PostgreSQL Source Code git master
trgm_regexp.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * trgm_regexp.c
4 * Regular expression matching using trigrams.
5 *
6 * The general idea of trigram index support for a regular expression (regex)
7 * search is to transform the regex into a logical expression on trigrams.
8 * For example:
9 *
10 * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11 *
12 * If a string matches the regex, then it must match the logical expression on
13 * trigrams. The opposite is not necessarily true, however: a string that
14 * matches the logical expression might not match the original regex. Such
15 * false positives are removed via recheck, by running the regular regex match
16 * operator on the retrieved heap tuple.
17 *
18 * Since the trigram expression involves both AND and OR operators, we can't
19 * expect the core index machinery to evaluate it completely. Instead, the
20 * result of regex analysis is a list of trigrams to be sought in the index,
21 * plus a simplified graph that is used by trigramsMatchGraph() to determine
22 * whether a particular indexed value matches the expression.
23 *
24 * Converting a regex to a trigram expression is based on analysis of an
25 * automaton corresponding to the regex. The algorithm consists of four
26 * stages:
27 *
28 * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29 * regexp library, which provides accessors for its opaque regex_t struct
30 * to expose the NFA state graph and the "colors" (sets of equivalent
31 * characters) used as state transition labels.
32 *
33 * 2) Transform the original NFA into an expanded graph, where arcs
34 * are labeled with trigrams that must be present in order to move from
35 * one state to another via the arcs. The trigrams used in this stage
36 * consist of colors, not characters, as in the original NFA.
37 *
38 * 3) Expand the color trigrams into regular trigrams consisting of
39 * characters. If too many distinct trigrams are produced, trigrams are
40 * eliminated and the graph is simplified until it's simple enough.
41 *
42 * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43 * and returned to the caller.
44 *
45 * 1) Compile the regexp to NFA form
46 * ---------------------------------
47 * The automaton returned by the regexp compiler is a graph where vertices
48 * are "states" and arcs are labeled with colors. Each color represents
49 * a set of characters, so that all characters assigned to the same color
50 * are interchangeable, so far as matching the regexp is concerned. There
51 * are two special states: "initial" and "final". A state can have multiple
52 * outgoing arcs labeled with the same color, which makes the automaton
53 * non-deterministic, because it can be in many states simultaneously.
54 *
55 * Note that this NFA is already lossy compared to the original regexp,
56 * since it ignores some regex features such as lookahead constraints and
57 * backref matching. This is OK for our purposes since it's still the case
58 * that only strings matching the NFA can possibly satisfy the regexp.
59 *
60 * 2) Transform the original NFA into an expanded graph
61 * ----------------------------------------------------
62 * In the 2nd stage, the automaton is transformed into a graph based on the
63 * original NFA. Each state in the expanded graph represents a state from
64 * the original NFA, plus a prefix identifying the last two characters
65 * (colors, to be precise) seen before entering the state. There can be
66 * multiple states in the expanded graph for each state in the original NFA,
67 * depending on what characters can precede it. A prefix position can be
68 * "unknown" if it's uncertain what the preceding character was, or "blank"
69 * if the character was a non-word character (we don't need to distinguish
70 * which non-word character it was, so just think of all of them as blanks).
71 *
72 * For convenience in description, call an expanded-state identifier
73 * (two prefix colors plus a state number from the original NFA) an
74 * "enter key".
75 *
76 * Each arc of the expanded graph is labeled with a trigram that must be
77 * present in the string to match. We can construct this from an out-arc of
78 * the underlying NFA state by combining the expanded state's prefix with the
79 * color label of the underlying out-arc, if neither prefix position is
80 * "unknown". But note that some of the colors in the trigram might be
81 * "blank". This is OK since we want to generate word-boundary trigrams as
82 * the regular trigram machinery would, if we know that some word characters
83 * must be adjacent to a word boundary in all strings matching the NFA.
84 *
85 * The expanded graph can also have fewer states than the original NFA,
86 * because we don't bother to make a separate state entry unless the state
87 * is reachable by a valid arc. When an enter key is reachable from a state
88 * of the expanded graph, but we do not know a complete trigram associated
89 * with that transition, we cannot make a valid arc; instead we insert the
90 * enter key into the enterKeys list of the source state. This effectively
91 * means that the two expanded states are not reliably distinguishable based
92 * on examining trigrams.
93 *
94 * So the expanded graph resembles the original NFA, but the arcs are
95 * labeled with trigrams instead of individual characters, and there may be
96 * more or fewer states. It is a lossy representation of the original NFA:
97 * any string that matches the original regexp must match the expanded graph,
98 * but the reverse is not true.
99 *
100 * We build the expanded graph through a breadth-first traversal of states
101 * reachable from the initial state. At each reachable state, we identify the
102 * states reachable from it without traversing a predictable trigram, and add
103 * those states' enter keys to the current state. Then we generate all
104 * out-arcs leading out of this collection of states that have predictable
105 * trigrams, adding their target states to the queue of states to examine.
106 *
107 * When building the graph, if the number of states or arcs exceed pre-defined
108 * limits, we give up and simply mark any states not yet processed as final
109 * states. Roughly speaking, that means that we make use of some portion from
110 * the beginning of the regexp. Also, any colors that have too many member
111 * characters are treated as "unknown", so that we can't derive trigrams
112 * from them.
113 *
114 * 3) Expand the color trigrams into regular trigrams
115 * --------------------------------------------------
116 * The trigrams in the expanded graph are "color trigrams", consisting
117 * of three consecutive colors that must be present in the string. But for
118 * search, we need regular trigrams consisting of characters. In the 3rd
119 * stage, the color trigrams are expanded into regular trigrams. Since each
120 * color can represent many characters, the total number of regular trigrams
121 * after expansion could be very large. Because searching the index for
122 * thousands of trigrams would be slow, and would likely produce so many
123 * false positives that we would have to traverse a large fraction of the
124 * index, the graph is simplified further in a lossy fashion by removing
125 * color trigrams. When a color trigram is removed, the states connected by
126 * any arcs labeled with that trigram are merged.
127 *
128 * Trigrams do not all have equivalent value for searching: some of them are
129 * more frequent and some of them are less frequent. Ideally, we would like
130 * to know the distribution of trigrams, but we don't. But because of padding
131 * we know for sure that the empty character is more frequent than others,
132 * so we can penalize trigrams according to presence of whitespace. The
133 * penalty assigned to each color trigram is the number of simple trigrams
134 * it would produce, times the penalties[] multiplier associated with its
135 * whitespace content. (The penalties[] constants were calculated by analysis
136 * of some real-life text.) We eliminate color trigrams starting with the
137 * highest-penalty one, until we get to a total penalty of no more than
138 * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139 * lead to merging the initial and final states, so we may not be able to
140 * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141 * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142 *
143 * 4) Pack the graph into a compact representation
144 * -----------------------------------------------
145 * The 2nd and 3rd stages might have eliminated or merged many of the states
146 * and trigrams created earlier, so in this final stage, the graph is
147 * compacted and packed into a simpler struct that contains only the
148 * information needed to evaluate it.
149 *
150 * ALGORITHM EXAMPLE:
151 *
152 * Consider the example regex "ab[cd]". This regex is transformed into the
153 * following NFA (for simplicity we show colors as their single members):
154 *
155 * 4#
156 * c/
157 * a b /
158 * 1* --- 2 ---- 3
159 * \
160 * d\
161 * 5#
162 *
163 * We use * to mark initial state and # to mark final state. It's not depicted,
164 * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165 * because this pattern can match to any part of a string.
166 *
167 * As the result of stage 2 we will have the following graph:
168 *
169 * abc abd
170 * 2# <---- 1* ----> 3#
171 *
172 * The process for generating this graph is:
173 * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174 * 2) Add key (UNKNOWN, "a", 2) to state 1.
175 * 3) Add key ("a", "b", 3) to state 1.
176 * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177 * from state 1 to state 2 with label trigram "abc".
178 * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179 * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180 * from state 1 to state 3 with label trigram "abd".
181 * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182 *
183 *
184 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
185 * Portions Copyright (c) 1994, Regents of the University of California
186 *
187 * IDENTIFICATION
188 * contrib/pg_trgm/trgm_regexp.c
189 *
190 *-------------------------------------------------------------------------
191 */
192#include "postgres.h"
193
194#include "catalog/pg_collation_d.h"
195#include "regex/regexport.h"
196#include "trgm.h"
197#include "tsearch/ts_locale.h"
198#include "utils/formatting.h"
199#include "utils/hsearch.h"
200#include "utils/memutils.h"
201#include "varatt.h"
202
203/*
204 * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
205 * for exploring and debugging the algorithm implementation.
206 * This produces three graph files in /tmp, in Graphviz .gv format.
207 * Some progress information is also printed to postmaster stderr.
208 */
209/* #define TRGM_REGEXP_DEBUG */
210
211/*
212 * These parameters are used to limit the amount of work done.
213 * Otherwise regex processing could be too slow and memory-consuming.
214 *
215 * MAX_EXPANDED_STATES - How many states we allow in expanded graph
216 * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
217 * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
218 * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
219 * COLOR_COUNT_LIMIT - Maximum number of characters per color
220 */
221#define MAX_EXPANDED_STATES 128
222#define MAX_EXPANDED_ARCS 1024
223#define MAX_TRGM_COUNT 256
224#define WISH_TRGM_PENALTY 16
225#define COLOR_COUNT_LIMIT 256
226
227/*
228 * Penalty multipliers for trigram counts depending on whitespace contents.
229 * Numbers based on analysis of real-life texts.
230 */
231static const float4 penalties[8] = {
232 1.0f, /* "aaa" */
233 3.5f, /* "aa " */
234 0.0f, /* "a a" (impossible) */
235 0.0f, /* "a " (impossible) */
236 4.2f, /* " aa" */
237 2.1f, /* " a " */
238 25.0f, /* " a" */
239 0.0f /* " " (impossible) */
240};
241
242/* Struct representing a single pg_wchar, converted back to multibyte form */
243typedef struct
244{
247
248/*
249 * Attributes of NFA colors:
250 *
251 * expandable - we know the character expansion of this color
252 * containsNonWord - color contains non-word characters
253 * (which will not be extracted into trigrams)
254 * wordCharsCount - count of word characters in color
255 * wordChars - array of this color's word characters
256 * (which can be extracted into trigrams)
257 *
258 * When expandable is false, the other attributes don't matter; we just
259 * assume this color represents unknown character(s).
260 */
261typedef struct
262{
268
269/*
270 * A "prefix" is information about the colors of the last two characters read
271 * before reaching a specific NFA state. These colors can have special values
272 * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
273 * information, for example because we read some character of an unexpandable
274 * color. COLOR_BLANK means that we read a non-word character.
275 *
276 * We call a prefix ambiguous if at least one of its colors is unknown. It's
277 * fully ambiguous if both are unknown, partially ambiguous if only the first
278 * is unknown. (The case of first color known, second unknown is not valid.)
279 *
280 * Wholly- or partly-blank prefixes are mostly handled the same as regular
281 * color prefixes. This allows us to generate appropriate partly-blank
282 * trigrams when the NFA requires word character(s) to appear adjacent to
283 * non-word character(s).
284 */
285typedef int TrgmColor;
286
287/* We assume that colors returned by the regexp engine cannot be these: */
288#define COLOR_UNKNOWN (-3)
289#define COLOR_BLANK (-4)
290
291typedef struct
292{
293 TrgmColor colors[2];
294} TrgmPrefix;
295
296/*
297 * Color-trigram data type. Note that some elements of the trigram can be
298 * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
299 */
300typedef struct
301{
302 TrgmColor colors[3];
303} ColorTrgm;
304
305/*
306 * Key identifying a state of our expanded graph: color prefix, and number
307 * of the corresponding state in the underlying regex NFA. The color prefix
308 * shows how we reached the regex state (to the extent that we know it).
309 */
310typedef struct
311{
315
316/*
317 * One state of the expanded graph.
318 *
319 * stateKey - ID of this state
320 * arcs - outgoing arcs of this state (List of TrgmArc)
321 * enterKeys - enter keys reachable from this state without reading any
322 * predictable trigram (List of TrgmStateKey)
323 * flags - flag bits
324 * snumber - number of this state (initially assigned as -1, -2, etc,
325 * for debugging purposes only; then at the packaging stage,
326 * surviving states are renumbered with positive numbers)
327 * parent - parent state, if this state has been merged into another
328 * tentFlags - flags this state would acquire via planned merges
329 * tentParent - planned parent state, if considering a merge
330 */
331#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
332#define TSTATE_FIN 0x02 /* flag indicating this state is final */
333
334typedef struct TrgmState
335{
336 TrgmStateKey stateKey; /* hashtable key: must be first field */
339 int flags;
345
346/*
347 * One arc in the expanded graph.
348 */
349typedef struct
350{
351 ColorTrgm ctrgm; /* trigram needed to traverse arc */
352 TrgmState *target; /* next state */
353} TrgmArc;
354
355/*
356 * Information about arc of specific color trigram (used in stage 3)
357 *
358 * Contains pointers to the source and target states.
359 */
360typedef struct
361{
365
366/*
367 * Information about color trigram (used in stage 3)
368 *
369 * ctrgm - trigram itself
370 * cnumber - number of this trigram (used in the packaging stage)
371 * count - number of simple trigrams created from this color trigram
372 * expanded - indicates this color trigram is expanded into simple trigrams
373 * arcs - list of all arcs labeled with this color trigram.
374 */
375typedef struct
376{
379 int count;
384
385/*
386 * Data structure representing all the data we need during regex processing.
387 *
388 * regex - compiled regex
389 * colorInfo - extracted information about regex's colors
390 * ncolors - number of colors in colorInfo[]
391 * states - hashtable of TrgmStates (states of expanded graph)
392 * initState - pointer to initial state of expanded graph
393 * queue - queue of to-be-processed TrgmStates
394 * keysQueue - queue of to-be-processed TrgmStateKeys
395 * arcsCount - total number of arcs of expanded graph (for resource
396 * limiting)
397 * overflowed - we have exceeded resource limit for transformation
398 * colorTrgms - array of all color trigrams present in graph
399 * colorTrgmsCount - count of those color trigrams
400 * totalTrgmCount - total count of extracted simple trigrams
401 */
402typedef struct
403{
404 /* Source regexp, and color information extracted from it (stage 1) */
408
409 /* Expanded graph (stage 2) */
413
414 /* Workspace for stage 2 */
419
420 /* Information about distinct color trigrams in the graph (stage 3) */
424} TrgmNFA;
425
426/*
427 * Final, compact representation of expanded graph.
428 */
429typedef struct
430{
431 int targetState; /* index of target state (zero-based) */
432 int colorTrgm; /* index of color trigram for transition */
434
435typedef struct
436{
437 int arcsCount; /* number of out-arcs for this state */
438 TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
440
441/* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
443{
444 /*
445 * colorTrigramsCount and colorTrigramGroups contain information about how
446 * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
447 * count of color trigrams and "colorTrigramGroups" contains number of
448 * simple trigrams for each color trigram. The array of simple trigrams
449 * (stored separately from this struct) is ordered so that the simple
450 * trigrams for each color trigram are consecutive, and they're in order
451 * by color trigram number.
452 */
454 int *colorTrigramGroups; /* array of size colorTrigramsCount */
455
456 /*
457 * The states of the simplified NFA. State number 0 is always initial
458 * state and state number 1 is always final state.
459 */
461 TrgmPackedState *states; /* array of size statesCount */
462
463 /* Temporary work space for trigramsMatchGraph() */
464 bool *colorTrigramsActive; /* array of size colorTrigramsCount */
465 bool *statesActive; /* array of size statesCount */
466 int *statesQueue; /* array of size statesCount */
467};
468
469/*
470 * Temporary structure for representing an arc during packaging.
471 */
472typedef struct
473{
478
479
480/* prototypes for private functions */
481static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
482 MemoryContext rcontext);
483static void RE_compile(regex_t *regex, text *text_re,
484 int cflags, Oid collation);
485static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
486static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
487static void transformGraph(TrgmNFA *trgmNFA);
488static void processState(TrgmNFA *trgmNFA, TrgmState *state);
489static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
490static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
491static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
492static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
493 TrgmColor co, TrgmStateKey *destKey);
494static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
495static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
496static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
497static bool selectColorTrigrams(TrgmNFA *trgmNFA);
498static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
499static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
500static void mergeStates(TrgmState *state1, TrgmState *state2);
501static int colorTrgmInfoCmp(const void *p1, const void *p2);
502static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
503static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
504static int packArcInfoCmp(const void *a1, const void *a2);
505
506#ifdef TRGM_REGEXP_DEBUG
507static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
508static void printTrgmNFA(TrgmNFA *trgmNFA);
509static void printTrgmColor(StringInfo buf, TrgmColor co);
510static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
511#endif
512
513
514/*
515 * Main entry point to process a regular expression.
516 *
517 * Returns an array of trigrams required by the regular expression, or NULL if
518 * the regular expression was too complex to analyze. In addition, a packed
519 * graph representation of the regex is returned into *graph. The results
520 * must be allocated in rcontext (which might or might not be the current
521 * context).
522 */
523TRGM *
524createTrgmNFA(text *text_re, Oid collation,
525 TrgmPackedGraph **graph, MemoryContext rcontext)
526{
527 TRGM *trg;
528 regex_t regex;
529 MemoryContext tmpcontext;
530 MemoryContext oldcontext;
531
532 /*
533 * This processing generates a great deal of cruft, which we'd like to
534 * clean up before returning (since this function may be called in a
535 * query-lifespan memory context). Make a temp context we can work in so
536 * that cleanup is easy.
537 */
539 "createTrgmNFA temporary context",
541 oldcontext = MemoryContextSwitchTo(tmpcontext);
542
543 /*
544 * Stage 1: Compile the regexp into a NFA, using the regexp library.
545 */
546#ifdef IGNORECASE
547 RE_compile(&regex, text_re,
548 REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
549#else
550 RE_compile(&regex, text_re,
551 REG_ADVANCED | REG_NOSUB, collation);
552#endif
553
554 trg = createTrgmNFAInternal(&regex, graph, rcontext);
555
556 /* Clean up all the cruft we created (including regex) */
557 MemoryContextSwitchTo(oldcontext);
558 MemoryContextDelete(tmpcontext);
559
560 return trg;
561}
562
563/*
564 * Body of createTrgmNFA, exclusive of regex compilation/freeing.
565 */
566static TRGM *
568 MemoryContext rcontext)
569{
570 TRGM *trg;
571 TrgmNFA trgmNFA;
572
573 trgmNFA.regex = regex;
574
575 /* Collect color information from the regex */
576 getColorInfo(regex, &trgmNFA);
577
578#ifdef TRGM_REGEXP_DEBUG
579 printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
580#endif
581
582 /*
583 * Stage 2: Create an expanded graph from the source NFA.
584 */
585 transformGraph(&trgmNFA);
586
587#ifdef TRGM_REGEXP_DEBUG
588 printTrgmNFA(&trgmNFA);
589#endif
590
591 /*
592 * Fail if we were unable to make a nontrivial graph, ie it is possible to
593 * get from the initial state to the final state without reading any
594 * predictable trigram.
595 */
596 if (trgmNFA.initState->flags & TSTATE_FIN)
597 return NULL;
598
599 /*
600 * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
601 */
602 if (!selectColorTrigrams(&trgmNFA))
603 return NULL;
604
605 /*
606 * Stage 4: Expand color trigrams and pack graph into final
607 * representation.
608 */
609 trg = expandColorTrigrams(&trgmNFA, rcontext);
610
611 *graph = packGraph(&trgmNFA, rcontext);
612
613#ifdef TRGM_REGEXP_DEBUG
614 printTrgmPackedGraph(*graph, trg);
615#endif
616
617 return trg;
618}
619
620/*
621 * Main entry point for evaluating a graph during index scanning.
622 *
623 * The check[] array is indexed by trigram number (in the array of simple
624 * trigrams returned by createTrgmNFA), and holds true for those trigrams
625 * that are present in the index entry being checked.
626 */
627bool
629{
630 int i,
631 j,
632 k,
633 queueIn,
634 queueOut;
635
636 /*
637 * Reset temporary working areas.
638 */
639 memset(graph->colorTrigramsActive, 0,
640 sizeof(bool) * graph->colorTrigramsCount);
641 memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
642
643 /*
644 * Check which color trigrams were matched. A match for any simple
645 * trigram associated with a color trigram counts as a match of the color
646 * trigram.
647 */
648 j = 0;
649 for (i = 0; i < graph->colorTrigramsCount; i++)
650 {
651 int cnt = graph->colorTrigramGroups[i];
652
653 for (k = j; k < j + cnt; k++)
654 {
655 if (check[k])
656 {
657 /*
658 * Found one matched trigram in the group. Can skip the rest
659 * of them and go to the next group.
660 */
661 graph->colorTrigramsActive[i] = true;
662 break;
663 }
664 }
665 j = j + cnt;
666 }
667
668 /*
669 * Initialize the statesQueue to hold just the initial state. Note:
670 * statesQueue has room for statesCount entries, which is certainly enough
671 * since no state will be put in the queue more than once. The
672 * statesActive array marks which states have been queued.
673 */
674 graph->statesActive[0] = true;
675 graph->statesQueue[0] = 0;
676 queueIn = 0;
677 queueOut = 1;
678
679 /* Process queued states as long as there are any. */
680 while (queueIn < queueOut)
681 {
682 int stateno = graph->statesQueue[queueIn++];
683 TrgmPackedState *state = &graph->states[stateno];
684 int cnt = state->arcsCount;
685
686 /* Loop over state's out-arcs */
687 for (i = 0; i < cnt; i++)
688 {
689 TrgmPackedArc *arc = &state->arcs[i];
690
691 /*
692 * If corresponding color trigram is present then activate the
693 * corresponding state. We're done if that's the final state,
694 * otherwise queue the state if it's not been queued already.
695 */
696 if (graph->colorTrigramsActive[arc->colorTrgm])
697 {
698 int nextstate = arc->targetState;
699
700 if (nextstate == 1)
701 return true; /* success: final state is reachable */
702
703 if (!graph->statesActive[nextstate])
704 {
705 graph->statesActive[nextstate] = true;
706 graph->statesQueue[queueOut++] = nextstate;
707 }
708 }
709 }
710 }
711
712 /* Queue is empty, so match fails. */
713 return false;
714}
715
716/*
717 * Compile regex string into struct at *regex.
718 * NB: pg_regfree must be applied to regex if this completes successfully.
719 */
720static void
721RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
722{
723 int text_re_len = VARSIZE_ANY_EXHDR(text_re);
724 char *text_re_val = VARDATA_ANY(text_re);
725 pg_wchar *pattern;
726 int pattern_len;
727 int regcomp_result;
728 char errMsg[100];
729
730 /* Convert pattern string to wide characters */
731 pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
732 pattern_len = pg_mb2wchar_with_len(text_re_val,
733 pattern,
734 text_re_len);
735
736 /* Compile regex */
737 regcomp_result = pg_regcomp(regex,
738 pattern,
739 pattern_len,
740 cflags,
741 collation);
742
743 pfree(pattern);
744
745 if (regcomp_result != REG_OKAY)
746 {
747 /* re didn't compile (no need for pg_regfree, if so) */
748 pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
750 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
751 errmsg("invalid regular expression: %s", errMsg)));
752 }
753}
754
755
756/*---------------------
757 * Subroutines for pre-processing the color map (stage 1).
758 *---------------------
759 */
760
761/*
762 * Fill TrgmColorInfo structure for each color using regex export functions.
763 */
764static void
765getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
766{
767 int colorsCount = pg_reg_getnumcolors(regex);
768 int i;
769
770 trgmNFA->ncolors = colorsCount;
771 trgmNFA->colorInfo = (TrgmColorInfo *)
772 palloc0(colorsCount * sizeof(TrgmColorInfo));
773
774 /*
775 * Loop over colors, filling TrgmColorInfo about each. Note we include
776 * WHITE (0) even though we know it'll be reported as non-expandable.
777 */
778 for (i = 0; i < colorsCount; i++)
779 {
780 TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
781 int charsCount = pg_reg_getnumcharacters(regex, i);
783 int j;
784
785 if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
786 {
787 /* Non expandable, or too large to work with */
788 colorInfo->expandable = false;
789 continue;
790 }
791
792 colorInfo->expandable = true;
793 colorInfo->containsNonWord = false;
794 colorInfo->wordChars = palloc_array(trgm_mb_char, charsCount);
795 colorInfo->wordCharsCount = 0;
796
797 /* Extract all the chars in this color */
798 chars = palloc_array(pg_wchar, charsCount);
799 pg_reg_getcharacters(regex, i, chars, charsCount);
800
801 /*
802 * Convert characters back to multibyte form, and save only those that
803 * are word characters. Set "containsNonWord" if any non-word
804 * character. (Note: it'd probably be nicer to keep the chars in
805 * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
806 */
807 for (j = 0; j < charsCount; j++)
808 {
810
811 if (!convertPgWchar(chars[j], &c))
812 continue; /* ok to ignore it altogether */
813 if (ISWORDCHR(c.bytes))
814 colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
815 else
816 colorInfo->containsNonWord = true;
817 }
818
819 pfree(chars);
820 }
821}
822
823/*
824 * Convert pg_wchar to multibyte format.
825 * Returns false if the character should be ignored completely.
826 */
827static bool
829{
830 /* "s" has enough space for a multibyte character and a trailing NUL */
831 char s[MAX_MULTIBYTE_CHAR_LEN + 1];
832
833 /*
834 * We can ignore the NUL character, since it can never appear in a PG text
835 * string. This avoids the need for various special cases when
836 * reconstructing trigrams.
837 */
838 if (c == 0)
839 return false;
840
841 /* Do the conversion, making sure the result is NUL-terminated */
842 memset(s, 0, sizeof(s));
843 pg_wchar2mb_with_len(&c, s, 1);
844
845 /*
846 * In IGNORECASE mode, we can ignore uppercase characters. We assume that
847 * the regex engine generated both uppercase and lowercase equivalents
848 * within each color, since we used the REG_ICASE option; so there's no
849 * need to process the uppercase version.
850 *
851 * XXX this code is dependent on the assumption that str_tolower() works
852 * the same as the regex engine's internal case folding machinery. Might
853 * be wiser to expose pg_wc_tolower and test whether c ==
854 * pg_wc_tolower(c). On the other hand, the trigrams in the index were
855 * created using str_tolower(), so we're probably screwed if there's any
856 * incompatibility anyway.
857 */
858#ifdef IGNORECASE
859 {
860 char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
861
862 if (strcmp(lowerCased, s) != 0)
863 {
864 pfree(lowerCased);
865 return false;
866 }
867 pfree(lowerCased);
868 }
869#endif
870
871 /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
872 memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
873 return true;
874}
875
876
877/*---------------------
878 * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
879 *---------------------
880 */
881
882/*
883 * Transform the graph, given a regex and extracted color information.
884 *
885 * We create and process a queue of expanded-graph states until all the states
886 * are processed.
887 *
888 * This algorithm may be stopped due to resource limitation. In this case we
889 * force every unprocessed branch to immediately finish with matching (this
890 * can give us false positives but no false negatives) by marking all
891 * unprocessed states as final.
892 */
893static void
895{
896 HASHCTL hashCtl;
897 TrgmStateKey initkey;
898 TrgmState *initstate;
899 ListCell *lc;
900
901 /* Initialize this stage's workspace in trgmNFA struct */
902 trgmNFA->queue = NIL;
903 trgmNFA->keysQueue = NIL;
904 trgmNFA->arcsCount = 0;
905 trgmNFA->overflowed = false;
906
907 /* Create hashtable for states */
908 hashCtl.keysize = sizeof(TrgmStateKey);
909 hashCtl.entrysize = sizeof(TrgmState);
910 hashCtl.hcxt = CurrentMemoryContext;
911 trgmNFA->states = hash_create("Trigram NFA",
912 1024,
913 &hashCtl,
915 trgmNFA->nstates = 0;
916
917 /* Create initial state: ambiguous prefix, NFA's initial state */
918 MemSet(&initkey, 0, sizeof(initkey));
919 initkey.prefix.colors[0] = COLOR_UNKNOWN;
920 initkey.prefix.colors[1] = COLOR_UNKNOWN;
921 initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
922
923 initstate = getState(trgmNFA, &initkey);
924 initstate->flags |= TSTATE_INIT;
925 trgmNFA->initState = initstate;
926
927 /*
928 * Recursively build the expanded graph by processing queue of states
929 * (breadth-first search). getState already put initstate in the queue.
930 * Note that getState will append new states to the queue within the loop,
931 * too; this works as long as we don't do repeat fetches using the "lc"
932 * pointer.
933 */
934 foreach(lc, trgmNFA->queue)
935 {
936 TrgmState *state = (TrgmState *) lfirst(lc);
937
938 /*
939 * If we overflowed then just mark state as final. Otherwise do
940 * actual processing.
941 */
942 if (trgmNFA->overflowed)
943 state->flags |= TSTATE_FIN;
944 else
945 processState(trgmNFA, state);
946
947 /* Did we overflow? */
948 if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
950 trgmNFA->overflowed = true;
951 }
952}
953
954/*
955 * Process one state: add enter keys and then add outgoing arcs.
956 */
957static void
959{
960 ListCell *lc;
961
962 /* keysQueue should be NIL already, but make sure */
963 trgmNFA->keysQueue = NIL;
964
965 /*
966 * Add state's own key, and then process all keys added to keysQueue until
967 * queue is finished. But we can quit if the state gets marked final.
968 */
969 addKey(trgmNFA, state, &state->stateKey);
970 foreach(lc, trgmNFA->keysQueue)
971 {
973
974 if (state->flags & TSTATE_FIN)
975 break;
976 addKey(trgmNFA, state, key);
977 }
978
979 /* Release keysQueue to clean up for next cycle */
980 list_free(trgmNFA->keysQueue);
981 trgmNFA->keysQueue = NIL;
982
983 /*
984 * Add outgoing arcs only if state isn't final (we have no interest in
985 * outgoing arcs if we already match)
986 */
987 if (!(state->flags & TSTATE_FIN))
988 addArcs(trgmNFA, state);
989}
990
991/*
992 * Add the given enter key into the state's enterKeys list, and determine
993 * whether this should result in any further enter keys being added.
994 * If so, add those keys to keysQueue so that processState will handle them.
995 *
996 * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
997 * This situation means that we can reach the final state from this expanded
998 * state without reading any predictable trigram, so we must consider this
999 * state as an accepting one.
1000 *
1001 * The given key could be a duplicate of one already in enterKeys, or be
1002 * redundant with some enterKeys. So we check that before doing anything.
1003 *
1004 * Note that we don't generate any actual arcs here. addArcs will do that
1005 * later, after we have identified all the enter keys for this state.
1006 */
1007static void
1009{
1010 regex_arc_t *arcs;
1011 TrgmStateKey destKey;
1012 ListCell *cell;
1013 int i,
1014 arcsCount;
1015
1016 /*
1017 * Ensure any pad bytes in destKey are zero, since it may get used as a
1018 * hashtable key by getState.
1019 */
1020 MemSet(&destKey, 0, sizeof(destKey));
1021
1022 /*
1023 * Compare key to each existing enter key of the state to check for
1024 * redundancy. We can drop either old key(s) or the new key if we find
1025 * redundancy.
1026 */
1027 foreach(cell, state->enterKeys)
1028 {
1029 TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1030
1031 if (existingKey->nstate == key->nstate)
1032 {
1033 if (prefixContains(&existingKey->prefix, &key->prefix))
1034 {
1035 /* This old key already covers the new key. Nothing to do */
1036 return;
1037 }
1038 if (prefixContains(&key->prefix, &existingKey->prefix))
1039 {
1040 /*
1041 * The new key covers this old key. Remove the old key, it's
1042 * no longer needed once we add this key to the list.
1043 */
1044 state->enterKeys = foreach_delete_current(state->enterKeys,
1045 cell);
1046 }
1047 }
1048 }
1049
1050 /* No redundancy, so add this key to the state's list */
1051 state->enterKeys = lappend(state->enterKeys, key);
1052
1053 /* If state is now known final, mark it and we're done */
1054 if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1055 {
1056 state->flags |= TSTATE_FIN;
1057 return;
1058 }
1059
1060 /*
1061 * Loop through all outgoing arcs of the corresponding state in the
1062 * original NFA.
1063 */
1064 arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1065 arcs = palloc_array(regex_arc_t, arcsCount);
1066 pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1067
1068 for (i = 0; i < arcsCount; i++)
1069 {
1070 regex_arc_t *arc = &arcs[i];
1071
1072 if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1073 {
1074 /*
1075 * Start of line/string (^). Trigram extraction treats start of
1076 * line same as start of word: double space prefix is added.
1077 * Hence, make an enter key showing we can reach the arc
1078 * destination with all-blank prefix.
1079 */
1080 destKey.prefix.colors[0] = COLOR_BLANK;
1081 destKey.prefix.colors[1] = COLOR_BLANK;
1082 destKey.nstate = arc->to;
1083
1084 /* Add enter key to this state */
1085 addKeyToQueue(trgmNFA, &destKey);
1086 }
1087 else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1088 {
1089 /*
1090 * End of line/string ($). We must consider this arc as a
1091 * transition that doesn't read anything. The reason for adding
1092 * this enter key to the state is that if the arc leads to the
1093 * NFA's final state, we must mark this expanded state as final.
1094 */
1095 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1096 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1097 destKey.nstate = arc->to;
1098
1099 /* Add enter key to this state */
1100 addKeyToQueue(trgmNFA, &destKey);
1101 }
1102 else if (arc->co >= 0)
1103 {
1104 /* Regular color (including WHITE) */
1105 TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1106
1107 if (colorInfo->expandable)
1108 {
1109 if (colorInfo->containsNonWord &&
1111 {
1112 /*
1113 * We can reach the arc destination after reading a
1114 * non-word character, but the prefix is not something
1115 * that addArc will accept with COLOR_BLANK, so no trigram
1116 * arc can get made for this transition. We must make an
1117 * enter key to show that the arc destination is
1118 * reachable. Set it up with an all-blank prefix, since
1119 * that corresponds to what the trigram extraction code
1120 * will do at a word starting boundary.
1121 */
1122 destKey.prefix.colors[0] = COLOR_BLANK;
1123 destKey.prefix.colors[1] = COLOR_BLANK;
1124 destKey.nstate = arc->to;
1125 addKeyToQueue(trgmNFA, &destKey);
1126 }
1127
1128 if (colorInfo->wordCharsCount > 0 &&
1129 !validArcLabel(key, arc->co))
1130 {
1131 /*
1132 * We can reach the arc destination after reading a word
1133 * character, but the prefix is not something that addArc
1134 * will accept, so no trigram arc can get made for this
1135 * transition. We must make an enter key to show that the
1136 * arc destination is reachable. The prefix for the enter
1137 * key should reflect the info we have for this arc.
1138 */
1139 destKey.prefix.colors[0] = key->prefix.colors[1];
1140 destKey.prefix.colors[1] = arc->co;
1141 destKey.nstate = arc->to;
1142 addKeyToQueue(trgmNFA, &destKey);
1143 }
1144 }
1145 else
1146 {
1147 /*
1148 * Unexpandable color. Add enter key with ambiguous prefix,
1149 * showing we can reach the destination from this state, but
1150 * the preceding colors will be uncertain. (We do not set the
1151 * first prefix color to key->prefix.colors[1], because a
1152 * prefix of known followed by unknown is invalid.)
1153 */
1154 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1155 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1156 destKey.nstate = arc->to;
1157 addKeyToQueue(trgmNFA, &destKey);
1158 }
1159 }
1160 else
1161 {
1162 /* RAINBOW: treat as unexpandable color */
1163 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1164 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1165 destKey.nstate = arc->to;
1166 addKeyToQueue(trgmNFA, &destKey);
1167 }
1168 }
1169
1170 pfree(arcs);
1171}
1172
1173/*
1174 * Add copy of given key to keysQueue for later processing.
1175 */
1176static void
1178{
1180
1181 memcpy(keyCopy, key, sizeof(TrgmStateKey));
1182 trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1183}
1184
1185/*
1186 * Add outgoing arcs from given state, whose enter keys are all now known.
1187 */
1188static void
1190{
1191 TrgmStateKey destKey;
1192 ListCell *cell;
1193 regex_arc_t *arcs;
1194 int arcsCount,
1195 i;
1196
1197 /*
1198 * Ensure any pad bytes in destKey are zero, since it may get used as a
1199 * hashtable key by getState.
1200 */
1201 MemSet(&destKey, 0, sizeof(destKey));
1202
1203 /*
1204 * Iterate over enter keys associated with this expanded-graph state. This
1205 * includes both the state's own stateKey, and any enter keys we added to
1206 * it during addKey (which represent expanded-graph states that are not
1207 * distinguishable from this one by means of trigrams). For each such
1208 * enter key, examine all the out-arcs of the key's underlying NFA state,
1209 * and try to make a trigram arc leading to where the out-arc leads.
1210 * (addArc will deal with whether the arc is valid or not.)
1211 */
1212 foreach(cell, state->enterKeys)
1213 {
1214 TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1215
1216 arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1217 arcs = palloc_array(regex_arc_t, arcsCount);
1218 pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1219
1220 for (i = 0; i < arcsCount; i++)
1221 {
1222 regex_arc_t *arc = &arcs[i];
1223 TrgmColorInfo *colorInfo;
1224
1225 /*
1226 * Ignore non-expandable colors; addKey already handled the case.
1227 *
1228 * We need no special check for WHITE or begin/end pseudocolors
1229 * here. We don't need to do any processing for them, and they
1230 * will be marked non-expandable since the regex engine will have
1231 * reported them that way. We do have to watch out for RAINBOW,
1232 * which has a negative color number.
1233 */
1234 if (arc->co < 0)
1235 continue;
1236 Assert(arc->co < trgmNFA->ncolors);
1237
1238 colorInfo = &trgmNFA->colorInfo[arc->co];
1239 if (!colorInfo->expandable)
1240 continue;
1241
1242 if (colorInfo->containsNonWord)
1243 {
1244 /*
1245 * Color includes non-word character(s).
1246 *
1247 * Generate an arc, treating this transition as occurring on
1248 * BLANK. This allows word-ending trigrams to be manufactured
1249 * if possible.
1250 */
1251 destKey.prefix.colors[0] = key->prefix.colors[1];
1252 destKey.prefix.colors[1] = COLOR_BLANK;
1253 destKey.nstate = arc->to;
1254
1255 addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
1256 }
1257
1258 if (colorInfo->wordCharsCount > 0)
1259 {
1260 /*
1261 * Color includes word character(s).
1262 *
1263 * Generate an arc. Color is pushed into prefix of target
1264 * state.
1265 */
1266 destKey.prefix.colors[0] = key->prefix.colors[1];
1267 destKey.prefix.colors[1] = arc->co;
1268 destKey.nstate = arc->to;
1269
1270 addArc(trgmNFA, state, key, arc->co, &destKey);
1271 }
1272 }
1273
1274 pfree(arcs);
1275 }
1276}
1277
1278/*
1279 * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1280 *
1281 * state: expanded-graph state we want to add an out-arc to
1282 * key: provides prefix colors (key->nstate is not used)
1283 * co: transition color
1284 * destKey: identifier for destination state of expanded graph
1285 */
1286static void
1288 TrgmColor co, TrgmStateKey *destKey)
1289{
1290 TrgmArc *arc;
1291 ListCell *cell;
1292
1293 /* Do nothing if this wouldn't be a valid arc label trigram */
1294 if (!validArcLabel(key, co))
1295 return;
1296
1297 /*
1298 * Check if we are going to reach key which is covered by a key which is
1299 * already listed in this state. If so arc is useless: the NFA can bypass
1300 * it through a path that doesn't require any predictable trigram, so
1301 * whether the arc's trigram is present or not doesn't really matter.
1302 */
1303 foreach(cell, state->enterKeys)
1304 {
1305 TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1306
1307 if (existingKey->nstate == destKey->nstate &&
1308 prefixContains(&existingKey->prefix, &destKey->prefix))
1309 return;
1310 }
1311
1312 /* Checks were successful, add new arc */
1314 arc->target = getState(trgmNFA, destKey);
1315 arc->ctrgm.colors[0] = key->prefix.colors[0];
1316 arc->ctrgm.colors[1] = key->prefix.colors[1];
1317 arc->ctrgm.colors[2] = co;
1318
1319 state->arcs = lappend(state->arcs, arc);
1320 trgmNFA->arcsCount++;
1321}
1322
1323/*
1324 * Can we make a valid trigram arc label from the given prefix and arc color?
1325 *
1326 * This is split out so that tests in addKey and addArc will stay in sync.
1327 */
1328static bool
1330{
1331 /*
1332 * We have to know full trigram in order to add outgoing arc. So we can't
1333 * do it if prefix is ambiguous.
1334 */
1335 if (key->prefix.colors[0] == COLOR_UNKNOWN)
1336 return false;
1337
1338 /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1339 Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1340 /* And we should not be called with an unknown arc color anytime */
1341 Assert(co != COLOR_UNKNOWN);
1342
1343 /*
1344 * We don't bother with making arcs representing three non-word
1345 * characters, since that's useless for trigram extraction.
1346 */
1347 if (key->prefix.colors[0] == COLOR_BLANK &&
1348 key->prefix.colors[1] == COLOR_BLANK &&
1349 co == COLOR_BLANK)
1350 return false;
1351
1352 /*
1353 * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1354 * case doesn't correspond to any trigram the trigram extraction code
1355 * would make. The nonblank-blank-blank case is also not possible with
1356 * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1357 * trigram even if it were valid, for example processing "foo bar" will
1358 * not result in considering the trigram "o ". So if you want to support
1359 * RPADDING = 2, there's more to do than just twiddle this test.)
1360 */
1361 if (key->prefix.colors[0] != COLOR_BLANK &&
1362 key->prefix.colors[1] == COLOR_BLANK)
1363 return false;
1364
1365 /*
1366 * Other combinations involving blank are valid, in particular we assume
1367 * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1368 *
1369 * Note: Using again the example "foo bar", we will not consider the
1370 * trigram " b", though this trigram would be found by the trigram
1371 * extraction code. Since we will find " ba", it doesn't seem worth
1372 * trying to hack the algorithm to generate the additional trigram.
1373 */
1374
1375 /* arc label is valid */
1376 return true;
1377}
1378
1379/*
1380 * Get state of expanded graph for given state key,
1381 * and queue the state for processing if it didn't already exist.
1382 */
1383static TrgmState *
1385{
1387 bool found;
1388
1390 &found);
1391 if (!found)
1392 {
1393 /* New state: initialize and queue it */
1394 state->arcs = NIL;
1395 state->enterKeys = NIL;
1396 state->flags = 0;
1397 /* states are initially given negative numbers */
1398 state->snumber = -(++trgmNFA->nstates);
1399 state->parent = NULL;
1400 state->tentFlags = 0;
1401 state->tentParent = NULL;
1402
1403 trgmNFA->queue = lappend(trgmNFA->queue, state);
1404 }
1405 return state;
1406}
1407
1408/*
1409 * Check if prefix1 "contains" prefix2.
1410 *
1411 * "contains" means that any exact prefix (with no ambiguity) that satisfies
1412 * prefix2 also satisfies prefix1.
1413 */
1414static bool
1416{
1417 if (prefix1->colors[1] == COLOR_UNKNOWN)
1418 {
1419 /* Fully ambiguous prefix contains everything */
1420 return true;
1421 }
1422 else if (prefix1->colors[0] == COLOR_UNKNOWN)
1423 {
1424 /*
1425 * Prefix with only first unknown color contains every prefix with
1426 * same second color.
1427 */
1428 if (prefix1->colors[1] == prefix2->colors[1])
1429 return true;
1430 else
1431 return false;
1432 }
1433 else
1434 {
1435 /* Exact prefix contains only the exact same prefix */
1436 if (prefix1->colors[0] == prefix2->colors[0] &&
1437 prefix1->colors[1] == prefix2->colors[1])
1438 return true;
1439 else
1440 return false;
1441 }
1442}
1443
1444
1445/*---------------------
1446 * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1447 *---------------------
1448 */
1449
1450/*
1451 * Get vector of all color trigrams in graph and select which of them
1452 * to expand into simple trigrams.
1453 *
1454 * Returns true if OK, false if exhausted resource limits.
1455 */
1456static bool
1458{
1459 HASH_SEQ_STATUS scan_status;
1460 int arcsCount = trgmNFA->arcsCount,
1461 i;
1463 ColorTrgmInfo *colorTrgms;
1464 int64 totalTrgmCount;
1465 float4 totalTrgmPenalty;
1466 int cnumber;
1467
1468 /* Collect color trigrams from all arcs */
1469 colorTrgms = palloc0_array(ColorTrgmInfo, arcsCount);
1470 trgmNFA->colorTrgms = colorTrgms;
1471
1472 i = 0;
1473 hash_seq_init(&scan_status, trgmNFA->states);
1474 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1475 {
1476 ListCell *cell;
1477
1478 foreach(cell, state->arcs)
1479 {
1480 TrgmArc *arc = (TrgmArc *) lfirst(cell);
1482 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1483
1484 arcInfo->source = state;
1485 arcInfo->target = arc->target;
1486 trgmInfo->ctrgm = arc->ctrgm;
1487 trgmInfo->cnumber = -1;
1488 /* count and penalty will be set below */
1489 trgmInfo->expanded = true;
1490 trgmInfo->arcs = list_make1(arcInfo);
1491 i++;
1492 }
1493 }
1494 Assert(i == arcsCount);
1495
1496 /* Remove duplicates, merging their arcs lists */
1497 if (arcsCount >= 2)
1498 {
1499 ColorTrgmInfo *p1,
1500 *p2;
1501
1502 /* Sort trigrams to ease duplicate detection */
1503 qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1504
1505 /* p1 is probe point, p2 is last known non-duplicate. */
1506 p2 = colorTrgms;
1507 for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1508 {
1509 if (colorTrgmInfoCmp(p1, p2) > 0)
1510 {
1511 p2++;
1512 *p2 = *p1;
1513 }
1514 else
1515 {
1516 p2->arcs = list_concat(p2->arcs, p1->arcs);
1517 }
1518 }
1519 trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1520 }
1521 else
1522 {
1523 trgmNFA->colorTrgmsCount = arcsCount;
1524 }
1525
1526 /*
1527 * Count number of simple trigrams generated by each color trigram, and
1528 * also compute a penalty value, which is the number of simple trigrams
1529 * times a multiplier that depends on its whitespace content.
1530 *
1531 * Note: per-color-trigram counts cannot overflow an int so long as
1532 * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1533 * 1290. However, the grand total totalTrgmCount might conceivably
1534 * overflow an int, so we use int64 for that within this routine. Also,
1535 * penalties are calculated in float4 arithmetic to avoid any overflow
1536 * worries.
1537 */
1538 totalTrgmCount = 0;
1539 totalTrgmPenalty = 0.0f;
1540 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1541 {
1542 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1543 int j,
1544 count = 1,
1545 typeIndex = 0;
1546
1547 for (j = 0; j < 3; j++)
1548 {
1549 TrgmColor c = trgmInfo->ctrgm.colors[j];
1550
1551 typeIndex *= 2;
1552 if (c == COLOR_BLANK)
1553 typeIndex++;
1554 else
1555 count *= trgmNFA->colorInfo[c].wordCharsCount;
1556 }
1557 trgmInfo->count = count;
1558 totalTrgmCount += count;
1559 trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1560 totalTrgmPenalty += trgmInfo->penalty;
1561 }
1562
1563 /* Sort color trigrams in descending order of their penalties */
1564 qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1566
1567 /*
1568 * Remove color trigrams from the graph so long as total penalty of color
1569 * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1570 * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1571 * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1572 * penalty, since those are the most promising for reducing the total
1573 * penalty. When removing a color trigram we have to merge states
1574 * connected by arcs labeled with that trigram. It's necessary to not
1575 * merge initial and final states, because our graph becomes useless if
1576 * that happens; so we cannot always remove the trigram we'd prefer to.
1577 */
1578 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1579 {
1580 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1581 bool canRemove = true;
1582 ListCell *cell;
1583
1584 /* Done if we've reached the target */
1585 if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
1586 break;
1587
1588#ifdef TRGM_REGEXP_DEBUG
1589 fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1590 trgmInfo->ctrgm.colors[0],
1591 trgmInfo->ctrgm.colors[1],
1592 trgmInfo->ctrgm.colors[2],
1593 trgmInfo->penalty,
1594 list_length(trgmInfo->arcs));
1595#endif
1596
1597 /*
1598 * Does any arc of this color trigram connect initial and final
1599 * states? If so we can't remove it.
1600 */
1601 foreach(cell, trgmInfo->arcs)
1602 {
1603 TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1604 TrgmState *source = arcInfo->source,
1605 *target = arcInfo->target;
1606 int source_flags,
1607 target_flags;
1608
1609#ifdef TRGM_REGEXP_DEBUG
1610 fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1611 -target->snumber, target->flags,
1612 -source->snumber, source->flags);
1613#endif
1614
1615 /* examine parent states, if any merging has already happened */
1616 while (source->parent)
1617 source = source->parent;
1618 while (target->parent)
1619 target = target->parent;
1620
1621#ifdef TRGM_REGEXP_DEBUG
1622 fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1623 -target->snumber, target->flags,
1624 -source->snumber, source->flags);
1625#endif
1626
1627 /* we must also consider merges we are planning right now */
1628 source_flags = source->flags | source->tentFlags;
1629 while (source->tentParent)
1630 {
1631 source = source->tentParent;
1632 source_flags |= source->flags | source->tentFlags;
1633 }
1634 target_flags = target->flags | target->tentFlags;
1635 while (target->tentParent)
1636 {
1637 target = target->tentParent;
1638 target_flags |= target->flags | target->tentFlags;
1639 }
1640
1641#ifdef TRGM_REGEXP_DEBUG
1642 fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1643 -target->snumber, target_flags,
1644 -source->snumber, source_flags);
1645#endif
1646
1647 /* would fully-merged state have both INIT and FIN set? */
1648 if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1650 {
1651 canRemove = false;
1652 break;
1653 }
1654
1655 /* ok so far, so remember planned merge */
1656 if (source != target)
1657 {
1658#ifdef TRGM_REGEXP_DEBUG
1659 fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1660 -target->snumber, -source->snumber);
1661#endif
1662 target->tentParent = source;
1663 source->tentFlags |= target_flags;
1664 }
1665 }
1666
1667 /*
1668 * We must reset all the tentFlags/tentParent fields before
1669 * continuing. tentFlags could only have become set in states that
1670 * are the source or parent or tentative parent of one of the current
1671 * arcs; likewise tentParent could only have become set in states that
1672 * are the target or parent or tentative parent of one of the current
1673 * arcs. There might be some overlap between those sets, but if we
1674 * clear tentFlags in target states as well as source states, we
1675 * should be okay even if we visit a state as target before visiting
1676 * it as a source.
1677 */
1678 foreach(cell, trgmInfo->arcs)
1679 {
1680 TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1681 TrgmState *source = arcInfo->source,
1682 *target = arcInfo->target;
1683 TrgmState *ttarget;
1684
1685 /* no need to touch previously-merged states */
1686 while (source->parent)
1687 source = source->parent;
1688 while (target->parent)
1689 target = target->parent;
1690
1691 while (source)
1692 {
1693 source->tentFlags = 0;
1694 source = source->tentParent;
1695 }
1696
1697 while ((ttarget = target->tentParent) != NULL)
1698 {
1699 target->tentParent = NULL;
1700 target->tentFlags = 0; /* in case it was also a source */
1701 target = ttarget;
1702 }
1703 }
1704
1705 /* Now, move on if we can't drop this trigram */
1706 if (!canRemove)
1707 {
1708#ifdef TRGM_REGEXP_DEBUG
1709 fprintf(stderr, " ... not ok to merge\n");
1710#endif
1711 continue;
1712 }
1713
1714 /* OK, merge states linked by each arc labeled by the trigram */
1715 foreach(cell, trgmInfo->arcs)
1716 {
1717 TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1718 TrgmState *source = arcInfo->source,
1719 *target = arcInfo->target;
1720
1721 while (source->parent)
1722 source = source->parent;
1723 while (target->parent)
1724 target = target->parent;
1725 if (source != target)
1726 {
1727#ifdef TRGM_REGEXP_DEBUG
1728 fprintf(stderr, "merging s%d into s%d\n",
1729 -target->snumber, -source->snumber);
1730#endif
1731 mergeStates(source, target);
1732 /* Assert we didn't merge initial and final states */
1733 Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1735 }
1736 }
1737
1738 /* Mark trigram unexpanded, and update totals */
1739 trgmInfo->expanded = false;
1740 totalTrgmCount -= trgmInfo->count;
1741 totalTrgmPenalty -= trgmInfo->penalty;
1742 }
1743
1744 /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1745 if (totalTrgmCount > MAX_TRGM_COUNT)
1746 return false;
1747
1748 trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1749
1750 /*
1751 * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1752 * and enumerate the color trigrams that are expanded.
1753 */
1754 cnumber = 0;
1755 qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1757 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1758 {
1759 if (colorTrgms[i].expanded)
1760 {
1761 colorTrgms[i].cnumber = cnumber;
1762 cnumber++;
1763 }
1764 }
1765
1766 return true;
1767}
1768
1769/*
1770 * Expand selected color trigrams into regular trigrams.
1771 *
1772 * Returns the TRGM array to be passed to the index machinery.
1773 * The array must be allocated in rcontext.
1774 */
1775static TRGM *
1777{
1778 TRGM *trg;
1779 trgm *p;
1780 int i;
1781 TrgmColorInfo blankColor;
1782 trgm_mb_char blankChar;
1783
1784 /* Set up "blank" color structure containing a single zero character */
1785 memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1786 blankColor.wordCharsCount = 1;
1787 blankColor.wordChars = &blankChar;
1788
1789 /* Construct the trgm array */
1790 trg = (TRGM *)
1791 MemoryContextAllocZero(rcontext,
1792 TRGMHDRSIZE +
1793 trgmNFA->totalTrgmCount * sizeof(trgm));
1794 trg->flag = ARRKEY;
1796 p = GETARR(trg);
1797 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1798 {
1799 ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1800 TrgmColorInfo *c[3];
1801 trgm_mb_char s[3];
1802 int j,
1803 i1,
1804 i2,
1805 i3;
1806
1807 /* Ignore any unexpanded trigrams ... */
1808 if (!colorTrgm->expanded)
1809 continue;
1810
1811 /* Get colors, substituting the dummy struct for COLOR_BLANK */
1812 for (j = 0; j < 3; j++)
1813 {
1814 if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1815 c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1816 else
1817 c[j] = &blankColor;
1818 }
1819
1820 /* Iterate over all possible combinations of colors' characters */
1821 for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1822 {
1823 s[0] = c[0]->wordChars[i1];
1824 for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1825 {
1826 s[1] = c[1]->wordChars[i2];
1827 for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1828 {
1829 s[2] = c[2]->wordChars[i3];
1830 fillTrgm(p, s);
1831 p++;
1832 }
1833 }
1834 }
1835 }
1836
1837 return trg;
1838}
1839
1840/*
1841 * Convert trigram into trgm datatype.
1842 */
1843static void
1845{
1846 char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1847 *p;
1848 int i,
1849 j;
1850
1851 /* Write multibyte string into "str" (we don't need null termination) */
1852 p = str;
1853
1854 for (i = 0; i < 3; i++)
1855 {
1856 if (s[i].bytes[0] != 0)
1857 {
1858 for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1859 *p++ = s[i].bytes[j];
1860 }
1861 else
1862 {
1863 /* Emit a space in place of COLOR_BLANK */
1864 *p++ = ' ';
1865 }
1866 }
1867
1868 /* Convert "str" to a standard trigram (possibly hashing it) */
1869 compact_trigram(ptrgm, str, p - str);
1870}
1871
1872/*
1873 * Merge two states of graph.
1874 */
1875static void
1877{
1878 Assert(state1 != state2);
1879 Assert(!state1->parent);
1880 Assert(!state2->parent);
1881
1882 /* state1 absorbs state2's flags */
1883 state1->flags |= state2->flags;
1884
1885 /* state2, and indirectly all its children, become children of state1 */
1886 state2->parent = state1;
1887}
1888
1889/*
1890 * Compare function for sorting of color trigrams by their colors.
1891 */
1892static int
1893colorTrgmInfoCmp(const void *p1, const void *p2)
1894{
1895 const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1896 const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1897
1898 return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1899}
1900
1901/*
1902 * Compare function for sorting color trigrams in descending order of
1903 * their penalty fields.
1904 */
1905static int
1906colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1907{
1908 float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1909 float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1910
1911 if (penalty1 < penalty2)
1912 return 1;
1913 else if (penalty1 == penalty2)
1914 return 0;
1915 else
1916 return -1;
1917}
1918
1919
1920/*---------------------
1921 * Subroutines for packing the graph into final representation (stage 4).
1922 *---------------------
1923 */
1924
1925/*
1926 * Pack expanded graph into final representation.
1927 *
1928 * The result data must be allocated in rcontext.
1929 */
1930static TrgmPackedGraph *
1932{
1933 int snumber = 2,
1934 arcIndex,
1935 arcsCount;
1936 HASH_SEQ_STATUS scan_status;
1938 TrgmPackArcInfo *arcs;
1939 TrgmPackedArc *packedArcs;
1940 TrgmPackedGraph *result;
1941 int i,
1942 j;
1943
1944 /* Enumerate surviving states, giving init and fin reserved numbers */
1945 hash_seq_init(&scan_status, trgmNFA->states);
1946 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1947 {
1948 while (state->parent)
1949 state = state->parent;
1950
1951 if (state->snumber < 0)
1952 {
1953 if (state->flags & TSTATE_INIT)
1954 state->snumber = 0;
1955 else if (state->flags & TSTATE_FIN)
1956 state->snumber = 1;
1957 else
1958 {
1959 state->snumber = snumber;
1960 snumber++;
1961 }
1962 }
1963 }
1964
1965 /* Collect array of all arcs */
1966 arcs = palloc_array(TrgmPackArcInfo, trgmNFA->arcsCount);
1967 arcIndex = 0;
1968 hash_seq_init(&scan_status, trgmNFA->states);
1969 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1970 {
1972 ListCell *cell;
1973
1974 while (source->parent)
1975 source = source->parent;
1976
1977 foreach(cell, state->arcs)
1978 {
1979 TrgmArc *arc = (TrgmArc *) lfirst(cell);
1980 TrgmState *target = arc->target;
1981
1982 while (target->parent)
1983 target = target->parent;
1984
1985 if (source->snumber != target->snumber)
1986 {
1987 ColorTrgmInfo *ctrgm;
1988
1989 ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1990 trgmNFA->colorTrgms,
1991 trgmNFA->colorTrgmsCount,
1992 sizeof(ColorTrgmInfo),
1994 Assert(ctrgm != NULL);
1995 Assert(ctrgm->expanded);
1996
1997 arcs[arcIndex].sourceState = source->snumber;
1998 arcs[arcIndex].targetState = target->snumber;
1999 arcs[arcIndex].colorTrgm = ctrgm->cnumber;
2000 arcIndex++;
2001 }
2002 }
2003 }
2004
2005 /* Sort arcs to ease duplicate detection */
2006 qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
2007
2008 /* We could have duplicates because states were merged. Remove them. */
2009 if (arcIndex > 1)
2010 {
2011 /* p1 is probe point, p2 is last known non-duplicate. */
2012 TrgmPackArcInfo *p1,
2013 *p2;
2014
2015 p2 = arcs;
2016 for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2017 {
2018 if (packArcInfoCmp(p1, p2) > 0)
2019 {
2020 p2++;
2021 *p2 = *p1;
2022 }
2023 }
2024 arcsCount = (p2 - arcs) + 1;
2025 }
2026 else
2027 arcsCount = arcIndex;
2028
2029 /* Create packed representation */
2030 result = (TrgmPackedGraph *)
2031 MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
2032
2033 /* Pack color trigrams information */
2034 result->colorTrigramsCount = 0;
2035 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2036 {
2037 if (trgmNFA->colorTrgms[i].expanded)
2038 result->colorTrigramsCount++;
2039 }
2040 result->colorTrigramGroups = (int *)
2041 MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2042 j = 0;
2043 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2044 {
2045 if (trgmNFA->colorTrgms[i].expanded)
2046 {
2047 result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2048 j++;
2049 }
2050 }
2051
2052 /* Pack states and arcs information */
2053 result->statesCount = snumber;
2054 result->states = (TrgmPackedState *)
2055 MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2056 packedArcs = (TrgmPackedArc *)
2057 MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2058 j = 0;
2059 for (i = 0; i < snumber; i++)
2060 {
2061 int cnt = 0;
2062
2063 result->states[i].arcs = &packedArcs[j];
2064 while (j < arcsCount && arcs[j].sourceState == i)
2065 {
2066 packedArcs[j].targetState = arcs[j].targetState;
2067 packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2068 cnt++;
2069 j++;
2070 }
2071 result->states[i].arcsCount = cnt;
2072 }
2073
2074 /* Allocate working memory for trigramsMatchGraph() */
2075 result->colorTrigramsActive = (bool *)
2076 MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2077 result->statesActive = (bool *)
2078 MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2079 result->statesQueue = (int *)
2080 MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2081
2082 return result;
2083}
2084
2085/*
2086 * Comparison function for sorting TrgmPackArcInfos.
2087 *
2088 * Compares arcs in following order: sourceState, colorTrgm, targetState.
2089 */
2090static int
2091packArcInfoCmp(const void *a1, const void *a2)
2092{
2093 const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2094 const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2095
2096 if (p1->sourceState < p2->sourceState)
2097 return -1;
2098 if (p1->sourceState > p2->sourceState)
2099 return 1;
2100 if (p1->colorTrgm < p2->colorTrgm)
2101 return -1;
2102 if (p1->colorTrgm > p2->colorTrgm)
2103 return 1;
2104 if (p1->targetState < p2->targetState)
2105 return -1;
2106 if (p1->targetState > p2->targetState)
2107 return 1;
2108 return 0;
2109}
2110
2111
2112/*---------------------
2113 * Debugging functions
2114 *
2115 * These are designed to emit GraphViz files.
2116 *---------------------
2117 */
2118
2119#ifdef TRGM_REGEXP_DEBUG
2120
2121/*
2122 * Print initial NFA, in regexp library's representation
2123 */
2124static void
2125printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2126{
2128 int nstates = pg_reg_getnumstates(regex);
2129 int state;
2130 int i;
2131
2133
2134 appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2135
2136 for (state = 0; state < nstates; state++)
2137 {
2138 regex_arc_t *arcs;
2139 int i,
2140 arcsCount;
2141
2142 appendStringInfo(&buf, "s%d", state);
2143 if (pg_reg_getfinalstate(regex) == state)
2144 appendStringInfoString(&buf, " [shape = doublecircle]");
2145 appendStringInfoString(&buf, ";\n");
2146
2147 arcsCount = pg_reg_getnumoutarcs(regex, state);
2148 arcs = palloc_array(regex_arc_t, arcsCount);
2149 pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2150
2151 for (i = 0; i < arcsCount; i++)
2152 {
2153 appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2154 state, arcs[i].to, arcs[i].co);
2155 }
2156
2157 pfree(arcs);
2158 }
2159
2160 appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2161 appendStringInfo(&buf, " initial -> s%d;\n",
2162 pg_reg_getinitialstate(regex));
2163
2164 /* Print colors */
2165 appendStringInfoString(&buf, " { rank = sink;\n");
2166 appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2167
2168 for (i = 0; i < ncolors; i++)
2169 {
2170 TrgmColorInfo *color = &colors[i];
2171 int j;
2172
2173 appendStringInfo(&buf, "<br/>Color %d: ", i);
2174 if (color->expandable)
2175 {
2176 for (j = 0; j < color->wordCharsCount; j++)
2177 {
2178 char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2179
2180 memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2181 s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2183 }
2184 }
2185 else
2186 appendStringInfoString(&buf, "not expandable");
2187 appendStringInfoChar(&buf, '\n');
2188 }
2189
2190 appendStringInfoString(&buf, " >];\n");
2191 appendStringInfoString(&buf, " }\n");
2192 appendStringInfoString(&buf, "}\n");
2193
2194 {
2195 /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2196 FILE *fp = fopen("/tmp/source.gv", "w");
2197
2198 fprintf(fp, "%s", buf.data);
2199 fclose(fp);
2200 }
2201
2202 pfree(buf.data);
2203}
2204
2205/*
2206 * Print expanded graph.
2207 */
2208static void
2209printTrgmNFA(TrgmNFA *trgmNFA)
2210{
2212 HASH_SEQ_STATUS scan_status;
2214 TrgmState *initstate = NULL;
2215
2217
2218 appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2219
2220 hash_seq_init(&scan_status, trgmNFA->states);
2221 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2222 {
2223 ListCell *cell;
2224
2225 appendStringInfo(&buf, "s%d", -state->snumber);
2226 if (state->flags & TSTATE_FIN)
2227 appendStringInfoString(&buf, " [shape = doublecircle]");
2228 if (state->flags & TSTATE_INIT)
2229 initstate = state;
2230 appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2231 appendStringInfoString(&buf, ";\n");
2232
2233 foreach(cell, state->arcs)
2234 {
2235 TrgmArc *arc = (TrgmArc *) lfirst(cell);
2236
2237 appendStringInfo(&buf, " s%d -> s%d [label = \"",
2238 -state->snumber, -arc->target->snumber);
2239 printTrgmColor(&buf, arc->ctrgm.colors[0]);
2241 printTrgmColor(&buf, arc->ctrgm.colors[1]);
2243 printTrgmColor(&buf, arc->ctrgm.colors[2]);
2244 appendStringInfoString(&buf, "\"];\n");
2245 }
2246 }
2247
2248 if (initstate)
2249 {
2250 appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2251 appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2252 }
2253
2254 appendStringInfoString(&buf, "}\n");
2255
2256 {
2257 /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2258 FILE *fp = fopen("/tmp/transformed.gv", "w");
2259
2260 fprintf(fp, "%s", buf.data);
2261 fclose(fp);
2262 }
2263
2264 pfree(buf.data);
2265}
2266
2267/*
2268 * Print a TrgmColor readably.
2269 */
2270static void
2271printTrgmColor(StringInfo buf, TrgmColor co)
2272{
2273 if (co == COLOR_UNKNOWN)
2275 else if (co == COLOR_BLANK)
2277 else
2278 appendStringInfo(buf, "%d", (int) co);
2279}
2280
2281/*
2282 * Print final packed representation of trigram-based expanded graph.
2283 */
2284static void
2285printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
2286{
2288 trgm *p;
2289 int i;
2290
2292
2293 appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2294
2295 for (i = 0; i < packedGraph->statesCount; i++)
2296 {
2297 TrgmPackedState *state = &packedGraph->states[i];
2298 int j;
2299
2300 appendStringInfo(&buf, " s%d", i);
2301 if (i == 1)
2302 appendStringInfoString(&buf, " [shape = doublecircle]");
2303
2304 appendStringInfo(&buf, " [label = <s%d>];\n", i);
2305
2306 for (j = 0; j < state->arcsCount; j++)
2307 {
2308 TrgmPackedArc *arc = &state->arcs[j];
2309
2310 appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2311 i, arc->targetState, arc->colorTrgm);
2312 }
2313 }
2314
2315 appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2316 appendStringInfo(&buf, " initial -> s%d;\n", 0);
2317
2318 /* Print trigrams */
2319 appendStringInfoString(&buf, " { rank = sink;\n");
2320 appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2321
2322 p = GETARR(trigrams);
2323 for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2324 {
2325 int count = packedGraph->colorTrigramGroups[i];
2326 int j;
2327
2328 appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2329
2330 for (j = 0; j < count; j++)
2331 {
2332 if (j > 0)
2334
2335 /*
2336 * XXX This representation is nice only for all-ASCII trigrams.
2337 */
2338 appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2339 p++;
2340 }
2341 }
2342
2343 appendStringInfoString(&buf, " >];\n");
2344 appendStringInfoString(&buf, " }\n");
2345 appendStringInfoString(&buf, "}\n");
2346
2347 {
2348 /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2349 FILE *fp = fopen("/tmp/packed.gv", "w");
2350
2351 fprintf(fp, "%s", buf.data);
2352 fclose(fp);
2353 }
2354
2355 pfree(buf.data);
2356}
2357
2358#endif /* TRGM_REGEXP_DEBUG */
int64_t int64
Definition: c.h:549
float float4
Definition: c.h:648
#define MemSet(start, val, len)
Definition: c.h:1032
#define fprintf(file, fmt, msg)
Definition: cubescan.l:21
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1415
int64 hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1336
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc_object(type)
Definition: fe_memutils.h:74
#define palloc_array(type, count)
Definition: fe_memutils.h:76
#define palloc0_array(type, count)
Definition: fe_memutils.h:77
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
Definition: formatting.c:1627
Assert(PointerIsAligned(start, uint64))
const char * str
static const FormData_pg_attribute a1
Definition: heap.c:144
static const FormData_pg_attribute a2
Definition: heap.c:157
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
#define CALCGTSIZE(flag, siglen)
Definition: hstore_gist.c:60
int j
Definition: isn.c:78
int i
Definition: isn.c:77
List * lappend(List *list, void *datum)
Definition: list.c:339
List * list_concat(List *list1, const List *list2)
Definition: list.c:561
void list_free(List *list)
Definition: list.c:1546
unsigned int pg_wchar
Definition: mbprint.c:31
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition: mbutils.c:1011
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition: mbutils.c:989
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
void * palloc(Size size)
Definition: mcxt.c:1365
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:469
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
#define lfirst(lc)
Definition: pg_list.h:172
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define foreach_delete_current(lst, var_or_cell)
Definition: pg_list.h:391
#define list_make1(x1)
Definition: pg_list.h:212
static rewind_source * source
Definition: pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
#define MAX_MULTIBYTE_CHAR_LEN
Definition: pg_wchar.h:33
#define qsort(a, b, c, d)
Definition: port.h:499
unsigned int Oid
Definition: postgres_ext.h:32
char * c
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition: regcomp.c:372
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition: regerror.c:60
#define REG_ICASE
Definition: regex.h:184
#define REG_ADVANCED
Definition: regex.h:181
#define REG_OKAY
Definition: regex.h:215
#define REG_NOSUB
Definition: regex.h:185
#define regex_t
Definition: regex.h:245
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
Definition: regexport.c:134
int pg_reg_getnumcolors(const regex_t *regex)
Definition: regexport.c:174
int pg_reg_getnumstates(const regex_t *regex)
Definition: regexport.c:36
int pg_reg_getfinalstate(const regex_t *regex)
Definition: regexport.c:64
int pg_reg_getinitialstate(const regex_t *regex)
Definition: regexport.c:50
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
Definition: regexport.c:155
int pg_reg_colorisend(const regex_t *regex, int co)
Definition: regexport.c:208
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
Definition: regexport.c:266
int pg_reg_getnumcharacters(const regex_t *regex, int co)
Definition: regexport.c:230
int pg_reg_colorisbegin(const regex_t *regex, int co)
Definition: regexport.c:191
short color
Definition: regguts.h:155
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
ColorTrgm ctrgm
Definition: trgm_regexp.c:377
float4 penalty
Definition: trgm_regexp.c:380
TrgmColor colors[3]
Definition: trgm_regexp.c:302
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
MemoryContext hcxt
Definition: hsearch.h:86
Definition: dynahash.c:222
Definition: pg_list.h:54
Definition: trgm.h:58
uint8 flag
Definition: trgm.h:60
TrgmState * source
Definition: trgm_regexp.c:362
TrgmState * target
Definition: trgm_regexp.c:363
ColorTrgm ctrgm
Definition: trgm_regexp.c:351
TrgmState * target
Definition: trgm_regexp.c:352
trgm_mb_char * wordChars
Definition: trgm_regexp.c:266
bool containsNonWord
Definition: trgm_regexp.c:264
bool overflowed
Definition: trgm_regexp.c:418
int nstates
Definition: trgm_regexp.c:412
int arcsCount
Definition: trgm_regexp.c:417
ColorTrgmInfo * colorTrgms
Definition: trgm_regexp.c:421
List * queue
Definition: trgm_regexp.c:415
int colorTrgmsCount
Definition: trgm_regexp.c:422
TrgmColorInfo * colorInfo
Definition: trgm_regexp.c:406
int totalTrgmCount
Definition: trgm_regexp.c:423
List * keysQueue
Definition: trgm_regexp.c:416
int ncolors
Definition: trgm_regexp.c:407
TrgmState * initState
Definition: trgm_regexp.c:411
regex_t * regex
Definition: trgm_regexp.c:405
HTAB * states
Definition: trgm_regexp.c:410
TrgmPackedState * states
Definition: trgm_regexp.c:461
bool * statesActive
Definition: trgm_regexp.c:465
bool * colorTrigramsActive
Definition: trgm_regexp.c:464
int * colorTrigramGroups
Definition: trgm_regexp.c:454
TrgmPackedArc * arcs
Definition: trgm_regexp.c:438
TrgmColor colors[2]
Definition: trgm_regexp.c:293
TrgmPrefix prefix
Definition: trgm_regexp.c:312
List * arcs
Definition: trgm_regexp.c:337
List * enterKeys
Definition: trgm_regexp.c:338
struct TrgmState * parent
Definition: trgm_regexp.c:341
TrgmStateKey stateKey
Definition: trgm_regexp.c:336
int tentFlags
Definition: trgm_regexp.c:342
struct TrgmState * tentParent
Definition: trgm_regexp.c:343
Definition: regguts.h:296
color co
Definition: regguts.h:298
struct state * to
Definition: regguts.h:300
Definition: regguts.h:323
char bytes[MAX_MULTIBYTE_CHAR_LEN]
Definition: trgm_regexp.c:245
Definition: c.h:706
#define ISWORDCHR(c)
Definition: trgm.h:50
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition: trgm_op.c:248
char trgm[3]
Definition: trgm.h:41
#define GETARR(x)
Definition: trgm.h:97
#define ARRKEY
Definition: trgm.h:87
#define TRGMHDRSIZE
Definition: trgm.h:64
int TrgmColor
Definition: trgm_regexp.c:285
static bool convertPgWchar(pg_wchar c, trgm_mb_char *result)
Definition: trgm_regexp.c:828
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
Definition: trgm_regexp.c:1329
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1384
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:765
#define COLOR_BLANK
Definition: trgm_regexp.c:289
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:567
#define TSTATE_INIT
Definition: trgm_regexp.c:331
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
Definition: trgm_regexp.c:1287
#define MAX_EXPANDED_STATES
Definition: trgm_regexp.c:221
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1906
#define MAX_TRGM_COUNT
Definition: trgm_regexp.c:223
#define MAX_EXPANDED_ARCS
Definition: trgm_regexp.c:222
struct TrgmState TrgmState
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:1457
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
Definition: trgm_regexp.c:1844
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1776
TRGM * createTrgmNFA(text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
Definition: trgm_regexp.c:524
static void transformGraph(TrgmNFA *trgmNFA)
Definition: trgm_regexp.c:894
bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
Definition: trgm_regexp.c:628
static const float4 penalties[8]
Definition: trgm_regexp.c:231
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
Definition: trgm_regexp.c:1177
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
Definition: trgm_regexp.c:1008
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
Definition: trgm_regexp.c:1931
static int packArcInfoCmp(const void *a1, const void *a2)
Definition: trgm_regexp.c:2091
static void processState(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:958
#define COLOR_COUNT_LIMIT
Definition: trgm_regexp.c:225
#define TSTATE_FIN
Definition: trgm_regexp.c:332
#define WISH_TRGM_PENALTY
Definition: trgm_regexp.c:224
static void mergeStates(TrgmState *state1, TrgmState *state2)
Definition: trgm_regexp.c:1876
#define COLOR_UNKNOWN
Definition: trgm_regexp.c:288
static int colorTrgmInfoCmp(const void *p1, const void *p2)
Definition: trgm_regexp.c:1893
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
Definition: trgm_regexp.c:721
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)
Definition: trgm_regexp.c:1189
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
Definition: trgm_regexp.c:1415
static Size VARSIZE_ANY_EXHDR(const void *PTR)
Definition: varatt.h:472
static char * VARDATA_ANY(const void *PTR)
Definition: varatt.h:486
static void SET_VARSIZE(void *PTR, Size len)
Definition: varatt.h:432
static char chars[TZ_MAX_CHARS]
Definition: zic.c:404