PostgreSQL Source Code git master
Loading...
Searching...
No Matches
trgm_regexp.c File Reference
#include "postgres.h"
#include "catalog/pg_collation_d.h"
#include "regex/regexport.h"
#include "trgm.h"
#include "tsearch/ts_locale.h"
#include "utils/formatting.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "varatt.h"
Include dependency graph for trgm_regexp.c:

Go to the source code of this file.

Data Structures

struct  trgm_mb_char
 
struct  TrgmColorInfo
 
struct  TrgmPrefix
 
struct  ColorTrgm
 
struct  TrgmStateKey
 
struct  TrgmState
 
struct  TrgmArc
 
struct  TrgmArcInfo
 
struct  ColorTrgmInfo
 
struct  TrgmNFA
 
struct  TrgmPackedArc
 
struct  TrgmPackedState
 
struct  TrgmPackedGraph
 
struct  TrgmPackArcInfo
 

Macros

#define MAX_EXPANDED_STATES   128
 
#define MAX_EXPANDED_ARCS   1024
 
#define MAX_TRGM_COUNT   256
 
#define WISH_TRGM_PENALTY   16
 
#define COLOR_COUNT_LIMIT   256
 
#define COLOR_UNKNOWN   (-3)
 
#define COLOR_BLANK   (-4)
 
#define TSTATE_INIT   0x01 /* flag indicating this state is initial */
 
#define TSTATE_FIN   0x02 /* flag indicating this state is final */
 

Typedefs

typedef int TrgmColor
 
typedef struct TrgmState TrgmState
 

Functions

static TRGMcreateTrgmNFAInternal (regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
 
static void RE_compile (regex_t *regex, text *text_re, int cflags, Oid collation)
 
static void getColorInfo (regex_t *regex, TrgmNFA *trgmNFA)
 
static int convertPgWchar (pg_wchar c, trgm_mb_char *result)
 
static void transformGraph (TrgmNFA *trgmNFA)
 
static void processState (TrgmNFA *trgmNFA, TrgmState *state)
 
static void addKey (TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
 
static void addKeyToQueue (TrgmNFA *trgmNFA, TrgmStateKey *key)
 
static void addArcs (TrgmNFA *trgmNFA, TrgmState *state)
 
static void addArc (TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)
 
static bool validArcLabel (TrgmStateKey *key, TrgmColor co)
 
static TrgmStategetState (TrgmNFA *trgmNFA, TrgmStateKey *key)
 
static bool prefixContains (TrgmPrefix *prefix1, TrgmPrefix *prefix2)
 
static bool selectColorTrigrams (TrgmNFA *trgmNFA)
 
static TRGMexpandColorTrigrams (TrgmNFA *trgmNFA, MemoryContext rcontext)
 
static void fillTrgm (trgm *ptrgm, trgm_mb_char s[3])
 
static void mergeStates (TrgmState *state1, TrgmState *state2)
 
static int colorTrgmInfoCmp (const void *p1, const void *p2)
 
static int colorTrgmInfoPenaltyCmp (const void *p1, const void *p2)
 
static TrgmPackedGraphpackGraph (TrgmNFA *trgmNFA, MemoryContext rcontext)
 
static int packArcInfoCmp (const void *a1, const void *a2)
 
TRGMcreateTrgmNFA (text *text_re, Oid collation, TrgmPackedGraph **graph, MemoryContext rcontext)
 
bool trigramsMatchGraph (TrgmPackedGraph *graph, bool *check)
 

Variables

static const float4 penalties [8]
 

Macro Definition Documentation

◆ COLOR_BLANK

#define COLOR_BLANK   (-4)

Definition at line 289 of file trgm_regexp.c.

◆ COLOR_COUNT_LIMIT

#define COLOR_COUNT_LIMIT   256

Definition at line 225 of file trgm_regexp.c.

◆ COLOR_UNKNOWN

#define COLOR_UNKNOWN   (-3)

Definition at line 288 of file trgm_regexp.c.

◆ MAX_EXPANDED_ARCS

#define MAX_EXPANDED_ARCS   1024

Definition at line 222 of file trgm_regexp.c.

◆ MAX_EXPANDED_STATES

#define MAX_EXPANDED_STATES   128

Definition at line 221 of file trgm_regexp.c.

◆ MAX_TRGM_COUNT

#define MAX_TRGM_COUNT   256

Definition at line 223 of file trgm_regexp.c.

◆ TSTATE_FIN

#define TSTATE_FIN   0x02 /* flag indicating this state is final */

Definition at line 332 of file trgm_regexp.c.

◆ TSTATE_INIT

#define TSTATE_INIT   0x01 /* flag indicating this state is initial */

Definition at line 331 of file trgm_regexp.c.

◆ WISH_TRGM_PENALTY

#define WISH_TRGM_PENALTY   16

Definition at line 224 of file trgm_regexp.c.

Typedef Documentation

◆ TrgmColor

Definition at line 285 of file trgm_regexp.c.

◆ TrgmState

Function Documentation

◆ addArc()

static void addArc ( TrgmNFA trgmNFA,
TrgmState state,
TrgmStateKey key,
TrgmColor  co,
TrgmStateKey destKey 
)
static

Definition at line 1290 of file trgm_regexp.c.

1292{
1293 TrgmArc *arc;
1294 ListCell *cell;
1295
1296 /* Do nothing if this wouldn't be a valid arc label trigram */
1297 if (!validArcLabel(key, co))
1298 return;
1299
1300 /*
1301 * Check if we are going to reach key which is covered by a key which is
1302 * already listed in this state. If so arc is useless: the NFA can bypass
1303 * it through a path that doesn't require any predictable trigram, so
1304 * whether the arc's trigram is present or not doesn't really matter.
1305 */
1306 foreach(cell, state->enterKeys)
1307 {
1309
1310 if (existingKey->nstate == destKey->nstate &&
1311 prefixContains(&existingKey->prefix, &destKey->prefix))
1312 return;
1313 }
1314
1315 /* Checks were successful, add new arc */
1317 arc->target = getState(trgmNFA, destKey);
1318 arc->ctrgm.colors[0] = key->prefix.colors[0];
1319 arc->ctrgm.colors[1] = key->prefix.colors[1];
1320 arc->ctrgm.colors[2] = co;
1321
1322 state->arcs = lappend(state->arcs, arc);
1323 trgmNFA->arcsCount++;
1324}
#define palloc_object(type)
Definition fe_memutils.h:74
List * lappend(List *list, void *datum)
Definition list.c:339
#define lfirst(lc)
Definition pg_list.h:172
static int fb(int x)
Definition regguts.h:296
static bool validArcLabel(TrgmStateKey *key, TrgmColor co)
static TrgmState * getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)

References fb(), getState(), lappend(), lfirst, palloc_object, prefixContains(), and validArcLabel().

Referenced by addArcs().

◆ addArcs()

static void addArcs ( TrgmNFA trgmNFA,
TrgmState state 
)
static

Definition at line 1192 of file trgm_regexp.c.

1193{
1195 ListCell *cell;
1196 regex_arc_t *arcs;
1197 int arcsCount,
1198 i;
1199
1200 /*
1201 * Ensure any pad bytes in destKey are zero, since it may get used as a
1202 * hashtable key by getState.
1203 */
1204 MemSet(&destKey, 0, sizeof(destKey));
1205
1206 /*
1207 * Iterate over enter keys associated with this expanded-graph state. This
1208 * includes both the state's own stateKey, and any enter keys we added to
1209 * it during addKey (which represent expanded-graph states that are not
1210 * distinguishable from this one by means of trigrams). For each such
1211 * enter key, examine all the out-arcs of the key's underlying NFA state,
1212 * and try to make a trigram arc leading to where the out-arc leads.
1213 * (addArc will deal with whether the arc is valid or not.)
1214 */
1215 foreach(cell, state->enterKeys)
1216 {
1217 TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1218
1219 arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1220 arcs = palloc_array(regex_arc_t, arcsCount);
1221 pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1222
1223 for (i = 0; i < arcsCount; i++)
1224 {
1225 regex_arc_t *arc = &arcs[i];
1226 TrgmColorInfo *colorInfo;
1227
1228 /*
1229 * Ignore non-expandable colors; addKey already handled the case.
1230 *
1231 * We need no special check for WHITE or begin/end pseudocolors
1232 * here. We don't need to do any processing for them, and they
1233 * will be marked non-expandable since the regex engine will have
1234 * reported them that way. We do have to watch out for RAINBOW,
1235 * which has a negative color number.
1236 */
1237 if (arc->co < 0)
1238 continue;
1239 Assert(arc->co < trgmNFA->ncolors);
1240
1241 colorInfo = &trgmNFA->colorInfo[arc->co];
1242 if (!colorInfo->expandable)
1243 continue;
1244
1245 if (colorInfo->containsNonWord)
1246 {
1247 /*
1248 * Color includes non-word character(s).
1249 *
1250 * Generate an arc, treating this transition as occurring on
1251 * BLANK. This allows word-ending trigrams to be manufactured
1252 * if possible.
1253 */
1254 destKey.prefix.colors[0] = key->prefix.colors[1];
1255 destKey.prefix.colors[1] = COLOR_BLANK;
1256 destKey.nstate = arc->to;
1257
1259 }
1260
1261 if (colorInfo->wordCharsCount > 0)
1262 {
1263 /*
1264 * Color includes word character(s).
1265 *
1266 * Generate an arc. Color is pushed into prefix of target
1267 * state.
1268 */
1269 destKey.prefix.colors[0] = key->prefix.colors[1];
1270 destKey.prefix.colors[1] = arc->co;
1271 destKey.nstate = arc->to;
1272
1273 addArc(trgmNFA, state, key, arc->co, &destKey);
1274 }
1275 }
1276
1277 pfree(arcs);
1278 }
1279}
#define Assert(condition)
Definition c.h:885
#define MemSet(start, val, len)
Definition c.h:1035
#define palloc_array(type, count)
Definition fe_memutils.h:76
int i
Definition isn.c:77
void pfree(void *pointer)
Definition mcxt.c:1616
int pg_reg_getnumoutarcs(const regex_t *regex, int st)
Definition regexport.c:134
void pg_reg_getoutarcs(const regex_t *regex, int st, regex_arc_t *arcs, int arcs_len)
Definition regexport.c:155
color co
Definition regguts.h:298
struct state * to
Definition regguts.h:300
#define COLOR_BLANK
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key, TrgmColor co, TrgmStateKey *destKey)

References addArc(), Assert, arc::co, COLOR_BLANK, TrgmColorInfo::containsNonWord, TrgmColorInfo::expandable, fb(), i, lfirst, MemSet, palloc_array, pfree(), pg_reg_getnumoutarcs(), pg_reg_getoutarcs(), arc::to, and TrgmColorInfo::wordCharsCount.

Referenced by processState().

◆ addKey()

static void addKey ( TrgmNFA trgmNFA,
TrgmState state,
TrgmStateKey key 
)
static

Definition at line 1011 of file trgm_regexp.c.

1012{
1013 regex_arc_t *arcs;
1015 ListCell *cell;
1016 int i,
1017 arcsCount;
1018
1019 /*
1020 * Ensure any pad bytes in destKey are zero, since it may get used as a
1021 * hashtable key by getState.
1022 */
1023 MemSet(&destKey, 0, sizeof(destKey));
1024
1025 /*
1026 * Compare key to each existing enter key of the state to check for
1027 * redundancy. We can drop either old key(s) or the new key if we find
1028 * redundancy.
1029 */
1030 foreach(cell, state->enterKeys)
1031 {
1033
1034 if (existingKey->nstate == key->nstate)
1035 {
1036 if (prefixContains(&existingKey->prefix, &key->prefix))
1037 {
1038 /* This old key already covers the new key. Nothing to do */
1039 return;
1040 }
1041 if (prefixContains(&key->prefix, &existingKey->prefix))
1042 {
1043 /*
1044 * The new key covers this old key. Remove the old key, it's
1045 * no longer needed once we add this key to the list.
1046 */
1047 state->enterKeys = foreach_delete_current(state->enterKeys,
1048 cell);
1049 }
1050 }
1051 }
1052
1053 /* No redundancy, so add this key to the state's list */
1054 state->enterKeys = lappend(state->enterKeys, key);
1055
1056 /* If state is now known final, mark it and we're done */
1057 if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1058 {
1059 state->flags |= TSTATE_FIN;
1060 return;
1061 }
1062
1063 /*
1064 * Loop through all outgoing arcs of the corresponding state in the
1065 * original NFA.
1066 */
1067 arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1068 arcs = palloc_array(regex_arc_t, arcsCount);
1069 pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1070
1071 for (i = 0; i < arcsCount; i++)
1072 {
1073 regex_arc_t *arc = &arcs[i];
1074
1075 if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1076 {
1077 /*
1078 * Start of line/string (^). Trigram extraction treats start of
1079 * line same as start of word: double space prefix is added.
1080 * Hence, make an enter key showing we can reach the arc
1081 * destination with all-blank prefix.
1082 */
1083 destKey.prefix.colors[0] = COLOR_BLANK;
1084 destKey.prefix.colors[1] = COLOR_BLANK;
1085 destKey.nstate = arc->to;
1086
1087 /* Add enter key to this state */
1089 }
1090 else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1091 {
1092 /*
1093 * End of line/string ($). We must consider this arc as a
1094 * transition that doesn't read anything. The reason for adding
1095 * this enter key to the state is that if the arc leads to the
1096 * NFA's final state, we must mark this expanded state as final.
1097 */
1098 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1099 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1100 destKey.nstate = arc->to;
1101
1102 /* Add enter key to this state */
1104 }
1105 else if (arc->co >= 0)
1106 {
1107 /* Regular color (including WHITE) */
1108 TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1109
1110 if (colorInfo->expandable)
1111 {
1112 if (colorInfo->containsNonWord &&
1114 {
1115 /*
1116 * We can reach the arc destination after reading a
1117 * non-word character, but the prefix is not something
1118 * that addArc will accept with COLOR_BLANK, so no trigram
1119 * arc can get made for this transition. We must make an
1120 * enter key to show that the arc destination is
1121 * reachable. Set it up with an all-blank prefix, since
1122 * that corresponds to what the trigram extraction code
1123 * will do at a word starting boundary.
1124 */
1125 destKey.prefix.colors[0] = COLOR_BLANK;
1126 destKey.prefix.colors[1] = COLOR_BLANK;
1127 destKey.nstate = arc->to;
1129 }
1130
1131 if (colorInfo->wordCharsCount > 0 &&
1132 !validArcLabel(key, arc->co))
1133 {
1134 /*
1135 * We can reach the arc destination after reading a word
1136 * character, but the prefix is not something that addArc
1137 * will accept, so no trigram arc can get made for this
1138 * transition. We must make an enter key to show that the
1139 * arc destination is reachable. The prefix for the enter
1140 * key should reflect the info we have for this arc.
1141 */
1142 destKey.prefix.colors[0] = key->prefix.colors[1];
1143 destKey.prefix.colors[1] = arc->co;
1144 destKey.nstate = arc->to;
1146 }
1147 }
1148 else
1149 {
1150 /*
1151 * Unexpandable color. Add enter key with ambiguous prefix,
1152 * showing we can reach the destination from this state, but
1153 * the preceding colors will be uncertain. (We do not set the
1154 * first prefix color to key->prefix.colors[1], because a
1155 * prefix of known followed by unknown is invalid.)
1156 */
1157 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1158 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1159 destKey.nstate = arc->to;
1161 }
1162 }
1163 else
1164 {
1165 /* RAINBOW: treat as unexpandable color */
1166 destKey.prefix.colors[0] = COLOR_UNKNOWN;
1167 destKey.prefix.colors[1] = COLOR_UNKNOWN;
1168 destKey.nstate = arc->to;
1170 }
1171 }
1172
1173 pfree(arcs);
1174}
#define foreach_delete_current(lst, var_or_cell)
Definition pg_list.h:391
int pg_reg_getfinalstate(const regex_t *regex)
Definition regexport.c:64
int pg_reg_colorisend(const regex_t *regex, int co)
Definition regexport.c:208
int pg_reg_colorisbegin(const regex_t *regex, int co)
Definition regexport.c:191
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
#define TSTATE_FIN
#define COLOR_UNKNOWN

References addKeyToQueue(), arc::co, COLOR_BLANK, COLOR_UNKNOWN, TrgmColorInfo::containsNonWord, TrgmColorInfo::expandable, fb(), foreach_delete_current, i, lappend(), lfirst, MemSet, palloc_array, pfree(), pg_reg_colorisbegin(), pg_reg_colorisend(), pg_reg_getfinalstate(), pg_reg_getnumoutarcs(), pg_reg_getoutarcs(), prefixContains(), arc::to, TSTATE_FIN, validArcLabel(), and TrgmColorInfo::wordCharsCount.

Referenced by processState().

◆ addKeyToQueue()

static void addKeyToQueue ( TrgmNFA trgmNFA,
TrgmStateKey key 
)
static

Definition at line 1180 of file trgm_regexp.c.

1181{
1183
1184 memcpy(keyCopy, key, sizeof(TrgmStateKey));
1185 trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1186}

References fb(), lappend(), and palloc_object.

Referenced by addKey().

◆ colorTrgmInfoCmp()

static int colorTrgmInfoCmp ( const void p1,
const void p2 
)
static

Definition at line 1896 of file trgm_regexp.c.

1897{
1898 const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1899 const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1900
1901 return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1902}

References fb().

Referenced by packGraph(), and selectColorTrigrams().

◆ colorTrgmInfoPenaltyCmp()

static int colorTrgmInfoPenaltyCmp ( const void p1,
const void p2 
)
static

Definition at line 1909 of file trgm_regexp.c.

1910{
1911 float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1912 float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1913
1914 if (penalty1 < penalty2)
1915 return 1;
1916 else if (penalty1 == penalty2)
1917 return 0;
1918 else
1919 return -1;
1920}
float float4
Definition c.h:655

References fb().

Referenced by selectColorTrigrams().

◆ convertPgWchar()

static int convertPgWchar ( pg_wchar  c,
trgm_mb_char result 
)
static

Definition at line 830 of file trgm_regexp.c.

831{
832 /* "s" has enough space for a multibyte character and a trailing NUL */
833 char s[MAX_MULTIBYTE_CHAR_LEN + 1];
834 int clen;
835
836 /*
837 * We can ignore the NUL character, since it can never appear in a PG text
838 * string. This avoids the need for various special cases when
839 * reconstructing trigrams.
840 */
841 if (c == 0)
842 return 0;
843
844 /* Do the conversion, making sure the result is NUL-terminated */
845 memset(s, 0, sizeof(s));
846 clen = pg_wchar2mb_with_len(&c, s, 1);
847
848 /*
849 * In IGNORECASE mode, we can ignore uppercase characters. We assume that
850 * the regex engine generated both uppercase and lowercase equivalents
851 * within each color, since we used the REG_ICASE option; so there's no
852 * need to process the uppercase version.
853 *
854 * XXX this code is dependent on the assumption that str_tolower() works
855 * the same as the regex engine's internal case folding machinery. Might
856 * be wiser to expose pg_wc_tolower and test whether c ==
857 * pg_wc_tolower(c). On the other hand, the trigrams in the index were
858 * created using str_tolower(), so we're probably screwed if there's any
859 * incompatibility anyway.
860 */
861#ifdef IGNORECASE
862 {
864
865 if (strcmp(lowerCased, s) != 0)
866 {
868 return 0;
869 }
871 }
872#endif
873
874 /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
876 return clen;
877}
char * str_tolower(const char *buff, size_t nbytes, Oid collid)
int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
Definition mbutils.c:1019
#define MAX_MULTIBYTE_CHAR_LEN
Definition pg_wchar.h:33
char * c
char bytes[MAX_MULTIBYTE_CHAR_LEN]

References trgm_mb_char::bytes, fb(), MAX_MULTIBYTE_CHAR_LEN, pfree(), pg_wchar2mb_with_len(), and str_tolower().

Referenced by getColorInfo().

◆ createTrgmNFA()

TRGM * createTrgmNFA ( text text_re,
Oid  collation,
TrgmPackedGraph **  graph,
MemoryContext  rcontext 
)

Definition at line 524 of file trgm_regexp.c.

526{
527 TRGM *trg;
528 regex_t regex;
529 MemoryContext tmpcontext;
530 MemoryContext oldcontext;
531
532 /*
533 * This processing generates a great deal of cruft, which we'd like to
534 * clean up before returning (since this function may be called in a
535 * query-lifespan memory context). Make a temp context we can work in so
536 * that cleanup is easy.
537 */
539 "createTrgmNFA temporary context",
541 oldcontext = MemoryContextSwitchTo(tmpcontext);
542
543 /*
544 * Stage 1: Compile the regexp into a NFA, using the regexp library.
545 */
546#ifdef IGNORECASE
547 RE_compile(&regex, text_re,
548 REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
549#else
550 RE_compile(&regex, text_re,
551 REG_ADVANCED | REG_NOSUB, collation);
552#endif
553
554 trg = createTrgmNFAInternal(&regex, graph, rcontext);
555
556 /* Clean up all the cruft we created (including regex) */
557 MemoryContextSwitchTo(oldcontext);
558 MemoryContextDelete(tmpcontext);
559
560 return trg;
561}
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
#define REG_ICASE
Definition regex.h:184
#define REG_ADVANCED
Definition regex.h:181
#define REG_NOSUB
Definition regex.h:185
#define regex_t
Definition regex.h:245
Definition trgm.h:58
static TRGM * createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, MemoryContext rcontext)
static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, createTrgmNFAInternal(), CurrentMemoryContext, fb(), MemoryContextDelete(), MemoryContextSwitchTo(), RE_compile(), REG_ADVANCED, REG_ICASE, REG_NOSUB, and regex_t.

Referenced by gin_extract_query_trgm(), and gtrgm_consistent().

◆ createTrgmNFAInternal()

static TRGM * createTrgmNFAInternal ( regex_t regex,
TrgmPackedGraph **  graph,
MemoryContext  rcontext 
)
static

Definition at line 567 of file trgm_regexp.c.

569{
570 TRGM *trg;
572
573 trgmNFA.regex = regex;
574
575 /* Collect color information from the regex */
576 getColorInfo(regex, &trgmNFA);
577
578#ifdef TRGM_REGEXP_DEBUG
579 printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
580#endif
581
582 /*
583 * Stage 2: Create an expanded graph from the source NFA.
584 */
586
587#ifdef TRGM_REGEXP_DEBUG
589#endif
590
591 /*
592 * Fail if we were unable to make a nontrivial graph, ie it is possible to
593 * get from the initial state to the final state without reading any
594 * predictable trigram.
595 */
596 if (trgmNFA.initState->flags & TSTATE_FIN)
597 return NULL;
598
599 /*
600 * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
601 */
603 return NULL;
604
605 /*
606 * Stage 4: Expand color trigrams and pack graph into final
607 * representation.
608 */
610
611 *graph = packGraph(&trgmNFA, rcontext);
612
613#ifdef TRGM_REGEXP_DEBUG
614 printTrgmPackedGraph(*graph, trg);
615#endif
616
617 return trg;
618}
regex_t * regex
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
static bool selectColorTrigrams(TrgmNFA *trgmNFA)
static TRGM * expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
static void transformGraph(TrgmNFA *trgmNFA)
static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)

References expandColorTrigrams(), fb(), getColorInfo(), packGraph(), TrgmNFA::regex, selectColorTrigrams(), transformGraph(), and TSTATE_FIN.

Referenced by createTrgmNFA().

◆ expandColorTrigrams()

static TRGM * expandColorTrigrams ( TrgmNFA trgmNFA,
MemoryContext  rcontext 
)
static

Definition at line 1779 of file trgm_regexp.c.

1780{
1781 TRGM *trg;
1782 trgm *p;
1783 int i;
1786
1787 /* Set up "blank" color structure containing a single zero character */
1788 memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1789 blankColor.wordCharsCount = 1;
1790 blankColor.wordChars = &blankChar;
1791
1792 /* Construct the trgm array */
1793 trg = (TRGM *)
1795 TRGMHDRSIZE +
1796 trgmNFA->totalTrgmCount * sizeof(trgm));
1797 trg->flag = ARRKEY;
1798 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1799 p = GETARR(trg);
1800 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1801 {
1802 ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1803 TrgmColorInfo *c[3];
1804 trgm_mb_char s[3];
1805 int j,
1806 i1,
1807 i2,
1808 i3;
1809
1810 /* Ignore any unexpanded trigrams ... */
1811 if (!colorTrgm->expanded)
1812 continue;
1813
1814 /* Get colors, substituting the dummy struct for COLOR_BLANK */
1815 for (j = 0; j < 3; j++)
1816 {
1817 if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1818 c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1819 else
1820 c[j] = &blankColor;
1821 }
1822
1823 /* Iterate over all possible combinations of colors' characters */
1824 for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1825 {
1826 s[0] = c[0]->wordChars[i1];
1827 for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1828 {
1829 s[1] = c[1]->wordChars[i2];
1830 for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1831 {
1832 s[2] = c[2]->wordChars[i3];
1833 fillTrgm(p, s);
1834 p++;
1835 }
1836 }
1837 }
1838 }
1839
1840 return trg;
1841}
#define CALCGTSIZE(flag, siglen)
Definition hstore_gist.c:60
int j
Definition isn.c:78
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1266
ColorTrgm ctrgm
TrgmColor colors[3]
uint8 flag
Definition trgm.h:60
char trgm[3]
Definition trgm.h:41
#define GETARR(x)
Definition trgm.h:97
#define ARRKEY
Definition trgm.h:87
#define TRGMHDRSIZE
Definition trgm.h:64
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
static void SET_VARSIZE(void *PTR, Size len)
Definition varatt.h:432

References ARRKEY, CALCGTSIZE, COLOR_BLANK, ColorTrgm::colors, ColorTrgmInfo::ctrgm, ColorTrgmInfo::expanded, fb(), fillTrgm(), TRGM::flag, GETARR, i, j, MemoryContextAllocZero(), SET_VARSIZE(), and TRGMHDRSIZE.

Referenced by createTrgmNFAInternal().

◆ fillTrgm()

static void fillTrgm ( trgm ptrgm,
trgm_mb_char  s[3] 
)
static

Definition at line 1847 of file trgm_regexp.c.

1848{
1849 char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1850 *p;
1851 int i,
1852 j;
1853
1854 /* Write multibyte string into "str" (we don't need null termination) */
1855 p = str;
1856
1857 for (i = 0; i < 3; i++)
1858 {
1859 if (s[i].bytes[0] != 0)
1860 {
1861 for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1862 *p++ = s[i].bytes[j];
1863 }
1864 else
1865 {
1866 /* Emit a space in place of COLOR_BLANK */
1867 *p++ = ' ';
1868 }
1869 }
1870
1871 /* Convert "str" to a standard trigram (possibly hashing it) */
1873}
const char * str
void compact_trigram(trgm *tptr, char *str, int bytelen)
Definition trgm_op.c:329

References trgm_mb_char::bytes, compact_trigram(), fb(), i, j, MAX_MULTIBYTE_CHAR_LEN, and str.

Referenced by expandColorTrigrams().

◆ getColorInfo()

static void getColorInfo ( regex_t regex,
TrgmNFA trgmNFA 
)
static

Definition at line 765 of file trgm_regexp.c.

766{
767 int colorsCount = pg_reg_getnumcolors(regex);
768 int i;
769
770 trgmNFA->ncolors = colorsCount;
771 trgmNFA->colorInfo = (TrgmColorInfo *)
773
774 /*
775 * Loop over colors, filling TrgmColorInfo about each. Note we include
776 * WHITE (0) even though we know it'll be reported as non-expandable.
777 */
778 for (i = 0; i < colorsCount; i++)
779 {
780 TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
783 int j;
784
786 {
787 /* Non expandable, or too large to work with */
788 colorInfo->expandable = false;
789 continue;
790 }
791
792 colorInfo->expandable = true;
793 colorInfo->containsNonWord = false;
795 colorInfo->wordCharsCount = 0;
796
797 /* Extract all the chars in this color */
800
801 /*
802 * Convert characters back to multibyte form, and save only those that
803 * are word characters. Set "containsNonWord" if any non-word
804 * character. (Note: it'd probably be nicer to keep the chars in
805 * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
806 */
807 for (j = 0; j < charsCount; j++)
808 {
810 int clen = convertPgWchar(chars[j], &c);
811
812 if (!clen)
813 continue; /* ok to ignore it altogether */
814 if (ISWORDCHR(c.bytes, clen))
815 colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
816 else
817 colorInfo->containsNonWord = true;
818 }
819
820 pfree(chars);
821 }
822}
unsigned int pg_wchar
Definition mbprint.c:31
void * palloc0(Size size)
Definition mcxt.c:1417
int pg_reg_getnumcolors(const regex_t *regex)
Definition regexport.c:174
void pg_reg_getcharacters(const regex_t *regex, int co, pg_wchar *chars, int chars_len)
Definition regexport.c:266
int pg_reg_getnumcharacters(const regex_t *regex, int co)
Definition regexport.c:230
trgm_mb_char * wordChars
#define ISWORDCHR(c, len)
Definition trgm.h:50
static int convertPgWchar(pg_wchar c, trgm_mb_char *result)
#define COLOR_COUNT_LIMIT
static char chars[TZ_MAX_CHARS]
Definition zic.c:404

References chars, COLOR_COUNT_LIMIT, TrgmColorInfo::containsNonWord, convertPgWchar(), TrgmColorInfo::expandable, fb(), i, ISWORDCHR, j, palloc0(), palloc_array, pfree(), pg_reg_getcharacters(), pg_reg_getnumcharacters(), pg_reg_getnumcolors(), TrgmColorInfo::wordChars, and TrgmColorInfo::wordCharsCount.

Referenced by createTrgmNFAInternal().

◆ getState()

static TrgmState * getState ( TrgmNFA trgmNFA,
TrgmStateKey key 
)
static

Definition at line 1387 of file trgm_regexp.c.

1388{
1390 bool found;
1391
1392 state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1393 &found);
1394 if (!found)
1395 {
1396 /* New state: initialize and queue it */
1397 state->arcs = NIL;
1398 state->enterKeys = NIL;
1399 state->flags = 0;
1400 /* states are initially given negative numbers */
1401 state->snumber = -(++trgmNFA->nstates);
1402 state->parent = NULL;
1403 state->tentFlags = 0;
1404 state->tentParent = NULL;
1405
1406 trgmNFA->queue = lappend(trgmNFA->queue, state);
1407 }
1408 return state;
1409}
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:952
@ HASH_ENTER
Definition hsearch.h:114
#define NIL
Definition pg_list.h:68

References fb(), HASH_ENTER, hash_search(), lappend(), and NIL.

Referenced by addArc(), and transformGraph().

◆ mergeStates()

static void mergeStates ( TrgmState state1,
TrgmState state2 
)
static

Definition at line 1879 of file trgm_regexp.c.

1880{
1881 Assert(state1 != state2);
1882 Assert(!state1->parent);
1883 Assert(!state2->parent);
1884
1885 /* state1 absorbs state2's flags */
1886 state1->flags |= state2->flags;
1887
1888 /* state2, and indirectly all its children, become children of state1 */
1889 state2->parent = state1;
1890}

References Assert, and fb().

Referenced by selectColorTrigrams().

◆ packArcInfoCmp()

static int packArcInfoCmp ( const void a1,
const void a2 
)
static

Definition at line 2094 of file trgm_regexp.c.

2095{
2096 const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2097 const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2098
2099 if (p1->sourceState < p2->sourceState)
2100 return -1;
2101 if (p1->sourceState > p2->sourceState)
2102 return 1;
2103 if (p1->colorTrgm < p2->colorTrgm)
2104 return -1;
2105 if (p1->colorTrgm > p2->colorTrgm)
2106 return 1;
2107 if (p1->targetState < p2->targetState)
2108 return -1;
2109 if (p1->targetState > p2->targetState)
2110 return 1;
2111 return 0;
2112}
static const FormData_pg_attribute a1
Definition heap.c:144
static const FormData_pg_attribute a2
Definition heap.c:157

References a1, a2, and fb().

Referenced by packGraph().

◆ packGraph()

static TrgmPackedGraph * packGraph ( TrgmNFA trgmNFA,
MemoryContext  rcontext 
)
static

Definition at line 1934 of file trgm_regexp.c.

1935{
1936 int snumber = 2,
1937 arcIndex,
1938 arcsCount;
1941 TrgmPackArcInfo *arcs;
1943 TrgmPackedGraph *result;
1944 int i,
1945 j;
1946
1947 /* Enumerate surviving states, giving init and fin reserved numbers */
1949 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1950 {
1951 while (state->parent)
1952 state = state->parent;
1953
1954 if (state->snumber < 0)
1955 {
1956 if (state->flags & TSTATE_INIT)
1957 state->snumber = 0;
1958 else if (state->flags & TSTATE_FIN)
1959 state->snumber = 1;
1960 else
1961 {
1962 state->snumber = snumber;
1963 snumber++;
1964 }
1965 }
1966 }
1967
1968 /* Collect array of all arcs */
1969 arcs = palloc_array(TrgmPackArcInfo, trgmNFA->arcsCount);
1970 arcIndex = 0;
1972 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1973 {
1975 ListCell *cell;
1976
1977 while (source->parent)
1978 source = source->parent;
1979
1980 foreach(cell, state->arcs)
1981 {
1982 TrgmArc *arc = (TrgmArc *) lfirst(cell);
1983 TrgmState *target = arc->target;
1984
1985 while (target->parent)
1986 target = target->parent;
1987
1988 if (source->snumber != target->snumber)
1989 {
1990 ColorTrgmInfo *ctrgm;
1991
1992 ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1993 trgmNFA->colorTrgms,
1994 trgmNFA->colorTrgmsCount,
1995 sizeof(ColorTrgmInfo),
1997 Assert(ctrgm != NULL);
1998 Assert(ctrgm->expanded);
1999
2000 arcs[arcIndex].sourceState = source->snumber;
2001 arcs[arcIndex].targetState = target->snumber;
2002 arcs[arcIndex].colorTrgm = ctrgm->cnumber;
2003 arcIndex++;
2004 }
2005 }
2006 }
2007
2008 /* Sort arcs to ease duplicate detection */
2010
2011 /* We could have duplicates because states were merged. Remove them. */
2012 if (arcIndex > 1)
2013 {
2014 /* p1 is probe point, p2 is last known non-duplicate. */
2016 *p2;
2017
2018 p2 = arcs;
2019 for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2020 {
2021 if (packArcInfoCmp(p1, p2) > 0)
2022 {
2023 p2++;
2024 *p2 = *p1;
2025 }
2026 }
2027 arcsCount = (p2 - arcs) + 1;
2028 }
2029 else
2030 arcsCount = arcIndex;
2031
2032 /* Create packed representation */
2033 result = (TrgmPackedGraph *)
2035
2036 /* Pack color trigrams information */
2037 result->colorTrigramsCount = 0;
2038 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2039 {
2040 if (trgmNFA->colorTrgms[i].expanded)
2041 result->colorTrigramsCount++;
2042 }
2043 result->colorTrigramGroups = (int *)
2044 MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2045 j = 0;
2046 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2047 {
2048 if (trgmNFA->colorTrgms[i].expanded)
2049 {
2050 result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2051 j++;
2052 }
2053 }
2054
2055 /* Pack states and arcs information */
2056 result->statesCount = snumber;
2057 result->states = (TrgmPackedState *)
2058 MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2060 MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2061 j = 0;
2062 for (i = 0; i < snumber; i++)
2063 {
2064 int cnt = 0;
2065
2066 result->states[i].arcs = &packedArcs[j];
2067 while (j < arcsCount && arcs[j].sourceState == i)
2068 {
2070 packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2071 cnt++;
2072 j++;
2073 }
2074 result->states[i].arcsCount = cnt;
2075 }
2076
2077 /* Allocate working memory for trigramsMatchGraph() */
2078 result->colorTrigramsActive = (bool *)
2079 MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2080 result->statesActive = (bool *)
2081 MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2082 result->statesQueue = (int *)
2083 MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2084
2085 return result;
2086}
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1380
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
static rewind_source * source
Definition pg_rewind.c:89
#define qsort(a, b, c, d)
Definition port.h:495
TrgmPackedState * states
bool * colorTrigramsActive
int * colorTrigramGroups
TrgmPackedArc * arcs
struct TrgmState * parent
#define TSTATE_INIT
static int packArcInfoCmp(const void *a1, const void *a2)
static int colorTrgmInfoCmp(const void *p1, const void *p2)

References TrgmPackedState::arcs, TrgmPackedState::arcsCount, Assert, ColorTrgmInfo::cnumber, TrgmPackArcInfo::colorTrgm, colorTrgmInfoCmp(), TrgmPackedGraph::colorTrigramGroups, TrgmPackedGraph::colorTrigramsActive, TrgmPackedGraph::colorTrigramsCount, ColorTrgmInfo::expanded, fb(), hash_seq_init(), hash_seq_search(), i, j, lfirst, MemoryContextAlloc(), packArcInfoCmp(), palloc_array, TrgmState::parent, qsort, TrgmState::snumber, source, TrgmPackArcInfo::sourceState, TrgmPackedGraph::states, TrgmPackedGraph::statesActive, TrgmPackedGraph::statesCount, TrgmPackedGraph::statesQueue, TrgmPackedArc::targetState, TrgmPackArcInfo::targetState, TSTATE_FIN, and TSTATE_INIT.

Referenced by createTrgmNFAInternal().

◆ prefixContains()

static bool prefixContains ( TrgmPrefix prefix1,
TrgmPrefix prefix2 
)
static

Definition at line 1418 of file trgm_regexp.c.

1419{
1420 if (prefix1->colors[1] == COLOR_UNKNOWN)
1421 {
1422 /* Fully ambiguous prefix contains everything */
1423 return true;
1424 }
1425 else if (prefix1->colors[0] == COLOR_UNKNOWN)
1426 {
1427 /*
1428 * Prefix with only first unknown color contains every prefix with
1429 * same second color.
1430 */
1431 if (prefix1->colors[1] == prefix2->colors[1])
1432 return true;
1433 else
1434 return false;
1435 }
1436 else
1437 {
1438 /* Exact prefix contains only the exact same prefix */
1439 if (prefix1->colors[0] == prefix2->colors[0] &&
1440 prefix1->colors[1] == prefix2->colors[1])
1441 return true;
1442 else
1443 return false;
1444 }
1445}

References COLOR_UNKNOWN, and fb().

Referenced by addArc(), and addKey().

◆ processState()

static void processState ( TrgmNFA trgmNFA,
TrgmState state 
)
static

Definition at line 961 of file trgm_regexp.c.

962{
963 ListCell *lc;
964
965 /* keysQueue should be NIL already, but make sure */
966 trgmNFA->keysQueue = NIL;
967
968 /*
969 * Add state's own key, and then process all keys added to keysQueue until
970 * queue is finished. But we can quit if the state gets marked final.
971 */
972 addKey(trgmNFA, state, &state->stateKey);
973 foreach(lc, trgmNFA->keysQueue)
974 {
976
977 if (state->flags & TSTATE_FIN)
978 break;
979 addKey(trgmNFA, state, key);
980 }
981
982 /* Release keysQueue to clean up for next cycle */
983 list_free(trgmNFA->keysQueue);
984 trgmNFA->keysQueue = NIL;
985
986 /*
987 * Add outgoing arcs only if state isn't final (we have no interest in
988 * outgoing arcs if we already match)
989 */
990 if (!(state->flags & TSTATE_FIN))
992}
void list_free(List *list)
Definition list.c:1546
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state)

References addArcs(), addKey(), fb(), lfirst, list_free(), NIL, and TSTATE_FIN.

Referenced by transformGraph().

◆ RE_compile()

static void RE_compile ( regex_t regex,
text text_re,
int  cflags,
Oid  collation 
)
static

Definition at line 721 of file trgm_regexp.c.

722{
725 pg_wchar *pattern;
726 int pattern_len;
727 int regcomp_result;
728 char errMsg[100];
729
730 /* Convert pattern string to wide characters */
731 pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
733 pattern,
735
736 /* Compile regex */
738 pattern,
740 cflags,
741 collation);
742
743 pfree(pattern);
744
746 {
747 /* re didn't compile (no need for pg_regfree, if so) */
748 pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
751 errmsg("invalid regular expression: %s", errMsg)));
752 }
753}
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define ERROR
Definition elog.h:39
#define ereport(elevel,...)
Definition elog.h:150
int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
Definition mbutils.c:997
void * palloc(Size size)
Definition mcxt.c:1387
int pg_regcomp(regex_t *re, const chr *string, size_t len, int flags, Oid collation)
Definition regcomp.c:372
size_t pg_regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
Definition regerror.c:60
#define REG_OKAY
Definition regex.h:215
static Size VARSIZE_ANY_EXHDR(const void *PTR)
Definition varatt.h:472
static char * VARDATA_ANY(const void *PTR)
Definition varatt.h:486

References ereport, errcode(), errmsg(), ERROR, fb(), palloc(), pfree(), pg_mb2wchar_with_len(), pg_regcomp(), pg_regerror(), REG_OKAY, VARDATA_ANY(), and VARSIZE_ANY_EXHDR().

Referenced by createTrgmNFA().

◆ selectColorTrigrams()

static bool selectColorTrigrams ( TrgmNFA trgmNFA)
static

Definition at line 1460 of file trgm_regexp.c.

1461{
1463 int arcsCount = trgmNFA->arcsCount,
1464 i;
1466 ColorTrgmInfo *colorTrgms;
1467 int64 totalTrgmCount;
1469 int cnumber;
1470
1471 /* Collect color trigrams from all arcs */
1472 colorTrgms = palloc0_array(ColorTrgmInfo, arcsCount);
1473 trgmNFA->colorTrgms = colorTrgms;
1474
1475 i = 0;
1477 while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1478 {
1479 ListCell *cell;
1480
1481 foreach(cell, state->arcs)
1482 {
1483 TrgmArc *arc = (TrgmArc *) lfirst(cell);
1485 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1486
1487 arcInfo->source = state;
1488 arcInfo->target = arc->target;
1489 trgmInfo->ctrgm = arc->ctrgm;
1490 trgmInfo->cnumber = -1;
1491 /* count and penalty will be set below */
1492 trgmInfo->expanded = true;
1493 trgmInfo->arcs = list_make1(arcInfo);
1494 i++;
1495 }
1496 }
1497 Assert(i == arcsCount);
1498
1499 /* Remove duplicates, merging their arcs lists */
1500 if (arcsCount >= 2)
1501 {
1503 *p2;
1504
1505 /* Sort trigrams to ease duplicate detection */
1506 qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1507
1508 /* p1 is probe point, p2 is last known non-duplicate. */
1509 p2 = colorTrgms;
1510 for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1511 {
1512 if (colorTrgmInfoCmp(p1, p2) > 0)
1513 {
1514 p2++;
1515 *p2 = *p1;
1516 }
1517 else
1518 {
1519 p2->arcs = list_concat(p2->arcs, p1->arcs);
1520 }
1521 }
1522 trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1523 }
1524 else
1525 {
1526 trgmNFA->colorTrgmsCount = arcsCount;
1527 }
1528
1529 /*
1530 * Count number of simple trigrams generated by each color trigram, and
1531 * also compute a penalty value, which is the number of simple trigrams
1532 * times a multiplier that depends on its whitespace content.
1533 *
1534 * Note: per-color-trigram counts cannot overflow an int so long as
1535 * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1536 * 1290. However, the grand total totalTrgmCount might conceivably
1537 * overflow an int, so we use int64 for that within this routine. Also,
1538 * penalties are calculated in float4 arithmetic to avoid any overflow
1539 * worries.
1540 */
1541 totalTrgmCount = 0;
1542 totalTrgmPenalty = 0.0f;
1543 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1544 {
1545 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1546 int j,
1547 count = 1,
1548 typeIndex = 0;
1549
1550 for (j = 0; j < 3; j++)
1551 {
1553
1554 typeIndex *= 2;
1555 if (c == COLOR_BLANK)
1556 typeIndex++;
1557 else
1558 count *= trgmNFA->colorInfo[c].wordCharsCount;
1559 }
1560 trgmInfo->count = count;
1561 totalTrgmCount += count;
1562 trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1563 totalTrgmPenalty += trgmInfo->penalty;
1564 }
1565
1566 /* Sort color trigrams in descending order of their penalties */
1567 qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1569
1570 /*
1571 * Remove color trigrams from the graph so long as total penalty of color
1572 * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1573 * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1574 * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1575 * penalty, since those are the most promising for reducing the total
1576 * penalty. When removing a color trigram we have to merge states
1577 * connected by arcs labeled with that trigram. It's necessary to not
1578 * merge initial and final states, because our graph becomes useless if
1579 * that happens; so we cannot always remove the trigram we'd prefer to.
1580 */
1581 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1582 {
1583 ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1584 bool canRemove = true;
1585 ListCell *cell;
1586
1587 /* Done if we've reached the target */
1589 break;
1590
1591#ifdef TRGM_REGEXP_DEBUG
1592 fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1593 trgmInfo->ctrgm.colors[0],
1594 trgmInfo->ctrgm.colors[1],
1595 trgmInfo->ctrgm.colors[2],
1596 trgmInfo->penalty,
1597 list_length(trgmInfo->arcs));
1598#endif
1599
1600 /*
1601 * Does any arc of this color trigram connect initial and final
1602 * states? If so we can't remove it.
1603 */
1604 foreach(cell, trgmInfo->arcs)
1605 {
1607 TrgmState *source = arcInfo->source,
1608 *target = arcInfo->target;
1609 int source_flags,
1611
1612#ifdef TRGM_REGEXP_DEBUG
1613 fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1614 -target->snumber, target->flags,
1615 -source->snumber, source->flags);
1616#endif
1617
1618 /* examine parent states, if any merging has already happened */
1619 while (source->parent)
1620 source = source->parent;
1621 while (target->parent)
1622 target = target->parent;
1623
1624#ifdef TRGM_REGEXP_DEBUG
1625 fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1626 -target->snumber, target->flags,
1627 -source->snumber, source->flags);
1628#endif
1629
1630 /* we must also consider merges we are planning right now */
1631 source_flags = source->flags | source->tentFlags;
1632 while (source->tentParent)
1633 {
1634 source = source->tentParent;
1635 source_flags |= source->flags | source->tentFlags;
1636 }
1637 target_flags = target->flags | target->tentFlags;
1638 while (target->tentParent)
1639 {
1640 target = target->tentParent;
1641 target_flags |= target->flags | target->tentFlags;
1642 }
1643
1644#ifdef TRGM_REGEXP_DEBUG
1645 fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1646 -target->snumber, target_flags,
1647 -source->snumber, source_flags);
1648#endif
1649
1650 /* would fully-merged state have both INIT and FIN set? */
1653 {
1654 canRemove = false;
1655 break;
1656 }
1657
1658 /* ok so far, so remember planned merge */
1659 if (source != target)
1660 {
1661#ifdef TRGM_REGEXP_DEBUG
1662 fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1663 -target->snumber, -source->snumber);
1664#endif
1665 target->tentParent = source;
1666 source->tentFlags |= target_flags;
1667 }
1668 }
1669
1670 /*
1671 * We must reset all the tentFlags/tentParent fields before
1672 * continuing. tentFlags could only have become set in states that
1673 * are the source or parent or tentative parent of one of the current
1674 * arcs; likewise tentParent could only have become set in states that
1675 * are the target or parent or tentative parent of one of the current
1676 * arcs. There might be some overlap between those sets, but if we
1677 * clear tentFlags in target states as well as source states, we
1678 * should be okay even if we visit a state as target before visiting
1679 * it as a source.
1680 */
1681 foreach(cell, trgmInfo->arcs)
1682 {
1684 TrgmState *source = arcInfo->source,
1685 *target = arcInfo->target;
1687
1688 /* no need to touch previously-merged states */
1689 while (source->parent)
1690 source = source->parent;
1691 while (target->parent)
1692 target = target->parent;
1693
1694 while (source)
1695 {
1696 source->tentFlags = 0;
1697 source = source->tentParent;
1698 }
1699
1700 while ((ttarget = target->tentParent) != NULL)
1701 {
1702 target->tentParent = NULL;
1703 target->tentFlags = 0; /* in case it was also a source */
1704 target = ttarget;
1705 }
1706 }
1707
1708 /* Now, move on if we can't drop this trigram */
1709 if (!canRemove)
1710 {
1711#ifdef TRGM_REGEXP_DEBUG
1712 fprintf(stderr, " ... not ok to merge\n");
1713#endif
1714 continue;
1715 }
1716
1717 /* OK, merge states linked by each arc labeled by the trigram */
1718 foreach(cell, trgmInfo->arcs)
1719 {
1721 TrgmState *source = arcInfo->source,
1722 *target = arcInfo->target;
1723
1724 while (source->parent)
1725 source = source->parent;
1726 while (target->parent)
1727 target = target->parent;
1728 if (source != target)
1729 {
1730#ifdef TRGM_REGEXP_DEBUG
1731 fprintf(stderr, "merging s%d into s%d\n",
1732 -target->snumber, -source->snumber);
1733#endif
1734 mergeStates(source, target);
1735 /* Assert we didn't merge initial and final states */
1736 Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1738 }
1739 }
1740
1741 /* Mark trigram unexpanded, and update totals */
1742 trgmInfo->expanded = false;
1743 totalTrgmCount -= trgmInfo->count;
1744 totalTrgmPenalty -= trgmInfo->penalty;
1745 }
1746
1747 /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1748 if (totalTrgmCount > MAX_TRGM_COUNT)
1749 return false;
1750
1751 trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1752
1753 /*
1754 * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1755 * and enumerate the color trigrams that are expanded.
1756 */
1757 cnumber = 0;
1758 qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1760 for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1761 {
1762 if (colorTrgms[i].expanded)
1763 {
1764 colorTrgms[i].cnumber = cnumber;
1765 cnumber++;
1766 }
1767 }
1768
1769 return true;
1770}
int64_t int64
Definition c.h:555
#define fprintf(file, fmt, msg)
Definition cubescan.l:21
#define palloc0_array(type, count)
Definition fe_memutils.h:77
List * list_concat(List *list1, const List *list2)
Definition list.c:561
static int list_length(const List *l)
Definition pg_list.h:152
#define list_make1(x1)
Definition pg_list.h:212
int TrgmColor
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
#define MAX_TRGM_COUNT
static const float4 penalties[8]
#define WISH_TRGM_PENALTY
static void mergeStates(TrgmState *state1, TrgmState *state2)

References ColorTrgmInfo::arcs, Assert, ColorTrgmInfo::cnumber, COLOR_BLANK, ColorTrgm::colors, colorTrgmInfoCmp(), colorTrgmInfoPenaltyCmp(), ColorTrgmInfo::ctrgm, fb(), fprintf, hash_seq_init(), hash_seq_search(), i, j, lfirst, list_concat(), list_length(), list_make1, MAX_TRGM_COUNT, mergeStates(), palloc0_array, palloc_object, penalties, qsort, source, TSTATE_FIN, TSTATE_INIT, and WISH_TRGM_PENALTY.

Referenced by createTrgmNFAInternal().

◆ transformGraph()

static void transformGraph ( TrgmNFA trgmNFA)
static

Definition at line 897 of file trgm_regexp.c.

898{
901 TrgmState *initstate;
902 ListCell *lc;
903
904 /* Initialize this stage's workspace in trgmNFA struct */
905 trgmNFA->queue = NIL;
906 trgmNFA->keysQueue = NIL;
907 trgmNFA->arcsCount = 0;
908 trgmNFA->overflowed = false;
909
910 /* Create hashtable for states */
911 hashCtl.keysize = sizeof(TrgmStateKey);
912 hashCtl.entrysize = sizeof(TrgmState);
914 trgmNFA->states = hash_create("Trigram NFA",
915 1024,
916 &hashCtl,
918 trgmNFA->nstates = 0;
919
920 /* Create initial state: ambiguous prefix, NFA's initial state */
921 MemSet(&initkey, 0, sizeof(initkey));
922 initkey.prefix.colors[0] = COLOR_UNKNOWN;
923 initkey.prefix.colors[1] = COLOR_UNKNOWN;
924 initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
925
926 initstate = getState(trgmNFA, &initkey);
927 initstate->flags |= TSTATE_INIT;
928 trgmNFA->initState = initstate;
929
930 /*
931 * Recursively build the expanded graph by processing queue of states
932 * (breadth-first search). getState already put initstate in the queue.
933 * Note that getState will append new states to the queue within the loop,
934 * too; this works as long as we don't do repeat fetches using the "lc"
935 * pointer.
936 */
937 foreach(lc, trgmNFA->queue)
938 {
940
941 /*
942 * If we overflowed then just mark state as final. Otherwise do
943 * actual processing.
944 */
945 if (trgmNFA->overflowed)
946 state->flags |= TSTATE_FIN;
947 else
949
950 /* Did we overflow? */
951 if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
953 trgmNFA->overflowed = true;
954 }
955}
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:358
int64 hash_get_num_entries(HTAB *hashp)
Definition dynahash.c:1336
#define HASH_CONTEXT
Definition hsearch.h:102
#define HASH_ELEM
Definition hsearch.h:95
#define HASH_BLOBS
Definition hsearch.h:97
int pg_reg_getinitialstate(const regex_t *regex)
Definition regexport.c:50
#define MAX_EXPANDED_STATES
#define MAX_EXPANDED_ARCS
static void processState(TrgmNFA *trgmNFA, TrgmState *state)

References COLOR_UNKNOWN, CurrentMemoryContext, fb(), TrgmState::flags, getState(), HASH_BLOBS, HASH_CONTEXT, hash_create(), HASH_ELEM, hash_get_num_entries(), lfirst, MAX_EXPANDED_ARCS, MAX_EXPANDED_STATES, MemSet, NIL, pg_reg_getinitialstate(), processState(), TSTATE_FIN, and TSTATE_INIT.

Referenced by createTrgmNFAInternal().

◆ trigramsMatchGraph()

bool trigramsMatchGraph ( TrgmPackedGraph graph,
bool check 
)

Definition at line 628 of file trgm_regexp.c.

629{
630 int i,
631 j,
632 k,
633 queueIn,
634 queueOut;
635
636 /*
637 * Reset temporary working areas.
638 */
639 memset(graph->colorTrigramsActive, 0,
640 sizeof(bool) * graph->colorTrigramsCount);
641 memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
642
643 /*
644 * Check which color trigrams were matched. A match for any simple
645 * trigram associated with a color trigram counts as a match of the color
646 * trigram.
647 */
648 j = 0;
649 for (i = 0; i < graph->colorTrigramsCount; i++)
650 {
651 int cnt = graph->colorTrigramGroups[i];
652
653 for (k = j; k < j + cnt; k++)
654 {
655 if (check[k])
656 {
657 /*
658 * Found one matched trigram in the group. Can skip the rest
659 * of them and go to the next group.
660 */
661 graph->colorTrigramsActive[i] = true;
662 break;
663 }
664 }
665 j = j + cnt;
666 }
667
668 /*
669 * Initialize the statesQueue to hold just the initial state. Note:
670 * statesQueue has room for statesCount entries, which is certainly enough
671 * since no state will be put in the queue more than once. The
672 * statesActive array marks which states have been queued.
673 */
674 graph->statesActive[0] = true;
675 graph->statesQueue[0] = 0;
676 queueIn = 0;
677 queueOut = 1;
678
679 /* Process queued states as long as there are any. */
680 while (queueIn < queueOut)
681 {
682 int stateno = graph->statesQueue[queueIn++];
684 int cnt = state->arcsCount;
685
686 /* Loop over state's out-arcs */
687 for (i = 0; i < cnt; i++)
688 {
689 TrgmPackedArc *arc = &state->arcs[i];
690
691 /*
692 * If corresponding color trigram is present then activate the
693 * corresponding state. We're done if that's the final state,
694 * otherwise queue the state if it's not been queued already.
695 */
696 if (graph->colorTrigramsActive[arc->colorTrgm])
697 {
698 int nextstate = arc->targetState;
699
700 if (nextstate == 1)
701 return true; /* success: final state is reachable */
702
703 if (!graph->statesActive[nextstate])
704 {
705 graph->statesActive[nextstate] = true;
706 graph->statesQueue[queueOut++] = nextstate;
707 }
708 }
709 }
710 }
711
712 /* Queue is empty, so match fails. */
713 return false;
714}

References TrgmPackedGraph::colorTrigramGroups, TrgmPackedGraph::colorTrigramsActive, TrgmPackedGraph::colorTrigramsCount, fb(), i, j, TrgmPackedGraph::states, TrgmPackedGraph::statesActive, TrgmPackedGraph::statesCount, and TrgmPackedGraph::statesQueue.

Referenced by gin_trgm_consistent(), gin_trgm_triconsistent(), and gtrgm_consistent().

◆ validArcLabel()

static bool validArcLabel ( TrgmStateKey key,
TrgmColor  co 
)
static

Definition at line 1332 of file trgm_regexp.c.

1333{
1334 /*
1335 * We have to know full trigram in order to add outgoing arc. So we can't
1336 * do it if prefix is ambiguous.
1337 */
1338 if (key->prefix.colors[0] == COLOR_UNKNOWN)
1339 return false;
1340
1341 /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1342 Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1343 /* And we should not be called with an unknown arc color anytime */
1344 Assert(co != COLOR_UNKNOWN);
1345
1346 /*
1347 * We don't bother with making arcs representing three non-word
1348 * characters, since that's useless for trigram extraction.
1349 */
1350 if (key->prefix.colors[0] == COLOR_BLANK &&
1351 key->prefix.colors[1] == COLOR_BLANK &&
1352 co == COLOR_BLANK)
1353 return false;
1354
1355 /*
1356 * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1357 * case doesn't correspond to any trigram the trigram extraction code
1358 * would make. The nonblank-blank-blank case is also not possible with
1359 * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1360 * trigram even if it were valid, for example processing "foo bar" will
1361 * not result in considering the trigram "o ". So if you want to support
1362 * RPADDING = 2, there's more to do than just twiddle this test.)
1363 */
1364 if (key->prefix.colors[0] != COLOR_BLANK &&
1365 key->prefix.colors[1] == COLOR_BLANK)
1366 return false;
1367
1368 /*
1369 * Other combinations involving blank are valid, in particular we assume
1370 * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1371 *
1372 * Note: Using again the example "foo bar", we will not consider the
1373 * trigram " b", though this trigram would be found by the trigram
1374 * extraction code. Since we will find " ba", it doesn't seem worth
1375 * trying to hack the algorithm to generate the additional trigram.
1376 */
1377
1378 /* arc label is valid */
1379 return true;
1380}

References Assert, COLOR_BLANK, and COLOR_UNKNOWN.

Referenced by addArc(), and addKey().

Variable Documentation

◆ penalties

const float4 penalties[8]
static
Initial value:
= {
1.0f,
3.5f,
0.0f,
0.0f,
4.2f,
2.1f,
25.0f,
0.0f
}

Definition at line 231 of file trgm_regexp.c.

231 {
232 1.0f, /* "aaa" */
233 3.5f, /* "aa " */
234 0.0f, /* "a a" (impossible) */
235 0.0f, /* "a " (impossible) */
236 4.2f, /* " aa" */
237 2.1f, /* " a " */
238 25.0f, /* " a" */
239 0.0f /* " " (impossible) */
240};

Referenced by selectColorTrigrams().