#include "postgres.h"
#include "catalog/pg_ts_dict.h"
#include "commands/defrem.h"
#include "lib/stringinfo.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"

Include dependency graph for unaccent.c:

Data Structures
struct	TrieChar

Typedefs
typedef struct TrieChar	TrieChar

Functions
	PG_MODULE_MAGIC_EXT (.name="unaccent",.version=PG_VERSION)

static TrieChar *	placeChar (TrieChar node, const unsigned char str, int lenstr, const char *replaceTo, int replacelen)

static TrieChar *	initTrie (const char *filename)

static TrieChar *	findReplaceTo (TrieChar node, const unsigned char src, int srclen, int *p_matchlen)

	PG_FUNCTION_INFO_V1 (unaccent_init)

Datum	unaccent_init (PG_FUNCTION_ARGS)

	PG_FUNCTION_INFO_V1 (unaccent_lexize)

Datum	unaccent_lexize (PG_FUNCTION_ARGS)

	PG_FUNCTION_INFO_V1 (unaccent_dict)

Datum	unaccent_dict (PG_FUNCTION_ARGS)

Typedef Documentation

◆ TrieChar

typedef struct TrieChar TrieChar

Function Documentation

◆ findReplaceTo()

static TrieChar * findReplaceTo	(	TrieChar *	node,
		const unsigned char *	src,
		int	srclen,
		int *	p_matchlen
	)

static

Definition at line 311 of file unaccent.c.

{
    TrieChar   *result = NULL;
    int         matchlen = 0;
 
    *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
 
    while (node && matchlen < srclen)
    {
        node = node + src[matchlen];
        matchlen++;
 
        if (node->replaceTo)
        {
            result = node;
            *p_matchlen = matchlen;
        }
 
        node = node->nextChar;
    }
 
    return result;
}

References TrieChar::nextChar, and TrieChar::replaceTo.

Referenced by unaccent_lexize().

◆ initTrie()

static TrieChar * initTrie ( const char * filename )

static

Definition at line 97 of file unaccent.c.

{
    TrieChar   *volatile rootTrie = NULL;
    MemoryContext ccxt = CurrentMemoryContext;
    tsearch_readline_state trst;
    volatile bool skip;
 
    filename = get_tsearch_config_filename(filename, "rules");
    if (!tsearch_readline_begin(&trst, filename))
        ereport(ERROR,
                (errcode(ERRCODE_CONFIG_FILE_ERROR),
                 errmsg("could not open unaccent file \"%s\": %m",
                        filename)));
 
    do
    {
        /*
         * pg_do_encoding_conversion() (called by tsearch_readline()) will
         * emit exception if it finds untranslatable characters in current
         * locale. We just skip such lines, continuing with the next.
         */
        skip = true;
 
        PG_TRY();
        {
            char       *line;
 
            while ((line = tsearch_readline(&trst)) != NULL)
            {
                /*----------
                 * The format of each line must be "src" or "src trg", where
                 * src and trg are sequences of one or more non-whitespace
                 * characters, separated by whitespace.  Whitespace at start
                 * or end of line is ignored.  If trg is omitted, an empty
                 * string is used as the replacement.  trg can be optionally
                 * quoted, in which case whitespaces are included in it.
                 *
                 * We use a simple state machine, with states
                 *  0   initial (before src)
                 *  1   in src
                 *  2   in whitespace after src
                 *  3   in trg (non-quoted)
                 *  4   in trg (quoted)
                 *  5   in whitespace after trg
                 *  -1  syntax error detected (two strings)
                 *  -2  syntax error detected (unfinished quoted string)
                 *----------
                 */
                int         state;
                char       *ptr;
                char       *src = NULL;
                char       *trg = NULL;
                char       *trgstore = NULL;
                int         ptrlen;
                int         srclen = 0;
                int         trglen = 0;
                int         trgstorelen = 0;
                bool        trgquoted = false;
 
                state = 0;
                for (ptr = line; *ptr; ptr += ptrlen)
                {
                    ptrlen = pg_mblen(ptr);
                    /* ignore whitespace, but end src or trg */
                    if (isspace((unsigned char) *ptr))
                    {
                        if (state == 1)
                            state = 2;
                        else if (state == 3)
                            state = 5;
                        /* whitespaces are OK in quoted area */
                        if (state != 4)
                            continue;
                    }
                    switch (state)
                    {
                        case 0:
                            /* start of src */
                            src = ptr;
                            srclen = ptrlen;
                            state = 1;
                            break;
                        case 1:
                            /* continue src */
                            srclen += ptrlen;
                            break;
                        case 2:
                            /* start of trg */
                            if (*ptr == '"')
                            {
                                trgquoted = true;
                                state = 4;
                            }
                            else
                                state = 3;
 
                            trg = ptr;
                            trglen = ptrlen;
                            break;
                        case 3:
                            /* continue non-quoted trg */
                            trglen += ptrlen;
                            break;
                        case 4:
                            /* continue quoted trg */
                            trglen += ptrlen;
 
                            /*
                             * If this is a quote, consider it as the end of
                             * trg except if the follow-up character is itself
                             * a quote.
                             */
                            if (*ptr == '"')
                            {
                                if (*(ptr + 1) == '"')
                                {
                                    ptr++;
                                    trglen += 1;
                                }
                                else
                                    state = 5;
                            }
                            break;
                        default:
                            /* bogus line format */
                            state = -1;
                            break;
                    }
                }
 
                if (state == 1 || state == 2)
                {
                    /* trg was omitted, so use "" */
                    trg = "";
                    trglen = 0;
                }
 
                /* If still in a quoted area, fallback to an error */
                if (state == 4)
                    state = -2;
 
                /* If trg was quoted, remove its quotes and unescape it */
                if (trgquoted && state > 0)
                {
                    /* Ignore first and end quotes */
                    trgstore = (char *) palloc(sizeof(char) * (trglen - 2));
                    trgstorelen = 0;
                    for (int i = 1; i < trglen - 1; i++)
                    {
                        trgstore[trgstorelen] = trg[i];
                        trgstorelen++;
                        /* skip second double quotes */
                        if (trg[i] == '"' && trg[i + 1] == '"')
                            i++;
                    }
                }
                else
                {
                    trgstore = (char *) palloc(sizeof(char) * trglen);
                    trgstorelen = trglen;
                    memcpy(trgstore, trg, trgstorelen);
                }
 
                if (state > 0)
                    rootTrie = placeChar(rootTrie,
                                         (unsigned char *) src, srclen,
                                         trgstore, trgstorelen);
                else if (state == -1)
                    ereport(WARNING,
                            (errcode(ERRCODE_CONFIG_FILE_ERROR),
                             errmsg("invalid syntax: more than two strings in unaccent rule")));
                else if (state == -2)
                    ereport(WARNING,
                            (errcode(ERRCODE_CONFIG_FILE_ERROR),
                             errmsg("invalid syntax: unfinished quoted string in unaccent rule")));
 
                pfree(trgstore);
                pfree(line);
            }
            skip = false;
        }
        PG_CATCH();
        {
            ErrorData  *errdata;
            MemoryContext ecxt;
 
            ecxt = MemoryContextSwitchTo(ccxt);
            errdata = CopyErrorData();
            if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
            {
                FlushErrorState();
            }
            else
            {
                MemoryContextSwitchTo(ecxt);
                PG_RE_THROW();
            }
        }
        PG_END_TRY();
    }
    while (skip);
 
    tsearch_readline_end(&trst);
 
    return rootTrie;
}

References CopyErrorData(), CurrentMemoryContext, ereport, errcode(), errmsg(), ERROR, filename, FlushErrorState(), get_tsearch_config_filename(), i, MemoryContextSwitchTo(), palloc(), pfree(), PG_CATCH, PG_END_TRY, pg_mblen(), PG_RE_THROW, PG_TRY, placeChar(), skip, ErrorData::sqlerrcode, tsearch_readline(), tsearch_readline_begin(), tsearch_readline_end(), and WARNING.

Referenced by unaccent_init().

◆ PG_FUNCTION_INFO_V1() [1/3]

PG_FUNCTION_INFO_V1 ( unaccent_dict )

◆ PG_FUNCTION_INFO_V1() [2/3]

PG_FUNCTION_INFO_V1 ( unaccent_init )

◆ PG_FUNCTION_INFO_V1() [3/3]

PG_FUNCTION_INFO_V1 ( unaccent_lexize )

◆ PG_MODULE_MAGIC_EXT()

PG_MODULE_MAGIC_EXT	(	.	name = `"unaccent"`,
		.	version = `PG_VERSION`
	)

◆ placeChar()

static TrieChar * placeChar	(	TrieChar *	node,
		const unsigned char *	str,
		int	lenstr,
		const char *	replaceTo,
		int	replacelen
	)

static

Definition at line 57 of file unaccent.c.

{
    TrieChar   *curnode;
 
    if (!node)
        node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
 
    Assert(lenstr > 0);         /* else str[0] doesn't exist */
 
    curnode = node + *str;
 
    if (lenstr <= 1)
    {
        if (curnode->replaceTo)
            ereport(WARNING,
                    (errcode(ERRCODE_CONFIG_FILE_ERROR),
                     errmsg("duplicate source strings, first one will be used")));
        else
        {
            curnode->replacelen = replacelen;
            curnode->replaceTo = (char *) palloc(replacelen);
            memcpy(curnode->replaceTo, replaceTo, replacelen);
        }
    }
    else
    {
        curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
                                      replaceTo, replacelen);
    }
 
    return node;
}

References Assert(), ereport, errcode(), errmsg(), TrieChar::nextChar, palloc(), palloc0(), placeChar(), TrieChar::replacelen, TrieChar::replaceTo, str, and WARNING.

Referenced by initTrie(), and placeChar().

◆ unaccent_dict()

Datum unaccent_dict ( PG_FUNCTION_ARGS )

Definition at line 439 of file unaccent.c.

{
    text       *str;
    int         strArg;
    Oid         dictOid;
    TSDictionaryCacheEntry *dict;
    TSLexeme   *res;
 
    if (PG_NARGS() == 1)
    {
        /*
         * Use the "unaccent" dictionary that is in the same schema that this
         * function is in.
         */
        Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
        const char *dictname = "unaccent";
 
        dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
                                  PointerGetDatum(dictname),
                                  ObjectIdGetDatum(procnspid));
        if (!OidIsValid(dictOid))
            ereport(ERROR,
                    (errcode(ERRCODE_UNDEFINED_OBJECT),
                     errmsg("text search dictionary \"%s.%s\" does not exist",
                            get_namespace_name(procnspid), dictname)));
        strArg = 0;
    }
    else
    {
        dictOid = PG_GETARG_OID(0);
        strArg = 1;
    }
    str = PG_GETARG_TEXT_PP(strArg);
 
    dict = lookup_ts_dictionary_cache(dictOid);
 
    res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
                                                     PointerGetDatum(dict->dictData),
                                                     PointerGetDatum(VARDATA_ANY(str)),
                                                     Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
                                                     PointerGetDatum(NULL)));
 
    PG_FREE_IF_COPY(str, strArg);
 
    if (res == NULL)
    {
        PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
    }
    else if (res->lexeme == NULL)
    {
        pfree(res);
        PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
    }
    else
    {
        text       *txt = cstring_to_text(res->lexeme);
 
        pfree(res->lexeme);
        pfree(res);
 
        PG_RETURN_TEXT_P(txt);
    }
}

References cstring_to_text(), DatumGetPointer(), TSDictionaryCacheEntry::dictData, ereport, errcode(), errmsg(), ERROR, FunctionCall4, get_func_namespace(), get_namespace_name(), GetSysCacheOid2, Int32GetDatum(), TSLexeme::lexeme, TSDictionaryCacheEntry::lexize, lookup_ts_dictionary_cache(), ObjectIdGetDatum(), OidIsValid, pfree(), PG_FREE_IF_COPY, PG_GETARG_OID, PG_GETARG_TEXT_P_COPY, PG_GETARG_TEXT_PP, PG_NARGS, PG_RETURN_TEXT_P, PointerGetDatum(), str, VARDATA_ANY, and VARSIZE_ANY_EXHDR.

◆ unaccent_init()

Datum unaccent_init ( PG_FUNCTION_ARGS )

Definition at line 338 of file unaccent.c.

{
    List       *dictoptions = (List *) PG_GETARG_POINTER(0);
    TrieChar   *rootTrie = NULL;
    bool        fileloaded = false;
    ListCell   *l;
 
    foreach(l, dictoptions)
    {
        DefElem    *defel = (DefElem *) lfirst(l);
 
        if (strcmp(defel->defname, "rules") == 0)
        {
            if (fileloaded)
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("multiple Rules parameters")));
            rootTrie = initTrie(defGetString(defel));
            fileloaded = true;
        }
        else
        {
            ereport(ERROR,
                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                     errmsg("unrecognized Unaccent parameter: \"%s\"",
                            defel->defname)));
        }
    }
 
    if (!fileloaded)
    {
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                 errmsg("missing Rules parameter")));
    }
 
    PG_RETURN_POINTER(rootTrie);
}

References defGetString(), DefElem::defname, ereport, errcode(), errmsg(), ERROR, initTrie(), lfirst, PG_GETARG_POINTER, and PG_RETURN_POINTER.

◆ unaccent_lexize()

Datum unaccent_lexize ( PG_FUNCTION_ARGS )

Definition at line 379 of file unaccent.c.

{
    TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
    char       *srcchar = (char *) PG_GETARG_POINTER(1);
    int32       len = PG_GETARG_INT32(2);
    char       *srcstart = srcchar;
    TSLexeme   *res;
    StringInfoData buf;
 
    /* we allocate storage for the buffer only if needed */
    buf.data = NULL;
 
    while (len > 0)
    {
        TrieChar   *node;
        int         matchlen;
 
        node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
                             &matchlen);
        if (node && node->replaceTo)
        {
            if (buf.data == NULL)
            {
                /* initialize buffer */
                initStringInfo(&buf);
                /* insert any data we already skipped over */
                if (srcchar != srcstart)
                    appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
            }
            appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
        }
        else
        {
            matchlen = pg_mblen(srcchar);
            if (buf.data != NULL)
                appendBinaryStringInfo(&buf, srcchar, matchlen);
        }
 
        srcchar += matchlen;
        len -= matchlen;
    }
 
    /* return a result only if we made at least one substitution */
    if (buf.data != NULL)
    {
        res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
        res->lexeme = buf.data;
        res->flags = TSL_FILTER;
    }
    else
        res = NULL;
 
    PG_RETURN_POINTER(res);
}

References appendBinaryStringInfo(), buf, findReplaceTo(), TSLexeme::flags, initStringInfo(), len, TSLexeme::lexeme, palloc0(), PG_GETARG_INT32, PG_GETARG_POINTER, pg_mblen(), PG_RETURN_POINTER, TrieChar::replacelen, TrieChar::replaceTo, and TSL_FILTER.

Data Structures

Typedefs

Functions

Typedef Documentation

◆ TrieChar

Function Documentation

◆ findReplaceTo()

◆ initTrie()

◆ PG_FUNCTION_INFO_V1() [1/3]

◆ PG_FUNCTION_INFO_V1() [2/3]

◆ PG_FUNCTION_INFO_V1() [3/3]

◆ PG_MODULE_MAGIC_EXT()

◆ placeChar()

◆ unaccent_dict()

◆ unaccent_init()

◆ unaccent_lexize()