unaccent_8c_source.html

/*-------------------------------------------------------------------------

 *

 * unaccent.c

 *    Text search unaccent dictionary

 *

 * Copyright (c) 2009-2025, PostgreSQL Global Development Group

 *

 * IDENTIFICATION

 *    contrib/unaccent/unaccent.c

 *

 *-------------------------------------------------------------------------

 */


#include "postgres.h"


#include "catalog/pg_ts_dict.h"

#include "commands/defrem.h"

#include "lib/stringinfo.h"

#include "tsearch/ts_cache.h"

#include "tsearch/ts_locale.h"

#include "tsearch/ts_public.h"

#include "utils/builtins.h"

#include "utils/lsyscache.h"

#include "utils/syscache.h"


PG_MODULE_MAGIC_EXT(

                    .name = "unaccent",

                    .version = PG_VERSION

);


/*

 * An unaccent dictionary uses a trie to find a string to replace.  Each node

 * of the trie is an array of 256 TrieChar structs; the N-th element of the

 * array corresponds to next byte value N.  That element can contain both a

 * replacement string (to be used if the source string ends with this byte)

 * and a link to another trie node (to be followed if there are more bytes).

 *

 * Note that the trie search logic pays no attention to multibyte character

 * boundaries.  This is OK as long as both the data entered into the trie and

 * the data we're trying to look up are validly encoded; no partial-character

 * matches will occur.

 */

typedef struct TrieChar

{

    struct TrieChar *nextChar;

    char       *replaceTo;

    int         replacelen;

} TrieChar;


/*

 * placeChar - put str into trie's structure, byte by byte.

 *

 * If node is NULL, we need to make a new node, which will be returned;

 * otherwise the return value is the same as node.

 */

static TrieChar *

placeChar(TrieChar *node, const unsigned char *str, int lenstr,

          const char *replaceTo, int replacelen)

{

    TrieChar   *curnode;


    if (!node)

        node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);


    Assert(lenstr > 0);         /* else str[0] doesn't exist */


    curnode = node + *str;


    if (lenstr <= 1)

    {

        if (curnode->replaceTo)

            ereport(WARNING,

                    (errcode(ERRCODE_CONFIG_FILE_ERROR),

                     errmsg("duplicate source strings, first one will be used")));

        else

        {

            curnode->replacelen = replacelen;

            curnode->replaceTo = (char *) palloc(replacelen);

            memcpy(curnode->replaceTo, replaceTo, replacelen);

        }

    }

    else

    {

        curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,

                                      replaceTo, replacelen);

    }


    return node;

}


/*

 * initTrie  - create trie from file.

 *

 * Function converts UTF8-encoded file into current encoding.

 */

static TrieChar *

initTrie(const char *filename)

{

    TrieChar   *volatile rootTrie = NULL;

    MemoryContext ccxt = CurrentMemoryContext;

    tsearch_readline_state trst;

    volatile bool skip;


    filename = get_tsearch_config_filename(filename, "rules");

    if (!tsearch_readline_begin(&trst, filename))

        ereport(ERROR,

                (errcode(ERRCODE_CONFIG_FILE_ERROR),

                 errmsg("could not open unaccent file \"%s\": %m",

                        filename)));


    do

    {

        /*

         * pg_do_encoding_conversion() (called by tsearch_readline()) will

         * emit exception if it finds untranslatable characters in current

         * locale. We just skip such lines, continuing with the next.

         */

        skip = true;


        PG_TRY();

        {

            char       *line;


            while ((line = tsearch_readline(&trst)) != NULL)

            {

                /*----------

                 * The format of each line must be "src" or "src trg", where

                 * src and trg are sequences of one or more non-whitespace

                 * characters, separated by whitespace.  Whitespace at start

                 * or end of line is ignored.  If trg is omitted, an empty

                 * string is used as the replacement.  trg can be optionally

                 * quoted, in which case whitespaces are included in it.

                 *

                 * We use a simple state machine, with states

                 *  0   initial (before src)

                 *  1   in src

                 *  2   in whitespace after src

                 *  3   in trg (non-quoted)

                 *  4   in trg (quoted)

                 *  5   in whitespace after trg

                 *  -1  syntax error detected (two strings)

                 *  -2  syntax error detected (unfinished quoted string)

                 *----------

                 */

                int         state;

                char       *ptr;

                char       *src = NULL;

                char       *trg = NULL;

                char       *trgstore = NULL;

                int         ptrlen;

                int         srclen = 0;

                int         trglen = 0;

                int         trgstorelen = 0;

                bool        trgquoted = false;


                state = 0;

                for (ptr = line; *ptr; ptr += ptrlen)

                {

                    ptrlen = pg_mblen(ptr);

                    /* ignore whitespace, but end src or trg */

                    if (isspace((unsigned char) *ptr))

                    {

                        if (state == 1)

                            state = 2;

                        else if (state == 3)

                            state = 5;

                        /* whitespaces are OK in quoted area */

                        if (state != 4)

                            continue;

                    }

                    switch (state)

                    {

                        case 0:

                            /* start of src */

                            src = ptr;

                            srclen = ptrlen;

                            state = 1;

                            break;

                        case 1:

                            /* continue src */

                            srclen += ptrlen;

                            break;

                        case 2:

                            /* start of trg */

                            if (*ptr == '"')

                            {

                                trgquoted = true;

                                state = 4;

                            }

                            else

                                state = 3;


                            trg = ptr;

                            trglen = ptrlen;

                            break;

                        case 3:

                            /* continue non-quoted trg */

                            trglen += ptrlen;

                            break;

                        case 4:

                            /* continue quoted trg */

                            trglen += ptrlen;


                            /*

                             * If this is a quote, consider it as the end of

                             * trg except if the follow-up character is itself

                             * a quote.

                             */

                            if (*ptr == '"')

                            {

                                if (*(ptr + 1) == '"')

                                {

                                    ptr++;

                                    trglen += 1;

                                }

                                else

                                    state = 5;

                            }

                            break;

                        default:

                            /* bogus line format */

                            state = -1;

                            break;

                    }

                }


                if (state == 1 || state == 2)

                {

                    /* trg was omitted, so use "" */

                    trg = "";

                    trglen = 0;

                }


                /* If still in a quoted area, fallback to an error */

                if (state == 4)

                    state = -2;


                /* If trg was quoted, remove its quotes and unescape it */

                if (trgquoted && state > 0)

                {

                    /* Ignore first and end quotes */

                    trgstore = (char *) palloc(sizeof(char) * (trglen - 2));

                    trgstorelen = 0;

                    for (int i = 1; i < trglen - 1; i++)

                    {

                        trgstore[trgstorelen] = trg[i];

                        trgstorelen++;

                        /* skip second double quotes */

                        if (trg[i] == '"' && trg[i + 1] == '"')

                            i++;

                    }

                }

                else

                {

                    trgstore = (char *) palloc(sizeof(char) * trglen);

                    trgstorelen = trglen;

                    memcpy(trgstore, trg, trgstorelen);

                }


                if (state > 0)

                    rootTrie = placeChar(rootTrie,

                                         (unsigned char *) src, srclen,

                                         trgstore, trgstorelen);

                else if (state == -1)

                    ereport(WARNING,

                            (errcode(ERRCODE_CONFIG_FILE_ERROR),

                             errmsg("invalid syntax: more than two strings in unaccent rule")));

                else if (state == -2)

                    ereport(WARNING,

                            (errcode(ERRCODE_CONFIG_FILE_ERROR),

                             errmsg("invalid syntax: unfinished quoted string in unaccent rule")));


                pfree(trgstore);

                pfree(line);

            }

            skip = false;

        }

        PG_CATCH();

        {

            ErrorData  *errdata;

            MemoryContext ecxt;


            ecxt = MemoryContextSwitchTo(ccxt);

            errdata = CopyErrorData();

            if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)

            {

                FlushErrorState();

            }

            else

            {

                MemoryContextSwitchTo(ecxt);

                PG_RE_THROW();

            }

        }

        PG_END_TRY();

    }

    while (skip);


    tsearch_readline_end(&trst);


    return rootTrie;

}


/*

 * findReplaceTo - find longest possible match in trie

 *

 * On success, returns pointer to ending subnode, plus length of matched

 * source string in *p_matchlen.  On failure, returns NULL.

 */

static TrieChar *

findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,

              int *p_matchlen)

{

    TrieChar   *result = NULL;

    int         matchlen = 0;


    *p_matchlen = 0;            /* prevent uninitialized-variable warnings */


    while (node && matchlen < srclen)

    {

        node = node + src[matchlen];

        matchlen++;


        if (node->replaceTo)

        {

            result = node;

            *p_matchlen = matchlen;

        }


        node = node->nextChar;

    }


    return result;

}


PG_FUNCTION_INFO_V1(unaccent_init);

Datum

unaccent_init(PG_FUNCTION_ARGS)

{

    List       *dictoptions = (List *) PG_GETARG_POINTER(0);

    TrieChar   *rootTrie = NULL;

    bool        fileloaded = false;

    ListCell   *l;


    foreach(l, dictoptions)

    {

        DefElem    *defel = (DefElem *) lfirst(l);


        if (strcmp(defel->defname, "rules") == 0)

        {

            if (fileloaded)

                ereport(ERROR,

                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

                         errmsg("multiple Rules parameters")));

            rootTrie = initTrie(defGetString(defel));

            fileloaded = true;

        }

        else

        {

            ereport(ERROR,

                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

                     errmsg("unrecognized Unaccent parameter: \"%s\"",

                            defel->defname)));

        }

    }


    if (!fileloaded)

    {

        ereport(ERROR,

                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),

                 errmsg("missing Rules parameter")));

    }


    PG_RETURN_POINTER(rootTrie);

}


PG_FUNCTION_INFO_V1(unaccent_lexize);

Datum

unaccent_lexize(PG_FUNCTION_ARGS)

{

    TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);

    char       *srcchar = (char *) PG_GETARG_POINTER(1);

    int32       len = PG_GETARG_INT32(2);

    char       *srcstart = srcchar;

    TSLexeme   *res;

    StringInfoData buf;


    /* we allocate storage for the buffer only if needed */

    buf.data = NULL;


    while (len > 0)

    {

        TrieChar   *node;

        int         matchlen;


        node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,

                             &matchlen);

        if (node && node->replaceTo)

        {

            if (buf.data == NULL)

            {

                /* initialize buffer */

                initStringInfo(&buf);

                /* insert any data we already skipped over */

                if (srcchar != srcstart)

                    appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);

            }

            appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);

        }

        else

        {

            matchlen = pg_mblen(srcchar);

            if (buf.data != NULL)

                appendBinaryStringInfo(&buf, srcchar, matchlen);

        }


        srcchar += matchlen;

        len -= matchlen;

    }


    /* return a result only if we made at least one substitution */

    if (buf.data != NULL)

    {

        res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);

        res->lexeme = buf.data;

        res->flags = TSL_FILTER;

    }

    else

        res = NULL;


    PG_RETURN_POINTER(res);

}


/*

 * Function-like wrapper for dictionary

 */

PG_FUNCTION_INFO_V1(unaccent_dict);

Datum

unaccent_dict(PG_FUNCTION_ARGS)

{

    text       *str;

    int         strArg;

    Oid         dictOid;

    TSDictionaryCacheEntry *dict;

    TSLexeme   *res;


    if (PG_NARGS() == 1)

    {

        /*

         * Use the "unaccent" dictionary that is in the same schema that this

         * function is in.

         */

        Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);

        const char *dictname = "unaccent";


        dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,

                                  PointerGetDatum(dictname),

                                  ObjectIdGetDatum(procnspid));

        if (!OidIsValid(dictOid))

            ereport(ERROR,

                    (errcode(ERRCODE_UNDEFINED_OBJECT),

                     errmsg("text search dictionary \"%s.%s\" does not exist",

                            get_namespace_name(procnspid), dictname)));

        strArg = 0;

    }

    else

    {

        dictOid = PG_GETARG_OID(0);

        strArg = 1;

    }

    str = PG_GETARG_TEXT_PP(strArg);


    dict = lookup_ts_dictionary_cache(dictOid);


    res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),

                                                     PointerGetDatum(dict->dictData),

                                                     PointerGetDatum(VARDATA_ANY(str)),

                                                     Int32GetDatum(VARSIZE_ANY_EXHDR(str)),

                                                     PointerGetDatum(NULL)));


    PG_FREE_IF_COPY(str, strArg);


    if (res == NULL)

    {

        PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));

    }

    else if (res->lexeme == NULL)

    {

        pfree(res);

        PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));

    }

    else

    {

        text       *txt = cstring_to_text(res->lexeme);


        pfree(res->lexeme);

        pfree(res);


        PG_RETURN_TEXT_P(txt);

    }

}

builtins.h

int32
int32_t int32
Definition: c.h:498

OidIsValid
#define OidIsValid(objectId)
Definition: c.h:746

defGetString
char * defGetString(DefElem *def)
Definition: define.c:35

defrem.h

CopyErrorData
ErrorData * CopyErrorData(void)
Definition: elog.c:1751

FlushErrorState
void FlushErrorState(void)
Definition: elog.c:1872

errcode
int errcode(int sqlerrcode)
Definition: elog.c:854

errmsg
int errmsg(const char *fmt,...)
Definition: elog.c:1071

PG_RE_THROW
#define PG_RE_THROW()
Definition: elog.h:404

PG_TRY
#define PG_TRY(...)
Definition: elog.h:371

WARNING
#define WARNING
Definition: elog.h:36

PG_END_TRY
#define PG_END_TRY(...)
Definition: elog.h:396

ERROR
#define ERROR
Definition: elog.h:39

PG_CATCH
#define PG_CATCH(...)
Definition: elog.h:381

ereport
#define ereport(elevel,...)
Definition: elog.h:149

PG_FREE_IF_COPY
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:260

PG_GETARG_OID
#define PG_GETARG_OID(n)
Definition: fmgr.h:275

PG_GETARG_TEXT_PP
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309

PG_GETARG_POINTER
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:276

PG_NARGS
#define PG_NARGS()
Definition: fmgr.h:203

FunctionCall4
#define FunctionCall4(flinfo, arg1, arg2, arg3, arg4)
Definition: fmgr.h:706

PG_RETURN_TEXT_P
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372

PG_GETARG_INT32
#define PG_GETARG_INT32(n)
Definition: fmgr.h:269

PG_RETURN_POINTER
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361

PG_GETARG_TEXT_P_COPY
#define PG_GETARG_TEXT_P_COPY(n)
Definition: fmgr.h:315

PG_FUNCTION_ARGS
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193

Assert
Assert(PointerIsAligned(start, uint64))

str
const char * str
Definition: hashfn_unstable.h:254

i
int i
Definition: isn.c:77

get_func_namespace
Oid get_func_namespace(Oid funcid)
Definition: lsyscache.c:1772

get_namespace_name
char * get_namespace_name(Oid nspid)
Definition: lsyscache.c:3506

lsyscache.h

pg_mblen
int pg_mblen(const char *mbstr)
Definition: mbutils.c:1024

pfree
void pfree(void *pointer)
Definition: mcxt.c:1528

palloc0
void * palloc0(Size size)
Definition: mcxt.c:1351

palloc
void * palloc(Size size)
Definition: mcxt.c:1321

CurrentMemoryContext
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143

MemoryContextSwitchTo
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124

skip
static const struct exclude_list_item skip[]
Definition: pg_checksums.c:107

len
const void size_t len
Definition: pg_crc32c_sse42.c:28

filename
static char * filename
Definition: pg_dumpall.c:124

lfirst
#define lfirst(lc)
Definition: pg_list.h:172

buf
static char * buf
Definition: pg_test_fsync.c:72

pg_ts_dict.h

postgres.h

PointerGetDatum
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327

Datum
uintptr_t Datum
Definition: postgres.h:69

ObjectIdGetDatum
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257

DatumGetPointer
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317

Int32GetDatum
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217

Oid
unsigned int Oid
Definition: postgres_ext.h:30

appendBinaryStringInfo
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:281

initStringInfo
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97

stringinfo.h

DefElem
Definition: parsenodes.h:823

DefElem::defname
char * defname
Definition: parsenodes.h:826

ErrorData
Definition: elog.h:419

ErrorData::sqlerrcode
int sqlerrcode
Definition: elog.h:430

List
Definition: pg_list.h:54

MemoryContextData
Definition: memnodes.h:118

StringInfoData
Definition: stringinfo.h:47

TSDictionaryCacheEntry
Definition: ts_cache.h:52

TSDictionaryCacheEntry::dictData
void * dictData
Definition: ts_cache.h:62

TSDictionaryCacheEntry::lexize
FmgrInfo lexize
Definition: ts_cache.h:59

TSLexeme
Definition: ts_public.h:116

TSLexeme::lexeme
char * lexeme
Definition: ts_public.h:138

TSLexeme::flags
uint16 flags
Definition: ts_public.h:136

TrieChar
Definition: unaccent.c:44

TrieChar::nextChar
struct TrieChar * nextChar
Definition: unaccent.c:45

TrieChar::replacelen
int replacelen
Definition: unaccent.c:47

TrieChar::replaceTo
char * replaceTo
Definition: unaccent.c:46

state
Definition: regguts.h:323

tsearch_readline_state
Definition: ts_locale.h:25

varlena
Definition: c.h:658

syscache.h

GetSysCacheOid2
#define GetSysCacheOid2(cacheId, oidcol, key1, key2)
Definition: syscache.h:111

lookup_ts_dictionary_cache
TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId)
Definition: ts_cache.c:208

ts_cache.h

tsearch_readline_begin
bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename)
Definition: ts_locale.c:89

tsearch_readline
char * tsearch_readline(tsearch_readline_state *stp)
Definition: ts_locale.c:112

tsearch_readline_end
void tsearch_readline_end(tsearch_readline_state *stp)
Definition: ts_locale.c:157

ts_locale.h

ts_public.h

TSL_FILTER
#define TSL_FILTER
Definition: ts_public.h:144

get_tsearch_config_filename
char * get_tsearch_config_filename(const char *basename, const char *extension)
Definition: ts_utils.c:34

initTrie
static TrieChar * initTrie(const char *filename)
Definition: unaccent.c:97

unaccent_init
Datum unaccent_init(PG_FUNCTION_ARGS)
Definition: unaccent.c:338

PG_FUNCTION_INFO_V1
PG_FUNCTION_INFO_V1(unaccent_init)

TrieChar
struct TrieChar TrieChar

unaccent_lexize
Datum unaccent_lexize(PG_FUNCTION_ARGS)
Definition: unaccent.c:379

placeChar
static TrieChar * placeChar(TrieChar *node, const unsigned char *str, int lenstr, const char *replaceTo, int replacelen)
Definition: unaccent.c:57

unaccent_dict
Datum unaccent_dict(PG_FUNCTION_ARGS)
Definition: unaccent.c:439

findReplaceTo
static TrieChar * findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, int *p_matchlen)
Definition: unaccent.c:311

PG_MODULE_MAGIC_EXT
PG_MODULE_MAGIC_EXT(.name="unaccent",.version=PG_VERSION)

ListCell
Definition: pg_list.h:46

VARDATA_ANY
#define VARDATA_ANY(PTR)
Definition: varatt.h:324

VARSIZE_ANY_EXHDR
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317

cstring_to_text
text * cstring_to_text(const char *s)
Definition: varlena.c:192

name
const char * name
Definition: wait_event_funcs.c:28