daitch__mokotoff_8c_source.html

 /*

  * Daitch-Mokotoff Soundex

  *

  * Copyright (c) 2023-2024, PostgreSQL Global Development Group

  *

  * This module was originally sponsored by Finance Norway /

  * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>

  *

  * The implementation of the Daitch-Mokotoff Soundex System aims at correctness

  * and high performance, and can be summarized as follows:

  *

  * - The processing of each phoneme is initiated by an O(1) table lookup.

  * - For phonemes containing more than one character, a coding tree is traversed

  *   to process the complete phoneme.

  * - The (alternate) soundex codes are produced digit by digit in-place in

  *   another tree structure.

  *

  * References:

  *

  * https://www.avotaynu.com/soundex.htm

  * https://www.jewishgen.org/InfoFiles/Soundex.html

  * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex

  * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)

  * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)

  * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)

  *

  * A few notes on other implementations:

  *

  * - All other known implementations have the same unofficial rules for "UE",

  *   these are also adapted by this implementation (0, 1, NC).

  * - The only other known implementation which is capable of generating all

  *   correct soundex codes in all cases is the JOS Soundex Calculator at

  *   https://www.jewishgen.org/jos/jossound.htm

  * - "J" is considered (only) a vowel in dmlat.php

  * - The official rules for "RS" are commented out in dmlat.php

  * - Identical code digits for adjacent letters are not collapsed correctly in

  *   dmsoundex.php when double digit codes are involved. E.g. "BESST" yields

  *   744300 instead of 743000 as for "BEST".

  * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java

  * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java

 */


 #include "postgres.h"


 #include "catalog/pg_type.h"

 #include "mb/pg_wchar.h"

 #include "utils/array.h"

 #include "utils/builtins.h"

 #include "utils/memutils.h"


 /*

  * The soundex coding chart table is adapted from

  * https://www.jewishgen.org/InfoFiles/Soundex.html

  * See daitch_mokotoff_header.pl for details.

 */


 /* Generated coding chart table */

 #include "daitch_mokotoff.h"


 #define DM_CODE_DIGITS 6


 /* Node in soundex code tree */

 typedef struct dm_node

 {

     int         soundex_length; /* Length of generated soundex code */

     char        soundex[DM_CODE_DIGITS];    /* Soundex code */

     int         is_leaf;        /* Candidate for complete soundex code */

     int         last_update;    /* Letter number for last update of node */

     char        code_digit;     /* Last code digit, 0 - 9 */


     /*

      * One or two alternate code digits leading to this node. If there are two

      * digits, one of them is always an 'X'. Repeated code digits and 'X' lead

      * back to the same node.

      */

     char        prev_code_digits[2];

     /* One or two alternate code digits moving forward. */

     char        next_code_digits[2];

     /* ORed together code index(es) used to reach current node. */

     int         prev_code_index;

     int         next_code_index;

     /* Possible nodes branching out from this node - digits 0-9. */

     struct dm_node *children[10];

     /* Next node in linked list. Alternating index for each iteration. */

     struct dm_node *next[2];

 } dm_node;


 /* Template for new node in soundex code tree. */

 static const dm_node start_node = {

     .soundex_length = 0,

     .soundex = "000000",        /* Six digits */

     .is_leaf = 0,

     .last_update = 0,

     .code_digit = '\0',

     .prev_code_digits = {'\0', '\0'},

     .next_code_digits = {'\0', '\0'},

     .prev_code_index = 0,

     .next_code_index = 0,

     .children = {NULL},

     .next = {NULL}

 };


 /* Dummy soundex codes at end of input. */

 static const dm_codes end_codes[2] =

 {

     {

         "X", "X", "X"

     }

 };


 /* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */

 static const char iso8859_1_to_ascii_upper[] =

 "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~                                  !                             ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";


 /* Internal C implementation */

 static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex);


 PG_FUNCTION_INFO_V1(daitch_mokotoff);


 Datum

 daitch_mokotoff(PG_FUNCTION_ARGS)

 {

     text       *arg = PG_GETARG_TEXT_PP(0);

     Datum       retval;

     char       *string;

     ArrayBuildState *soundex;

     MemoryContext old_ctx,

                 tmp_ctx;


     /* Work in a temporary context to simplify cleanup. */

     tmp_ctx = AllocSetContextCreate(CurrentMemoryContext,

                                     "daitch_mokotoff temporary context",

                                     ALLOCSET_DEFAULT_SIZES);

     old_ctx = MemoryContextSwitchTo(tmp_ctx);


     /* We must convert the string to UTF-8 if it isn't already. */

     string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg),

                               PG_UTF8);


     /* The result is built in this ArrayBuildState. */

     soundex = initArrayResult(TEXTOID, tmp_ctx, false);


     if (!daitch_mokotoff_coding(string, soundex))

     {

         /* No encodable characters in input */

         MemoryContextSwitchTo(old_ctx);

         MemoryContextDelete(tmp_ctx);

         PG_RETURN_NULL();

     }


     retval = makeArrayResult(soundex, old_ctx);


     MemoryContextSwitchTo(old_ctx);

     MemoryContextDelete(tmp_ctx);


     PG_RETURN_DATUM(retval);

 }


 /* Initialize soundex code tree node for next code digit. */

 static void

 initialize_node(dm_node *node, int last_update)

 {

     if (node->last_update < last_update)

     {

         node->prev_code_digits[0] = node->next_code_digits[0];

         node->prev_code_digits[1] = node->next_code_digits[1];

         node->next_code_digits[0] = '\0';

         node->next_code_digits[1] = '\0';

         node->prev_code_index = node->next_code_index;

         node->next_code_index = 0;

         node->is_leaf = 0;

         node->last_update = last_update;

     }

 }


 /* Update soundex code tree node with next code digit. */

 static void

 add_next_code_digit(dm_node *node, int code_index, char code_digit)

 {

     /* OR in index 1 or 2. */

     node->next_code_index |= code_index;


     if (!node->next_code_digits[0])

         node->next_code_digits[0] = code_digit;

     else if (node->next_code_digits[0] != code_digit)

         node->next_code_digits[1] = code_digit;

 }


 /* Mark soundex code tree node as leaf. */

 static void

 set_leaf(dm_node *first_node[2], dm_node *last_node[2],

          dm_node *node, int ix_node)

 {

     if (!node->is_leaf)

     {

         node->is_leaf = 1;


         if (first_node[ix_node] == NULL)

             first_node[ix_node] = node;

         else

             last_node[ix_node]->next[ix_node] = node;


         last_node[ix_node] = node;

         node->next[ix_node] = NULL;

     }

 }


 /* Find next node corresponding to code digit, or create a new node. */

 static dm_node *

 find_or_create_child_node(dm_node *parent, char code_digit,

                           ArrayBuildState *soundex)

 {

     int         i = code_digit - '0';

     dm_node   **nodes = parent->children;

     dm_node    *node = nodes[i];


     if (node)

     {

         /* Found existing child node. Skip completed nodes. */

         return node->soundex_length < DM_CODE_DIGITS ? node : NULL;

     }


     /* Create new child node. */

     node = palloc_object(dm_node);

     nodes[i] = node;


     *node = start_node;

     memcpy(node->soundex, parent->soundex, sizeof(parent->soundex));

     node->soundex_length = parent->soundex_length;

     node->soundex[node->soundex_length++] = code_digit;

     node->code_digit = code_digit;

     node->next_code_index = node->prev_code_index;


     if (node->soundex_length < DM_CODE_DIGITS)

     {

         return node;

     }

     else

     {

         /* Append completed soundex code to output array. */

         text       *out = cstring_to_text_with_len(node->soundex,

                                                    DM_CODE_DIGITS);


         accumArrayResult(soundex,

                          PointerGetDatum(out),

                          false,

                          TEXTOID,

                          CurrentMemoryContext);

         return NULL;

     }

 }


 /* Update node for next code digit(s). */

 static void

 update_node(dm_node *first_node[2], dm_node *last_node[2],

             dm_node *node, int ix_node,

             int letter_no, int prev_code_index, int next_code_index,

             const char *next_code_digits, int digit_no,

             ArrayBuildState *soundex)

 {

     int         i;

     char        next_code_digit = next_code_digits[digit_no];

     int         num_dirty_nodes = 0;

     dm_node    *dirty_nodes[2];


     initialize_node(node, letter_no);


     if (node->prev_code_index && !(node->prev_code_index & prev_code_index))

     {

         /*

          * If the sound (vowel / consonant) of this letter encoding doesn't

          * correspond to the coding index of the previous letter, we skip this

          * letter encoding. Note that currently, only "J" can be either a

          * vowel or a consonant.

          */

         return;

     }


     if (next_code_digit == 'X' ||

         (digit_no == 0 &&

          (node->prev_code_digits[0] == next_code_digit ||

           node->prev_code_digits[1] == next_code_digit)))

     {

         /* The code digit is the same as one of the previous (i.e. not added). */

         dirty_nodes[num_dirty_nodes++] = node;

     }


     if (next_code_digit != 'X' &&

         (digit_no > 0 ||

          node->prev_code_digits[0] != next_code_digit ||

          node->prev_code_digits[1]))

     {

         /* The code digit is different from one of the previous (i.e. added). */

         node = find_or_create_child_node(node, next_code_digit, soundex);

         if (node)

         {

             initialize_node(node, letter_no);

             dirty_nodes[num_dirty_nodes++] = node;

         }

     }


     for (i = 0; i < num_dirty_nodes; i++)

     {

         /* Add code digit leading to the current node. */

         add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit);


         if (next_code_digits[++digit_no])

         {

             update_node(first_node, last_node, dirty_nodes[i], ix_node,

                         letter_no, prev_code_index, next_code_index,

                         next_code_digits, digit_no,

                         soundex);

         }

         else

         {

             /* Add incomplete leaf node to linked list. */

             set_leaf(first_node, last_node, dirty_nodes[i], ix_node);

         }

     }

 }


 /* Update soundex tree leaf nodes. */

 static void

 update_leaves(dm_node *first_node[2], int *ix_node, int letter_no,

               const dm_codes *codes, const dm_codes *next_codes,

               ArrayBuildState *soundex)

 {

     int         i,

                 j,

                 code_index;

     dm_node    *node,

                *last_node[2];

     const dm_code *code,

                *next_code;

     int         ix_node_next = (*ix_node + 1) & 1;  /* Alternating index: 0, 1 */


     /* Initialize for new linked list of leaves. */

     first_node[ix_node_next] = NULL;

     last_node[ix_node_next] = NULL;


     /* Process all nodes. */

     for (node = first_node[*ix_node]; node; node = node->next[*ix_node])

     {

         /* One or two alternate code sequences. */

         for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++)

         {

             /* Coding for previous letter - before vowel: 1, all other: 2 */

             int         prev_code_index = (code[0][0] > '1') + 1;


             /* One or two alternate next code sequences. */

             for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++)

             {

                 /* Determine which code to use. */

                 if (letter_no == 0)

                 {

                     /* This is the first letter. */

                     code_index = 0;

                 }

                 else if (next_code[0][0] <= '1')

                 {

                     /* The next letter is a vowel. */

                     code_index = 1;

                 }

                 else

                 {

                     /* All other cases. */

                     code_index = 2;

                 }


                 /* One or two sequential code digits. */

                 update_node(first_node, last_node, node, ix_node_next,

                             letter_no, prev_code_index, code_index,

                             code[code_index], 0,

                             soundex);

             }

         }

     }


     *ix_node = ix_node_next;

 }


 /*

  * Return next character, converted from UTF-8 to uppercase ASCII.

  * *ix is the current string index and is incremented by the character length.

  */

 static char

 read_char(const unsigned char *str, int *ix)

 {

     /* Substitute character for skipped code points. */

     const char  na = '\x1a';

     pg_wchar    c;


     /* Decode UTF-8 character to ISO 10646 code point. */

     str += *ix;

     c = utf8_to_unicode(str);


     /* Advance *ix, but (for safety) not if we've reached end of string. */

     if (c)

         *ix += pg_utf_mblen(str);


     /* Convert. */

     if (c >= (unsigned char) '[' && c <= (unsigned char) ']')

     {

         /* ASCII characters [, \, and ] are reserved for conversions below. */

         return na;

     }

     else if (c < 0x60)

     {

         /* Other non-lowercase ASCII characters can be used as-is. */

         return (char) c;

     }

     else if (c < 0x100)

     {

         /* ISO-8859-1 code point; convert to upper-case ASCII via table. */

         return iso8859_1_to_ascii_upper[c - 0x60];

     }

     else

     {

         /* Conversion of non-ASCII characters in the coding chart. */

         switch (c)

         {

             case 0x0104:        /* LATIN CAPITAL LETTER A WITH OGONEK */

             case 0x0105:        /* LATIN SMALL LETTER A WITH OGONEK */

                 return '[';

             case 0x0118:        /* LATIN CAPITAL LETTER E WITH OGONEK */

             case 0x0119:        /* LATIN SMALL LETTER E WITH OGONEK */

                 return '\\';

             case 0x0162:        /* LATIN CAPITAL LETTER T WITH CEDILLA */

             case 0x0163:        /* LATIN SMALL LETTER T WITH CEDILLA */

             case 0x021A:        /* LATIN CAPITAL LETTER T WITH COMMA BELOW */

             case 0x021B:        /* LATIN SMALL LETTER T WITH COMMA BELOW */

                 return ']';

             default:

                 return na;

         }

     }

 }


 /* Read next ASCII character, skipping any characters not in [A-\]]. */

 static char

 read_valid_char(const char *str, int *ix)

 {

     char        c;


     while ((c = read_char((const unsigned char *) str, ix)) != '\0')

     {

         if (c >= 'A' && c <= ']')

             break;

     }


     return c;

 }


 /* Return sound coding for "letter" (letter sequence) */

 static const dm_codes *

 read_letter(const char *str, int *ix)

 {

     char        c,

                 cmp;

     int         i,

                 j;

     const dm_letter *letters;

     const dm_codes *codes;


     /* First letter in sequence. */

     if ((c = read_valid_char(str, ix)) == '\0')

         return NULL;


     letters = &letter_[c - 'A'];

     codes = letters->codes;

     i = *ix;


     /* Any subsequent letters in sequence. */

     while ((letters = letters->letters) && (c = read_valid_char(str, &i)))

     {

         for (j = 0; (cmp = letters[j].letter); j++)

         {

             if (cmp == c)

             {

                 /* Letter found. */

                 letters = &letters[j];

                 if (letters->codes)

                 {

                     /* Coding for letter sequence found. */

                     codes = letters->codes;

                     *ix = i;

                 }

                 break;

             }

         }

         if (!cmp)

         {

             /* The sequence of letters has no coding. */

             break;

         }

     }


     return codes;

 }


 /*

  * Generate all Daitch-Mokotoff soundex codes for word,

  * adding them to the "soundex" ArrayBuildState.

  * Returns false if string has no encodable characters, else true.

  */

 static bool

 daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)

 {

     int         i = 0;

     int         letter_no = 0;

     int         ix_node = 0;

     const dm_codes *codes,

                *next_codes;

     dm_node    *first_node[2],

                *node;


     /* First letter. */

     if (!(codes = read_letter(word, &i)))

     {

         /* No encodable character in input. */

         return false;

     }


     /* Starting point. */

     first_node[ix_node] = palloc_object(dm_node);

     *first_node[ix_node] = start_node;


     /*

      * Loop until either the word input is exhausted, or all generated soundex

      * codes are completed to six digits.

      */

     while (codes && first_node[ix_node])

     {

         next_codes = read_letter(word, &i);


         /* Update leaf nodes. */

         update_leaves(first_node, &ix_node, letter_no,

                       codes, next_codes ? next_codes : end_codes,

                       soundex);


         codes = next_codes;

         letter_no++;

     }


     /* Append all remaining (incomplete) soundex codes to output array. */

     for (node = first_node[ix_node]; node; node = node->next[ix_node])

     {

         text       *out = cstring_to_text_with_len(node->soundex,

                                                    DM_CODE_DIGITS);


         accumArrayResult(soundex,

                          PointerGetDatum(out),

                          false,

                          TEXTOID,

                          CurrentMemoryContext);

     }


     return true;

 }

array.h

accumArrayResult
ArrayBuildState * accumArrayResult(ArrayBuildState *astate, Datum dvalue, bool disnull, Oid element_type, MemoryContext rcontext)
Definition: arrayfuncs.c:5338

initArrayResult
ArrayBuildState * initArrayResult(Oid element_type, MemoryContext rcontext, bool subcontext)
Definition: arrayfuncs.c:5281

makeArrayResult
Datum makeArrayResult(ArrayBuildState *astate, MemoryContext rcontext)
Definition: arrayfuncs.c:5408

builtins.h

daitch_mokotoff_coding
static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)
Definition: daitch_mokotoff.c:519

PG_FUNCTION_INFO_V1
PG_FUNCTION_INFO_V1(daitch_mokotoff)

iso8859_1_to_ascii_upper
static const char iso8859_1_to_ascii_upper[]
Definition: daitch_mokotoff.c:113

update_node
static void update_node(dm_node *first_node[2], dm_node *last_node[2], dm_node *node, int ix_node, int letter_no, int prev_code_index, int next_code_index, const char *next_code_digits, int digit_no, ArrayBuildState *soundex)
Definition: daitch_mokotoff.c:262

dm_node
struct dm_node dm_node

find_or_create_child_node
static dm_node * find_or_create_child_node(dm_node *parent, char code_digit, ArrayBuildState *soundex)
Definition: daitch_mokotoff.c:216

start_node
static const dm_node start_node
Definition: daitch_mokotoff.c:90

daitch_mokotoff
Datum daitch_mokotoff(PG_FUNCTION_ARGS)
Definition: daitch_mokotoff.c:123

end_codes
static const dm_codes end_codes[2]
Definition: daitch_mokotoff.c:105

read_char
static char read_char(const unsigned char *str, int *ix)
Definition: daitch_mokotoff.c:396

update_leaves
static void update_leaves(dm_node *first_node[2], int *ix_node, int letter_no, const dm_codes *codes, const dm_codes *next_codes, ArrayBuildState *soundex)
Definition: daitch_mokotoff.c:332

DM_CODE_DIGITS
#define DM_CODE_DIGITS
Definition: daitch_mokotoff.c:61

read_letter
static const dm_codes * read_letter(const char *str, int *ix)
Definition: daitch_mokotoff.c:467

add_next_code_digit
static void add_next_code_digit(dm_node *node, int code_index, char code_digit)
Definition: daitch_mokotoff.c:182

read_valid_char
static char read_valid_char(const char *str, int *ix)
Definition: daitch_mokotoff.c:451

set_leaf
static void set_leaf(dm_node *first_node[2], dm_node *last_node[2], dm_node *node, int ix_node)
Definition: daitch_mokotoff.c:196

initialize_node
static void initialize_node(dm_node *node, int last_update)
Definition: daitch_mokotoff.c:164

palloc_object
#define palloc_object(type)
Definition: fe_memutils.h:62

PG_GETARG_TEXT_PP
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309

PG_RETURN_NULL
#define PG_RETURN_NULL()
Definition: fmgr.h:345

PG_RETURN_DATUM
#define PG_RETURN_DATUM(x)
Definition: fmgr.h:353

PG_FUNCTION_ARGS
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193

soundex
Datum soundex(PG_FUNCTION_ARGS)
Definition: fuzzystrmatch.c:711

str
const char * str
Definition: hashfn_unstable.h:263

j
int j
Definition: isn.c:74

i
int i
Definition: isn.c:73

utf8_to_unicode
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53

pg_wchar
unsigned int pg_wchar
Definition: mbprint.c:31

pg_server_to_any
char * pg_server_to_any(const char *s, int len, int encoding)
Definition: mbutils.c:749

CurrentMemoryContext
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143

MemoryContextDelete
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454

memutils.h

AllocSetContextCreate
#define AllocSetContextCreate
Definition: memutils.h:129

ALLOCSET_DEFAULT_SIZES
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160

arg
void * arg
Definition: pg_backup_utils.c:27

pg_type.h

pg_wchar.h

pg_utf_mblen
#define pg_utf_mblen
Definition: pg_wchar.h:633

PG_UTF8
@ PG_UTF8
Definition: pg_wchar.h:232

postgres.h

PointerGetDatum
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322

Datum
uintptr_t Datum
Definition: postgres.h:64

c
char * c
Definition: preproc-cursor.c:31

string
char string[11]
Definition: preproc-type.c:52

old_ctx
RT_SCOPE RT_RADIX_TREE *MemoryContext old_ctx
Definition: radixtree.h:1824

MemoryContextSwitchTo
MemoryContextSwitchTo(old_ctx)

cmp
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743

word
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1474

ArrayBuildState
Definition: array.h:188

MemoryContextData
Definition: memnodes.h:118

dm_node
Definition: daitch_mokotoff.c:65

dm_node::is_leaf
int is_leaf
Definition: daitch_mokotoff.c:68

dm_node::prev_code_digits
char prev_code_digits[2]
Definition: daitch_mokotoff.c:77

dm_node::soundex_length
int soundex_length
Definition: daitch_mokotoff.c:66

dm_node::last_update
int last_update
Definition: daitch_mokotoff.c:69

dm_node::children
struct dm_node * children[10]
Definition: daitch_mokotoff.c:84

dm_node::next
struct dm_node * next[2]
Definition: daitch_mokotoff.c:86

dm_node::prev_code_index
int prev_code_index
Definition: daitch_mokotoff.c:81

dm_node::next_code_digits
char next_code_digits[2]
Definition: daitch_mokotoff.c:79

dm_node::soundex
char soundex[DM_CODE_DIGITS]
Definition: daitch_mokotoff.c:67

dm_node::next_code_index
int next_code_index
Definition: daitch_mokotoff.c:82

dm_node::code_digit
char code_digit
Definition: daitch_mokotoff.c:70

varlena
Definition: c.h:687

VARSIZE_ANY_EXHDR
#define VARSIZE_ANY_EXHDR(PTR)
Definition: varatt.h:317

text_to_cstring
char * text_to_cstring(const text *t)
Definition: varlena.c:217

cstring_to_text_with_len
text * cstring_to_text_with_len(const char *s, int len)
Definition: varlena.c:196