PostgreSQL Source Code  git master
dmetaphone.c
Go to the documentation of this file.
1 /*
2  * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3  *
4  * contrib/fuzzystrmatch/dmetaphone.c
5  *
6  * Double Metaphone computes 2 "sounds like" strings - a primary and an
7  * alternate. In most cases they are the same, but for foreign names
8  * especially they can be a bit different, depending on pronunciation.
9  *
10  * Information on using Double Metaphone can be found at
11  * http://www.codeproject.com/string/dmetaphone1.asp
12  * and the original article describing it can be found at
13  * http://drdobbs.com/184401251
14  *
15  * For PostgreSQL we provide 2 functions - one for the primary and one for
16  * the alternate. That way the functions are pure text->text mappings that
17  * are useful in functional indexes. These are 'dmetaphone' for the
18  * primary and 'dmetaphone_alt' for the alternate.
19  *
20  * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21  * functions looks like this:
22  *
23  * CREATE FUNCTION dmetaphone (text) RETURNS text
24  * LANGUAGE C IMMUTABLE STRICT
25  * AS '$libdir/dmetaphone', 'dmetaphone';
26  *
27  * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28  * LANGUAGE C IMMUTABLE STRICT
29  * AS '$libdir/dmetaphone', 'dmetaphone_alt';
30  *
31  * Note that you have to declare the functions IMMUTABLE if you want to
32  * use them in functional indexes, and you have to declare them as STRICT
33  * as they do not check for NULL input, and will segfault if given NULL input.
34  * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35  * will never call them with NULL, but instead assume the result is NULL,
36  * which is what we (I) want.
37  *
38  * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39  * will detect NULL input and return NULL. The you don't have to declare them
40  * as STRICT.
41  *
42  * There is a small inefficiency here - each function call actually computes
43  * both the primary and the alternate and then throws away the one it doesn't
44  * need. That's the way the perl module was written, because perl can handle
45  * a list return more easily than we can in PostgreSQL. The result has been
46  * fast enough for my needs, but it could maybe be optimized a bit to remove
47  * that behaviour.
48  *
49  */
50 
51 
52 /***************************** COPYRIGHT NOTICES ***********************
53 
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from http://www.cpan.org.
56 It bears this copyright notice:
57 
58 
59  Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60  All rights reserved.
61 
62  This code is based heavily on the C++ implementation by
63  Lawrence Philips and incorporates several bug fixes courtesy
64  of Kevin Atkinson <kevina@users.sourceforge.net>.
65 
66  This module is free software; you may redistribute it and/or
67  modify it under the same terms as Perl itself.
68 
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
71 
72  Copyright 2003, North Carolina State Highway Patrol.
73  All rights reserved.
74 
75  Permission to use, copy, modify, and distribute this software and its
76  documentation for any purpose, without fee, and without a written agreement
77  is hereby granted, provided that the above copyright notice and this
78  paragraph and the following two paragraphs appear in all copies.
79 
80  IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81  PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82  INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83  DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 
86  THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89  HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90  HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91  MODIFICATIONS.
92 
93 ***********************************************************************/
94 
95 
96 /* include these first, according to the docs */
97 #ifndef DMETAPHONE_MAIN
98 
99 #include "postgres.h"
100 
101 #include "utils/builtins.h"
102 
103 /* turn off assertions for embedded function */
104 #define NDEBUG
105 
106 #else /* DMETAPHONE_MAIN */
107 
108 /* we need these if we didn't get them from postgres.h */
109 #include <stdio.h>
110 #include <stdlib.h>
111 #include <string.h>
112 #include <stdarg.h>
113 
114 #endif /* DMETAPHONE_MAIN */
115 
116 #include <assert.h>
117 #include <ctype.h>
118 
119 /* prototype for the main function we got from the perl module */
120 static void DoubleMetaphone(char *, char **);
121 
122 #ifndef DMETAPHONE_MAIN
123 
124 /*
125  * The PostgreSQL visible dmetaphone function.
126  */
127 
129 
130 Datum
132 {
133  text *arg;
134  char *aptr,
135  *codes[2],
136  *code;
137 
138 #ifdef DMETAPHONE_NOSTRICT
139  if (PG_ARGISNULL(0))
140  PG_RETURN_NULL();
141 #endif
142  arg = PG_GETARG_TEXT_PP(0);
143  aptr = text_to_cstring(arg);
144 
145  DoubleMetaphone(aptr, codes);
146  code = codes[0];
147  if (!code)
148  code = "";
149 
151 }
152 
153 /*
154  * The PostgreSQL visible dmetaphone_alt function.
155  */
156 
158 
159 Datum
161 {
162  text *arg;
163  char *aptr,
164  *codes[2],
165  *code;
166 
167 #ifdef DMETAPHONE_NOSTRICT
168  if (PG_ARGISNULL(0))
169  PG_RETURN_NULL();
170 #endif
171  arg = PG_GETARG_TEXT_PP(0);
172  aptr = text_to_cstring(arg);
173 
174  DoubleMetaphone(aptr, codes);
175  code = codes[1];
176  if (!code)
177  code = "";
178 
180 }
181 
182 
183 /* here is where we start the code imported from the perl module */
184 
185 /* all memory handling is done with these macros */
186 
187 #define META_MALLOC(v,n,t) \
188  (v = (t*)palloc(((n)*sizeof(t))))
189 
190 #define META_REALLOC(v,n,t) \
191  (v = (t*)repalloc((v),((n)*sizeof(t))))
192 
193 /*
194  * Don't do pfree - it seems to cause a segv sometimes - which might have just
195  * been caused by reloading the module in development.
196  * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197  * in a case like this.
198  */
199 
200 #define META_FREE(x) ((void)true) /* pfree((x)) */
201 #else /* not defined DMETAPHONE_MAIN */
202 
203 /* use the standard malloc library when not running in PostgreSQL */
204 
205 #define META_MALLOC(v,n,t) \
206  (v = (t*)malloc(((n)*sizeof(t))))
207 
208 #define META_REALLOC(v,n,t) \
209  (v = (t*)realloc((v),((n)*sizeof(t))))
210 
211 #define META_FREE(x) free((x))
212 #endif /* defined DMETAPHONE_MAIN */
213 
214 
215 
216 /* this typedef was originally in the perl module's .h file */
217 
218 typedef struct
219 {
220  char *str;
221  int length;
222  int bufsize;
224 }
225 
226 metastring;
227 
228 /*
229  * remaining perl module funcs unchanged except for declaring them static
230  * and reformatting to PostgreSQL indentation and to fit in 80 cols.
231  *
232  */
233 
234 static metastring *
235 NewMetaString(const char *init_str)
236 {
237  metastring *s;
238  char empty_string[] = "";
239 
240  META_MALLOC(s, 1, metastring);
241  assert(s != NULL);
242 
243  if (init_str == NULL)
244  init_str = empty_string;
245  s->length = strlen(init_str);
246  /* preallocate a bit more for potential growth */
247  s->bufsize = s->length + 7;
248 
249  META_MALLOC(s->str, s->bufsize, char);
250  assert(s->str != NULL);
251 
252  memcpy(s->str, init_str, s->length + 1);
253  s->free_string_on_destroy = 1;
254 
255  return s;
256 }
257 
258 
259 static void
261 {
262  if (s == NULL)
263  return;
264 
265  if (s->free_string_on_destroy && (s->str != NULL))
266  META_FREE(s->str);
267 
268  META_FREE(s);
269 }
270 
271 
272 static void
273 IncreaseBuffer(metastring *s, int chars_needed)
274 {
275  META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
276  assert(s->str != NULL);
277  s->bufsize = s->bufsize + chars_needed + 10;
278 }
279 
280 
281 static void
283 {
284  char *i;
285 
286  for (i = s->str; *i; i++)
287  *i = toupper((unsigned char) *i);
288 }
289 
290 
291 static int
292 IsVowel(metastring *s, int pos)
293 {
294  char c;
295 
296  if ((pos < 0) || (pos >= s->length))
297  return 0;
298 
299  c = *(s->str + pos);
300  if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
301  (c == 'U') || (c == 'Y'))
302  return 1;
303 
304  return 0;
305 }
306 
307 
308 static int
310 {
311  if ((char *) strstr(s->str, "W"))
312  return 1;
313  else if ((char *) strstr(s->str, "K"))
314  return 1;
315  else if ((char *) strstr(s->str, "CZ"))
316  return 1;
317  else if ((char *) strstr(s->str, "WITZ"))
318  return 1;
319  else
320  return 0;
321 }
322 
323 
324 static char
325 GetAt(metastring *s, int pos)
326 {
327  if ((pos < 0) || (pos >= s->length))
328  return '\0';
329 
330  return ((char) *(s->str + pos));
331 }
332 
333 
334 static void
335 SetAt(metastring *s, int pos, char c)
336 {
337  if ((pos < 0) || (pos >= s->length))
338  return;
339 
340  *(s->str + pos) = c;
341 }
342 
343 
344 /*
345  Caveats: the START value is 0 based
346 */
347 static int
348 StringAt(metastring *s, int start, int length,...)
349 {
350  char *test;
351  char *pos;
352  va_list ap;
353 
354  if ((start < 0) || (start >= s->length))
355  return 0;
356 
357  pos = (s->str + start);
358  va_start(ap, length);
359 
360  do
361  {
362  test = va_arg(ap, char *);
363  if (*test && (strncmp(pos, test, length) == 0))
364  {
365  va_end(ap);
366  return 1;
367  }
368  }
369  while (strcmp(test, "") != 0);
370 
371  va_end(ap);
372 
373  return 0;
374 }
375 
376 
377 static void
378 MetaphAdd(metastring *s, const char *new_str)
379 {
380  int add_length;
381 
382  if (new_str == NULL)
383  return;
384 
385  add_length = strlen(new_str);
386  if ((s->length + add_length) > (s->bufsize - 1))
387  IncreaseBuffer(s, add_length);
388 
389  strcat(s->str, new_str);
390  s->length += add_length;
391 }
392 
393 
394 static void
395 DoubleMetaphone(char *str, char **codes)
396 {
397  int length;
398  metastring *original;
399  metastring *primary;
400  metastring *secondary;
401  int current;
402  int last;
403 
404  current = 0;
405  /* we need the real length and last prior to padding */
406  length = strlen(str);
407  last = length - 1;
408  original = NewMetaString(str);
409  /* Pad original so we can index beyond end */
410  MetaphAdd(original, " ");
411 
412  primary = NewMetaString("");
413  secondary = NewMetaString("");
414  primary->free_string_on_destroy = 0;
415  secondary->free_string_on_destroy = 0;
416 
417  MakeUpper(original);
418 
419  /* skip these when at start of word */
420  if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
421  current += 1;
422 
423  /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424  if (GetAt(original, 0) == 'X')
425  {
426  MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
427  MetaphAdd(secondary, "S");
428  current += 1;
429  }
430 
431  /* main loop */
432  while ((primary->length < 4) || (secondary->length < 4))
433  {
434  if (current >= length)
435  break;
436 
437  switch (GetAt(original, current))
438  {
439  case 'A':
440  case 'E':
441  case 'I':
442  case 'O':
443  case 'U':
444  case 'Y':
445  if (current == 0)
446  {
447  /* all init vowels now map to 'A' */
448  MetaphAdd(primary, "A");
449  MetaphAdd(secondary, "A");
450  }
451  current += 1;
452  break;
453 
454  case 'B':
455 
456  /* "-mb", e.g", "dumb", already skipped over... */
457  MetaphAdd(primary, "P");
458  MetaphAdd(secondary, "P");
459 
460  if (GetAt(original, current + 1) == 'B')
461  current += 2;
462  else
463  current += 1;
464  break;
465 
466  case '\xc7': /* C with cedilla */
467  MetaphAdd(primary, "S");
468  MetaphAdd(secondary, "S");
469  current += 1;
470  break;
471 
472  case 'C':
473  /* various germanic */
474  if ((current > 1)
475  && !IsVowel(original, current - 2)
476  && StringAt(original, (current - 1), 3, "ACH", "")
477  && ((GetAt(original, current + 2) != 'I')
478  && ((GetAt(original, current + 2) != 'E')
479  || StringAt(original, (current - 2), 6, "BACHER",
480  "MACHER", ""))))
481  {
482  MetaphAdd(primary, "K");
483  MetaphAdd(secondary, "K");
484  current += 2;
485  break;
486  }
487 
488  /* special case 'caesar' */
489  if ((current == 0)
490  && StringAt(original, current, 6, "CAESAR", ""))
491  {
492  MetaphAdd(primary, "S");
493  MetaphAdd(secondary, "S");
494  current += 2;
495  break;
496  }
497 
498  /* italian 'chianti' */
499  if (StringAt(original, current, 4, "CHIA", ""))
500  {
501  MetaphAdd(primary, "K");
502  MetaphAdd(secondary, "K");
503  current += 2;
504  break;
505  }
506 
507  if (StringAt(original, current, 2, "CH", ""))
508  {
509  /* find 'michael' */
510  if ((current > 0)
511  && StringAt(original, current, 4, "CHAE", ""))
512  {
513  MetaphAdd(primary, "K");
514  MetaphAdd(secondary, "X");
515  current += 2;
516  break;
517  }
518 
519  /* greek roots e.g. 'chemistry', 'chorus' */
520  if ((current == 0)
521  && (StringAt(original, (current + 1), 5,
522  "HARAC", "HARIS", "")
523  || StringAt(original, (current + 1), 3, "HOR",
524  "HYM", "HIA", "HEM", ""))
525  && !StringAt(original, 0, 5, "CHORE", ""))
526  {
527  MetaphAdd(primary, "K");
528  MetaphAdd(secondary, "K");
529  current += 2;
530  break;
531  }
532 
533  /* germanic, greek, or otherwise 'ch' for 'kh' sound */
534  if (
535  (StringAt(original, 0, 4, "VAN ", "VON ", "")
536  || StringAt(original, 0, 3, "SCH", ""))
537  /* 'architect but not 'arch', 'orchestra', 'orchid' */
538  || StringAt(original, (current - 2), 6, "ORCHES",
539  "ARCHIT", "ORCHID", "")
540  || StringAt(original, (current + 2), 1, "T", "S",
541  "")
542  || ((StringAt(original, (current - 1), 1,
543  "A", "O", "U", "E", "")
544  || (current == 0))
545 
546  /*
547  * e.g., 'wachtler', 'wechsler', but not 'tichner'
548  */
549  && StringAt(original, (current + 2), 1, "L", "R",
550  "N", "M", "B", "H", "F", "V", "W",
551  " ", "")))
552  {
553  MetaphAdd(primary, "K");
554  MetaphAdd(secondary, "K");
555  }
556  else
557  {
558  if (current > 0)
559  {
560  if (StringAt(original, 0, 2, "MC", ""))
561  {
562  /* e.g., "McHugh" */
563  MetaphAdd(primary, "K");
564  MetaphAdd(secondary, "K");
565  }
566  else
567  {
568  MetaphAdd(primary, "X");
569  MetaphAdd(secondary, "K");
570  }
571  }
572  else
573  {
574  MetaphAdd(primary, "X");
575  MetaphAdd(secondary, "X");
576  }
577  }
578  current += 2;
579  break;
580  }
581  /* e.g, 'czerny' */
582  if (StringAt(original, current, 2, "CZ", "")
583  && !StringAt(original, (current - 2), 4, "WICZ", ""))
584  {
585  MetaphAdd(primary, "S");
586  MetaphAdd(secondary, "X");
587  current += 2;
588  break;
589  }
590 
591  /* e.g., 'focaccia' */
592  if (StringAt(original, (current + 1), 3, "CIA", ""))
593  {
594  MetaphAdd(primary, "X");
595  MetaphAdd(secondary, "X");
596  current += 3;
597  break;
598  }
599 
600  /* double 'C', but not if e.g. 'McClellan' */
601  if (StringAt(original, current, 2, "CC", "")
602  && !((current == 1) && (GetAt(original, 0) == 'M')))
603  {
604  /* 'bellocchio' but not 'bacchus' */
605  if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
606  && !StringAt(original, (current + 2), 2, "HU", ""))
607  {
608  /* 'accident', 'accede' 'succeed' */
609  if (
610  ((current == 1)
611  && (GetAt(original, current - 1) == 'A'))
612  || StringAt(original, (current - 1), 5, "UCCEE",
613  "UCCES", ""))
614  {
615  MetaphAdd(primary, "KS");
616  MetaphAdd(secondary, "KS");
617  /* 'bacci', 'bertucci', other italian */
618  }
619  else
620  {
621  MetaphAdd(primary, "X");
622  MetaphAdd(secondary, "X");
623  }
624  current += 3;
625  break;
626  }
627  else
628  { /* Pierce's rule */
629  MetaphAdd(primary, "K");
630  MetaphAdd(secondary, "K");
631  current += 2;
632  break;
633  }
634  }
635 
636  if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
637  {
638  MetaphAdd(primary, "K");
639  MetaphAdd(secondary, "K");
640  current += 2;
641  break;
642  }
643 
644  if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
645  {
646  /* italian vs. english */
647  if (StringAt
648  (original, current, 3, "CIO", "CIE", "CIA", ""))
649  {
650  MetaphAdd(primary, "S");
651  MetaphAdd(secondary, "X");
652  }
653  else
654  {
655  MetaphAdd(primary, "S");
656  MetaphAdd(secondary, "S");
657  }
658  current += 2;
659  break;
660  }
661 
662  /* else */
663  MetaphAdd(primary, "K");
664  MetaphAdd(secondary, "K");
665 
666  /* name sent in 'mac caffrey', 'mac gregor */
667  if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
668  current += 3;
669  else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
670  && !StringAt(original, (current + 1), 2,
671  "CE", "CI", ""))
672  current += 2;
673  else
674  current += 1;
675  break;
676 
677  case 'D':
678  if (StringAt(original, current, 2, "DG", ""))
679  {
680  if (StringAt(original, (current + 2), 1,
681  "I", "E", "Y", ""))
682  {
683  /* e.g. 'edge' */
684  MetaphAdd(primary, "J");
685  MetaphAdd(secondary, "J");
686  current += 3;
687  break;
688  }
689  else
690  {
691  /* e.g. 'edgar' */
692  MetaphAdd(primary, "TK");
693  MetaphAdd(secondary, "TK");
694  current += 2;
695  break;
696  }
697  }
698 
699  if (StringAt(original, current, 2, "DT", "DD", ""))
700  {
701  MetaphAdd(primary, "T");
702  MetaphAdd(secondary, "T");
703  current += 2;
704  break;
705  }
706 
707  /* else */
708  MetaphAdd(primary, "T");
709  MetaphAdd(secondary, "T");
710  current += 1;
711  break;
712 
713  case 'F':
714  if (GetAt(original, current + 1) == 'F')
715  current += 2;
716  else
717  current += 1;
718  MetaphAdd(primary, "F");
719  MetaphAdd(secondary, "F");
720  break;
721 
722  case 'G':
723  if (GetAt(original, current + 1) == 'H')
724  {
725  if ((current > 0) && !IsVowel(original, current - 1))
726  {
727  MetaphAdd(primary, "K");
728  MetaphAdd(secondary, "K");
729  current += 2;
730  break;
731  }
732 
733  if (current < 3)
734  {
735  /* 'ghislane', ghiradelli */
736  if (current == 0)
737  {
738  if (GetAt(original, current + 2) == 'I')
739  {
740  MetaphAdd(primary, "J");
741  MetaphAdd(secondary, "J");
742  }
743  else
744  {
745  MetaphAdd(primary, "K");
746  MetaphAdd(secondary, "K");
747  }
748  current += 2;
749  break;
750  }
751  }
752 
753  /*
754  * Parker's rule (with some further refinements) - e.g.,
755  * 'hugh'
756  */
757  if (
758  ((current > 1)
759  && StringAt(original, (current - 2), 1,
760  "B", "H", "D", ""))
761  /* e.g., 'bough' */
762  || ((current > 2)
763  && StringAt(original, (current - 3), 1,
764  "B", "H", "D", ""))
765  /* e.g., 'broughton' */
766  || ((current > 3)
767  && StringAt(original, (current - 4), 1,
768  "B", "H", "")))
769  {
770  current += 2;
771  break;
772  }
773  else
774  {
775  /*
776  * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
777  * 'rough', 'tough'
778  */
779  if ((current > 2)
780  && (GetAt(original, current - 1) == 'U')
781  && StringAt(original, (current - 3), 1, "C",
782  "G", "L", "R", "T", ""))
783  {
784  MetaphAdd(primary, "F");
785  MetaphAdd(secondary, "F");
786  }
787  else if ((current > 0)
788  && GetAt(original, current - 1) != 'I')
789  {
790 
791 
792  MetaphAdd(primary, "K");
793  MetaphAdd(secondary, "K");
794  }
795 
796  current += 2;
797  break;
798  }
799  }
800 
801  if (GetAt(original, current + 1) == 'N')
802  {
803  if ((current == 1) && IsVowel(original, 0)
804  && !SlavoGermanic(original))
805  {
806  MetaphAdd(primary, "KN");
807  MetaphAdd(secondary, "N");
808  }
809  else
810  /* not e.g. 'cagney' */
811  if (!StringAt(original, (current + 2), 2, "EY", "")
812  && (GetAt(original, current + 1) != 'Y')
813  && !SlavoGermanic(original))
814  {
815  MetaphAdd(primary, "N");
816  MetaphAdd(secondary, "KN");
817  }
818  else
819  {
820  MetaphAdd(primary, "KN");
821  MetaphAdd(secondary, "KN");
822  }
823  current += 2;
824  break;
825  }
826 
827  /* 'tagliaro' */
828  if (StringAt(original, (current + 1), 2, "LI", "")
829  && !SlavoGermanic(original))
830  {
831  MetaphAdd(primary, "KL");
832  MetaphAdd(secondary, "L");
833  current += 2;
834  break;
835  }
836 
837  /* -ges-,-gep-,-gel-, -gie- at beginning */
838  if ((current == 0)
839  && ((GetAt(original, current + 1) == 'Y')
840  || StringAt(original, (current + 1), 2, "ES", "EP",
841  "EB", "EL", "EY", "IB", "IL", "IN", "IE",
842  "EI", "ER", "")))
843  {
844  MetaphAdd(primary, "K");
845  MetaphAdd(secondary, "J");
846  current += 2;
847  break;
848  }
849 
850  /* -ger-, -gy- */
851  if (
852  (StringAt(original, (current + 1), 2, "ER", "")
853  || (GetAt(original, current + 1) == 'Y'))
854  && !StringAt(original, 0, 6,
855  "DANGER", "RANGER", "MANGER", "")
856  && !StringAt(original, (current - 1), 1, "E", "I", "")
857  && !StringAt(original, (current - 1), 3, "RGY", "OGY",
858  ""))
859  {
860  MetaphAdd(primary, "K");
861  MetaphAdd(secondary, "J");
862  current += 2;
863  break;
864  }
865 
866  /* italian e.g, 'biaggi' */
867  if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
868  || StringAt(original, (current - 1), 4,
869  "AGGI", "OGGI", ""))
870  {
871  /* obvious germanic */
872  if (
873  (StringAt(original, 0, 4, "VAN ", "VON ", "")
874  || StringAt(original, 0, 3, "SCH", ""))
875  || StringAt(original, (current + 1), 2, "ET", ""))
876  {
877  MetaphAdd(primary, "K");
878  MetaphAdd(secondary, "K");
879  }
880  else
881  {
882  /* always soft if french ending */
883  if (StringAt
884  (original, (current + 1), 4, "IER ", ""))
885  {
886  MetaphAdd(primary, "J");
887  MetaphAdd(secondary, "J");
888  }
889  else
890  {
891  MetaphAdd(primary, "J");
892  MetaphAdd(secondary, "K");
893  }
894  }
895  current += 2;
896  break;
897  }
898 
899  if (GetAt(original, current + 1) == 'G')
900  current += 2;
901  else
902  current += 1;
903  MetaphAdd(primary, "K");
904  MetaphAdd(secondary, "K");
905  break;
906 
907  case 'H':
908  /* only keep if first & before vowel or btw. 2 vowels */
909  if (((current == 0) || IsVowel(original, current - 1))
910  && IsVowel(original, current + 1))
911  {
912  MetaphAdd(primary, "H");
913  MetaphAdd(secondary, "H");
914  current += 2;
915  }
916  else
917  /* also takes care of 'HH' */
918  current += 1;
919  break;
920 
921  case 'J':
922  /* obvious spanish, 'jose', 'san jacinto' */
923  if (StringAt(original, current, 4, "JOSE", "")
924  || StringAt(original, 0, 4, "SAN ", ""))
925  {
926  if (((current == 0)
927  && (GetAt(original, current + 4) == ' '))
928  || StringAt(original, 0, 4, "SAN ", ""))
929  {
930  MetaphAdd(primary, "H");
931  MetaphAdd(secondary, "H");
932  }
933  else
934  {
935  MetaphAdd(primary, "J");
936  MetaphAdd(secondary, "H");
937  }
938  current += 1;
939  break;
940  }
941 
942  if ((current == 0)
943  && !StringAt(original, current, 4, "JOSE", ""))
944  {
945  MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
946  MetaphAdd(secondary, "A");
947  }
948  else
949  {
950  /* spanish pron. of e.g. 'bajador' */
951  if (IsVowel(original, current - 1)
952  && !SlavoGermanic(original)
953  && ((GetAt(original, current + 1) == 'A')
954  || (GetAt(original, current + 1) == 'O')))
955  {
956  MetaphAdd(primary, "J");
957  MetaphAdd(secondary, "H");
958  }
959  else
960  {
961  if (current == last)
962  {
963  MetaphAdd(primary, "J");
964  MetaphAdd(secondary, "");
965  }
966  else
967  {
968  if (!StringAt(original, (current + 1), 1, "L", "T",
969  "K", "S", "N", "M", "B", "Z", "")
970  && !StringAt(original, (current - 1), 1,
971  "S", "K", "L", ""))
972  {
973  MetaphAdd(primary, "J");
974  MetaphAdd(secondary, "J");
975  }
976  }
977  }
978  }
979 
980  if (GetAt(original, current + 1) == 'J') /* it could happen! */
981  current += 2;
982  else
983  current += 1;
984  break;
985 
986  case 'K':
987  if (GetAt(original, current + 1) == 'K')
988  current += 2;
989  else
990  current += 1;
991  MetaphAdd(primary, "K");
992  MetaphAdd(secondary, "K");
993  break;
994 
995  case 'L':
996  if (GetAt(original, current + 1) == 'L')
997  {
998  /* spanish e.g. 'cabrillo', 'gallegos' */
999  if (((current == (length - 3))
1000  && StringAt(original, (current - 1), 4, "ILLO",
1001  "ILLA", "ALLE", ""))
1002  || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
1003  || StringAt(original, last, 1, "A", "O", ""))
1004  && StringAt(original, (current - 1), 4,
1005  "ALLE", "")))
1006  {
1007  MetaphAdd(primary, "L");
1008  MetaphAdd(secondary, "");
1009  current += 2;
1010  break;
1011  }
1012  current += 2;
1013  }
1014  else
1015  current += 1;
1016  MetaphAdd(primary, "L");
1017  MetaphAdd(secondary, "L");
1018  break;
1019 
1020  case 'M':
1021  if ((StringAt(original, (current - 1), 3, "UMB", "")
1022  && (((current + 1) == last)
1023  || StringAt(original, (current + 2), 2, "ER", "")))
1024  /* 'dumb','thumb' */
1025  || (GetAt(original, current + 1) == 'M'))
1026  current += 2;
1027  else
1028  current += 1;
1029  MetaphAdd(primary, "M");
1030  MetaphAdd(secondary, "M");
1031  break;
1032 
1033  case 'N':
1034  if (GetAt(original, current + 1) == 'N')
1035  current += 2;
1036  else
1037  current += 1;
1038  MetaphAdd(primary, "N");
1039  MetaphAdd(secondary, "N");
1040  break;
1041 
1042  case '\xd1': /* N with tilde */
1043  current += 1;
1044  MetaphAdd(primary, "N");
1045  MetaphAdd(secondary, "N");
1046  break;
1047 
1048  case 'P':
1049  if (GetAt(original, current + 1) == 'H')
1050  {
1051  MetaphAdd(primary, "F");
1052  MetaphAdd(secondary, "F");
1053  current += 2;
1054  break;
1055  }
1056 
1057  /* also account for "campbell", "raspberry" */
1058  if (StringAt(original, (current + 1), 1, "P", "B", ""))
1059  current += 2;
1060  else
1061  current += 1;
1062  MetaphAdd(primary, "P");
1063  MetaphAdd(secondary, "P");
1064  break;
1065 
1066  case 'Q':
1067  if (GetAt(original, current + 1) == 'Q')
1068  current += 2;
1069  else
1070  current += 1;
1071  MetaphAdd(primary, "K");
1072  MetaphAdd(secondary, "K");
1073  break;
1074 
1075  case 'R':
1076  /* french e.g. 'rogier', but exclude 'hochmeier' */
1077  if ((current == last)
1078  && !SlavoGermanic(original)
1079  && StringAt(original, (current - 2), 2, "IE", "")
1080  && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1081  {
1082  MetaphAdd(primary, "");
1083  MetaphAdd(secondary, "R");
1084  }
1085  else
1086  {
1087  MetaphAdd(primary, "R");
1088  MetaphAdd(secondary, "R");
1089  }
1090 
1091  if (GetAt(original, current + 1) == 'R')
1092  current += 2;
1093  else
1094  current += 1;
1095  break;
1096 
1097  case 'S':
1098  /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1099  if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1100  {
1101  current += 1;
1102  break;
1103  }
1104 
1105  /* special case 'sugar-' */
1106  if ((current == 0)
1107  && StringAt(original, current, 5, "SUGAR", ""))
1108  {
1109  MetaphAdd(primary, "X");
1110  MetaphAdd(secondary, "S");
1111  current += 1;
1112  break;
1113  }
1114 
1115  if (StringAt(original, current, 2, "SH", ""))
1116  {
1117  /* germanic */
1118  if (StringAt
1119  (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1120  "HOLZ", ""))
1121  {
1122  MetaphAdd(primary, "S");
1123  MetaphAdd(secondary, "S");
1124  }
1125  else
1126  {
1127  MetaphAdd(primary, "X");
1128  MetaphAdd(secondary, "X");
1129  }
1130  current += 2;
1131  break;
1132  }
1133 
1134  /* italian & armenian */
1135  if (StringAt(original, current, 3, "SIO", "SIA", "")
1136  || StringAt(original, current, 4, "SIAN", ""))
1137  {
1138  if (!SlavoGermanic(original))
1139  {
1140  MetaphAdd(primary, "S");
1141  MetaphAdd(secondary, "X");
1142  }
1143  else
1144  {
1145  MetaphAdd(primary, "S");
1146  MetaphAdd(secondary, "S");
1147  }
1148  current += 3;
1149  break;
1150  }
1151 
1152  /*
1153  * german & anglicisations, e.g. 'smith' match 'schmidt',
1154  * 'snider' match 'schneider' also, -sz- in slavic language
1155  * although in hungarian it is pronounced 's'
1156  */
1157  if (((current == 0)
1158  && StringAt(original, (current + 1), 1,
1159  "M", "N", "L", "W", ""))
1160  || StringAt(original, (current + 1), 1, "Z", ""))
1161  {
1162  MetaphAdd(primary, "S");
1163  MetaphAdd(secondary, "X");
1164  if (StringAt(original, (current + 1), 1, "Z", ""))
1165  current += 2;
1166  else
1167  current += 1;
1168  break;
1169  }
1170 
1171  if (StringAt(original, current, 2, "SC", ""))
1172  {
1173  /* Schlesinger's rule */
1174  if (GetAt(original, current + 2) == 'H')
1175  {
1176  /* dutch origin, e.g. 'school', 'schooner' */
1177  if (StringAt(original, (current + 3), 2,
1178  "OO", "ER", "EN",
1179  "UY", "ED", "EM", ""))
1180  {
1181  /* 'schermerhorn', 'schenker' */
1182  if (StringAt(original, (current + 3), 2,
1183  "ER", "EN", ""))
1184  {
1185  MetaphAdd(primary, "X");
1186  MetaphAdd(secondary, "SK");
1187  }
1188  else
1189  {
1190  MetaphAdd(primary, "SK");
1191  MetaphAdd(secondary, "SK");
1192  }
1193  current += 3;
1194  break;
1195  }
1196  else
1197  {
1198  if ((current == 0) && !IsVowel(original, 3)
1199  && (GetAt(original, 3) != 'W'))
1200  {
1201  MetaphAdd(primary, "X");
1202  MetaphAdd(secondary, "S");
1203  }
1204  else
1205  {
1206  MetaphAdd(primary, "X");
1207  MetaphAdd(secondary, "X");
1208  }
1209  current += 3;
1210  break;
1211  }
1212  }
1213 
1214  if (StringAt(original, (current + 2), 1,
1215  "I", "E", "Y", ""))
1216  {
1217  MetaphAdd(primary, "S");
1218  MetaphAdd(secondary, "S");
1219  current += 3;
1220  break;
1221  }
1222  /* else */
1223  MetaphAdd(primary, "SK");
1224  MetaphAdd(secondary, "SK");
1225  current += 3;
1226  break;
1227  }
1228 
1229  /* french e.g. 'resnais', 'artois' */
1230  if ((current == last)
1231  && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1232  {
1233  MetaphAdd(primary, "");
1234  MetaphAdd(secondary, "S");
1235  }
1236  else
1237  {
1238  MetaphAdd(primary, "S");
1239  MetaphAdd(secondary, "S");
1240  }
1241 
1242  if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1243  current += 2;
1244  else
1245  current += 1;
1246  break;
1247 
1248  case 'T':
1249  if (StringAt(original, current, 4, "TION", ""))
1250  {
1251  MetaphAdd(primary, "X");
1252  MetaphAdd(secondary, "X");
1253  current += 3;
1254  break;
1255  }
1256 
1257  if (StringAt(original, current, 3, "TIA", "TCH", ""))
1258  {
1259  MetaphAdd(primary, "X");
1260  MetaphAdd(secondary, "X");
1261  current += 3;
1262  break;
1263  }
1264 
1265  if (StringAt(original, current, 2, "TH", "")
1266  || StringAt(original, current, 3, "TTH", ""))
1267  {
1268  /* special case 'thomas', 'thames' or germanic */
1269  if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1270  || StringAt(original, 0, 4, "VAN ", "VON ", "")
1271  || StringAt(original, 0, 3, "SCH", ""))
1272  {
1273  MetaphAdd(primary, "T");
1274  MetaphAdd(secondary, "T");
1275  }
1276  else
1277  {
1278  MetaphAdd(primary, "0");
1279  MetaphAdd(secondary, "T");
1280  }
1281  current += 2;
1282  break;
1283  }
1284 
1285  if (StringAt(original, (current + 1), 1, "T", "D", ""))
1286  current += 2;
1287  else
1288  current += 1;
1289  MetaphAdd(primary, "T");
1290  MetaphAdd(secondary, "T");
1291  break;
1292 
1293  case 'V':
1294  if (GetAt(original, current + 1) == 'V')
1295  current += 2;
1296  else
1297  current += 1;
1298  MetaphAdd(primary, "F");
1299  MetaphAdd(secondary, "F");
1300  break;
1301 
1302  case 'W':
1303  /* can also be in middle of word */
1304  if (StringAt(original, current, 2, "WR", ""))
1305  {
1306  MetaphAdd(primary, "R");
1307  MetaphAdd(secondary, "R");
1308  current += 2;
1309  break;
1310  }
1311 
1312  if ((current == 0)
1313  && (IsVowel(original, current + 1)
1314  || StringAt(original, current, 2, "WH", "")))
1315  {
1316  /* Wasserman should match Vasserman */
1317  if (IsVowel(original, current + 1))
1318  {
1319  MetaphAdd(primary, "A");
1320  MetaphAdd(secondary, "F");
1321  }
1322  else
1323  {
1324  /* need Uomo to match Womo */
1325  MetaphAdd(primary, "A");
1326  MetaphAdd(secondary, "A");
1327  }
1328  }
1329 
1330  /* Arnow should match Arnoff */
1331  if (((current == last) && IsVowel(original, current - 1))
1332  || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1333  "OWSKI", "OWSKY", "")
1334  || StringAt(original, 0, 3, "SCH", ""))
1335  {
1336  MetaphAdd(primary, "");
1337  MetaphAdd(secondary, "F");
1338  current += 1;
1339  break;
1340  }
1341 
1342  /* polish e.g. 'filipowicz' */
1343  if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1344  {
1345  MetaphAdd(primary, "TS");
1346  MetaphAdd(secondary, "FX");
1347  current += 4;
1348  break;
1349  }
1350 
1351  /* else skip it */
1352  current += 1;
1353  break;
1354 
1355  case 'X':
1356  /* french e.g. breaux */
1357  if (!((current == last)
1358  && (StringAt(original, (current - 3), 3,
1359  "IAU", "EAU", "")
1360  || StringAt(original, (current - 2), 2,
1361  "AU", "OU", ""))))
1362  {
1363  MetaphAdd(primary, "KS");
1364  MetaphAdd(secondary, "KS");
1365  }
1366 
1367 
1368  if (StringAt(original, (current + 1), 1, "C", "X", ""))
1369  current += 2;
1370  else
1371  current += 1;
1372  break;
1373 
1374  case 'Z':
1375  /* chinese pinyin e.g. 'zhao' */
1376  if (GetAt(original, current + 1) == 'H')
1377  {
1378  MetaphAdd(primary, "J");
1379  MetaphAdd(secondary, "J");
1380  current += 2;
1381  break;
1382  }
1383  else if (StringAt(original, (current + 1), 2,
1384  "ZO", "ZI", "ZA", "")
1385  || (SlavoGermanic(original)
1386  && ((current > 0)
1387  && GetAt(original, current - 1) != 'T')))
1388  {
1389  MetaphAdd(primary, "S");
1390  MetaphAdd(secondary, "TS");
1391  }
1392  else
1393  {
1394  MetaphAdd(primary, "S");
1395  MetaphAdd(secondary, "S");
1396  }
1397 
1398  if (GetAt(original, current + 1) == 'Z')
1399  current += 2;
1400  else
1401  current += 1;
1402  break;
1403 
1404  default:
1405  current += 1;
1406  }
1407 
1408  /*
1409  * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1410  * secondary->str);
1411  */
1412  }
1413 
1414 
1415  if (primary->length > 4)
1416  SetAt(primary, 4, '\0');
1417 
1418  if (secondary->length > 4)
1419  SetAt(secondary, 4, '\0');
1420 
1421  *codes = primary->str;
1422  *++codes = secondary->str;
1423 
1424  DestroyMetaString(original);
1425  DestroyMetaString(primary);
1426  DestroyMetaString(secondary);
1427 }
1428 
1429 #ifdef DMETAPHONE_MAIN
1430 
1431 /* just for testing - not part of the perl code */
1432 
1433 main(int argc, char **argv)
1434 {
1435  char *codes[2];
1436 
1437  if (argc > 1)
1438  {
1439  DoubleMetaphone(argv[1], codes);
1440  printf("%s|%s\n", codes[0], codes[1]);
1441  }
1442 }
1443 
1444 #endif
Datum dmetaphone_alt(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:160
int length(const List *list)
Definition: list.c:1271
static metastring * NewMetaString(const char *init_str)
Definition: dmetaphone.c:235
#define META_FREE(x)
Definition: dmetaphone.c:200
static void test(void)
Datum dmetaphone(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:131
static void MetaphAdd(metastring *s, const char *new_str)
Definition: dmetaphone.c:378
int main(int argc, char **argv)
Definition: oid2name.c:545
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:273
static char GetAt(metastring *s, int pos)
Definition: dmetaphone.c:325
#define META_REALLOC(v, n, t)
Definition: dmetaphone.c:190
PG_FUNCTION_INFO_V1(dmetaphone)
char * c
static void SetAt(metastring *s, int pos, char c)
Definition: dmetaphone.c:335
#define assert(TEST)
Definition: imath.c:37
char * str
Definition: dmetaphone.c:220
int free_string_on_destroy
Definition: dmetaphone.c:223
static void DestroyMetaString(metastring *s)
Definition: dmetaphone.c:260
uintptr_t Datum
Definition: postgres.h:372
static void MakeUpper(metastring *s)
Definition: dmetaphone.c:282
static int IsVowel(metastring *s, int pos)
Definition: dmetaphone.c:292
static int StringAt(metastring *s, int start, int length,...)
Definition: dmetaphone.c:348
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:331
static void IncreaseBuffer(metastring *s, int chars_needed)
Definition: dmetaphone.c:273
text * cstring_to_text(const char *s)
Definition: varlena.c:149
#define PG_ARGISNULL(n)
Definition: fmgr.h:174
char * text_to_cstring(const text *t)
Definition: varlena.c:182
int i
void * arg
Definition: c.h:487
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
static int SlavoGermanic(metastring *s)
Definition: dmetaphone.c:309
static void DoubleMetaphone(char *, char **)
Definition: dmetaphone.c:395
#define PG_RETURN_NULL()
Definition: fmgr.h:305
#define META_MALLOC(v, n, t)
Definition: dmetaphone.c:187