PostgreSQL Source Code  git master
dmetaphone.c
Go to the documentation of this file.
1 /*
2  * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3  *
4  * contrib/fuzzystrmatch/dmetaphone.c
5  *
6  * Double Metaphone computes 2 "sounds like" strings - a primary and an
7  * alternate. In most cases they are the same, but for foreign names
8  * especially they can be a bit different, depending on pronunciation.
9  *
10  * Information on using Double Metaphone can be found at
11  * http://www.codeproject.com/string/dmetaphone1.asp
12  * and the original article describing it can be found at
13  * http://drdobbs.com/184401251
14  *
15  * For PostgreSQL we provide 2 functions - one for the primary and one for
16  * the alternate. That way the functions are pure text->text mappings that
17  * are useful in functional indexes. These are 'dmetaphone' for the
18  * primary and 'dmetaphone_alt' for the alternate.
19  *
20  * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21  * functions looks like this:
22  *
23  * CREATE FUNCTION dmetaphone (text) RETURNS text
24  * LANGUAGE C IMMUTABLE STRICT
25  * AS '$libdir/dmetaphone', 'dmetaphone';
26  *
27  * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28  * LANGUAGE C IMMUTABLE STRICT
29  * AS '$libdir/dmetaphone', 'dmetaphone_alt';
30  *
31  * Note that you have to declare the functions IMMUTABLE if you want to
32  * use them in functional indexes, and you have to declare them as STRICT
33  * as they do not check for NULL input, and will segfault if given NULL input.
34  * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35  * will never call them with NULL, but instead assume the result is NULL,
36  * which is what we (I) want.
37  *
38  * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39  * will detect NULL input and return NULL. The you don't have to declare them
40  * as STRICT.
41  *
42  * There is a small inefficiency here - each function call actually computes
43  * both the primary and the alternate and then throws away the one it doesn't
44  * need. That's the way the perl module was written, because perl can handle
45  * a list return more easily than we can in PostgreSQL. The result has been
46  * fast enough for my needs, but it could maybe be optimized a bit to remove
47  * that behaviour.
48  *
49  */
50 
51 
52 /***************************** COPYRIGHT NOTICES ***********************
53 
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from https://www.cpan.org/.
56 It bears this copyright notice:
57 
58 
59  Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60  All rights reserved.
61 
62  This code is based heavily on the C++ implementation by
63  Lawrence Philips and incorporates several bug fixes courtesy
64  of Kevin Atkinson <kevina@users.sourceforge.net>.
65 
66  This module is free software; you may redistribute it and/or
67  modify it under the same terms as Perl itself.
68 
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
71 
72  Copyright 2003, North Carolina State Highway Patrol.
73  All rights reserved.
74 
75  Permission to use, copy, modify, and distribute this software and its
76  documentation for any purpose, without fee, and without a written agreement
77  is hereby granted, provided that the above copyright notice and this
78  paragraph and the following two paragraphs appear in all copies.
79 
80  IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81  PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82  INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83  DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 
86  THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89  HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90  HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91  MODIFICATIONS.
92 
93 ***********************************************************************/
94 
95 
96 /* include these first, according to the docs */
97 #ifndef DMETAPHONE_MAIN
98 
99 #include "postgres.h"
100 
101 #include "utils/builtins.h"
102 
103 /* turn off assertions for embedded function */
104 #define NDEBUG
105 
106 #else /* DMETAPHONE_MAIN */
107 
108 /* we need these if we didn't get them from postgres.h */
109 #include <stdio.h>
110 #include <stdlib.h>
111 #include <string.h>
112 #include <stdarg.h>
113 
114 #endif /* DMETAPHONE_MAIN */
115 
116 #include <assert.h>
117 #include <ctype.h>
118 
119 /* prototype for the main function we got from the perl module */
120 static void DoubleMetaphone(char *str, char **codes);
121 
122 #ifndef DMETAPHONE_MAIN
123 
124 /*
125  * The PostgreSQL visible dmetaphone function.
126  */
127 
129 
130 Datum
132 {
133  text *arg;
134  char *aptr,
135  *codes[2],
136  *code;
137 
138 #ifdef DMETAPHONE_NOSTRICT
139  if (PG_ARGISNULL(0))
140  PG_RETURN_NULL();
141 #endif
142  arg = PG_GETARG_TEXT_PP(0);
143  aptr = text_to_cstring(arg);
144 
145  DoubleMetaphone(aptr, codes);
146  code = codes[0];
147  if (!code)
148  code = "";
149 
151 }
152 
153 /*
154  * The PostgreSQL visible dmetaphone_alt function.
155  */
156 
158 
159 Datum
161 {
162  text *arg;
163  char *aptr,
164  *codes[2],
165  *code;
166 
167 #ifdef DMETAPHONE_NOSTRICT
168  if (PG_ARGISNULL(0))
169  PG_RETURN_NULL();
170 #endif
171  arg = PG_GETARG_TEXT_PP(0);
172  aptr = text_to_cstring(arg);
173 
174  DoubleMetaphone(aptr, codes);
175  code = codes[1];
176  if (!code)
177  code = "";
178 
180 }
181 
182 
183 /* here is where we start the code imported from the perl module */
184 
185 /* all memory handling is done with these macros */
186 
187 #define META_MALLOC(v,n,t) \
188  (v = (t*)palloc(((n)*sizeof(t))))
189 
190 #define META_REALLOC(v,n,t) \
191  (v = (t*)repalloc((v),((n)*sizeof(t))))
192 
193 /*
194  * Don't do pfree - it seems to cause a SIGSEGV sometimes - which might have just
195  * been caused by reloading the module in development.
196  * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197  * in a case like this.
198  */
199 
200 #define META_FREE(x) ((void)true) /* pfree((x)) */
201 #else /* not defined DMETAPHONE_MAIN */
202 
203 /* use the standard malloc library when not running in PostgreSQL */
204 
205 #define META_MALLOC(v,n,t) \
206  (v = (t*)malloc(((n)*sizeof(t))))
207 
208 #define META_REALLOC(v,n,t) \
209  (v = (t*)realloc((v),((n)*sizeof(t))))
210 
211 #define META_FREE(x) free((x))
212 #endif /* defined DMETAPHONE_MAIN */
213 
214 
215 
216 /* this typedef was originally in the perl module's .h file */
217 
218 typedef struct
219 {
220  char *str;
221  int length;
222  int bufsize;
224 }
225 
226 metastring;
227 
228 /*
229  * remaining perl module funcs unchanged except for declaring them static
230  * and reformatting to PostgreSQL indentation and to fit in 80 cols.
231  *
232  */
233 
234 static metastring *
235 NewMetaString(const char *init_str)
236 {
237  metastring *s;
238  char empty_string[] = "";
239 
240  META_MALLOC(s, 1, metastring);
241  assert(s != NULL);
242 
243  if (init_str == NULL)
244  init_str = empty_string;
245  s->length = strlen(init_str);
246  /* preallocate a bit more for potential growth */
247  s->bufsize = s->length + 7;
248 
249  META_MALLOC(s->str, s->bufsize, char);
250  assert(s->str != NULL);
251 
252  memcpy(s->str, init_str, s->length + 1);
253  s->free_string_on_destroy = 1;
254 
255  return s;
256 }
257 
258 
259 static void
261 {
262  if (s == NULL)
263  return;
264 
265  if (s->free_string_on_destroy && (s->str != NULL))
266  META_FREE(s->str);
267 
268  META_FREE(s);
269 }
270 
271 
272 static void
273 IncreaseBuffer(metastring *s, int chars_needed)
274 {
275  META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
276  assert(s->str != NULL);
277  s->bufsize = s->bufsize + chars_needed + 10;
278 }
279 
280 
281 static void
283 {
284  char *i;
285 
286  for (i = s->str; *i; i++)
287  *i = toupper((unsigned char) *i);
288 }
289 
290 
291 static int
292 IsVowel(metastring *s, int pos)
293 {
294  char c;
295 
296  if ((pos < 0) || (pos >= s->length))
297  return 0;
298 
299  c = *(s->str + pos);
300  if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
301  (c == 'U') || (c == 'Y'))
302  return 1;
303 
304  return 0;
305 }
306 
307 
308 static int
310 {
311  if ((char *) strstr(s->str, "W"))
312  return 1;
313  else if ((char *) strstr(s->str, "K"))
314  return 1;
315  else if ((char *) strstr(s->str, "CZ"))
316  return 1;
317  else if ((char *) strstr(s->str, "WITZ"))
318  return 1;
319  else
320  return 0;
321 }
322 
323 
324 static char
325 GetAt(metastring *s, int pos)
326 {
327  if ((pos < 0) || (pos >= s->length))
328  return '\0';
329 
330  return ((char) *(s->str + pos));
331 }
332 
333 
334 static void
335 SetAt(metastring *s, int pos, char c)
336 {
337  if ((pos < 0) || (pos >= s->length))
338  return;
339 
340  *(s->str + pos) = c;
341 }
342 
343 
344 /*
345  Caveats: the START value is 0 based
346 */
347 static int
348 StringAt(metastring *s, int start, int length,...)
349 {
350  char *test;
351  char *pos;
352  va_list ap;
353 
354  if ((start < 0) || (start >= s->length))
355  return 0;
356 
357  pos = (s->str + start);
358  va_start(ap, length);
359 
360  do
361  {
362  test = va_arg(ap, char *);
363  if (*test && (strncmp(pos, test, length) == 0))
364  {
365  va_end(ap);
366  return 1;
367  }
368  }
369  while (strcmp(test, "") != 0);
370 
371  va_end(ap);
372 
373  return 0;
374 }
375 
376 
377 static void
378 MetaphAdd(metastring *s, const char *new_str)
379 {
380  int add_length;
381 
382  if (new_str == NULL)
383  return;
384 
385  add_length = strlen(new_str);
386  if ((s->length + add_length) > (s->bufsize - 1))
387  IncreaseBuffer(s, add_length);
388 
389  strcat(s->str, new_str);
390  s->length += add_length;
391 }
392 
393 
394 static void
395 DoubleMetaphone(char *str, char **codes)
396 {
397  int length;
398  metastring *original;
399  metastring *primary;
400  metastring *secondary;
401  int current;
402  int last;
403 
404  current = 0;
405  /* we need the real length and last prior to padding */
406  length = strlen(str);
407  last = length - 1;
408  original = NewMetaString(str);
409  /* Pad original so we can index beyond end */
410  MetaphAdd(original, " ");
411 
412  primary = NewMetaString("");
413  secondary = NewMetaString("");
414  primary->free_string_on_destroy = 0;
415  secondary->free_string_on_destroy = 0;
416 
417  MakeUpper(original);
418 
419  /* skip these when at start of word */
420  if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
421  current += 1;
422 
423  /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424  if (GetAt(original, 0) == 'X')
425  {
426  MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
427  MetaphAdd(secondary, "S");
428  current += 1;
429  }
430 
431  /* main loop */
432  while ((primary->length < 4) || (secondary->length < 4))
433  {
434  if (current >= length)
435  break;
436 
437  switch (GetAt(original, current))
438  {
439  case 'A':
440  case 'E':
441  case 'I':
442  case 'O':
443  case 'U':
444  case 'Y':
445  if (current == 0)
446  {
447  /* all init vowels now map to 'A' */
448  MetaphAdd(primary, "A");
449  MetaphAdd(secondary, "A");
450  }
451  current += 1;
452  break;
453 
454  case 'B':
455 
456  /* "-mb", e.g", "dumb", already skipped over... */
457  MetaphAdd(primary, "P");
458  MetaphAdd(secondary, "P");
459 
460  if (GetAt(original, current + 1) == 'B')
461  current += 2;
462  else
463  current += 1;
464  break;
465 
466  case '\xc7': /* C with cedilla */
467  MetaphAdd(primary, "S");
468  MetaphAdd(secondary, "S");
469  current += 1;
470  break;
471 
472  case 'C':
473  /* various germanic */
474  if ((current > 1)
475  && !IsVowel(original, current - 2)
476  && StringAt(original, (current - 1), 3, "ACH", "")
477  && ((GetAt(original, current + 2) != 'I')
478  && ((GetAt(original, current + 2) != 'E')
479  || StringAt(original, (current - 2), 6, "BACHER",
480  "MACHER", ""))))
481  {
482  MetaphAdd(primary, "K");
483  MetaphAdd(secondary, "K");
484  current += 2;
485  break;
486  }
487 
488  /* special case 'caesar' */
489  if ((current == 0)
490  && StringAt(original, current, 6, "CAESAR", ""))
491  {
492  MetaphAdd(primary, "S");
493  MetaphAdd(secondary, "S");
494  current += 2;
495  break;
496  }
497 
498  /* italian 'chianti' */
499  if (StringAt(original, current, 4, "CHIA", ""))
500  {
501  MetaphAdd(primary, "K");
502  MetaphAdd(secondary, "K");
503  current += 2;
504  break;
505  }
506 
507  if (StringAt(original, current, 2, "CH", ""))
508  {
509  /* find 'michael' */
510  if ((current > 0)
511  && StringAt(original, current, 4, "CHAE", ""))
512  {
513  MetaphAdd(primary, "K");
514  MetaphAdd(secondary, "X");
515  current += 2;
516  break;
517  }
518 
519  /* greek roots e.g. 'chemistry', 'chorus' */
520  if ((current == 0)
521  && (StringAt(original, (current + 1), 5,
522  "HARAC", "HARIS", "")
523  || StringAt(original, (current + 1), 3, "HOR",
524  "HYM", "HIA", "HEM", ""))
525  && !StringAt(original, 0, 5, "CHORE", ""))
526  {
527  MetaphAdd(primary, "K");
528  MetaphAdd(secondary, "K");
529  current += 2;
530  break;
531  }
532 
533  /* germanic, greek, or otherwise 'ch' for 'kh' sound */
534  if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
535  || StringAt(original, 0, 3, "SCH", ""))
536  /* 'architect but not 'arch', 'orchestra', 'orchid' */
537  || StringAt(original, (current - 2), 6, "ORCHES",
538  "ARCHIT", "ORCHID", "")
539  || StringAt(original, (current + 2), 1, "T", "S",
540  "")
541  || ((StringAt(original, (current - 1), 1,
542  "A", "O", "U", "E", "")
543  || (current == 0))
544 
545  /*
546  * e.g., 'wachtler', 'wechsler', but not 'tichner'
547  */
548  && StringAt(original, (current + 2), 1, "L", "R",
549  "N", "M", "B", "H", "F", "V", "W",
550  " ", "")))
551  {
552  MetaphAdd(primary, "K");
553  MetaphAdd(secondary, "K");
554  }
555  else
556  {
557  if (current > 0)
558  {
559  if (StringAt(original, 0, 2, "MC", ""))
560  {
561  /* e.g., "McHugh" */
562  MetaphAdd(primary, "K");
563  MetaphAdd(secondary, "K");
564  }
565  else
566  {
567  MetaphAdd(primary, "X");
568  MetaphAdd(secondary, "K");
569  }
570  }
571  else
572  {
573  MetaphAdd(primary, "X");
574  MetaphAdd(secondary, "X");
575  }
576  }
577  current += 2;
578  break;
579  }
580  /* e.g, 'czerny' */
581  if (StringAt(original, current, 2, "CZ", "")
582  && !StringAt(original, (current - 2), 4, "WICZ", ""))
583  {
584  MetaphAdd(primary, "S");
585  MetaphAdd(secondary, "X");
586  current += 2;
587  break;
588  }
589 
590  /* e.g., 'focaccia' */
591  if (StringAt(original, (current + 1), 3, "CIA", ""))
592  {
593  MetaphAdd(primary, "X");
594  MetaphAdd(secondary, "X");
595  current += 3;
596  break;
597  }
598 
599  /* double 'C', but not if e.g. 'McClellan' */
600  if (StringAt(original, current, 2, "CC", "")
601  && !((current == 1) && (GetAt(original, 0) == 'M')))
602  {
603  /* 'bellocchio' but not 'bacchus' */
604  if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
605  && !StringAt(original, (current + 2), 2, "HU", ""))
606  {
607  /* 'accident', 'accede' 'succeed' */
608  if (((current == 1)
609  && (GetAt(original, current - 1) == 'A'))
610  || StringAt(original, (current - 1), 5, "UCCEE",
611  "UCCES", ""))
612  {
613  MetaphAdd(primary, "KS");
614  MetaphAdd(secondary, "KS");
615  /* 'bacci', 'bertucci', other italian */
616  }
617  else
618  {
619  MetaphAdd(primary, "X");
620  MetaphAdd(secondary, "X");
621  }
622  current += 3;
623  break;
624  }
625  else
626  { /* Pierce's rule */
627  MetaphAdd(primary, "K");
628  MetaphAdd(secondary, "K");
629  current += 2;
630  break;
631  }
632  }
633 
634  if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
635  {
636  MetaphAdd(primary, "K");
637  MetaphAdd(secondary, "K");
638  current += 2;
639  break;
640  }
641 
642  if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
643  {
644  /* italian vs. english */
645  if (StringAt
646  (original, current, 3, "CIO", "CIE", "CIA", ""))
647  {
648  MetaphAdd(primary, "S");
649  MetaphAdd(secondary, "X");
650  }
651  else
652  {
653  MetaphAdd(primary, "S");
654  MetaphAdd(secondary, "S");
655  }
656  current += 2;
657  break;
658  }
659 
660  /* else */
661  MetaphAdd(primary, "K");
662  MetaphAdd(secondary, "K");
663 
664  /* name sent in 'mac caffrey', 'mac gregor */
665  if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
666  current += 3;
667  else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
668  && !StringAt(original, (current + 1), 2,
669  "CE", "CI", ""))
670  current += 2;
671  else
672  current += 1;
673  break;
674 
675  case 'D':
676  if (StringAt(original, current, 2, "DG", ""))
677  {
678  if (StringAt(original, (current + 2), 1,
679  "I", "E", "Y", ""))
680  {
681  /* e.g. 'edge' */
682  MetaphAdd(primary, "J");
683  MetaphAdd(secondary, "J");
684  current += 3;
685  break;
686  }
687  else
688  {
689  /* e.g. 'edgar' */
690  MetaphAdd(primary, "TK");
691  MetaphAdd(secondary, "TK");
692  current += 2;
693  break;
694  }
695  }
696 
697  if (StringAt(original, current, 2, "DT", "DD", ""))
698  {
699  MetaphAdd(primary, "T");
700  MetaphAdd(secondary, "T");
701  current += 2;
702  break;
703  }
704 
705  /* else */
706  MetaphAdd(primary, "T");
707  MetaphAdd(secondary, "T");
708  current += 1;
709  break;
710 
711  case 'F':
712  if (GetAt(original, current + 1) == 'F')
713  current += 2;
714  else
715  current += 1;
716  MetaphAdd(primary, "F");
717  MetaphAdd(secondary, "F");
718  break;
719 
720  case 'G':
721  if (GetAt(original, current + 1) == 'H')
722  {
723  if ((current > 0) && !IsVowel(original, current - 1))
724  {
725  MetaphAdd(primary, "K");
726  MetaphAdd(secondary, "K");
727  current += 2;
728  break;
729  }
730 
731  if (current < 3)
732  {
733  /* 'ghislane', ghiradelli */
734  if (current == 0)
735  {
736  if (GetAt(original, current + 2) == 'I')
737  {
738  MetaphAdd(primary, "J");
739  MetaphAdd(secondary, "J");
740  }
741  else
742  {
743  MetaphAdd(primary, "K");
744  MetaphAdd(secondary, "K");
745  }
746  current += 2;
747  break;
748  }
749  }
750 
751  /*
752  * Parker's rule (with some further refinements) - e.g.,
753  * 'hugh'
754  */
755  if (((current > 1)
756  && StringAt(original, (current - 2), 1,
757  "B", "H", "D", ""))
758  /* e.g., 'bough' */
759  || ((current > 2)
760  && StringAt(original, (current - 3), 1,
761  "B", "H", "D", ""))
762  /* e.g., 'broughton' */
763  || ((current > 3)
764  && StringAt(original, (current - 4), 1,
765  "B", "H", "")))
766  {
767  current += 2;
768  break;
769  }
770  else
771  {
772  /*
773  * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
774  * 'rough', 'tough'
775  */
776  if ((current > 2)
777  && (GetAt(original, current - 1) == 'U')
778  && StringAt(original, (current - 3), 1, "C",
779  "G", "L", "R", "T", ""))
780  {
781  MetaphAdd(primary, "F");
782  MetaphAdd(secondary, "F");
783  }
784  else if ((current > 0)
785  && GetAt(original, current - 1) != 'I')
786  {
787 
788 
789  MetaphAdd(primary, "K");
790  MetaphAdd(secondary, "K");
791  }
792 
793  current += 2;
794  break;
795  }
796  }
797 
798  if (GetAt(original, current + 1) == 'N')
799  {
800  if ((current == 1) && IsVowel(original, 0)
801  && !SlavoGermanic(original))
802  {
803  MetaphAdd(primary, "KN");
804  MetaphAdd(secondary, "N");
805  }
806  else
807  /* not e.g. 'cagney' */
808  if (!StringAt(original, (current + 2), 2, "EY", "")
809  && (GetAt(original, current + 1) != 'Y')
810  && !SlavoGermanic(original))
811  {
812  MetaphAdd(primary, "N");
813  MetaphAdd(secondary, "KN");
814  }
815  else
816  {
817  MetaphAdd(primary, "KN");
818  MetaphAdd(secondary, "KN");
819  }
820  current += 2;
821  break;
822  }
823 
824  /* 'tagliaro' */
825  if (StringAt(original, (current + 1), 2, "LI", "")
826  && !SlavoGermanic(original))
827  {
828  MetaphAdd(primary, "KL");
829  MetaphAdd(secondary, "L");
830  current += 2;
831  break;
832  }
833 
834  /* -ges-,-gep-,-gel-, -gie- at beginning */
835  if ((current == 0)
836  && ((GetAt(original, current + 1) == 'Y')
837  || StringAt(original, (current + 1), 2, "ES", "EP",
838  "EB", "EL", "EY", "IB", "IL", "IN", "IE",
839  "EI", "ER", "")))
840  {
841  MetaphAdd(primary, "K");
842  MetaphAdd(secondary, "J");
843  current += 2;
844  break;
845  }
846 
847  /* -ger-, -gy- */
848  if ((StringAt(original, (current + 1), 2, "ER", "")
849  || (GetAt(original, current + 1) == 'Y'))
850  && !StringAt(original, 0, 6,
851  "DANGER", "RANGER", "MANGER", "")
852  && !StringAt(original, (current - 1), 1, "E", "I", "")
853  && !StringAt(original, (current - 1), 3, "RGY", "OGY", ""))
854  {
855  MetaphAdd(primary, "K");
856  MetaphAdd(secondary, "J");
857  current += 2;
858  break;
859  }
860 
861  /* italian e.g, 'biaggi' */
862  if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
863  || StringAt(original, (current - 1), 4,
864  "AGGI", "OGGI", ""))
865  {
866  /* obvious germanic */
867  if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
868  || StringAt(original, 0, 3, "SCH", ""))
869  || StringAt(original, (current + 1), 2, "ET", ""))
870  {
871  MetaphAdd(primary, "K");
872  MetaphAdd(secondary, "K");
873  }
874  else
875  {
876  /* always soft if french ending */
877  if (StringAt
878  (original, (current + 1), 4, "IER ", ""))
879  {
880  MetaphAdd(primary, "J");
881  MetaphAdd(secondary, "J");
882  }
883  else
884  {
885  MetaphAdd(primary, "J");
886  MetaphAdd(secondary, "K");
887  }
888  }
889  current += 2;
890  break;
891  }
892 
893  if (GetAt(original, current + 1) == 'G')
894  current += 2;
895  else
896  current += 1;
897  MetaphAdd(primary, "K");
898  MetaphAdd(secondary, "K");
899  break;
900 
901  case 'H':
902  /* only keep if first & before vowel or btw. 2 vowels */
903  if (((current == 0) || IsVowel(original, current - 1))
904  && IsVowel(original, current + 1))
905  {
906  MetaphAdd(primary, "H");
907  MetaphAdd(secondary, "H");
908  current += 2;
909  }
910  else
911  /* also takes care of 'HH' */
912  current += 1;
913  break;
914 
915  case 'J':
916  /* obvious spanish, 'jose', 'san jacinto' */
917  if (StringAt(original, current, 4, "JOSE", "")
918  || StringAt(original, 0, 4, "SAN ", ""))
919  {
920  if (((current == 0)
921  && (GetAt(original, current + 4) == ' '))
922  || StringAt(original, 0, 4, "SAN ", ""))
923  {
924  MetaphAdd(primary, "H");
925  MetaphAdd(secondary, "H");
926  }
927  else
928  {
929  MetaphAdd(primary, "J");
930  MetaphAdd(secondary, "H");
931  }
932  current += 1;
933  break;
934  }
935 
936  if ((current == 0)
937  && !StringAt(original, current, 4, "JOSE", ""))
938  {
939  MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
940  MetaphAdd(secondary, "A");
941  }
942  else
943  {
944  /* spanish pron. of e.g. 'bajador' */
945  if (IsVowel(original, current - 1)
946  && !SlavoGermanic(original)
947  && ((GetAt(original, current + 1) == 'A')
948  || (GetAt(original, current + 1) == 'O')))
949  {
950  MetaphAdd(primary, "J");
951  MetaphAdd(secondary, "H");
952  }
953  else
954  {
955  if (current == last)
956  {
957  MetaphAdd(primary, "J");
958  MetaphAdd(secondary, "");
959  }
960  else
961  {
962  if (!StringAt(original, (current + 1), 1, "L", "T",
963  "K", "S", "N", "M", "B", "Z", "")
964  && !StringAt(original, (current - 1), 1,
965  "S", "K", "L", ""))
966  {
967  MetaphAdd(primary, "J");
968  MetaphAdd(secondary, "J");
969  }
970  }
971  }
972  }
973 
974  if (GetAt(original, current + 1) == 'J') /* it could happen! */
975  current += 2;
976  else
977  current += 1;
978  break;
979 
980  case 'K':
981  if (GetAt(original, current + 1) == 'K')
982  current += 2;
983  else
984  current += 1;
985  MetaphAdd(primary, "K");
986  MetaphAdd(secondary, "K");
987  break;
988 
989  case 'L':
990  if (GetAt(original, current + 1) == 'L')
991  {
992  /* spanish e.g. 'cabrillo', 'gallegos' */
993  if (((current == (length - 3))
994  && StringAt(original, (current - 1), 4, "ILLO",
995  "ILLA", "ALLE", ""))
996  || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
997  || StringAt(original, last, 1, "A", "O", ""))
998  && StringAt(original, (current - 1), 4,
999  "ALLE", "")))
1000  {
1001  MetaphAdd(primary, "L");
1002  MetaphAdd(secondary, "");
1003  current += 2;
1004  break;
1005  }
1006  current += 2;
1007  }
1008  else
1009  current += 1;
1010  MetaphAdd(primary, "L");
1011  MetaphAdd(secondary, "L");
1012  break;
1013 
1014  case 'M':
1015  if ((StringAt(original, (current - 1), 3, "UMB", "")
1016  && (((current + 1) == last)
1017  || StringAt(original, (current + 2), 2, "ER", "")))
1018  /* 'dumb','thumb' */
1019  || (GetAt(original, current + 1) == 'M'))
1020  current += 2;
1021  else
1022  current += 1;
1023  MetaphAdd(primary, "M");
1024  MetaphAdd(secondary, "M");
1025  break;
1026 
1027  case 'N':
1028  if (GetAt(original, current + 1) == 'N')
1029  current += 2;
1030  else
1031  current += 1;
1032  MetaphAdd(primary, "N");
1033  MetaphAdd(secondary, "N");
1034  break;
1035 
1036  case '\xd1': /* N with tilde */
1037  current += 1;
1038  MetaphAdd(primary, "N");
1039  MetaphAdd(secondary, "N");
1040  break;
1041 
1042  case 'P':
1043  if (GetAt(original, current + 1) == 'H')
1044  {
1045  MetaphAdd(primary, "F");
1046  MetaphAdd(secondary, "F");
1047  current += 2;
1048  break;
1049  }
1050 
1051  /* also account for "campbell", "raspberry" */
1052  if (StringAt(original, (current + 1), 1, "P", "B", ""))
1053  current += 2;
1054  else
1055  current += 1;
1056  MetaphAdd(primary, "P");
1057  MetaphAdd(secondary, "P");
1058  break;
1059 
1060  case 'Q':
1061  if (GetAt(original, current + 1) == 'Q')
1062  current += 2;
1063  else
1064  current += 1;
1065  MetaphAdd(primary, "K");
1066  MetaphAdd(secondary, "K");
1067  break;
1068 
1069  case 'R':
1070  /* french e.g. 'rogier', but exclude 'hochmeier' */
1071  if ((current == last)
1072  && !SlavoGermanic(original)
1073  && StringAt(original, (current - 2), 2, "IE", "")
1074  && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1075  {
1076  MetaphAdd(primary, "");
1077  MetaphAdd(secondary, "R");
1078  }
1079  else
1080  {
1081  MetaphAdd(primary, "R");
1082  MetaphAdd(secondary, "R");
1083  }
1084 
1085  if (GetAt(original, current + 1) == 'R')
1086  current += 2;
1087  else
1088  current += 1;
1089  break;
1090 
1091  case 'S':
1092  /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1093  if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1094  {
1095  current += 1;
1096  break;
1097  }
1098 
1099  /* special case 'sugar-' */
1100  if ((current == 0)
1101  && StringAt(original, current, 5, "SUGAR", ""))
1102  {
1103  MetaphAdd(primary, "X");
1104  MetaphAdd(secondary, "S");
1105  current += 1;
1106  break;
1107  }
1108 
1109  if (StringAt(original, current, 2, "SH", ""))
1110  {
1111  /* germanic */
1112  if (StringAt
1113  (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1114  "HOLZ", ""))
1115  {
1116  MetaphAdd(primary, "S");
1117  MetaphAdd(secondary, "S");
1118  }
1119  else
1120  {
1121  MetaphAdd(primary, "X");
1122  MetaphAdd(secondary, "X");
1123  }
1124  current += 2;
1125  break;
1126  }
1127 
1128  /* italian & armenian */
1129  if (StringAt(original, current, 3, "SIO", "SIA", "")
1130  || StringAt(original, current, 4, "SIAN", ""))
1131  {
1132  if (!SlavoGermanic(original))
1133  {
1134  MetaphAdd(primary, "S");
1135  MetaphAdd(secondary, "X");
1136  }
1137  else
1138  {
1139  MetaphAdd(primary, "S");
1140  MetaphAdd(secondary, "S");
1141  }
1142  current += 3;
1143  break;
1144  }
1145 
1146  /*
1147  * german & anglicisations, e.g. 'smith' match 'schmidt',
1148  * 'snider' match 'schneider' also, -sz- in slavic language
1149  * although in hungarian it is pronounced 's'
1150  */
1151  if (((current == 0)
1152  && StringAt(original, (current + 1), 1,
1153  "M", "N", "L", "W", ""))
1154  || StringAt(original, (current + 1), 1, "Z", ""))
1155  {
1156  MetaphAdd(primary, "S");
1157  MetaphAdd(secondary, "X");
1158  if (StringAt(original, (current + 1), 1, "Z", ""))
1159  current += 2;
1160  else
1161  current += 1;
1162  break;
1163  }
1164 
1165  if (StringAt(original, current, 2, "SC", ""))
1166  {
1167  /* Schlesinger's rule */
1168  if (GetAt(original, current + 2) == 'H')
1169  {
1170  /* dutch origin, e.g. 'school', 'schooner' */
1171  if (StringAt(original, (current + 3), 2,
1172  "OO", "ER", "EN",
1173  "UY", "ED", "EM", ""))
1174  {
1175  /* 'schermerhorn', 'schenker' */
1176  if (StringAt(original, (current + 3), 2,
1177  "ER", "EN", ""))
1178  {
1179  MetaphAdd(primary, "X");
1180  MetaphAdd(secondary, "SK");
1181  }
1182  else
1183  {
1184  MetaphAdd(primary, "SK");
1185  MetaphAdd(secondary, "SK");
1186  }
1187  current += 3;
1188  break;
1189  }
1190  else
1191  {
1192  if ((current == 0) && !IsVowel(original, 3)
1193  && (GetAt(original, 3) != 'W'))
1194  {
1195  MetaphAdd(primary, "X");
1196  MetaphAdd(secondary, "S");
1197  }
1198  else
1199  {
1200  MetaphAdd(primary, "X");
1201  MetaphAdd(secondary, "X");
1202  }
1203  current += 3;
1204  break;
1205  }
1206  }
1207 
1208  if (StringAt(original, (current + 2), 1,
1209  "I", "E", "Y", ""))
1210  {
1211  MetaphAdd(primary, "S");
1212  MetaphAdd(secondary, "S");
1213  current += 3;
1214  break;
1215  }
1216  /* else */
1217  MetaphAdd(primary, "SK");
1218  MetaphAdd(secondary, "SK");
1219  current += 3;
1220  break;
1221  }
1222 
1223  /* french e.g. 'resnais', 'artois' */
1224  if ((current == last)
1225  && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1226  {
1227  MetaphAdd(primary, "");
1228  MetaphAdd(secondary, "S");
1229  }
1230  else
1231  {
1232  MetaphAdd(primary, "S");
1233  MetaphAdd(secondary, "S");
1234  }
1235 
1236  if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1237  current += 2;
1238  else
1239  current += 1;
1240  break;
1241 
1242  case 'T':
1243  if (StringAt(original, current, 4, "TION", ""))
1244  {
1245  MetaphAdd(primary, "X");
1246  MetaphAdd(secondary, "X");
1247  current += 3;
1248  break;
1249  }
1250 
1251  if (StringAt(original, current, 3, "TIA", "TCH", ""))
1252  {
1253  MetaphAdd(primary, "X");
1254  MetaphAdd(secondary, "X");
1255  current += 3;
1256  break;
1257  }
1258 
1259  if (StringAt(original, current, 2, "TH", "")
1260  || StringAt(original, current, 3, "TTH", ""))
1261  {
1262  /* special case 'thomas', 'thames' or germanic */
1263  if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1264  || StringAt(original, 0, 4, "VAN ", "VON ", "")
1265  || StringAt(original, 0, 3, "SCH", ""))
1266  {
1267  MetaphAdd(primary, "T");
1268  MetaphAdd(secondary, "T");
1269  }
1270  else
1271  {
1272  MetaphAdd(primary, "0");
1273  MetaphAdd(secondary, "T");
1274  }
1275  current += 2;
1276  break;
1277  }
1278 
1279  if (StringAt(original, (current + 1), 1, "T", "D", ""))
1280  current += 2;
1281  else
1282  current += 1;
1283  MetaphAdd(primary, "T");
1284  MetaphAdd(secondary, "T");
1285  break;
1286 
1287  case 'V':
1288  if (GetAt(original, current + 1) == 'V')
1289  current += 2;
1290  else
1291  current += 1;
1292  MetaphAdd(primary, "F");
1293  MetaphAdd(secondary, "F");
1294  break;
1295 
1296  case 'W':
1297  /* can also be in middle of word */
1298  if (StringAt(original, current, 2, "WR", ""))
1299  {
1300  MetaphAdd(primary, "R");
1301  MetaphAdd(secondary, "R");
1302  current += 2;
1303  break;
1304  }
1305 
1306  if ((current == 0)
1307  && (IsVowel(original, current + 1)
1308  || StringAt(original, current, 2, "WH", "")))
1309  {
1310  /* Wasserman should match Vasserman */
1311  if (IsVowel(original, current + 1))
1312  {
1313  MetaphAdd(primary, "A");
1314  MetaphAdd(secondary, "F");
1315  }
1316  else
1317  {
1318  /* need Uomo to match Womo */
1319  MetaphAdd(primary, "A");
1320  MetaphAdd(secondary, "A");
1321  }
1322  }
1323 
1324  /* Arnow should match Arnoff */
1325  if (((current == last) && IsVowel(original, current - 1))
1326  || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1327  "OWSKI", "OWSKY", "")
1328  || StringAt(original, 0, 3, "SCH", ""))
1329  {
1330  MetaphAdd(primary, "");
1331  MetaphAdd(secondary, "F");
1332  current += 1;
1333  break;
1334  }
1335 
1336  /* polish e.g. 'filipowicz' */
1337  if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1338  {
1339  MetaphAdd(primary, "TS");
1340  MetaphAdd(secondary, "FX");
1341  current += 4;
1342  break;
1343  }
1344 
1345  /* else skip it */
1346  current += 1;
1347  break;
1348 
1349  case 'X':
1350  /* french e.g. breaux */
1351  if (!((current == last)
1352  && (StringAt(original, (current - 3), 3,
1353  "IAU", "EAU", "")
1354  || StringAt(original, (current - 2), 2,
1355  "AU", "OU", ""))))
1356  {
1357  MetaphAdd(primary, "KS");
1358  MetaphAdd(secondary, "KS");
1359  }
1360 
1361 
1362  if (StringAt(original, (current + 1), 1, "C", "X", ""))
1363  current += 2;
1364  else
1365  current += 1;
1366  break;
1367 
1368  case 'Z':
1369  /* chinese pinyin e.g. 'zhao' */
1370  if (GetAt(original, current + 1) == 'H')
1371  {
1372  MetaphAdd(primary, "J");
1373  MetaphAdd(secondary, "J");
1374  current += 2;
1375  break;
1376  }
1377  else if (StringAt(original, (current + 1), 2,
1378  "ZO", "ZI", "ZA", "")
1379  || (SlavoGermanic(original)
1380  && ((current > 0)
1381  && GetAt(original, current - 1) != 'T')))
1382  {
1383  MetaphAdd(primary, "S");
1384  MetaphAdd(secondary, "TS");
1385  }
1386  else
1387  {
1388  MetaphAdd(primary, "S");
1389  MetaphAdd(secondary, "S");
1390  }
1391 
1392  if (GetAt(original, current + 1) == 'Z')
1393  current += 2;
1394  else
1395  current += 1;
1396  break;
1397 
1398  default:
1399  current += 1;
1400  }
1401 
1402  /*
1403  * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1404  * secondary->str);
1405  */
1406  }
1407 
1408 
1409  if (primary->length > 4)
1410  SetAt(primary, 4, '\0');
1411 
1412  if (secondary->length > 4)
1413  SetAt(secondary, 4, '\0');
1414 
1415  *codes = primary->str;
1416  *++codes = secondary->str;
1417 
1418  DestroyMetaString(original);
1419  DestroyMetaString(primary);
1420  DestroyMetaString(secondary);
1421 }
1422 
1423 #ifdef DMETAPHONE_MAIN
1424 
1425 /* just for testing - not part of the perl code */
1426 
1427 main(int argc, char **argv)
1428 {
1429  char *codes[2];
1430 
1431  if (argc > 1)
1432  {
1433  DoubleMetaphone(argv[1], codes);
1434  printf("%s|%s\n", codes[0], codes[1]);
1435  }
1436 }
1437 
1438 #endif
Datum dmetaphone(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:131
static char GetAt(metastring *s, int pos)
Definition: dmetaphone.c:325
static void SetAt(metastring *s, int pos, char c)
Definition: dmetaphone.c:335
static void MetaphAdd(metastring *s, const char *new_str)
Definition: dmetaphone.c:378
static int SlavoGermanic(metastring *s)
Definition: dmetaphone.c:309
PG_FUNCTION_INFO_V1(dmetaphone)
static void MakeUpper(metastring *s)
Definition: dmetaphone.c:282
static void IncreaseBuffer(metastring *s, int chars_needed)
Definition: dmetaphone.c:273
static int StringAt(metastring *s, int start, int length,...)
Definition: dmetaphone.c:348
static int IsVowel(metastring *s, int pos)
Definition: dmetaphone.c:292
static metastring * NewMetaString(const char *init_str)
Definition: dmetaphone.c:235
#define META_FREE(x)
Definition: dmetaphone.c:200
Datum dmetaphone_alt(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:160
#define META_REALLOC(v, n, t)
Definition: dmetaphone.c:190
static void DestroyMetaString(metastring *s)
Definition: dmetaphone.c:260
static void DoubleMetaphone(char *str, char **codes)
Definition: dmetaphone.c:395
#define META_MALLOC(v, n, t)
Definition: dmetaphone.c:187
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_ARGISNULL(n)
Definition: fmgr.h:209
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
return str start
const char * str
int i
Definition: isn.c:73
va_end(args)
va_start(args, fmt)
int main(int argc, char **argv)
Definition: oid2name.c:583
void * arg
#define printf(...)
Definition: port.h:244
uintptr_t Datum
Definition: postgres.h:64
static void test(void)
char * c
#define assert(x)
Definition: regcustom.h:56
char * str
Definition: dmetaphone.c:220
int free_string_on_destroy
Definition: dmetaphone.c:223
Definition: c.h:687
char * text_to_cstring(const text *t)
Definition: varlena.c:217
text * cstring_to_text(const char *s)
Definition: varlena.c:184