PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
dmetaphone.c
Go to the documentation of this file.
1 /*
2  * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3  *
4  * contrib/fuzzystrmatch/dmetaphone.c
5  *
6  * Double Metaphone computes 2 "sounds like" strings - a primary and an
7  * alternate. In most cases they are the same, but for foreign names
8  * especially they can be a bit different, depending on pronunciation.
9  *
10  * Information on using Double Metaphone can be found at
11  * http://www.codeproject.com/string/dmetaphone1.asp
12  * and the original article describing it can be found at
13  * http://drdobbs.com/184401251
14  *
15  * For PostgreSQL we provide 2 functions - one for the primary and one for
16  * the alternate. That way the functions are pure text->text mappings that
17  * are useful in functional indexes. These are 'dmetaphone' for the
18  * primary and 'dmetaphone_alt' for the alternate.
19  *
20  * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21  * functions looks like this:
22  *
23  * CREATE FUNCTION dmetaphone (text) RETURNS text
24  * LANGUAGE C IMMUTABLE STRICT
25  * AS '$libdir/dmetaphone', 'dmetaphone';
26  *
27  * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28  * LANGUAGE C IMMUTABLE STRICT
29  * AS '$libdir/dmetaphone', 'dmetaphone_alt';
30  *
31  * Note that you have to declare the functions IMMUTABLE if you want to
32  * use them in functional indexes, and you have to declare them as STRICT
33  * as they do not check for NULL input, and will segfault if given NULL input.
34  * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35  * will never call them with NULL, but instead assume the result is NULL,
36  * which is what we (I) want.
37  *
38  * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39  * will detect NULL input and return NULL. The you don't have to declare them
40  * as STRICT.
41  *
42  * There is a small inefficiency here - each function call actually computes
43  * both the primary and the alternate and then throws away the one it doesn't
44  * need. That's the way the perl module was written, because perl can handle
45  * a list return more easily than we can in PostgreSQL. The result has been
46  * fast enough for my needs, but it could maybe be optimized a bit to remove
47  * that behaviour.
48  *
49  */
50 
51 
52 /***************************** COPYRIGHT NOTICES ***********************
53 
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from http://www.cpan.org.
56 It bears this copyright notice:
57 
58 
59  Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60  All rights reserved.
61 
62  This code is based heavily on the C++ implementation by
63  Lawrence Philips and incorporates several bug fixes courtesy
64  of Kevin Atkinson <kevina@users.sourceforge.net>.
65 
66  This module is free software; you may redistribute it and/or
67  modify it under the same terms as Perl itself.
68 
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
71 
72  Copyright 2003, North Carolina State Highway Patrol.
73  All rights reserved.
74 
75  Permission to use, copy, modify, and distribute this software and its
76  documentation for any purpose, without fee, and without a written agreement
77  is hereby granted, provided that the above copyright notice and this
78  paragraph and the following two paragraphs appear in all copies.
79 
80  IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81  PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82  INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83  DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 
86  THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89  HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90  HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91  MODIFICATIONS.
92 
93 ***********************************************************************/
94 
95 
96 
97 
98 
99 /* include these first, according to the docs */
100 #ifndef DMETAPHONE_MAIN
101 
102 #include "postgres.h"
103 
104 #include "utils/builtins.h"
105 
106 /* turn off assertions for embedded function */
107 #define NDEBUG
108 #endif
109 
110 #include <stdio.h>
111 #include <ctype.h>
112 #include <stdlib.h>
113 #include <string.h>
114 #include <stdarg.h>
115 #include <assert.h>
116 
117 /* prototype for the main function we got from the perl module */
118 static void DoubleMetaphone(char *, char **);
119 
120 #ifndef DMETAPHONE_MAIN
121 
122 /*
123  * The PostgreSQL visible dmetaphone function.
124  */
125 
127 
128 Datum
130 {
131  text *arg;
132  char *aptr,
133  *codes[2],
134  *code;
135 
136 #ifdef DMETAPHONE_NOSTRICT
137  if (PG_ARGISNULL(0))
138  PG_RETURN_NULL();
139 #endif
140  arg = PG_GETARG_TEXT_P(0);
141  aptr = text_to_cstring(arg);
142 
143  DoubleMetaphone(aptr, codes);
144  code = codes[0];
145  if (!code)
146  code = "";
147 
149 }
150 
151 /*
152  * The PostgreSQL visible dmetaphone_alt function.
153  */
154 
156 
157 Datum
159 {
160  text *arg;
161  char *aptr,
162  *codes[2],
163  *code;
164 
165 #ifdef DMETAPHONE_NOSTRICT
166  if (PG_ARGISNULL(0))
167  PG_RETURN_NULL();
168 #endif
169  arg = PG_GETARG_TEXT_P(0);
170  aptr = text_to_cstring(arg);
171 
172  DoubleMetaphone(aptr, codes);
173  code = codes[1];
174  if (!code)
175  code = "";
176 
178 }
179 
180 
181 /* here is where we start the code imported from the perl module */
182 
183 /* all memory handling is done with these macros */
184 
185 #define META_MALLOC(v,n,t) \
186  (v = (t*)palloc(((n)*sizeof(t))))
187 
188 #define META_REALLOC(v,n,t) \
189  (v = (t*)repalloc((v),((n)*sizeof(t))))
190 
191 /*
192  * Don't do pfree - it seems to cause a segv sometimes - which might have just
193  * been caused by reloading the module in development.
194  * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
195  * in a case like this.
196  */
197 
198 #define META_FREE(x) ((void)true) /* pfree((x)) */
199 #else /* not defined DMETAPHONE_MAIN */
200 
201 /* use the standard malloc library when not running in PostgreSQL */
202 
203 #define META_MALLOC(v,n,t) \
204  (v = (t*)malloc(((n)*sizeof(t))))
205 
206 #define META_REALLOC(v,n,t) \
207  (v = (t*)realloc((v),((n)*sizeof(t))))
208 
209 #define META_FREE(x) free((x))
210 #endif /* defined DMETAPHONE_MAIN */
211 
212 
213 
214 /* this typedef was originally in the perl module's .h file */
215 
216 typedef struct
217 {
218  char *str;
219  int length;
220  int bufsize;
222 }
223 
224 metastring;
225 
226 /*
227  * remaining perl module funcs unchanged except for declaring them static
228  * and reformatting to PostgreSQL indentation and to fit in 80 cols.
229  *
230  */
231 
232 static metastring *
233 NewMetaString(char *init_str)
234 {
235  metastring *s;
236  char empty_string[] = "";
237 
238  META_MALLOC(s, 1, metastring);
239  assert(s != NULL);
240 
241  if (init_str == NULL)
242  init_str = empty_string;
243  s->length = strlen(init_str);
244  /* preallocate a bit more for potential growth */
245  s->bufsize = s->length + 7;
246 
247  META_MALLOC(s->str, s->bufsize, char);
248  assert(s->str != NULL);
249 
250  memcpy(s->str, init_str, s->length + 1);
251  s->free_string_on_destroy = 1;
252 
253  return s;
254 }
255 
256 
257 static void
259 {
260  if (s == NULL)
261  return;
262 
263  if (s->free_string_on_destroy && (s->str != NULL))
264  META_FREE(s->str);
265 
266  META_FREE(s);
267 }
268 
269 
270 static void
271 IncreaseBuffer(metastring *s, int chars_needed)
272 {
273  META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
274  assert(s->str != NULL);
275  s->bufsize = s->bufsize + chars_needed + 10;
276 }
277 
278 
279 static void
281 {
282  char *i;
283 
284  for (i = s->str; *i; i++)
285  *i = toupper((unsigned char) *i);
286 }
287 
288 
289 static int
290 IsVowel(metastring *s, int pos)
291 {
292  char c;
293 
294  if ((pos < 0) || (pos >= s->length))
295  return 0;
296 
297  c = *(s->str + pos);
298  if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
299  (c == 'U') || (c == 'Y'))
300  return 1;
301 
302  return 0;
303 }
304 
305 
306 static int
308 {
309  if ((char *) strstr(s->str, "W"))
310  return 1;
311  else if ((char *) strstr(s->str, "K"))
312  return 1;
313  else if ((char *) strstr(s->str, "CZ"))
314  return 1;
315  else if ((char *) strstr(s->str, "WITZ"))
316  return 1;
317  else
318  return 0;
319 }
320 
321 
322 static char
323 GetAt(metastring *s, int pos)
324 {
325  if ((pos < 0) || (pos >= s->length))
326  return '\0';
327 
328  return ((char) *(s->str + pos));
329 }
330 
331 
332 static void
333 SetAt(metastring *s, int pos, char c)
334 {
335  if ((pos < 0) || (pos >= s->length))
336  return;
337 
338  *(s->str + pos) = c;
339 }
340 
341 
342 /*
343  Caveats: the START value is 0 based
344 */
345 static int
346 StringAt(metastring *s, int start, int length,...)
347 {
348  char *test;
349  char *pos;
350  va_list ap;
351 
352  if ((start < 0) || (start >= s->length))
353  return 0;
354 
355  pos = (s->str + start);
356  va_start(ap, length);
357 
358  do
359  {
360  test = va_arg(ap, char *);
361  if (*test && (strncmp(pos, test, length) == 0))
362  {
363  va_end(ap);
364  return 1;
365  }
366  }
367  while (strcmp(test, "") != 0);
368 
369  va_end(ap);
370 
371  return 0;
372 }
373 
374 
375 static void
376 MetaphAdd(metastring *s, char *new_str)
377 {
378  int add_length;
379 
380  if (new_str == NULL)
381  return;
382 
383  add_length = strlen(new_str);
384  if ((s->length + add_length) > (s->bufsize - 1))
385  IncreaseBuffer(s, add_length);
386 
387  strcat(s->str, new_str);
388  s->length += add_length;
389 }
390 
391 
392 static void
393 DoubleMetaphone(char *str, char **codes)
394 {
395  int length;
396  metastring *original;
397  metastring *primary;
398  metastring *secondary;
399  int current;
400  int last;
401 
402  current = 0;
403  /* we need the real length and last prior to padding */
404  length = strlen(str);
405  last = length - 1;
406  original = NewMetaString(str);
407  /* Pad original so we can index beyond end */
408  MetaphAdd(original, " ");
409 
410  primary = NewMetaString("");
411  secondary = NewMetaString("");
412  primary->free_string_on_destroy = 0;
413  secondary->free_string_on_destroy = 0;
414 
415  MakeUpper(original);
416 
417  /* skip these when at start of word */
418  if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
419  current += 1;
420 
421  /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
422  if (GetAt(original, 0) == 'X')
423  {
424  MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
425  MetaphAdd(secondary, "S");
426  current += 1;
427  }
428 
429  /* main loop */
430  while ((primary->length < 4) || (secondary->length < 4))
431  {
432  if (current >= length)
433  break;
434 
435  switch (GetAt(original, current))
436  {
437  case 'A':
438  case 'E':
439  case 'I':
440  case 'O':
441  case 'U':
442  case 'Y':
443  if (current == 0)
444  {
445  /* all init vowels now map to 'A' */
446  MetaphAdd(primary, "A");
447  MetaphAdd(secondary, "A");
448  }
449  current += 1;
450  break;
451 
452  case 'B':
453 
454  /* "-mb", e.g", "dumb", already skipped over... */
455  MetaphAdd(primary, "P");
456  MetaphAdd(secondary, "P");
457 
458  if (GetAt(original, current + 1) == 'B')
459  current += 2;
460  else
461  current += 1;
462  break;
463 
464  case '\xc7': /* C with cedilla */
465  MetaphAdd(primary, "S");
466  MetaphAdd(secondary, "S");
467  current += 1;
468  break;
469 
470  case 'C':
471  /* various germanic */
472  if ((current > 1)
473  && !IsVowel(original, current - 2)
474  && StringAt(original, (current - 1), 3, "ACH", "")
475  && ((GetAt(original, current + 2) != 'I')
476  && ((GetAt(original, current + 2) != 'E')
477  || StringAt(original, (current - 2), 6, "BACHER",
478  "MACHER", ""))))
479  {
480  MetaphAdd(primary, "K");
481  MetaphAdd(secondary, "K");
482  current += 2;
483  break;
484  }
485 
486  /* special case 'caesar' */
487  if ((current == 0)
488  && StringAt(original, current, 6, "CAESAR", ""))
489  {
490  MetaphAdd(primary, "S");
491  MetaphAdd(secondary, "S");
492  current += 2;
493  break;
494  }
495 
496  /* italian 'chianti' */
497  if (StringAt(original, current, 4, "CHIA", ""))
498  {
499  MetaphAdd(primary, "K");
500  MetaphAdd(secondary, "K");
501  current += 2;
502  break;
503  }
504 
505  if (StringAt(original, current, 2, "CH", ""))
506  {
507  /* find 'michael' */
508  if ((current > 0)
509  && StringAt(original, current, 4, "CHAE", ""))
510  {
511  MetaphAdd(primary, "K");
512  MetaphAdd(secondary, "X");
513  current += 2;
514  break;
515  }
516 
517  /* greek roots e.g. 'chemistry', 'chorus' */
518  if ((current == 0)
519  && (StringAt(original, (current + 1), 5,
520  "HARAC", "HARIS", "")
521  || StringAt(original, (current + 1), 3, "HOR",
522  "HYM", "HIA", "HEM", ""))
523  && !StringAt(original, 0, 5, "CHORE", ""))
524  {
525  MetaphAdd(primary, "K");
526  MetaphAdd(secondary, "K");
527  current += 2;
528  break;
529  }
530 
531  /* germanic, greek, or otherwise 'ch' for 'kh' sound */
532  if (
533  (StringAt(original, 0, 4, "VAN ", "VON ", "")
534  || StringAt(original, 0, 3, "SCH", ""))
535  /* 'architect but not 'arch', 'orchestra', 'orchid' */
536  || StringAt(original, (current - 2), 6, "ORCHES",
537  "ARCHIT", "ORCHID", "")
538  || StringAt(original, (current + 2), 1, "T", "S",
539  "")
540  || ((StringAt(original, (current - 1), 1,
541  "A", "O", "U", "E", "")
542  || (current == 0))
543 
544  /*
545  * e.g., 'wachtler', 'wechsler', but not 'tichner'
546  */
547  && StringAt(original, (current + 2), 1, "L", "R",
548  "N", "M", "B", "H", "F", "V", "W",
549  " ", "")))
550  {
551  MetaphAdd(primary, "K");
552  MetaphAdd(secondary, "K");
553  }
554  else
555  {
556  if (current > 0)
557  {
558  if (StringAt(original, 0, 2, "MC", ""))
559  {
560  /* e.g., "McHugh" */
561  MetaphAdd(primary, "K");
562  MetaphAdd(secondary, "K");
563  }
564  else
565  {
566  MetaphAdd(primary, "X");
567  MetaphAdd(secondary, "K");
568  }
569  }
570  else
571  {
572  MetaphAdd(primary, "X");
573  MetaphAdd(secondary, "X");
574  }
575  }
576  current += 2;
577  break;
578  }
579  /* e.g, 'czerny' */
580  if (StringAt(original, current, 2, "CZ", "")
581  && !StringAt(original, (current - 2), 4, "WICZ", ""))
582  {
583  MetaphAdd(primary, "S");
584  MetaphAdd(secondary, "X");
585  current += 2;
586  break;
587  }
588 
589  /* e.g., 'focaccia' */
590  if (StringAt(original, (current + 1), 3, "CIA", ""))
591  {
592  MetaphAdd(primary, "X");
593  MetaphAdd(secondary, "X");
594  current += 3;
595  break;
596  }
597 
598  /* double 'C', but not if e.g. 'McClellan' */
599  if (StringAt(original, current, 2, "CC", "")
600  && !((current == 1) && (GetAt(original, 0) == 'M')))
601  {
602  /* 'bellocchio' but not 'bacchus' */
603  if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
604  && !StringAt(original, (current + 2), 2, "HU", ""))
605  {
606  /* 'accident', 'accede' 'succeed' */
607  if (
608  ((current == 1)
609  && (GetAt(original, current - 1) == 'A'))
610  || StringAt(original, (current - 1), 5, "UCCEE",
611  "UCCES", ""))
612  {
613  MetaphAdd(primary, "KS");
614  MetaphAdd(secondary, "KS");
615  /* 'bacci', 'bertucci', other italian */
616  }
617  else
618  {
619  MetaphAdd(primary, "X");
620  MetaphAdd(secondary, "X");
621  }
622  current += 3;
623  break;
624  }
625  else
626  { /* Pierce's rule */
627  MetaphAdd(primary, "K");
628  MetaphAdd(secondary, "K");
629  current += 2;
630  break;
631  }
632  }
633 
634  if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
635  {
636  MetaphAdd(primary, "K");
637  MetaphAdd(secondary, "K");
638  current += 2;
639  break;
640  }
641 
642  if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
643  {
644  /* italian vs. english */
645  if (StringAt
646  (original, current, 3, "CIO", "CIE", "CIA", ""))
647  {
648  MetaphAdd(primary, "S");
649  MetaphAdd(secondary, "X");
650  }
651  else
652  {
653  MetaphAdd(primary, "S");
654  MetaphAdd(secondary, "S");
655  }
656  current += 2;
657  break;
658  }
659 
660  /* else */
661  MetaphAdd(primary, "K");
662  MetaphAdd(secondary, "K");
663 
664  /* name sent in 'mac caffrey', 'mac gregor */
665  if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
666  current += 3;
667  else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
668  && !StringAt(original, (current + 1), 2,
669  "CE", "CI", ""))
670  current += 2;
671  else
672  current += 1;
673  break;
674 
675  case 'D':
676  if (StringAt(original, current, 2, "DG", ""))
677  {
678  if (StringAt(original, (current + 2), 1,
679  "I", "E", "Y", ""))
680  {
681  /* e.g. 'edge' */
682  MetaphAdd(primary, "J");
683  MetaphAdd(secondary, "J");
684  current += 3;
685  break;
686  }
687  else
688  {
689  /* e.g. 'edgar' */
690  MetaphAdd(primary, "TK");
691  MetaphAdd(secondary, "TK");
692  current += 2;
693  break;
694  }
695  }
696 
697  if (StringAt(original, current, 2, "DT", "DD", ""))
698  {
699  MetaphAdd(primary, "T");
700  MetaphAdd(secondary, "T");
701  current += 2;
702  break;
703  }
704 
705  /* else */
706  MetaphAdd(primary, "T");
707  MetaphAdd(secondary, "T");
708  current += 1;
709  break;
710 
711  case 'F':
712  if (GetAt(original, current + 1) == 'F')
713  current += 2;
714  else
715  current += 1;
716  MetaphAdd(primary, "F");
717  MetaphAdd(secondary, "F");
718  break;
719 
720  case 'G':
721  if (GetAt(original, current + 1) == 'H')
722  {
723  if ((current > 0) && !IsVowel(original, current - 1))
724  {
725  MetaphAdd(primary, "K");
726  MetaphAdd(secondary, "K");
727  current += 2;
728  break;
729  }
730 
731  if (current < 3)
732  {
733  /* 'ghislane', ghiradelli */
734  if (current == 0)
735  {
736  if (GetAt(original, current + 2) == 'I')
737  {
738  MetaphAdd(primary, "J");
739  MetaphAdd(secondary, "J");
740  }
741  else
742  {
743  MetaphAdd(primary, "K");
744  MetaphAdd(secondary, "K");
745  }
746  current += 2;
747  break;
748  }
749  }
750 
751  /*
752  * Parker's rule (with some further refinements) - e.g.,
753  * 'hugh'
754  */
755  if (
756  ((current > 1)
757  && StringAt(original, (current - 2), 1,
758  "B", "H", "D", ""))
759  /* e.g., 'bough' */
760  || ((current > 2)
761  && StringAt(original, (current - 3), 1,
762  "B", "H", "D", ""))
763  /* e.g., 'broughton' */
764  || ((current > 3)
765  && StringAt(original, (current - 4), 1,
766  "B", "H", "")))
767  {
768  current += 2;
769  break;
770  }
771  else
772  {
773  /*
774  * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
775  * 'rough', 'tough'
776  */
777  if ((current > 2)
778  && (GetAt(original, current - 1) == 'U')
779  && StringAt(original, (current - 3), 1, "C",
780  "G", "L", "R", "T", ""))
781  {
782  MetaphAdd(primary, "F");
783  MetaphAdd(secondary, "F");
784  }
785  else if ((current > 0)
786  && GetAt(original, current - 1) != 'I')
787  {
788 
789 
790  MetaphAdd(primary, "K");
791  MetaphAdd(secondary, "K");
792  }
793 
794  current += 2;
795  break;
796  }
797  }
798 
799  if (GetAt(original, current + 1) == 'N')
800  {
801  if ((current == 1) && IsVowel(original, 0)
802  && !SlavoGermanic(original))
803  {
804  MetaphAdd(primary, "KN");
805  MetaphAdd(secondary, "N");
806  }
807  else
808  /* not e.g. 'cagney' */
809  if (!StringAt(original, (current + 2), 2, "EY", "")
810  && (GetAt(original, current + 1) != 'Y')
811  && !SlavoGermanic(original))
812  {
813  MetaphAdd(primary, "N");
814  MetaphAdd(secondary, "KN");
815  }
816  else
817  {
818  MetaphAdd(primary, "KN");
819  MetaphAdd(secondary, "KN");
820  }
821  current += 2;
822  break;
823  }
824 
825  /* 'tagliaro' */
826  if (StringAt(original, (current + 1), 2, "LI", "")
827  && !SlavoGermanic(original))
828  {
829  MetaphAdd(primary, "KL");
830  MetaphAdd(secondary, "L");
831  current += 2;
832  break;
833  }
834 
835  /* -ges-,-gep-,-gel-, -gie- at beginning */
836  if ((current == 0)
837  && ((GetAt(original, current + 1) == 'Y')
838  || StringAt(original, (current + 1), 2, "ES", "EP",
839  "EB", "EL", "EY", "IB", "IL", "IN", "IE",
840  "EI", "ER", "")))
841  {
842  MetaphAdd(primary, "K");
843  MetaphAdd(secondary, "J");
844  current += 2;
845  break;
846  }
847 
848  /* -ger-, -gy- */
849  if (
850  (StringAt(original, (current + 1), 2, "ER", "")
851  || (GetAt(original, current + 1) == 'Y'))
852  && !StringAt(original, 0, 6,
853  "DANGER", "RANGER", "MANGER", "")
854  && !StringAt(original, (current - 1), 1, "E", "I", "")
855  && !StringAt(original, (current - 1), 3, "RGY", "OGY",
856  ""))
857  {
858  MetaphAdd(primary, "K");
859  MetaphAdd(secondary, "J");
860  current += 2;
861  break;
862  }
863 
864  /* italian e.g, 'biaggi' */
865  if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
866  || StringAt(original, (current - 1), 4,
867  "AGGI", "OGGI", ""))
868  {
869  /* obvious germanic */
870  if (
871  (StringAt(original, 0, 4, "VAN ", "VON ", "")
872  || StringAt(original, 0, 3, "SCH", ""))
873  || StringAt(original, (current + 1), 2, "ET", ""))
874  {
875  MetaphAdd(primary, "K");
876  MetaphAdd(secondary, "K");
877  }
878  else
879  {
880  /* always soft if french ending */
881  if (StringAt
882  (original, (current + 1), 4, "IER ", ""))
883  {
884  MetaphAdd(primary, "J");
885  MetaphAdd(secondary, "J");
886  }
887  else
888  {
889  MetaphAdd(primary, "J");
890  MetaphAdd(secondary, "K");
891  }
892  }
893  current += 2;
894  break;
895  }
896 
897  if (GetAt(original, current + 1) == 'G')
898  current += 2;
899  else
900  current += 1;
901  MetaphAdd(primary, "K");
902  MetaphAdd(secondary, "K");
903  break;
904 
905  case 'H':
906  /* only keep if first & before vowel or btw. 2 vowels */
907  if (((current == 0) || IsVowel(original, current - 1))
908  && IsVowel(original, current + 1))
909  {
910  MetaphAdd(primary, "H");
911  MetaphAdd(secondary, "H");
912  current += 2;
913  }
914  else
915  /* also takes care of 'HH' */
916  current += 1;
917  break;
918 
919  case 'J':
920  /* obvious spanish, 'jose', 'san jacinto' */
921  if (StringAt(original, current, 4, "JOSE", "")
922  || StringAt(original, 0, 4, "SAN ", ""))
923  {
924  if (((current == 0)
925  && (GetAt(original, current + 4) == ' '))
926  || StringAt(original, 0, 4, "SAN ", ""))
927  {
928  MetaphAdd(primary, "H");
929  MetaphAdd(secondary, "H");
930  }
931  else
932  {
933  MetaphAdd(primary, "J");
934  MetaphAdd(secondary, "H");
935  }
936  current += 1;
937  break;
938  }
939 
940  if ((current == 0)
941  && !StringAt(original, current, 4, "JOSE", ""))
942  {
943  MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
944  MetaphAdd(secondary, "A");
945  }
946  else
947  {
948  /* spanish pron. of e.g. 'bajador' */
949  if (IsVowel(original, current - 1)
950  && !SlavoGermanic(original)
951  && ((GetAt(original, current + 1) == 'A')
952  || (GetAt(original, current + 1) == 'O')))
953  {
954  MetaphAdd(primary, "J");
955  MetaphAdd(secondary, "H");
956  }
957  else
958  {
959  if (current == last)
960  {
961  MetaphAdd(primary, "J");
962  MetaphAdd(secondary, "");
963  }
964  else
965  {
966  if (!StringAt(original, (current + 1), 1, "L", "T",
967  "K", "S", "N", "M", "B", "Z", "")
968  && !StringAt(original, (current - 1), 1,
969  "S", "K", "L", ""))
970  {
971  MetaphAdd(primary, "J");
972  MetaphAdd(secondary, "J");
973  }
974  }
975  }
976  }
977 
978  if (GetAt(original, current + 1) == 'J') /* it could happen! */
979  current += 2;
980  else
981  current += 1;
982  break;
983 
984  case 'K':
985  if (GetAt(original, current + 1) == 'K')
986  current += 2;
987  else
988  current += 1;
989  MetaphAdd(primary, "K");
990  MetaphAdd(secondary, "K");
991  break;
992 
993  case 'L':
994  if (GetAt(original, current + 1) == 'L')
995  {
996  /* spanish e.g. 'cabrillo', 'gallegos' */
997  if (((current == (length - 3))
998  && StringAt(original, (current - 1), 4, "ILLO",
999  "ILLA", "ALLE", ""))
1000  || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
1001  || StringAt(original, last, 1, "A", "O", ""))
1002  && StringAt(original, (current - 1), 4,
1003  "ALLE", "")))
1004  {
1005  MetaphAdd(primary, "L");
1006  MetaphAdd(secondary, "");
1007  current += 2;
1008  break;
1009  }
1010  current += 2;
1011  }
1012  else
1013  current += 1;
1014  MetaphAdd(primary, "L");
1015  MetaphAdd(secondary, "L");
1016  break;
1017 
1018  case 'M':
1019  if ((StringAt(original, (current - 1), 3, "UMB", "")
1020  && (((current + 1) == last)
1021  || StringAt(original, (current + 2), 2, "ER", "")))
1022  /* 'dumb','thumb' */
1023  || (GetAt(original, current + 1) == 'M'))
1024  current += 2;
1025  else
1026  current += 1;
1027  MetaphAdd(primary, "M");
1028  MetaphAdd(secondary, "M");
1029  break;
1030 
1031  case 'N':
1032  if (GetAt(original, current + 1) == 'N')
1033  current += 2;
1034  else
1035  current += 1;
1036  MetaphAdd(primary, "N");
1037  MetaphAdd(secondary, "N");
1038  break;
1039 
1040  case '\xd1': /* N with tilde */
1041  current += 1;
1042  MetaphAdd(primary, "N");
1043  MetaphAdd(secondary, "N");
1044  break;
1045 
1046  case 'P':
1047  if (GetAt(original, current + 1) == 'H')
1048  {
1049  MetaphAdd(primary, "F");
1050  MetaphAdd(secondary, "F");
1051  current += 2;
1052  break;
1053  }
1054 
1055  /* also account for "campbell", "raspberry" */
1056  if (StringAt(original, (current + 1), 1, "P", "B", ""))
1057  current += 2;
1058  else
1059  current += 1;
1060  MetaphAdd(primary, "P");
1061  MetaphAdd(secondary, "P");
1062  break;
1063 
1064  case 'Q':
1065  if (GetAt(original, current + 1) == 'Q')
1066  current += 2;
1067  else
1068  current += 1;
1069  MetaphAdd(primary, "K");
1070  MetaphAdd(secondary, "K");
1071  break;
1072 
1073  case 'R':
1074  /* french e.g. 'rogier', but exclude 'hochmeier' */
1075  if ((current == last)
1076  && !SlavoGermanic(original)
1077  && StringAt(original, (current - 2), 2, "IE", "")
1078  && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1079  {
1080  MetaphAdd(primary, "");
1081  MetaphAdd(secondary, "R");
1082  }
1083  else
1084  {
1085  MetaphAdd(primary, "R");
1086  MetaphAdd(secondary, "R");
1087  }
1088 
1089  if (GetAt(original, current + 1) == 'R')
1090  current += 2;
1091  else
1092  current += 1;
1093  break;
1094 
1095  case 'S':
1096  /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1097  if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1098  {
1099  current += 1;
1100  break;
1101  }
1102 
1103  /* special case 'sugar-' */
1104  if ((current == 0)
1105  && StringAt(original, current, 5, "SUGAR", ""))
1106  {
1107  MetaphAdd(primary, "X");
1108  MetaphAdd(secondary, "S");
1109  current += 1;
1110  break;
1111  }
1112 
1113  if (StringAt(original, current, 2, "SH", ""))
1114  {
1115  /* germanic */
1116  if (StringAt
1117  (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1118  "HOLZ", ""))
1119  {
1120  MetaphAdd(primary, "S");
1121  MetaphAdd(secondary, "S");
1122  }
1123  else
1124  {
1125  MetaphAdd(primary, "X");
1126  MetaphAdd(secondary, "X");
1127  }
1128  current += 2;
1129  break;
1130  }
1131 
1132  /* italian & armenian */
1133  if (StringAt(original, current, 3, "SIO", "SIA", "")
1134  || StringAt(original, current, 4, "SIAN", ""))
1135  {
1136  if (!SlavoGermanic(original))
1137  {
1138  MetaphAdd(primary, "S");
1139  MetaphAdd(secondary, "X");
1140  }
1141  else
1142  {
1143  MetaphAdd(primary, "S");
1144  MetaphAdd(secondary, "S");
1145  }
1146  current += 3;
1147  break;
1148  }
1149 
1150  /*
1151  * german & anglicisations, e.g. 'smith' match 'schmidt',
1152  * 'snider' match 'schneider' also, -sz- in slavic language
1153  * although in hungarian it is pronounced 's'
1154  */
1155  if (((current == 0)
1156  && StringAt(original, (current + 1), 1,
1157  "M", "N", "L", "W", ""))
1158  || StringAt(original, (current + 1), 1, "Z", ""))
1159  {
1160  MetaphAdd(primary, "S");
1161  MetaphAdd(secondary, "X");
1162  if (StringAt(original, (current + 1), 1, "Z", ""))
1163  current += 2;
1164  else
1165  current += 1;
1166  break;
1167  }
1168 
1169  if (StringAt(original, current, 2, "SC", ""))
1170  {
1171  /* Schlesinger's rule */
1172  if (GetAt(original, current + 2) == 'H')
1173  {
1174  /* dutch origin, e.g. 'school', 'schooner' */
1175  if (StringAt(original, (current + 3), 2,
1176  "OO", "ER", "EN",
1177  "UY", "ED", "EM", ""))
1178  {
1179  /* 'schermerhorn', 'schenker' */
1180  if (StringAt(original, (current + 3), 2,
1181  "ER", "EN", ""))
1182  {
1183  MetaphAdd(primary, "X");
1184  MetaphAdd(secondary, "SK");
1185  }
1186  else
1187  {
1188  MetaphAdd(primary, "SK");
1189  MetaphAdd(secondary, "SK");
1190  }
1191  current += 3;
1192  break;
1193  }
1194  else
1195  {
1196  if ((current == 0) && !IsVowel(original, 3)
1197  && (GetAt(original, 3) != 'W'))
1198  {
1199  MetaphAdd(primary, "X");
1200  MetaphAdd(secondary, "S");
1201  }
1202  else
1203  {
1204  MetaphAdd(primary, "X");
1205  MetaphAdd(secondary, "X");
1206  }
1207  current += 3;
1208  break;
1209  }
1210  }
1211 
1212  if (StringAt(original, (current + 2), 1,
1213  "I", "E", "Y", ""))
1214  {
1215  MetaphAdd(primary, "S");
1216  MetaphAdd(secondary, "S");
1217  current += 3;
1218  break;
1219  }
1220  /* else */
1221  MetaphAdd(primary, "SK");
1222  MetaphAdd(secondary, "SK");
1223  current += 3;
1224  break;
1225  }
1226 
1227  /* french e.g. 'resnais', 'artois' */
1228  if ((current == last)
1229  && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1230  {
1231  MetaphAdd(primary, "");
1232  MetaphAdd(secondary, "S");
1233  }
1234  else
1235  {
1236  MetaphAdd(primary, "S");
1237  MetaphAdd(secondary, "S");
1238  }
1239 
1240  if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1241  current += 2;
1242  else
1243  current += 1;
1244  break;
1245 
1246  case 'T':
1247  if (StringAt(original, current, 4, "TION", ""))
1248  {
1249  MetaphAdd(primary, "X");
1250  MetaphAdd(secondary, "X");
1251  current += 3;
1252  break;
1253  }
1254 
1255  if (StringAt(original, current, 3, "TIA", "TCH", ""))
1256  {
1257  MetaphAdd(primary, "X");
1258  MetaphAdd(secondary, "X");
1259  current += 3;
1260  break;
1261  }
1262 
1263  if (StringAt(original, current, 2, "TH", "")
1264  || StringAt(original, current, 3, "TTH", ""))
1265  {
1266  /* special case 'thomas', 'thames' or germanic */
1267  if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1268  || StringAt(original, 0, 4, "VAN ", "VON ", "")
1269  || StringAt(original, 0, 3, "SCH", ""))
1270  {
1271  MetaphAdd(primary, "T");
1272  MetaphAdd(secondary, "T");
1273  }
1274  else
1275  {
1276  MetaphAdd(primary, "0");
1277  MetaphAdd(secondary, "T");
1278  }
1279  current += 2;
1280  break;
1281  }
1282 
1283  if (StringAt(original, (current + 1), 1, "T", "D", ""))
1284  current += 2;
1285  else
1286  current += 1;
1287  MetaphAdd(primary, "T");
1288  MetaphAdd(secondary, "T");
1289  break;
1290 
1291  case 'V':
1292  if (GetAt(original, current + 1) == 'V')
1293  current += 2;
1294  else
1295  current += 1;
1296  MetaphAdd(primary, "F");
1297  MetaphAdd(secondary, "F");
1298  break;
1299 
1300  case 'W':
1301  /* can also be in middle of word */
1302  if (StringAt(original, current, 2, "WR", ""))
1303  {
1304  MetaphAdd(primary, "R");
1305  MetaphAdd(secondary, "R");
1306  current += 2;
1307  break;
1308  }
1309 
1310  if ((current == 0)
1311  && (IsVowel(original, current + 1)
1312  || StringAt(original, current, 2, "WH", "")))
1313  {
1314  /* Wasserman should match Vasserman */
1315  if (IsVowel(original, current + 1))
1316  {
1317  MetaphAdd(primary, "A");
1318  MetaphAdd(secondary, "F");
1319  }
1320  else
1321  {
1322  /* need Uomo to match Womo */
1323  MetaphAdd(primary, "A");
1324  MetaphAdd(secondary, "A");
1325  }
1326  }
1327 
1328  /* Arnow should match Arnoff */
1329  if (((current == last) && IsVowel(original, current - 1))
1330  || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1331  "OWSKI", "OWSKY", "")
1332  || StringAt(original, 0, 3, "SCH", ""))
1333  {
1334  MetaphAdd(primary, "");
1335  MetaphAdd(secondary, "F");
1336  current += 1;
1337  break;
1338  }
1339 
1340  /* polish e.g. 'filipowicz' */
1341  if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1342  {
1343  MetaphAdd(primary, "TS");
1344  MetaphAdd(secondary, "FX");
1345  current += 4;
1346  break;
1347  }
1348 
1349  /* else skip it */
1350  current += 1;
1351  break;
1352 
1353  case 'X':
1354  /* french e.g. breaux */
1355  if (!((current == last)
1356  && (StringAt(original, (current - 3), 3,
1357  "IAU", "EAU", "")
1358  || StringAt(original, (current - 2), 2,
1359  "AU", "OU", ""))))
1360  {
1361  MetaphAdd(primary, "KS");
1362  MetaphAdd(secondary, "KS");
1363  }
1364 
1365 
1366  if (StringAt(original, (current + 1), 1, "C", "X", ""))
1367  current += 2;
1368  else
1369  current += 1;
1370  break;
1371 
1372  case 'Z':
1373  /* chinese pinyin e.g. 'zhao' */
1374  if (GetAt(original, current + 1) == 'H')
1375  {
1376  MetaphAdd(primary, "J");
1377  MetaphAdd(secondary, "J");
1378  current += 2;
1379  break;
1380  }
1381  else if (StringAt(original, (current + 1), 2,
1382  "ZO", "ZI", "ZA", "")
1383  || (SlavoGermanic(original)
1384  && ((current > 0)
1385  && GetAt(original, current - 1) != 'T')))
1386  {
1387  MetaphAdd(primary, "S");
1388  MetaphAdd(secondary, "TS");
1389  }
1390  else
1391  {
1392  MetaphAdd(primary, "S");
1393  MetaphAdd(secondary, "S");
1394  }
1395 
1396  if (GetAt(original, current + 1) == 'Z')
1397  current += 2;
1398  else
1399  current += 1;
1400  break;
1401 
1402  default:
1403  current += 1;
1404  }
1405 
1406  /*
1407  * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1408  * secondary->str);
1409  */
1410  }
1411 
1412 
1413  if (primary->length > 4)
1414  SetAt(primary, 4, '\0');
1415 
1416  if (secondary->length > 4)
1417  SetAt(secondary, 4, '\0');
1418 
1419  *codes = primary->str;
1420  *++codes = secondary->str;
1421 
1422  DestroyMetaString(original);
1423  DestroyMetaString(primary);
1424  DestroyMetaString(secondary);
1425 }
1426 
1427 #ifdef DMETAPHONE_MAIN
1428 
1429 /* just for testing - not part of the perl code */
1430 
1431 main(int argc, char **argv)
1432 {
1433  char *codes[2];
1434 
1435  if (argc > 1)
1436  {
1437  DoubleMetaphone(argv[1], codes);
1438  printf("%s|%s\n", codes[0], codes[1]);
1439  }
1440 }
1441 
1442 #endif
Datum dmetaphone_alt(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:158
int length(const List *list)
Definition: list.c:1271
#define META_FREE(x)
Definition: dmetaphone.c:198
static void test(void)
Datum dmetaphone(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:129
static metastring * NewMetaString(char *init_str)
Definition: dmetaphone.c:233
int main(int argc, char **argv)
Definition: oid2name.c:538
static char GetAt(metastring *s, int pos)
Definition: dmetaphone.c:323
#define META_REALLOC(v, n, t)
Definition: dmetaphone.c:188
PG_FUNCTION_INFO_V1(dmetaphone)
char * c
static void SetAt(metastring *s, int pos, char c)
Definition: dmetaphone.c:333
#define assert(TEST)
Definition: imath.c:37
char * str
Definition: dmetaphone.c:218
int free_string_on_destroy
Definition: dmetaphone.c:221
static void DestroyMetaString(metastring *s)
Definition: dmetaphone.c:258
uintptr_t Datum
Definition: postgres.h:374
static void MakeUpper(metastring *s)
Definition: dmetaphone.c:280
static int IsVowel(metastring *s, int pos)
Definition: dmetaphone.c:290
static int StringAt(metastring *s, int start, int length,...)
Definition: dmetaphone.c:346
static void MetaphAdd(metastring *s, char *new_str)
Definition: dmetaphone.c:376
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:314
static void IncreaseBuffer(metastring *s, int chars_needed)
Definition: dmetaphone.c:271
text * cstring_to_text(const char *s)
Definition: varlena.c:151
#define PG_ARGISNULL(n)
Definition: fmgr.h:166
#define NULL
Definition: c.h:226
char * text_to_cstring(const text *t)
Definition: varlena.c:184
#define PG_GETARG_TEXT_P(n)
Definition: fmgr.h:269
int i
void * arg
Definition: c.h:435
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
static int SlavoGermanic(metastring *s)
Definition: dmetaphone.c:307
static void DoubleMetaphone(char *, char **)
Definition: dmetaphone.c:393
#define PG_RETURN_NULL()
Definition: fmgr.h:289
#define META_MALLOC(v, n, t)
Definition: dmetaphone.c:185