PostgreSQL Source Code git master
dmetaphone.c
Go to the documentation of this file.
1/*
2 * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3 *
4 * contrib/fuzzystrmatch/dmetaphone.c
5 *
6 * Double Metaphone computes 2 "sounds like" strings - a primary and an
7 * alternate. In most cases they are the same, but for foreign names
8 * especially they can be a bit different, depending on pronunciation.
9 *
10 * Information on using Double Metaphone can be found at
11 * http://www.codeproject.com/string/dmetaphone1.asp
12 * and the original article describing it can be found at
13 * http://drdobbs.com/184401251
14 *
15 * For PostgreSQL we provide 2 functions - one for the primary and one for
16 * the alternate. That way the functions are pure text->text mappings that
17 * are useful in functional indexes. These are 'dmetaphone' for the
18 * primary and 'dmetaphone_alt' for the alternate.
19 *
20 * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21 * functions looks like this:
22 *
23 * CREATE FUNCTION dmetaphone (text) RETURNS text
24 * LANGUAGE C IMMUTABLE STRICT
25 * AS '$libdir/dmetaphone', 'dmetaphone';
26 *
27 * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28 * LANGUAGE C IMMUTABLE STRICT
29 * AS '$libdir/dmetaphone', 'dmetaphone_alt';
30 *
31 * Note that you have to declare the functions IMMUTABLE if you want to
32 * use them in functional indexes, and you have to declare them as STRICT
33 * as they do not check for NULL input, and will segfault if given NULL input.
34 * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35 * will never call them with NULL, but instead assume the result is NULL,
36 * which is what we (I) want.
37 *
38 * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39 * will detect NULL input and return NULL. The you don't have to declare them
40 * as STRICT.
41 *
42 * There is a small inefficiency here - each function call actually computes
43 * both the primary and the alternate and then throws away the one it doesn't
44 * need. That's the way the perl module was written, because perl can handle
45 * a list return more easily than we can in PostgreSQL. The result has been
46 * fast enough for my needs, but it could maybe be optimized a bit to remove
47 * that behaviour.
48 *
49 */
50
51
52/***************************** COPYRIGHT NOTICES ***********************
53
54Most of this code is directly from the Text::DoubleMetaphone perl module
55version 0.05 available from https://www.cpan.org/.
56It bears this copyright notice:
57
58
59 Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60 All rights reserved.
61
62 This code is based heavily on the C++ implementation by
63 Lawrence Philips and incorporates several bug fixes courtesy
64 of Kevin Atkinson <kevina@users.sourceforge.net>.
65
66 This module is free software; you may redistribute it and/or
67 modify it under the same terms as Perl itself.
68
69The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70<andrew@dunslane.net> and is covered this copyright:
71
72 Copyright 2003, North Carolina State Highway Patrol.
73 All rights reserved.
74
75 Permission to use, copy, modify, and distribute this software and its
76 documentation for any purpose, without fee, and without a written agreement
77 is hereby granted, provided that the above copyright notice and this
78 paragraph and the following two paragraphs appear in all copies.
79
80 IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81 PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82 INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83 DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85
86 THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89 HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90 HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91 MODIFICATIONS.
92
93***********************************************************************/
94
95
96/* include these first, according to the docs */
97#ifndef DMETAPHONE_MAIN
98
99#include "postgres.h"
100
101#include "utils/builtins.h"
102
103/* turn off assertions for embedded function */
104#define NDEBUG
105
106#else /* DMETAPHONE_MAIN */
107
108/* we need these if we didn't get them from postgres.h */
109#include <stdio.h>
110#include <stdlib.h>
111#include <string.h>
112#include <stdarg.h>
113
114#endif /* DMETAPHONE_MAIN */
115
116#include <assert.h>
117#include <ctype.h>
118
119/* prototype for the main function we got from the perl module */
120static void DoubleMetaphone(char *str, char **codes);
121
122#ifndef DMETAPHONE_MAIN
123
124/*
125 * The PostgreSQL visible dmetaphone function.
126 */
127
129
130Datum
132{
133 text *arg;
134 char *aptr,
135 *codes[2],
136 *code;
137
138#ifdef DMETAPHONE_NOSTRICT
139 if (PG_ARGISNULL(0))
141#endif
143 aptr = text_to_cstring(arg);
144
145 DoubleMetaphone(aptr, codes);
146 code = codes[0];
147 if (!code)
148 code = "";
149
151}
152
153/*
154 * The PostgreSQL visible dmetaphone_alt function.
155 */
156
158
159Datum
161{
162 text *arg;
163 char *aptr,
164 *codes[2],
165 *code;
166
167#ifdef DMETAPHONE_NOSTRICT
168 if (PG_ARGISNULL(0))
170#endif
172 aptr = text_to_cstring(arg);
173
174 DoubleMetaphone(aptr, codes);
175 code = codes[1];
176 if (!code)
177 code = "";
178
180}
181
182
183/* here is where we start the code imported from the perl module */
184
185/* all memory handling is done with these macros */
186
187#define META_MALLOC(v,n,t) \
188 (v = (t*)palloc(((n)*sizeof(t))))
189
190#define META_REALLOC(v,n,t) \
191 (v = (t*)repalloc((v),((n)*sizeof(t))))
192
193/*
194 * Don't do pfree - it seems to cause a SIGSEGV sometimes - which might have just
195 * been caused by reloading the module in development.
196 * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197 * in a case like this.
198 */
199
200#define META_FREE(x) ((void)true) /* pfree((x)) */
201#else /* not defined DMETAPHONE_MAIN */
202
203/* use the standard malloc library when not running in PostgreSQL */
204
205#define META_MALLOC(v,n,t) \
206 (v = (t*)malloc(((n)*sizeof(t))))
207
208#define META_REALLOC(v,n,t) \
209 (v = (t*)realloc((v),((n)*sizeof(t))))
210
211#define META_FREE(x) free((x))
212#endif /* defined DMETAPHONE_MAIN */
213
214
215
216/* this typedef was originally in the perl module's .h file */
217
218typedef struct
219{
220 char *str;
224}
225
227
228/*
229 * remaining perl module funcs unchanged except for declaring them static
230 * and reformatting to PostgreSQL indentation and to fit in 80 cols.
231 *
232 */
233
234static metastring *
235NewMetaString(const char *init_str)
236{
237 metastring *s;
238 char empty_string[] = "";
239
240 META_MALLOC(s, 1, metastring);
241 assert(s != NULL);
242
243 if (init_str == NULL)
244 init_str = empty_string;
245 s->length = strlen(init_str);
246 /* preallocate a bit more for potential growth */
247 s->bufsize = s->length + 7;
248
249 META_MALLOC(s->str, s->bufsize, char);
250 assert(s->str != NULL);
251
252 memcpy(s->str, init_str, s->length + 1);
254
255 return s;
256}
257
258
259static void
261{
262 if (s == NULL)
263 return;
264
265 if (s->free_string_on_destroy && (s->str != NULL))
266 META_FREE(s->str);
267
268 META_FREE(s);
269}
270
271
272static void
273IncreaseBuffer(metastring *s, int chars_needed)
274{
275 META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
276 assert(s->str != NULL);
277 s->bufsize = s->bufsize + chars_needed + 10;
278}
279
280
281static void
283{
284 char *i;
285
286 for (i = s->str; *i; i++)
287 *i = toupper((unsigned char) *i);
288}
289
290
291static int
292IsVowel(metastring *s, int pos)
293{
294 char c;
295
296 if ((pos < 0) || (pos >= s->length))
297 return 0;
298
299 c = *(s->str + pos);
300 if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
301 (c == 'U') || (c == 'Y'))
302 return 1;
303
304 return 0;
305}
306
307
308static int
310{
311 if (strstr(s->str, "W"))
312 return 1;
313 else if (strstr(s->str, "K"))
314 return 1;
315 else if (strstr(s->str, "CZ"))
316 return 1;
317 else if (strstr(s->str, "WITZ"))
318 return 1;
319 else
320 return 0;
321}
322
323
324static char
325GetAt(metastring *s, int pos)
326{
327 if ((pos < 0) || (pos >= s->length))
328 return '\0';
329
330 return ((char) *(s->str + pos));
331}
332
333
334static void
335SetAt(metastring *s, int pos, char c)
336{
337 if ((pos < 0) || (pos >= s->length))
338 return;
339
340 *(s->str + pos) = c;
341}
342
343
344/*
345 Caveats: the START value is 0 based
346*/
347static int
348StringAt(metastring *s, int start, int length,...)
349{
350 char *test;
351 char *pos;
352 va_list ap;
353
354 if ((start < 0) || (start >= s->length))
355 return 0;
356
357 pos = (s->str + start);
358 va_start(ap, length);
359
360 do
361 {
362 test = va_arg(ap, char *);
363 if (*test && (strncmp(pos, test, length) == 0))
364 {
365 va_end(ap);
366 return 1;
367 }
368 }
369 while (strcmp(test, "") != 0);
370
371 va_end(ap);
372
373 return 0;
374}
375
376
377static void
378MetaphAdd(metastring *s, const char *new_str)
379{
380 int add_length;
381
382 if (new_str == NULL)
383 return;
384
385 add_length = strlen(new_str);
386 if ((s->length + add_length) > (s->bufsize - 1))
387 IncreaseBuffer(s, add_length);
388
389 strcat(s->str, new_str);
390 s->length += add_length;
391}
392
393
394static void
395DoubleMetaphone(char *str, char **codes)
396{
397 int length;
398 metastring *original;
399 metastring *primary;
400 metastring *secondary;
401 int current;
402 int last;
403
404 current = 0;
405 /* we need the real length and last prior to padding */
406 length = strlen(str);
407 last = length - 1;
408 original = NewMetaString(str);
409 /* Pad original so we can index beyond end */
410 MetaphAdd(original, " ");
411
412 primary = NewMetaString("");
413 secondary = NewMetaString("");
414 primary->free_string_on_destroy = 0;
415 secondary->free_string_on_destroy = 0;
416
417 MakeUpper(original);
418
419 /* skip these when at start of word */
420 if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
421 current += 1;
422
423 /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424 if (GetAt(original, 0) == 'X')
425 {
426 MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
427 MetaphAdd(secondary, "S");
428 current += 1;
429 }
430
431 /* main loop */
432 while ((primary->length < 4) || (secondary->length < 4))
433 {
434 if (current >= length)
435 break;
436
437 switch (GetAt(original, current))
438 {
439 case 'A':
440 case 'E':
441 case 'I':
442 case 'O':
443 case 'U':
444 case 'Y':
445 if (current == 0)
446 {
447 /* all init vowels now map to 'A' */
448 MetaphAdd(primary, "A");
449 MetaphAdd(secondary, "A");
450 }
451 current += 1;
452 break;
453
454 case 'B':
455
456 /* "-mb", e.g", "dumb", already skipped over... */
457 MetaphAdd(primary, "P");
458 MetaphAdd(secondary, "P");
459
460 if (GetAt(original, current + 1) == 'B')
461 current += 2;
462 else
463 current += 1;
464 break;
465
466 case '\xc7': /* C with cedilla */
467 MetaphAdd(primary, "S");
468 MetaphAdd(secondary, "S");
469 current += 1;
470 break;
471
472 case 'C':
473 /* various germanic */
474 if ((current > 1)
475 && !IsVowel(original, current - 2)
476 && StringAt(original, (current - 1), 3, "ACH", "")
477 && ((GetAt(original, current + 2) != 'I')
478 && ((GetAt(original, current + 2) != 'E')
479 || StringAt(original, (current - 2), 6, "BACHER",
480 "MACHER", ""))))
481 {
482 MetaphAdd(primary, "K");
483 MetaphAdd(secondary, "K");
484 current += 2;
485 break;
486 }
487
488 /* special case 'caesar' */
489 if ((current == 0)
490 && StringAt(original, current, 6, "CAESAR", ""))
491 {
492 MetaphAdd(primary, "S");
493 MetaphAdd(secondary, "S");
494 current += 2;
495 break;
496 }
497
498 /* italian 'chianti' */
499 if (StringAt(original, current, 4, "CHIA", ""))
500 {
501 MetaphAdd(primary, "K");
502 MetaphAdd(secondary, "K");
503 current += 2;
504 break;
505 }
506
507 if (StringAt(original, current, 2, "CH", ""))
508 {
509 /* find 'michael' */
510 if ((current > 0)
511 && StringAt(original, current, 4, "CHAE", ""))
512 {
513 MetaphAdd(primary, "K");
514 MetaphAdd(secondary, "X");
515 current += 2;
516 break;
517 }
518
519 /* greek roots e.g. 'chemistry', 'chorus' */
520 if ((current == 0)
521 && (StringAt(original, (current + 1), 5,
522 "HARAC", "HARIS", "")
523 || StringAt(original, (current + 1), 3, "HOR",
524 "HYM", "HIA", "HEM", ""))
525 && !StringAt(original, 0, 5, "CHORE", ""))
526 {
527 MetaphAdd(primary, "K");
528 MetaphAdd(secondary, "K");
529 current += 2;
530 break;
531 }
532
533 /* germanic, greek, or otherwise 'ch' for 'kh' sound */
534 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
535 || StringAt(original, 0, 3, "SCH", ""))
536 /* 'architect but not 'arch', 'orchestra', 'orchid' */
537 || StringAt(original, (current - 2), 6, "ORCHES",
538 "ARCHIT", "ORCHID", "")
539 || StringAt(original, (current + 2), 1, "T", "S",
540 "")
541 || ((StringAt(original, (current - 1), 1,
542 "A", "O", "U", "E", "")
543 || (current == 0))
544
545 /*
546 * e.g., 'wachtler', 'wechsler', but not 'tichner'
547 */
548 && StringAt(original, (current + 2), 1, "L", "R",
549 "N", "M", "B", "H", "F", "V", "W",
550 " ", "")))
551 {
552 MetaphAdd(primary, "K");
553 MetaphAdd(secondary, "K");
554 }
555 else
556 {
557 if (current > 0)
558 {
559 if (StringAt(original, 0, 2, "MC", ""))
560 {
561 /* e.g., "McHugh" */
562 MetaphAdd(primary, "K");
563 MetaphAdd(secondary, "K");
564 }
565 else
566 {
567 MetaphAdd(primary, "X");
568 MetaphAdd(secondary, "K");
569 }
570 }
571 else
572 {
573 MetaphAdd(primary, "X");
574 MetaphAdd(secondary, "X");
575 }
576 }
577 current += 2;
578 break;
579 }
580 /* e.g, 'czerny' */
581 if (StringAt(original, current, 2, "CZ", "")
582 && !StringAt(original, (current - 2), 4, "WICZ", ""))
583 {
584 MetaphAdd(primary, "S");
585 MetaphAdd(secondary, "X");
586 current += 2;
587 break;
588 }
589
590 /* e.g., 'focaccia' */
591 if (StringAt(original, (current + 1), 3, "CIA", ""))
592 {
593 MetaphAdd(primary, "X");
594 MetaphAdd(secondary, "X");
595 current += 3;
596 break;
597 }
598
599 /* double 'C', but not if e.g. 'McClellan' */
600 if (StringAt(original, current, 2, "CC", "")
601 && !((current == 1) && (GetAt(original, 0) == 'M')))
602 {
603 /* 'bellocchio' but not 'bacchus' */
604 if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
605 && !StringAt(original, (current + 2), 2, "HU", ""))
606 {
607 /* 'accident', 'accede' 'succeed' */
608 if (((current == 1)
609 && (GetAt(original, current - 1) == 'A'))
610 || StringAt(original, (current - 1), 5, "UCCEE",
611 "UCCES", ""))
612 {
613 MetaphAdd(primary, "KS");
614 MetaphAdd(secondary, "KS");
615 /* 'bacci', 'bertucci', other italian */
616 }
617 else
618 {
619 MetaphAdd(primary, "X");
620 MetaphAdd(secondary, "X");
621 }
622 current += 3;
623 break;
624 }
625 else
626 { /* Pierce's rule */
627 MetaphAdd(primary, "K");
628 MetaphAdd(secondary, "K");
629 current += 2;
630 break;
631 }
632 }
633
634 if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
635 {
636 MetaphAdd(primary, "K");
637 MetaphAdd(secondary, "K");
638 current += 2;
639 break;
640 }
641
642 if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
643 {
644 /* italian vs. english */
645 if (StringAt
646 (original, current, 3, "CIO", "CIE", "CIA", ""))
647 {
648 MetaphAdd(primary, "S");
649 MetaphAdd(secondary, "X");
650 }
651 else
652 {
653 MetaphAdd(primary, "S");
654 MetaphAdd(secondary, "S");
655 }
656 current += 2;
657 break;
658 }
659
660 /* else */
661 MetaphAdd(primary, "K");
662 MetaphAdd(secondary, "K");
663
664 /* name sent in 'mac caffrey', 'mac gregor */
665 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
666 current += 3;
667 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
668 && !StringAt(original, (current + 1), 2,
669 "CE", "CI", ""))
670 current += 2;
671 else
672 current += 1;
673 break;
674
675 case 'D':
676 if (StringAt(original, current, 2, "DG", ""))
677 {
678 if (StringAt(original, (current + 2), 1,
679 "I", "E", "Y", ""))
680 {
681 /* e.g. 'edge' */
682 MetaphAdd(primary, "J");
683 MetaphAdd(secondary, "J");
684 current += 3;
685 break;
686 }
687 else
688 {
689 /* e.g. 'edgar' */
690 MetaphAdd(primary, "TK");
691 MetaphAdd(secondary, "TK");
692 current += 2;
693 break;
694 }
695 }
696
697 if (StringAt(original, current, 2, "DT", "DD", ""))
698 {
699 MetaphAdd(primary, "T");
700 MetaphAdd(secondary, "T");
701 current += 2;
702 break;
703 }
704
705 /* else */
706 MetaphAdd(primary, "T");
707 MetaphAdd(secondary, "T");
708 current += 1;
709 break;
710
711 case 'F':
712 if (GetAt(original, current + 1) == 'F')
713 current += 2;
714 else
715 current += 1;
716 MetaphAdd(primary, "F");
717 MetaphAdd(secondary, "F");
718 break;
719
720 case 'G':
721 if (GetAt(original, current + 1) == 'H')
722 {
723 if ((current > 0) && !IsVowel(original, current - 1))
724 {
725 MetaphAdd(primary, "K");
726 MetaphAdd(secondary, "K");
727 current += 2;
728 break;
729 }
730
731 if (current < 3)
732 {
733 /* 'ghislane', ghiradelli */
734 if (current == 0)
735 {
736 if (GetAt(original, current + 2) == 'I')
737 {
738 MetaphAdd(primary, "J");
739 MetaphAdd(secondary, "J");
740 }
741 else
742 {
743 MetaphAdd(primary, "K");
744 MetaphAdd(secondary, "K");
745 }
746 current += 2;
747 break;
748 }
749 }
750
751 /*
752 * Parker's rule (with some further refinements) - e.g.,
753 * 'hugh'
754 */
755 if (((current > 1)
756 && StringAt(original, (current - 2), 1,
757 "B", "H", "D", ""))
758 /* e.g., 'bough' */
759 || ((current > 2)
760 && StringAt(original, (current - 3), 1,
761 "B", "H", "D", ""))
762 /* e.g., 'broughton' */
763 || ((current > 3)
764 && StringAt(original, (current - 4), 1,
765 "B", "H", "")))
766 {
767 current += 2;
768 break;
769 }
770 else
771 {
772 /*
773 * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
774 * 'rough', 'tough'
775 */
776 if ((current > 2)
777 && (GetAt(original, current - 1) == 'U')
778 && StringAt(original, (current - 3), 1, "C",
779 "G", "L", "R", "T", ""))
780 {
781 MetaphAdd(primary, "F");
782 MetaphAdd(secondary, "F");
783 }
784 else if ((current > 0)
785 && GetAt(original, current - 1) != 'I')
786 {
787
788
789 MetaphAdd(primary, "K");
790 MetaphAdd(secondary, "K");
791 }
792
793 current += 2;
794 break;
795 }
796 }
797
798 if (GetAt(original, current + 1) == 'N')
799 {
800 if ((current == 1) && IsVowel(original, 0)
801 && !SlavoGermanic(original))
802 {
803 MetaphAdd(primary, "KN");
804 MetaphAdd(secondary, "N");
805 }
806 else
807 /* not e.g. 'cagney' */
808 if (!StringAt(original, (current + 2), 2, "EY", "")
809 && (GetAt(original, current + 1) != 'Y')
810 && !SlavoGermanic(original))
811 {
812 MetaphAdd(primary, "N");
813 MetaphAdd(secondary, "KN");
814 }
815 else
816 {
817 MetaphAdd(primary, "KN");
818 MetaphAdd(secondary, "KN");
819 }
820 current += 2;
821 break;
822 }
823
824 /* 'tagliaro' */
825 if (StringAt(original, (current + 1), 2, "LI", "")
826 && !SlavoGermanic(original))
827 {
828 MetaphAdd(primary, "KL");
829 MetaphAdd(secondary, "L");
830 current += 2;
831 break;
832 }
833
834 /* -ges-,-gep-,-gel-, -gie- at beginning */
835 if ((current == 0)
836 && ((GetAt(original, current + 1) == 'Y')
837 || StringAt(original, (current + 1), 2, "ES", "EP",
838 "EB", "EL", "EY", "IB", "IL", "IN", "IE",
839 "EI", "ER", "")))
840 {
841 MetaphAdd(primary, "K");
842 MetaphAdd(secondary, "J");
843 current += 2;
844 break;
845 }
846
847 /* -ger-, -gy- */
848 if ((StringAt(original, (current + 1), 2, "ER", "")
849 || (GetAt(original, current + 1) == 'Y'))
850 && !StringAt(original, 0, 6,
851 "DANGER", "RANGER", "MANGER", "")
852 && !StringAt(original, (current - 1), 1, "E", "I", "")
853 && !StringAt(original, (current - 1), 3, "RGY", "OGY", ""))
854 {
855 MetaphAdd(primary, "K");
856 MetaphAdd(secondary, "J");
857 current += 2;
858 break;
859 }
860
861 /* italian e.g, 'biaggi' */
862 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
863 || StringAt(original, (current - 1), 4,
864 "AGGI", "OGGI", ""))
865 {
866 /* obvious germanic */
867 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
868 || StringAt(original, 0, 3, "SCH", ""))
869 || StringAt(original, (current + 1), 2, "ET", ""))
870 {
871 MetaphAdd(primary, "K");
872 MetaphAdd(secondary, "K");
873 }
874 else
875 {
876 /* always soft if french ending */
877 if (StringAt
878 (original, (current + 1), 4, "IER ", ""))
879 {
880 MetaphAdd(primary, "J");
881 MetaphAdd(secondary, "J");
882 }
883 else
884 {
885 MetaphAdd(primary, "J");
886 MetaphAdd(secondary, "K");
887 }
888 }
889 current += 2;
890 break;
891 }
892
893 if (GetAt(original, current + 1) == 'G')
894 current += 2;
895 else
896 current += 1;
897 MetaphAdd(primary, "K");
898 MetaphAdd(secondary, "K");
899 break;
900
901 case 'H':
902 /* only keep if first & before vowel or btw. 2 vowels */
903 if (((current == 0) || IsVowel(original, current - 1))
904 && IsVowel(original, current + 1))
905 {
906 MetaphAdd(primary, "H");
907 MetaphAdd(secondary, "H");
908 current += 2;
909 }
910 else
911 /* also takes care of 'HH' */
912 current += 1;
913 break;
914
915 case 'J':
916 /* obvious spanish, 'jose', 'san jacinto' */
917 if (StringAt(original, current, 4, "JOSE", "")
918 || StringAt(original, 0, 4, "SAN ", ""))
919 {
920 if (((current == 0)
921 && (GetAt(original, current + 4) == ' '))
922 || StringAt(original, 0, 4, "SAN ", ""))
923 {
924 MetaphAdd(primary, "H");
925 MetaphAdd(secondary, "H");
926 }
927 else
928 {
929 MetaphAdd(primary, "J");
930 MetaphAdd(secondary, "H");
931 }
932 current += 1;
933 break;
934 }
935
936 if ((current == 0)
937 && !StringAt(original, current, 4, "JOSE", ""))
938 {
939 MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
940 MetaphAdd(secondary, "A");
941 }
942 else
943 {
944 /* spanish pron. of e.g. 'bajador' */
945 if (IsVowel(original, current - 1)
946 && !SlavoGermanic(original)
947 && ((GetAt(original, current + 1) == 'A')
948 || (GetAt(original, current + 1) == 'O')))
949 {
950 MetaphAdd(primary, "J");
951 MetaphAdd(secondary, "H");
952 }
953 else
954 {
955 if (current == last)
956 {
957 MetaphAdd(primary, "J");
958 MetaphAdd(secondary, "");
959 }
960 else
961 {
962 if (!StringAt(original, (current + 1), 1, "L", "T",
963 "K", "S", "N", "M", "B", "Z", "")
964 && !StringAt(original, (current - 1), 1,
965 "S", "K", "L", ""))
966 {
967 MetaphAdd(primary, "J");
968 MetaphAdd(secondary, "J");
969 }
970 }
971 }
972 }
973
974 if (GetAt(original, current + 1) == 'J') /* it could happen! */
975 current += 2;
976 else
977 current += 1;
978 break;
979
980 case 'K':
981 if (GetAt(original, current + 1) == 'K')
982 current += 2;
983 else
984 current += 1;
985 MetaphAdd(primary, "K");
986 MetaphAdd(secondary, "K");
987 break;
988
989 case 'L':
990 if (GetAt(original, current + 1) == 'L')
991 {
992 /* spanish e.g. 'cabrillo', 'gallegos' */
993 if (((current == (length - 3))
994 && StringAt(original, (current - 1), 4, "ILLO",
995 "ILLA", "ALLE", ""))
996 || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
997 || StringAt(original, last, 1, "A", "O", ""))
998 && StringAt(original, (current - 1), 4,
999 "ALLE", "")))
1000 {
1001 MetaphAdd(primary, "L");
1002 MetaphAdd(secondary, "");
1003 current += 2;
1004 break;
1005 }
1006 current += 2;
1007 }
1008 else
1009 current += 1;
1010 MetaphAdd(primary, "L");
1011 MetaphAdd(secondary, "L");
1012 break;
1013
1014 case 'M':
1015 if ((StringAt(original, (current - 1), 3, "UMB", "")
1016 && (((current + 1) == last)
1017 || StringAt(original, (current + 2), 2, "ER", "")))
1018 /* 'dumb','thumb' */
1019 || (GetAt(original, current + 1) == 'M'))
1020 current += 2;
1021 else
1022 current += 1;
1023 MetaphAdd(primary, "M");
1024 MetaphAdd(secondary, "M");
1025 break;
1026
1027 case 'N':
1028 if (GetAt(original, current + 1) == 'N')
1029 current += 2;
1030 else
1031 current += 1;
1032 MetaphAdd(primary, "N");
1033 MetaphAdd(secondary, "N");
1034 break;
1035
1036 case '\xd1': /* N with tilde */
1037 current += 1;
1038 MetaphAdd(primary, "N");
1039 MetaphAdd(secondary, "N");
1040 break;
1041
1042 case 'P':
1043 if (GetAt(original, current + 1) == 'H')
1044 {
1045 MetaphAdd(primary, "F");
1046 MetaphAdd(secondary, "F");
1047 current += 2;
1048 break;
1049 }
1050
1051 /* also account for "campbell", "raspberry" */
1052 if (StringAt(original, (current + 1), 1, "P", "B", ""))
1053 current += 2;
1054 else
1055 current += 1;
1056 MetaphAdd(primary, "P");
1057 MetaphAdd(secondary, "P");
1058 break;
1059
1060 case 'Q':
1061 if (GetAt(original, current + 1) == 'Q')
1062 current += 2;
1063 else
1064 current += 1;
1065 MetaphAdd(primary, "K");
1066 MetaphAdd(secondary, "K");
1067 break;
1068
1069 case 'R':
1070 /* french e.g. 'rogier', but exclude 'hochmeier' */
1071 if ((current == last)
1072 && !SlavoGermanic(original)
1073 && StringAt(original, (current - 2), 2, "IE", "")
1074 && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1075 {
1076 MetaphAdd(primary, "");
1077 MetaphAdd(secondary, "R");
1078 }
1079 else
1080 {
1081 MetaphAdd(primary, "R");
1082 MetaphAdd(secondary, "R");
1083 }
1084
1085 if (GetAt(original, current + 1) == 'R')
1086 current += 2;
1087 else
1088 current += 1;
1089 break;
1090
1091 case 'S':
1092 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1093 if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1094 {
1095 current += 1;
1096 break;
1097 }
1098
1099 /* special case 'sugar-' */
1100 if ((current == 0)
1101 && StringAt(original, current, 5, "SUGAR", ""))
1102 {
1103 MetaphAdd(primary, "X");
1104 MetaphAdd(secondary, "S");
1105 current += 1;
1106 break;
1107 }
1108
1109 if (StringAt(original, current, 2, "SH", ""))
1110 {
1111 /* germanic */
1112 if (StringAt
1113 (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1114 "HOLZ", ""))
1115 {
1116 MetaphAdd(primary, "S");
1117 MetaphAdd(secondary, "S");
1118 }
1119 else
1120 {
1121 MetaphAdd(primary, "X");
1122 MetaphAdd(secondary, "X");
1123 }
1124 current += 2;
1125 break;
1126 }
1127
1128 /* italian & armenian */
1129 if (StringAt(original, current, 3, "SIO", "SIA", "")
1130 || StringAt(original, current, 4, "SIAN", ""))
1131 {
1132 if (!SlavoGermanic(original))
1133 {
1134 MetaphAdd(primary, "S");
1135 MetaphAdd(secondary, "X");
1136 }
1137 else
1138 {
1139 MetaphAdd(primary, "S");
1140 MetaphAdd(secondary, "S");
1141 }
1142 current += 3;
1143 break;
1144 }
1145
1146 /*
1147 * german & anglicisations, e.g. 'smith' match 'schmidt',
1148 * 'snider' match 'schneider' also, -sz- in slavic language
1149 * although in hungarian it is pronounced 's'
1150 */
1151 if (((current == 0)
1152 && StringAt(original, (current + 1), 1,
1153 "M", "N", "L", "W", ""))
1154 || StringAt(original, (current + 1), 1, "Z", ""))
1155 {
1156 MetaphAdd(primary, "S");
1157 MetaphAdd(secondary, "X");
1158 if (StringAt(original, (current + 1), 1, "Z", ""))
1159 current += 2;
1160 else
1161 current += 1;
1162 break;
1163 }
1164
1165 if (StringAt(original, current, 2, "SC", ""))
1166 {
1167 /* Schlesinger's rule */
1168 if (GetAt(original, current + 2) == 'H')
1169 {
1170 /* dutch origin, e.g. 'school', 'schooner' */
1171 if (StringAt(original, (current + 3), 2,
1172 "OO", "ER", "EN",
1173 "UY", "ED", "EM", ""))
1174 {
1175 /* 'schermerhorn', 'schenker' */
1176 if (StringAt(original, (current + 3), 2,
1177 "ER", "EN", ""))
1178 {
1179 MetaphAdd(primary, "X");
1180 MetaphAdd(secondary, "SK");
1181 }
1182 else
1183 {
1184 MetaphAdd(primary, "SK");
1185 MetaphAdd(secondary, "SK");
1186 }
1187 current += 3;
1188 break;
1189 }
1190 else
1191 {
1192 if ((current == 0) && !IsVowel(original, 3)
1193 && (GetAt(original, 3) != 'W'))
1194 {
1195 MetaphAdd(primary, "X");
1196 MetaphAdd(secondary, "S");
1197 }
1198 else
1199 {
1200 MetaphAdd(primary, "X");
1201 MetaphAdd(secondary, "X");
1202 }
1203 current += 3;
1204 break;
1205 }
1206 }
1207
1208 if (StringAt(original, (current + 2), 1,
1209 "I", "E", "Y", ""))
1210 {
1211 MetaphAdd(primary, "S");
1212 MetaphAdd(secondary, "S");
1213 current += 3;
1214 break;
1215 }
1216 /* else */
1217 MetaphAdd(primary, "SK");
1218 MetaphAdd(secondary, "SK");
1219 current += 3;
1220 break;
1221 }
1222
1223 /* french e.g. 'resnais', 'artois' */
1224 if ((current == last)
1225 && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1226 {
1227 MetaphAdd(primary, "");
1228 MetaphAdd(secondary, "S");
1229 }
1230 else
1231 {
1232 MetaphAdd(primary, "S");
1233 MetaphAdd(secondary, "S");
1234 }
1235
1236 if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1237 current += 2;
1238 else
1239 current += 1;
1240 break;
1241
1242 case 'T':
1243 if (StringAt(original, current, 4, "TION", ""))
1244 {
1245 MetaphAdd(primary, "X");
1246 MetaphAdd(secondary, "X");
1247 current += 3;
1248 break;
1249 }
1250
1251 if (StringAt(original, current, 3, "TIA", "TCH", ""))
1252 {
1253 MetaphAdd(primary, "X");
1254 MetaphAdd(secondary, "X");
1255 current += 3;
1256 break;
1257 }
1258
1259 if (StringAt(original, current, 2, "TH", "")
1260 || StringAt(original, current, 3, "TTH", ""))
1261 {
1262 /* special case 'thomas', 'thames' or germanic */
1263 if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1264 || StringAt(original, 0, 4, "VAN ", "VON ", "")
1265 || StringAt(original, 0, 3, "SCH", ""))
1266 {
1267 MetaphAdd(primary, "T");
1268 MetaphAdd(secondary, "T");
1269 }
1270 else
1271 {
1272 MetaphAdd(primary, "0");
1273 MetaphAdd(secondary, "T");
1274 }
1275 current += 2;
1276 break;
1277 }
1278
1279 if (StringAt(original, (current + 1), 1, "T", "D", ""))
1280 current += 2;
1281 else
1282 current += 1;
1283 MetaphAdd(primary, "T");
1284 MetaphAdd(secondary, "T");
1285 break;
1286
1287 case 'V':
1288 if (GetAt(original, current + 1) == 'V')
1289 current += 2;
1290 else
1291 current += 1;
1292 MetaphAdd(primary, "F");
1293 MetaphAdd(secondary, "F");
1294 break;
1295
1296 case 'W':
1297 /* can also be in middle of word */
1298 if (StringAt(original, current, 2, "WR", ""))
1299 {
1300 MetaphAdd(primary, "R");
1301 MetaphAdd(secondary, "R");
1302 current += 2;
1303 break;
1304 }
1305
1306 if ((current == 0)
1307 && (IsVowel(original, current + 1)
1308 || StringAt(original, current, 2, "WH", "")))
1309 {
1310 /* Wasserman should match Vasserman */
1311 if (IsVowel(original, current + 1))
1312 {
1313 MetaphAdd(primary, "A");
1314 MetaphAdd(secondary, "F");
1315 }
1316 else
1317 {
1318 /* need Uomo to match Womo */
1319 MetaphAdd(primary, "A");
1320 MetaphAdd(secondary, "A");
1321 }
1322 }
1323
1324 /* Arnow should match Arnoff */
1325 if (((current == last) && IsVowel(original, current - 1))
1326 || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1327 "OWSKI", "OWSKY", "")
1328 || StringAt(original, 0, 3, "SCH", ""))
1329 {
1330 MetaphAdd(primary, "");
1331 MetaphAdd(secondary, "F");
1332 current += 1;
1333 break;
1334 }
1335
1336 /* polish e.g. 'filipowicz' */
1337 if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1338 {
1339 MetaphAdd(primary, "TS");
1340 MetaphAdd(secondary, "FX");
1341 current += 4;
1342 break;
1343 }
1344
1345 /* else skip it */
1346 current += 1;
1347 break;
1348
1349 case 'X':
1350 /* french e.g. breaux */
1351 if (!((current == last)
1352 && (StringAt(original, (current - 3), 3,
1353 "IAU", "EAU", "")
1354 || StringAt(original, (current - 2), 2,
1355 "AU", "OU", ""))))
1356 {
1357 MetaphAdd(primary, "KS");
1358 MetaphAdd(secondary, "KS");
1359 }
1360
1361
1362 if (StringAt(original, (current + 1), 1, "C", "X", ""))
1363 current += 2;
1364 else
1365 current += 1;
1366 break;
1367
1368 case 'Z':
1369 /* chinese pinyin e.g. 'zhao' */
1370 if (GetAt(original, current + 1) == 'H')
1371 {
1372 MetaphAdd(primary, "J");
1373 MetaphAdd(secondary, "J");
1374 current += 2;
1375 break;
1376 }
1377 else if (StringAt(original, (current + 1), 2,
1378 "ZO", "ZI", "ZA", "")
1379 || (SlavoGermanic(original)
1380 && ((current > 0)
1381 && GetAt(original, current - 1) != 'T')))
1382 {
1383 MetaphAdd(primary, "S");
1384 MetaphAdd(secondary, "TS");
1385 }
1386 else
1387 {
1388 MetaphAdd(primary, "S");
1389 MetaphAdd(secondary, "S");
1390 }
1391
1392 if (GetAt(original, current + 1) == 'Z')
1393 current += 2;
1394 else
1395 current += 1;
1396 break;
1397
1398 default:
1399 current += 1;
1400 }
1401
1402 /*
1403 * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1404 * secondary->str);
1405 */
1406 }
1407
1408
1409 if (primary->length > 4)
1410 SetAt(primary, 4, '\0');
1411
1412 if (secondary->length > 4)
1413 SetAt(secondary, 4, '\0');
1414
1415 *codes = primary->str;
1416 *++codes = secondary->str;
1417
1418 DestroyMetaString(original);
1419 DestroyMetaString(primary);
1420 DestroyMetaString(secondary);
1421}
1422
1423#ifdef DMETAPHONE_MAIN
1424
1425/* just for testing - not part of the perl code */
1426
1427main(int argc, char **argv)
1428{
1429 char *codes[2];
1430
1431 if (argc > 1)
1432 {
1433 DoubleMetaphone(argv[1], codes);
1434 printf("%s|%s\n", codes[0], codes[1]);
1435 }
1436}
1437
1438#endif
Datum dmetaphone(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:131
static char GetAt(metastring *s, int pos)
Definition: dmetaphone.c:325
static void SetAt(metastring *s, int pos, char c)
Definition: dmetaphone.c:335
static void MetaphAdd(metastring *s, const char *new_str)
Definition: dmetaphone.c:378
static int SlavoGermanic(metastring *s)
Definition: dmetaphone.c:309
PG_FUNCTION_INFO_V1(dmetaphone)
static void MakeUpper(metastring *s)
Definition: dmetaphone.c:282
static void IncreaseBuffer(metastring *s, int chars_needed)
Definition: dmetaphone.c:273
static int StringAt(metastring *s, int start, int length,...)
Definition: dmetaphone.c:348
static int IsVowel(metastring *s, int pos)
Definition: dmetaphone.c:292
#define META_FREE(x)
Definition: dmetaphone.c:200
Datum dmetaphone_alt(PG_FUNCTION_ARGS)
Definition: dmetaphone.c:160
#define META_REALLOC(v, n, t)
Definition: dmetaphone.c:190
static void DestroyMetaString(metastring *s)
Definition: dmetaphone.c:260
static void DoubleMetaphone(char *str, char **codes)
Definition: dmetaphone.c:395
#define META_MALLOC(v, n, t)
Definition: dmetaphone.c:187
static metastring * NewMetaString(const char *init_str)
Definition: dmetaphone.c:235
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:309
#define PG_ARGISNULL(n)
Definition: fmgr.h:209
#define PG_RETURN_NULL()
Definition: fmgr.h:345
#define PG_RETURN_TEXT_P(x)
Definition: fmgr.h:372
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
return str start
const char * str
int i
Definition: isn.c:74
int main(int argc, char **argv)
Definition: oid2name.c:583
void * arg
#define printf(...)
Definition: port.h:245
uintptr_t Datum
Definition: postgres.h:69
static void test(void)
char * c
#define assert(x)
Definition: regcustom.h:56
char * str
Definition: dmetaphone.c:220
int free_string_on_destroy
Definition: dmetaphone.c:223
Definition: c.h:658
text * cstring_to_text(const char *s)
Definition: varlena.c:192
char * text_to_cstring(const text *t)
Definition: varlena.c:225