PostgreSQL Source Code git master
Loading...
Searching...
No Matches
dmetaphone.c
Go to the documentation of this file.
1/*
2 * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
3 *
4 * contrib/fuzzystrmatch/dmetaphone.c
5 *
6 * Double Metaphone computes 2 "sounds like" strings - a primary and an
7 * alternate. In most cases they are the same, but for foreign names
8 * especially they can be a bit different, depending on pronunciation.
9 *
10 * Information on using Double Metaphone can be found at
11 * http://www.codeproject.com/string/dmetaphone1.asp
12 * and the original article describing it can be found at
13 * http://drdobbs.com/184401251
14 *
15 * For PostgreSQL we provide 2 functions - one for the primary and one for
16 * the alternate. That way the functions are pure text->text mappings that
17 * are useful in functional indexes. These are 'dmetaphone' for the
18 * primary and 'dmetaphone_alt' for the alternate.
19 *
20 * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21 * functions looks like this:
22 *
23 * CREATE FUNCTION dmetaphone (text) RETURNS text
24 * LANGUAGE C IMMUTABLE STRICT
25 * AS '$libdir/dmetaphone', 'dmetaphone';
26 *
27 * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28 * LANGUAGE C IMMUTABLE STRICT
29 * AS '$libdir/dmetaphone', 'dmetaphone_alt';
30 *
31 * Note that you have to declare the functions IMMUTABLE if you want to
32 * use them in functional indexes, and you have to declare them as STRICT
33 * as they do not check for NULL input, and will segfault if given NULL input.
34 * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35 * will never call them with NULL, but instead assume the result is NULL,
36 * which is what we (I) want.
37 *
38 * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39 * will detect NULL input and return NULL. The you don't have to declare them
40 * as STRICT.
41 *
42 * There is a small inefficiency here - each function call actually computes
43 * both the primary and the alternate and then throws away the one it doesn't
44 * need. That's the way the perl module was written, because perl can handle
45 * a list return more easily than we can in PostgreSQL. The result has been
46 * fast enough for my needs, but it could maybe be optimized a bit to remove
47 * that behaviour.
48 *
49 */
50
51
52/***************************** COPYRIGHT NOTICES ***********************
53
54Most of this code is directly from the Text::DoubleMetaphone perl module
55version 0.05 available from https://www.cpan.org/.
56It bears this copyright notice:
57
58
59 Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60 All rights reserved.
61
62 This code is based heavily on the C++ implementation by
63 Lawrence Philips and incorporates several bug fixes courtesy
64 of Kevin Atkinson <kevina@users.sourceforge.net>.
65
66 This module is free software; you may redistribute it and/or
67 modify it under the same terms as Perl itself.
68
69The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70<andrew@dunslane.net> and is covered this copyright:
71
72 Copyright 2003, North Carolina State Highway Patrol.
73 All rights reserved.
74
75 Permission to use, copy, modify, and distribute this software and its
76 documentation for any purpose, without fee, and without a written agreement
77 is hereby granted, provided that the above copyright notice and this
78 paragraph and the following two paragraphs appear in all copies.
79
80 IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81 PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82 INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83 DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85
86 THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89 HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90 HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91 MODIFICATIONS.
92
93***********************************************************************/
94
95
96/* include these first, according to the docs */
97#ifndef DMETAPHONE_MAIN
98
99#include "postgres.h"
100
101#include "utils/builtins.h"
102#include "utils/formatting.h"
103
104/* turn off assertions for embedded function */
105#define NDEBUG
106
107#else /* DMETAPHONE_MAIN */
108
109/* we need these if we didn't get them from postgres.h */
110#include <stdio.h>
111#include <stdlib.h>
112#include <string.h>
113#include <stdarg.h>
114
115#endif /* DMETAPHONE_MAIN */
116
117#include <assert.h>
118#include <ctype.h>
119
120/* prototype for the main function we got from the perl module */
121static void DoubleMetaphone(const char *str, Oid collid, char **codes);
122
123#ifndef DMETAPHONE_MAIN
124
125/*
126 * The PostgreSQL visible dmetaphone function.
127 */
128
130
131Datum
133{
134 text *arg;
135 char *aptr,
136 *codes[2],
137 *code;
138
139#ifdef DMETAPHONE_NOSTRICT
140 if (PG_ARGISNULL(0))
142#endif
145
147 code = codes[0];
148 if (!code)
149 code = "";
150
152}
153
154/*
155 * The PostgreSQL visible dmetaphone_alt function.
156 */
157
159
160Datum
162{
163 text *arg;
164 char *aptr,
165 *codes[2],
166 *code;
167
168#ifdef DMETAPHONE_NOSTRICT
169 if (PG_ARGISNULL(0))
171#endif
174
176 code = codes[1];
177 if (!code)
178 code = "";
179
181}
182
183
184/* here is where we start the code imported from the perl module */
185
186/* all memory handling is done with these macros */
187
188#define META_MALLOC(v,n,t) \
189 (v = (t*)palloc(((n)*sizeof(t))))
190
191#define META_REALLOC(v,n,t) \
192 (v = (t*)repalloc((v),((n)*sizeof(t))))
193
194/*
195 * Don't do pfree - it seems to cause a SIGSEGV sometimes - which might have just
196 * been caused by reloading the module in development.
197 * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
198 * in a case like this.
199 */
200
201#define META_FREE(x) ((void)true) /* pfree((x)) */
202#else /* not defined DMETAPHONE_MAIN */
203
204/* use the standard malloc library when not running in PostgreSQL */
205
206#define META_MALLOC(v,n,t) \
207 (v = (t*)malloc(((n)*sizeof(t))))
208
209#define META_REALLOC(v,n,t) \
210 (v = (t*)realloc((v),((n)*sizeof(t))))
211
212#define META_FREE(x) free((x))
213#endif /* defined DMETAPHONE_MAIN */
214
215
216
217/* this typedef was originally in the perl module's .h file */
218
219typedef struct
220{
221 char *str;
225}
226
228
229/*
230 * remaining perl module funcs unchanged except for declaring them static
231 * and reformatting to PostgreSQL indentation and to fit in 80 cols.
232 *
233 */
234
235static metastring *
237{
238 metastring *s;
239 char empty_string[] = "";
240
241 META_MALLOC(s, 1, metastring);
242 assert(s != NULL);
243
244 if (init_str == NULL)
246 s->length = strlen(init_str);
247 /* preallocate a bit more for potential growth */
248 s->bufsize = s->length + 7;
249
250 META_MALLOC(s->str, s->bufsize, char);
251 assert(s->str != NULL);
252
253 memcpy(s->str, init_str, s->length + 1);
255
256 return s;
257}
258
259
260static void
262{
263 if (s == NULL)
264 return;
265
266 if (s->free_string_on_destroy && (s->str != NULL))
267 META_FREE(s->str);
268
269 META_FREE(s);
270}
271
272
273static void
275{
276 META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
277 assert(s->str != NULL);
278 s->bufsize = s->bufsize + chars_needed + 10;
279}
280
281
282static metastring *
284{
285 char *newstr;
287
291
292 return newms;
293}
294
295
296static int
297IsVowel(metastring *s, int pos)
298{
299 char c;
300
301 if ((pos < 0) || (pos >= s->length))
302 return 0;
303
304 c = *(s->str + pos);
305 if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
306 (c == 'U') || (c == 'Y'))
307 return 1;
308
309 return 0;
310}
311
312
313static int
315{
316 if (strstr(s->str, "W"))
317 return 1;
318 else if (strstr(s->str, "K"))
319 return 1;
320 else if (strstr(s->str, "CZ"))
321 return 1;
322 else if (strstr(s->str, "WITZ"))
323 return 1;
324 else
325 return 0;
326}
327
328
329static char
330GetAt(metastring *s, int pos)
331{
332 if ((pos < 0) || (pos >= s->length))
333 return '\0';
334
335 return *(s->str + pos);
336}
337
338
339static void
340SetAt(metastring *s, int pos, char c)
341{
342 if ((pos < 0) || (pos >= s->length))
343 return;
344
345 *(s->str + pos) = c;
346}
347
348
349/*
350 Caveats: the START value is 0 based
351*/
352static int
353StringAt(metastring *s, int start, int length,...)
354{
355 char *test;
356 char *pos;
357 va_list ap;
358
359 if ((start < 0) || (start >= s->length))
360 return 0;
361
362 pos = (s->str + start);
363 va_start(ap, length);
364
365 do
366 {
367 test = va_arg(ap, char *);
368 if (*test && (strncmp(pos, test, length) == 0))
369 {
370 va_end(ap);
371 return 1;
372 }
373 }
374 while (strcmp(test, "") != 0);
375
376 va_end(ap);
377
378 return 0;
379}
380
381
382static void
384{
385 int add_length;
386
387 if (new_str == NULL)
388 return;
389
391 if ((s->length + add_length) > (s->bufsize - 1))
393
394 strcat(s->str, new_str);
395 s->length += add_length;
396}
397
398
399static void
400DoubleMetaphone(const char *str, Oid collid, char **codes)
401{
402 int length;
403 metastring *original;
404 metastring *primary;
406 int current;
407 int last;
408
409 current = 0;
410 /* we need the real length and last prior to padding */
411 length = strlen(str);
412 last = length - 1;
413 original = NewMetaString(str);
414 /* Pad original so we can index beyond end */
415 MetaphAdd(original, " ");
416
417 primary = NewMetaString("");
419 primary->free_string_on_destroy = 0;
420 secondary->free_string_on_destroy = 0;
421
422 original = MakeUpper(original, collid);
423
424 /* skip these when at start of word */
425 if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
426 current += 1;
427
428 /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
429 if (GetAt(original, 0) == 'X')
430 {
431 MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
432 MetaphAdd(secondary, "S");
433 current += 1;
434 }
435
436 /* main loop */
437 while ((primary->length < 4) || (secondary->length < 4))
438 {
439 if (current >= length)
440 break;
441
442 switch (GetAt(original, current))
443 {
444 case 'A':
445 case 'E':
446 case 'I':
447 case 'O':
448 case 'U':
449 case 'Y':
450 if (current == 0)
451 {
452 /* all init vowels now map to 'A' */
453 MetaphAdd(primary, "A");
454 MetaphAdd(secondary, "A");
455 }
456 current += 1;
457 break;
458
459 case 'B':
460
461 /* "-mb", e.g", "dumb", already skipped over... */
462 MetaphAdd(primary, "P");
463 MetaphAdd(secondary, "P");
464
465 if (GetAt(original, current + 1) == 'B')
466 current += 2;
467 else
468 current += 1;
469 break;
470
471 case '\xc7': /* C with cedilla */
472 MetaphAdd(primary, "S");
473 MetaphAdd(secondary, "S");
474 current += 1;
475 break;
476
477 case 'C':
478 /* various germanic */
479 if ((current > 1)
480 && !IsVowel(original, current - 2)
481 && StringAt(original, (current - 1), 3, "ACH", "")
482 && ((GetAt(original, current + 2) != 'I')
483 && ((GetAt(original, current + 2) != 'E')
484 || StringAt(original, (current - 2), 6, "BACHER",
485 "MACHER", ""))))
486 {
487 MetaphAdd(primary, "K");
488 MetaphAdd(secondary, "K");
489 current += 2;
490 break;
491 }
492
493 /* special case 'caesar' */
494 if ((current == 0)
495 && StringAt(original, current, 6, "CAESAR", ""))
496 {
497 MetaphAdd(primary, "S");
498 MetaphAdd(secondary, "S");
499 current += 2;
500 break;
501 }
502
503 /* italian 'chianti' */
504 if (StringAt(original, current, 4, "CHIA", ""))
505 {
506 MetaphAdd(primary, "K");
507 MetaphAdd(secondary, "K");
508 current += 2;
509 break;
510 }
511
512 if (StringAt(original, current, 2, "CH", ""))
513 {
514 /* find 'michael' */
515 if ((current > 0)
516 && StringAt(original, current, 4, "CHAE", ""))
517 {
518 MetaphAdd(primary, "K");
519 MetaphAdd(secondary, "X");
520 current += 2;
521 break;
522 }
523
524 /* greek roots e.g. 'chemistry', 'chorus' */
525 if ((current == 0)
526 && (StringAt(original, (current + 1), 5,
527 "HARAC", "HARIS", "")
528 || StringAt(original, (current + 1), 3, "HOR",
529 "HYM", "HIA", "HEM", ""))
530 && !StringAt(original, 0, 5, "CHORE", ""))
531 {
532 MetaphAdd(primary, "K");
533 MetaphAdd(secondary, "K");
534 current += 2;
535 break;
536 }
537
538 /* germanic, greek, or otherwise 'ch' for 'kh' sound */
539 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
540 || StringAt(original, 0, 3, "SCH", ""))
541 /* 'architect but not 'arch', 'orchestra', 'orchid' */
542 || StringAt(original, (current - 2), 6, "ORCHES",
543 "ARCHIT", "ORCHID", "")
544 || StringAt(original, (current + 2), 1, "T", "S",
545 "")
546 || ((StringAt(original, (current - 1), 1,
547 "A", "O", "U", "E", "")
548 || (current == 0))
549
550 /*
551 * e.g., 'wachtler', 'wechsler', but not 'tichner'
552 */
553 && StringAt(original, (current + 2), 1, "L", "R",
554 "N", "M", "B", "H", "F", "V", "W",
555 " ", "")))
556 {
557 MetaphAdd(primary, "K");
558 MetaphAdd(secondary, "K");
559 }
560 else
561 {
562 if (current > 0)
563 {
564 if (StringAt(original, 0, 2, "MC", ""))
565 {
566 /* e.g., "McHugh" */
567 MetaphAdd(primary, "K");
568 MetaphAdd(secondary, "K");
569 }
570 else
571 {
572 MetaphAdd(primary, "X");
573 MetaphAdd(secondary, "K");
574 }
575 }
576 else
577 {
578 MetaphAdd(primary, "X");
579 MetaphAdd(secondary, "X");
580 }
581 }
582 current += 2;
583 break;
584 }
585 /* e.g, 'czerny' */
586 if (StringAt(original, current, 2, "CZ", "")
587 && !StringAt(original, (current - 2), 4, "WICZ", ""))
588 {
589 MetaphAdd(primary, "S");
590 MetaphAdd(secondary, "X");
591 current += 2;
592 break;
593 }
594
595 /* e.g., 'focaccia' */
596 if (StringAt(original, (current + 1), 3, "CIA", ""))
597 {
598 MetaphAdd(primary, "X");
599 MetaphAdd(secondary, "X");
600 current += 3;
601 break;
602 }
603
604 /* double 'C', but not if e.g. 'McClellan' */
605 if (StringAt(original, current, 2, "CC", "")
606 && !((current == 1) && (GetAt(original, 0) == 'M')))
607 {
608 /* 'bellocchio' but not 'bacchus' */
609 if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
610 && !StringAt(original, (current + 2), 2, "HU", ""))
611 {
612 /* 'accident', 'accede' 'succeed' */
613 if (((current == 1)
614 && (GetAt(original, current - 1) == 'A'))
615 || StringAt(original, (current - 1), 5, "UCCEE",
616 "UCCES", ""))
617 {
618 MetaphAdd(primary, "KS");
619 MetaphAdd(secondary, "KS");
620 /* 'bacci', 'bertucci', other italian */
621 }
622 else
623 {
624 MetaphAdd(primary, "X");
625 MetaphAdd(secondary, "X");
626 }
627 current += 3;
628 break;
629 }
630 else
631 { /* Pierce's rule */
632 MetaphAdd(primary, "K");
633 MetaphAdd(secondary, "K");
634 current += 2;
635 break;
636 }
637 }
638
639 if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
640 {
641 MetaphAdd(primary, "K");
642 MetaphAdd(secondary, "K");
643 current += 2;
644 break;
645 }
646
647 if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
648 {
649 /* italian vs. english */
650 if (StringAt
651 (original, current, 3, "CIO", "CIE", "CIA", ""))
652 {
653 MetaphAdd(primary, "S");
654 MetaphAdd(secondary, "X");
655 }
656 else
657 {
658 MetaphAdd(primary, "S");
659 MetaphAdd(secondary, "S");
660 }
661 current += 2;
662 break;
663 }
664
665 /* else */
666 MetaphAdd(primary, "K");
667 MetaphAdd(secondary, "K");
668
669 /* name sent in 'mac caffrey', 'mac gregor */
670 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
671 current += 3;
672 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
673 && !StringAt(original, (current + 1), 2,
674 "CE", "CI", ""))
675 current += 2;
676 else
677 current += 1;
678 break;
679
680 case 'D':
681 if (StringAt(original, current, 2, "DG", ""))
682 {
683 if (StringAt(original, (current + 2), 1,
684 "I", "E", "Y", ""))
685 {
686 /* e.g. 'edge' */
687 MetaphAdd(primary, "J");
688 MetaphAdd(secondary, "J");
689 current += 3;
690 break;
691 }
692 else
693 {
694 /* e.g. 'edgar' */
695 MetaphAdd(primary, "TK");
696 MetaphAdd(secondary, "TK");
697 current += 2;
698 break;
699 }
700 }
701
702 if (StringAt(original, current, 2, "DT", "DD", ""))
703 {
704 MetaphAdd(primary, "T");
705 MetaphAdd(secondary, "T");
706 current += 2;
707 break;
708 }
709
710 /* else */
711 MetaphAdd(primary, "T");
712 MetaphAdd(secondary, "T");
713 current += 1;
714 break;
715
716 case 'F':
717 if (GetAt(original, current + 1) == 'F')
718 current += 2;
719 else
720 current += 1;
721 MetaphAdd(primary, "F");
722 MetaphAdd(secondary, "F");
723 break;
724
725 case 'G':
726 if (GetAt(original, current + 1) == 'H')
727 {
728 if ((current > 0) && !IsVowel(original, current - 1))
729 {
730 MetaphAdd(primary, "K");
731 MetaphAdd(secondary, "K");
732 current += 2;
733 break;
734 }
735
736 if (current < 3)
737 {
738 /* 'ghislane', ghiradelli */
739 if (current == 0)
740 {
741 if (GetAt(original, current + 2) == 'I')
742 {
743 MetaphAdd(primary, "J");
744 MetaphAdd(secondary, "J");
745 }
746 else
747 {
748 MetaphAdd(primary, "K");
749 MetaphAdd(secondary, "K");
750 }
751 current += 2;
752 break;
753 }
754 }
755
756 /*
757 * Parker's rule (with some further refinements) - e.g.,
758 * 'hugh'
759 */
760 if (((current > 1)
761 && StringAt(original, (current - 2), 1,
762 "B", "H", "D", ""))
763 /* e.g., 'bough' */
764 || ((current > 2)
765 && StringAt(original, (current - 3), 1,
766 "B", "H", "D", ""))
767 /* e.g., 'broughton' */
768 || ((current > 3)
769 && StringAt(original, (current - 4), 1,
770 "B", "H", "")))
771 {
772 current += 2;
773 break;
774 }
775 else
776 {
777 /*
778 * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
779 * 'rough', 'tough'
780 */
781 if ((current > 2)
782 && (GetAt(original, current - 1) == 'U')
783 && StringAt(original, (current - 3), 1, "C",
784 "G", "L", "R", "T", ""))
785 {
786 MetaphAdd(primary, "F");
787 MetaphAdd(secondary, "F");
788 }
789 else if ((current > 0)
790 && GetAt(original, current - 1) != 'I')
791 {
792
793
794 MetaphAdd(primary, "K");
795 MetaphAdd(secondary, "K");
796 }
797
798 current += 2;
799 break;
800 }
801 }
802
803 if (GetAt(original, current + 1) == 'N')
804 {
805 if ((current == 1) && IsVowel(original, 0)
806 && !SlavoGermanic(original))
807 {
808 MetaphAdd(primary, "KN");
809 MetaphAdd(secondary, "N");
810 }
811 else
812 /* not e.g. 'cagney' */
813 if (!StringAt(original, (current + 2), 2, "EY", "")
814 && (GetAt(original, current + 1) != 'Y')
815 && !SlavoGermanic(original))
816 {
817 MetaphAdd(primary, "N");
818 MetaphAdd(secondary, "KN");
819 }
820 else
821 {
822 MetaphAdd(primary, "KN");
823 MetaphAdd(secondary, "KN");
824 }
825 current += 2;
826 break;
827 }
828
829 /* 'tagliaro' */
830 if (StringAt(original, (current + 1), 2, "LI", "")
831 && !SlavoGermanic(original))
832 {
833 MetaphAdd(primary, "KL");
834 MetaphAdd(secondary, "L");
835 current += 2;
836 break;
837 }
838
839 /* -ges-,-gep-,-gel-, -gie- at beginning */
840 if ((current == 0)
841 && ((GetAt(original, current + 1) == 'Y')
842 || StringAt(original, (current + 1), 2, "ES", "EP",
843 "EB", "EL", "EY", "IB", "IL", "IN", "IE",
844 "EI", "ER", "")))
845 {
846 MetaphAdd(primary, "K");
847 MetaphAdd(secondary, "J");
848 current += 2;
849 break;
850 }
851
852 /* -ger-, -gy- */
853 if ((StringAt(original, (current + 1), 2, "ER", "")
854 || (GetAt(original, current + 1) == 'Y'))
855 && !StringAt(original, 0, 6,
856 "DANGER", "RANGER", "MANGER", "")
857 && !StringAt(original, (current - 1), 1, "E", "I", "")
858 && !StringAt(original, (current - 1), 3, "RGY", "OGY", ""))
859 {
860 MetaphAdd(primary, "K");
861 MetaphAdd(secondary, "J");
862 current += 2;
863 break;
864 }
865
866 /* italian e.g, 'biaggi' */
867 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
868 || StringAt(original, (current - 1), 4,
869 "AGGI", "OGGI", ""))
870 {
871 /* obvious germanic */
872 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
873 || StringAt(original, 0, 3, "SCH", ""))
874 || StringAt(original, (current + 1), 2, "ET", ""))
875 {
876 MetaphAdd(primary, "K");
877 MetaphAdd(secondary, "K");
878 }
879 else
880 {
881 /* always soft if french ending */
882 if (StringAt
883 (original, (current + 1), 4, "IER ", ""))
884 {
885 MetaphAdd(primary, "J");
886 MetaphAdd(secondary, "J");
887 }
888 else
889 {
890 MetaphAdd(primary, "J");
891 MetaphAdd(secondary, "K");
892 }
893 }
894 current += 2;
895 break;
896 }
897
898 if (GetAt(original, current + 1) == 'G')
899 current += 2;
900 else
901 current += 1;
902 MetaphAdd(primary, "K");
903 MetaphAdd(secondary, "K");
904 break;
905
906 case 'H':
907 /* only keep if first & before vowel or btw. 2 vowels */
908 if (((current == 0) || IsVowel(original, current - 1))
909 && IsVowel(original, current + 1))
910 {
911 MetaphAdd(primary, "H");
912 MetaphAdd(secondary, "H");
913 current += 2;
914 }
915 else
916 /* also takes care of 'HH' */
917 current += 1;
918 break;
919
920 case 'J':
921 /* obvious spanish, 'jose', 'san jacinto' */
922 if (StringAt(original, current, 4, "JOSE", "")
923 || StringAt(original, 0, 4, "SAN ", ""))
924 {
925 if (((current == 0)
926 && (GetAt(original, current + 4) == ' '))
927 || StringAt(original, 0, 4, "SAN ", ""))
928 {
929 MetaphAdd(primary, "H");
930 MetaphAdd(secondary, "H");
931 }
932 else
933 {
934 MetaphAdd(primary, "J");
935 MetaphAdd(secondary, "H");
936 }
937 current += 1;
938 break;
939 }
940
941 if ((current == 0)
942 && !StringAt(original, current, 4, "JOSE", ""))
943 {
944 MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
945 MetaphAdd(secondary, "A");
946 }
947 else
948 {
949 /* spanish pron. of e.g. 'bajador' */
950 if (IsVowel(original, current - 1)
951 && !SlavoGermanic(original)
952 && ((GetAt(original, current + 1) == 'A')
953 || (GetAt(original, current + 1) == 'O')))
954 {
955 MetaphAdd(primary, "J");
956 MetaphAdd(secondary, "H");
957 }
958 else
959 {
960 if (current == last)
961 {
962 MetaphAdd(primary, "J");
963 MetaphAdd(secondary, "");
964 }
965 else
966 {
967 if (!StringAt(original, (current + 1), 1, "L", "T",
968 "K", "S", "N", "M", "B", "Z", "")
969 && !StringAt(original, (current - 1), 1,
970 "S", "K", "L", ""))
971 {
972 MetaphAdd(primary, "J");
973 MetaphAdd(secondary, "J");
974 }
975 }
976 }
977 }
978
979 if (GetAt(original, current + 1) == 'J') /* it could happen! */
980 current += 2;
981 else
982 current += 1;
983 break;
984
985 case 'K':
986 if (GetAt(original, current + 1) == 'K')
987 current += 2;
988 else
989 current += 1;
990 MetaphAdd(primary, "K");
991 MetaphAdd(secondary, "K");
992 break;
993
994 case 'L':
995 if (GetAt(original, current + 1) == 'L')
996 {
997 /* spanish e.g. 'cabrillo', 'gallegos' */
998 if (((current == (length - 3))
999 && StringAt(original, (current - 1), 4, "ILLO",
1000 "ILLA", "ALLE", ""))
1001 || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
1002 || StringAt(original, last, 1, "A", "O", ""))
1003 && StringAt(original, (current - 1), 4,
1004 "ALLE", "")))
1005 {
1006 MetaphAdd(primary, "L");
1007 MetaphAdd(secondary, "");
1008 current += 2;
1009 break;
1010 }
1011 current += 2;
1012 }
1013 else
1014 current += 1;
1015 MetaphAdd(primary, "L");
1016 MetaphAdd(secondary, "L");
1017 break;
1018
1019 case 'M':
1020 if ((StringAt(original, (current - 1), 3, "UMB", "")
1021 && (((current + 1) == last)
1022 || StringAt(original, (current + 2), 2, "ER", "")))
1023 /* 'dumb','thumb' */
1024 || (GetAt(original, current + 1) == 'M'))
1025 current += 2;
1026 else
1027 current += 1;
1028 MetaphAdd(primary, "M");
1029 MetaphAdd(secondary, "M");
1030 break;
1031
1032 case 'N':
1033 if (GetAt(original, current + 1) == 'N')
1034 current += 2;
1035 else
1036 current += 1;
1037 MetaphAdd(primary, "N");
1038 MetaphAdd(secondary, "N");
1039 break;
1040
1041 case '\xd1': /* N with tilde */
1042 current += 1;
1043 MetaphAdd(primary, "N");
1044 MetaphAdd(secondary, "N");
1045 break;
1046
1047 case 'P':
1048 if (GetAt(original, current + 1) == 'H')
1049 {
1050 MetaphAdd(primary, "F");
1051 MetaphAdd(secondary, "F");
1052 current += 2;
1053 break;
1054 }
1055
1056 /* also account for "campbell", "raspberry" */
1057 if (StringAt(original, (current + 1), 1, "P", "B", ""))
1058 current += 2;
1059 else
1060 current += 1;
1061 MetaphAdd(primary, "P");
1062 MetaphAdd(secondary, "P");
1063 break;
1064
1065 case 'Q':
1066 if (GetAt(original, current + 1) == 'Q')
1067 current += 2;
1068 else
1069 current += 1;
1070 MetaphAdd(primary, "K");
1071 MetaphAdd(secondary, "K");
1072 break;
1073
1074 case 'R':
1075 /* french e.g. 'rogier', but exclude 'hochmeier' */
1076 if ((current == last)
1077 && !SlavoGermanic(original)
1078 && StringAt(original, (current - 2), 2, "IE", "")
1079 && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1080 {
1081 MetaphAdd(primary, "");
1082 MetaphAdd(secondary, "R");
1083 }
1084 else
1085 {
1086 MetaphAdd(primary, "R");
1087 MetaphAdd(secondary, "R");
1088 }
1089
1090 if (GetAt(original, current + 1) == 'R')
1091 current += 2;
1092 else
1093 current += 1;
1094 break;
1095
1096 case 'S':
1097 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1098 if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1099 {
1100 current += 1;
1101 break;
1102 }
1103
1104 /* special case 'sugar-' */
1105 if ((current == 0)
1106 && StringAt(original, current, 5, "SUGAR", ""))
1107 {
1108 MetaphAdd(primary, "X");
1109 MetaphAdd(secondary, "S");
1110 current += 1;
1111 break;
1112 }
1113
1114 if (StringAt(original, current, 2, "SH", ""))
1115 {
1116 /* germanic */
1117 if (StringAt
1118 (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1119 "HOLZ", ""))
1120 {
1121 MetaphAdd(primary, "S");
1122 MetaphAdd(secondary, "S");
1123 }
1124 else
1125 {
1126 MetaphAdd(primary, "X");
1127 MetaphAdd(secondary, "X");
1128 }
1129 current += 2;
1130 break;
1131 }
1132
1133 /* italian & armenian */
1134 if (StringAt(original, current, 3, "SIO", "SIA", "")
1135 || StringAt(original, current, 4, "SIAN", ""))
1136 {
1137 if (!SlavoGermanic(original))
1138 {
1139 MetaphAdd(primary, "S");
1140 MetaphAdd(secondary, "X");
1141 }
1142 else
1143 {
1144 MetaphAdd(primary, "S");
1145 MetaphAdd(secondary, "S");
1146 }
1147 current += 3;
1148 break;
1149 }
1150
1151 /*
1152 * german & anglicisations, e.g. 'smith' match 'schmidt',
1153 * 'snider' match 'schneider' also, -sz- in slavic language
1154 * although in hungarian it is pronounced 's'
1155 */
1156 if (((current == 0)
1157 && StringAt(original, (current + 1), 1,
1158 "M", "N", "L", "W", ""))
1159 || StringAt(original, (current + 1), 1, "Z", ""))
1160 {
1161 MetaphAdd(primary, "S");
1162 MetaphAdd(secondary, "X");
1163 if (StringAt(original, (current + 1), 1, "Z", ""))
1164 current += 2;
1165 else
1166 current += 1;
1167 break;
1168 }
1169
1170 if (StringAt(original, current, 2, "SC", ""))
1171 {
1172 /* Schlesinger's rule */
1173 if (GetAt(original, current + 2) == 'H')
1174 {
1175 /* dutch origin, e.g. 'school', 'schooner' */
1176 if (StringAt(original, (current + 3), 2,
1177 "OO", "ER", "EN",
1178 "UY", "ED", "EM", ""))
1179 {
1180 /* 'schermerhorn', 'schenker' */
1181 if (StringAt(original, (current + 3), 2,
1182 "ER", "EN", ""))
1183 {
1184 MetaphAdd(primary, "X");
1185 MetaphAdd(secondary, "SK");
1186 }
1187 else
1188 {
1189 MetaphAdd(primary, "SK");
1190 MetaphAdd(secondary, "SK");
1191 }
1192 current += 3;
1193 break;
1194 }
1195 else
1196 {
1197 if ((current == 0) && !IsVowel(original, 3)
1198 && (GetAt(original, 3) != 'W'))
1199 {
1200 MetaphAdd(primary, "X");
1201 MetaphAdd(secondary, "S");
1202 }
1203 else
1204 {
1205 MetaphAdd(primary, "X");
1206 MetaphAdd(secondary, "X");
1207 }
1208 current += 3;
1209 break;
1210 }
1211 }
1212
1213 if (StringAt(original, (current + 2), 1,
1214 "I", "E", "Y", ""))
1215 {
1216 MetaphAdd(primary, "S");
1217 MetaphAdd(secondary, "S");
1218 current += 3;
1219 break;
1220 }
1221 /* else */
1222 MetaphAdd(primary, "SK");
1223 MetaphAdd(secondary, "SK");
1224 current += 3;
1225 break;
1226 }
1227
1228 /* french e.g. 'resnais', 'artois' */
1229 if ((current == last)
1230 && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1231 {
1232 MetaphAdd(primary, "");
1233 MetaphAdd(secondary, "S");
1234 }
1235 else
1236 {
1237 MetaphAdd(primary, "S");
1238 MetaphAdd(secondary, "S");
1239 }
1240
1241 if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1242 current += 2;
1243 else
1244 current += 1;
1245 break;
1246
1247 case 'T':
1248 if (StringAt(original, current, 4, "TION", ""))
1249 {
1250 MetaphAdd(primary, "X");
1251 MetaphAdd(secondary, "X");
1252 current += 3;
1253 break;
1254 }
1255
1256 if (StringAt(original, current, 3, "TIA", "TCH", ""))
1257 {
1258 MetaphAdd(primary, "X");
1259 MetaphAdd(secondary, "X");
1260 current += 3;
1261 break;
1262 }
1263
1264 if (StringAt(original, current, 2, "TH", "")
1265 || StringAt(original, current, 3, "TTH", ""))
1266 {
1267 /* special case 'thomas', 'thames' or germanic */
1268 if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1269 || StringAt(original, 0, 4, "VAN ", "VON ", "")
1270 || StringAt(original, 0, 3, "SCH", ""))
1271 {
1272 MetaphAdd(primary, "T");
1273 MetaphAdd(secondary, "T");
1274 }
1275 else
1276 {
1277 MetaphAdd(primary, "0");
1278 MetaphAdd(secondary, "T");
1279 }
1280 current += 2;
1281 break;
1282 }
1283
1284 if (StringAt(original, (current + 1), 1, "T", "D", ""))
1285 current += 2;
1286 else
1287 current += 1;
1288 MetaphAdd(primary, "T");
1289 MetaphAdd(secondary, "T");
1290 break;
1291
1292 case 'V':
1293 if (GetAt(original, current + 1) == 'V')
1294 current += 2;
1295 else
1296 current += 1;
1297 MetaphAdd(primary, "F");
1298 MetaphAdd(secondary, "F");
1299 break;
1300
1301 case 'W':
1302 /* can also be in middle of word */
1303 if (StringAt(original, current, 2, "WR", ""))
1304 {
1305 MetaphAdd(primary, "R");
1306 MetaphAdd(secondary, "R");
1307 current += 2;
1308 break;
1309 }
1310
1311 if ((current == 0)
1312 && (IsVowel(original, current + 1)
1313 || StringAt(original, current, 2, "WH", "")))
1314 {
1315 /* Wasserman should match Vasserman */
1316 if (IsVowel(original, current + 1))
1317 {
1318 MetaphAdd(primary, "A");
1319 MetaphAdd(secondary, "F");
1320 }
1321 else
1322 {
1323 /* need Uomo to match Womo */
1324 MetaphAdd(primary, "A");
1325 MetaphAdd(secondary, "A");
1326 }
1327 }
1328
1329 /* Arnow should match Arnoff */
1330 if (((current == last) && IsVowel(original, current - 1))
1331 || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1332 "OWSKI", "OWSKY", "")
1333 || StringAt(original, 0, 3, "SCH", ""))
1334 {
1335 MetaphAdd(primary, "");
1336 MetaphAdd(secondary, "F");
1337 current += 1;
1338 break;
1339 }
1340
1341 /* polish e.g. 'filipowicz' */
1342 if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1343 {
1344 MetaphAdd(primary, "TS");
1345 MetaphAdd(secondary, "FX");
1346 current += 4;
1347 break;
1348 }
1349
1350 /* else skip it */
1351 current += 1;
1352 break;
1353
1354 case 'X':
1355 /* french e.g. breaux */
1356 if (!((current == last)
1357 && (StringAt(original, (current - 3), 3,
1358 "IAU", "EAU", "")
1359 || StringAt(original, (current - 2), 2,
1360 "AU", "OU", ""))))
1361 {
1362 MetaphAdd(primary, "KS");
1363 MetaphAdd(secondary, "KS");
1364 }
1365
1366
1367 if (StringAt(original, (current + 1), 1, "C", "X", ""))
1368 current += 2;
1369 else
1370 current += 1;
1371 break;
1372
1373 case 'Z':
1374 /* chinese pinyin e.g. 'zhao' */
1375 if (GetAt(original, current + 1) == 'H')
1376 {
1377 MetaphAdd(primary, "J");
1378 MetaphAdd(secondary, "J");
1379 current += 2;
1380 break;
1381 }
1382 else if (StringAt(original, (current + 1), 2,
1383 "ZO", "ZI", "ZA", "")
1384 || (SlavoGermanic(original)
1385 && ((current > 0)
1386 && GetAt(original, current - 1) != 'T')))
1387 {
1388 MetaphAdd(primary, "S");
1389 MetaphAdd(secondary, "TS");
1390 }
1391 else
1392 {
1393 MetaphAdd(primary, "S");
1394 MetaphAdd(secondary, "S");
1395 }
1396
1397 if (GetAt(original, current + 1) == 'Z')
1398 current += 2;
1399 else
1400 current += 1;
1401 break;
1402
1403 default:
1404 current += 1;
1405 }
1406
1407 /*
1408 * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1409 * secondary->str);
1410 */
1411 }
1412
1413
1414 if (primary->length > 4)
1415 SetAt(primary, 4, '\0');
1416
1417 if (secondary->length > 4)
1418 SetAt(secondary, 4, '\0');
1419
1420 *codes = primary->str;
1421 *++codes = secondary->str;
1422
1423 DestroyMetaString(original);
1424 DestroyMetaString(primary);
1426}
1427
1428#ifdef DMETAPHONE_MAIN
1429
1430/* just for testing - not part of the perl code */
1431
1432main(int argc, char **argv)
1433{
1434 char *codes[2];
1435
1436 if (argc > 1)
1437 {
1439 printf("%s|%s\n", codes[0], codes[1]);
1440 }
1441}
1442
1443#endif
Oid collid
int main(void)
Datum dmetaphone(PG_FUNCTION_ARGS)
Definition dmetaphone.c:132
static char GetAt(metastring *s, int pos)
Definition dmetaphone.c:330
static void SetAt(metastring *s, int pos, char c)
Definition dmetaphone.c:340
static void MetaphAdd(metastring *s, const char *new_str)
Definition dmetaphone.c:383
static int SlavoGermanic(metastring *s)
Definition dmetaphone.c:314
static metastring * MakeUpper(metastring *s, Oid collid)
Definition dmetaphone.c:283
static void IncreaseBuffer(metastring *s, int chars_needed)
Definition dmetaphone.c:274
static int StringAt(metastring *s, int start, int length,...)
Definition dmetaphone.c:353
static void DoubleMetaphone(const char *str, Oid collid, char **codes)
Definition dmetaphone.c:400
static int IsVowel(metastring *s, int pos)
Definition dmetaphone.c:297
#define META_FREE(x)
Definition dmetaphone.c:201
Datum dmetaphone_alt(PG_FUNCTION_ARGS)
Definition dmetaphone.c:161
#define META_REALLOC(v, n, t)
Definition dmetaphone.c:191
static void DestroyMetaString(metastring *s)
Definition dmetaphone.c:261
#define META_MALLOC(v, n, t)
Definition dmetaphone.c:188
static metastring * NewMetaString(const char *init_str)
Definition dmetaphone.c:236
#define PG_GETARG_TEXT_PP(n)
Definition fmgr.h:310
#define PG_ARGISNULL(n)
Definition fmgr.h:209
#define PG_RETURN_NULL()
Definition fmgr.h:346
#define PG_FUNCTION_INFO_V1(funcname)
Definition fmgr.h:417
#define PG_RETURN_TEXT_P(x)
Definition fmgr.h:374
#define PG_GET_COLLATION()
Definition fmgr.h:198
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
char * str_toupper(const char *buff, size_t nbytes, Oid collid)
return str start
const char * str
void * arg
#define printf(...)
Definition port.h:266
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
static void test(void)
char * c
static int fb(int x)
#define assert(x)
Definition regcustom.h:57
char * str
Definition dmetaphone.c:221
int free_string_on_destroy
Definition dmetaphone.c:224
Definition c.h:706
text * cstring_to_text(const char *s)
Definition varlena.c:181
char * text_to_cstring(const text *t)
Definition varlena.c:214