2 * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
4 * contrib/fuzzystrmatch/dmetaphone.c
6 * Double Metaphone computes 2 "sounds like" strings - a primary and an
7 * alternate. In most cases they are the same, but for foreign names
8 * especially they can be a bit different, depending on pronunciation.
10 * Information on using Double Metaphone can be found at
11 * http://www.codeproject.com/string/dmetaphone1.asp
12 * and the original article describing it can be found at
13 * http://drdobbs.com/184401251
15 * For PostgreSQL we provide 2 functions - one for the primary and one for
16 * the alternate. That way the functions are pure text->text mappings that
17 * are useful in functional indexes. These are 'dmetaphone' for the
18 * primary and 'dmetaphone_alt' for the alternate.
20 * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21 * functions looks like this:
23 * CREATE FUNCTION dmetaphone (text) RETURNS text
24 * LANGUAGE C IMMUTABLE STRICT
25 * AS '$libdir/dmetaphone', 'dmetaphone';
27 * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28 * LANGUAGE C IMMUTABLE STRICT
29 * AS '$libdir/dmetaphone', 'dmetaphone_alt';
31 * Note that you have to declare the functions IMMUTABLE if you want to
32 * use them in functional indexes, and you have to declare them as STRICT
33 * as they do not check for NULL input, and will segfault if given NULL input.
34 * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35 * will never call them with NULL, but instead assume the result is NULL,
36 * which is what we (I) want.
38 * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39 * will detect NULL input and return NULL. The you don't have to declare them
42 * There is a small inefficiency here - each function call actually computes
43 * both the primary and the alternate and then throws away the one it doesn't
44 * need. That's the way the perl module was written, because perl can handle
45 * a list return more easily than we can in PostgreSQL. The result has been
46 * fast enough for my needs, but it could maybe be optimized a bit to remove
52 /***************************** COPYRIGHT NOTICES ***********************
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from https://www.cpan.org/.
56 It bears this copyright notice:
59 Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
62 This code is based heavily on the C++ implementation by
63 Lawrence Philips and incorporates several bug fixes courtesy
64 of Kevin Atkinson <kevina@users.sourceforge.net>.
66 This module is free software; you may redistribute it and/or
67 modify it under the same terms as Perl itself.
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
72 Copyright 2003, North Carolina State Highway Patrol.
75 Permission to use, copy, modify, and distribute this software and its
76 documentation for any purpose, without fee, and without a written agreement
77 is hereby granted, provided that the above copyright notice and this
78 paragraph and the following two paragraphs appear in all copies.
80 IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81 PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82 INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83 DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
86 THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89 HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90 HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
93 ***********************************************************************/
96 /* include these first, according to the docs */
97 #ifndef DMETAPHONE_MAIN
101 #include "utils/builtins.h"
103 /* turn off assertions for embedded function */
106 #else /* DMETAPHONE_MAIN */
108 /* we need these if we didn't get them from postgres.h */
114 #endif /* DMETAPHONE_MAIN */
119 /* prototype for the main function we got from the perl module */
120 static void DoubleMetaphone(char *str
, char **codes
);
122 #ifndef DMETAPHONE_MAIN
125 * The PostgreSQL visible dmetaphone function.
128 PG_FUNCTION_INFO_V1(dmetaphone
);
131 dmetaphone(PG_FUNCTION_ARGS
)
138 #ifdef DMETAPHONE_NOSTRICT
142 arg
= PG_GETARG_TEXT_PP(0);
143 aptr
= text_to_cstring(arg
);
145 DoubleMetaphone(aptr
, codes
);
150 PG_RETURN_TEXT_P(cstring_to_text(code
));
154 * The PostgreSQL visible dmetaphone_alt function.
157 PG_FUNCTION_INFO_V1(dmetaphone_alt
);
160 dmetaphone_alt(PG_FUNCTION_ARGS
)
167 #ifdef DMETAPHONE_NOSTRICT
171 arg
= PG_GETARG_TEXT_PP(0);
172 aptr
= text_to_cstring(arg
);
174 DoubleMetaphone(aptr
, codes
);
179 PG_RETURN_TEXT_P(cstring_to_text(code
));
183 /* here is where we start the code imported from the perl module */
185 /* all memory handling is done with these macros */
187 #define META_MALLOC(v,n,t) \
188 (v = (t*)palloc(((n)*sizeof(t))))
190 #define META_REALLOC(v,n,t) \
191 (v = (t*)repalloc((v),((n)*sizeof(t))))
194 * Don't do pfree - it seems to cause a SIGSEGV sometimes - which might have just
195 * been caused by reloading the module in development.
196 * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197 * in a case like this.
200 #define META_FREE(x) ((void)true) /* pfree((x)) */
201 #else /* not defined DMETAPHONE_MAIN */
203 /* use the standard malloc library when not running in PostgreSQL */
205 #define META_MALLOC(v,n,t) \
206 (v = (t*)malloc(((n)*sizeof(t))))
208 #define META_REALLOC(v,n,t) \
209 (v = (t*)realloc((v),((n)*sizeof(t))))
211 #define META_FREE(x) free((x))
212 #endif /* defined DMETAPHONE_MAIN */
216 /* this typedef was originally in the perl module's .h file */
223 int free_string_on_destroy
;
229 * remaining perl module funcs unchanged except for declaring them static
230 * and reformatting to PostgreSQL indentation and to fit in 80 cols.
235 NewMetaString(const char *init_str
)
238 char empty_string
[] = "";
240 META_MALLOC(s
, 1, metastring
);
243 if (init_str
== NULL
)
244 init_str
= empty_string
;
245 s
->length
= strlen(init_str
);
246 /* preallocate a bit more for potential growth */
247 s
->bufsize
= s
->length
+ 7;
249 META_MALLOC(s
->str
, s
->bufsize
, char);
250 assert(s
->str
!= NULL
);
252 memcpy(s
->str
, init_str
, s
->length
+ 1);
253 s
->free_string_on_destroy
= 1;
260 DestroyMetaString(metastring
*s
)
265 if (s
->free_string_on_destroy
&& (s
->str
!= NULL
))
273 IncreaseBuffer(metastring
*s
, int chars_needed
)
275 META_REALLOC(s
->str
, (s
->bufsize
+ chars_needed
+ 10), char);
276 assert(s
->str
!= NULL
);
277 s
->bufsize
= s
->bufsize
+ chars_needed
+ 10;
282 MakeUpper(metastring
*s
)
286 for (i
= s
->str
; *i
; i
++)
287 *i
= toupper((unsigned char) *i
);
292 IsVowel(metastring
*s
, int pos
)
296 if ((pos
< 0) || (pos
>= s
->length
))
300 if ((c
== 'A') || (c
== 'E') || (c
== 'I') || (c
== 'O') ||
301 (c
== 'U') || (c
== 'Y'))
309 SlavoGermanic(metastring
*s
)
311 if ((char *) strstr(s
->str
, "W"))
313 else if ((char *) strstr(s
->str
, "K"))
315 else if ((char *) strstr(s
->str
, "CZ"))
317 else if ((char *) strstr(s
->str
, "WITZ"))
325 GetAt(metastring
*s
, int pos
)
327 if ((pos
< 0) || (pos
>= s
->length
))
330 return ((char) *(s
->str
+ pos
));
335 SetAt(metastring
*s
, int pos
, char c
)
337 if ((pos
< 0) || (pos
>= s
->length
))
345 Caveats: the START value is 0 based
348 StringAt(metastring
*s
, int start
, int length
,...)
354 if ((start
< 0) || (start
>= s
->length
))
357 pos
= (s
->str
+ start
);
358 va_start(ap
, length
);
362 test
= va_arg(ap
, char *);
363 if (*test
&& (strncmp(pos
, test
, length
) == 0))
369 while (strcmp(test
, "") != 0);
378 MetaphAdd(metastring
*s
, const char *new_str
)
385 add_length
= strlen(new_str
);
386 if ((s
->length
+ add_length
) > (s
->bufsize
- 1))
387 IncreaseBuffer(s
, add_length
);
389 strcat(s
->str
, new_str
);
390 s
->length
+= add_length
;
395 DoubleMetaphone(char *str
, char **codes
)
398 metastring
*original
;
400 metastring
*secondary
;
405 /* we need the real length and last prior to padding */
406 length
= strlen(str
);
408 original
= NewMetaString(str
);
409 /* Pad original so we can index beyond end */
410 MetaphAdd(original
, " ");
412 primary
= NewMetaString("");
413 secondary
= NewMetaString("");
414 primary
->free_string_on_destroy
= 0;
415 secondary
->free_string_on_destroy
= 0;
419 /* skip these when at start of word */
420 if (StringAt(original
, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
423 /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424 if (GetAt(original
, 0) == 'X')
426 MetaphAdd(primary
, "S"); /* 'Z' maps to 'S' */
427 MetaphAdd(secondary
, "S");
432 while ((primary
->length
< 4) || (secondary
->length
< 4))
434 if (current
>= length
)
437 switch (GetAt(original
, current
))
447 /* all init vowels now map to 'A' */
448 MetaphAdd(primary
, "A");
449 MetaphAdd(secondary
, "A");
456 /* "-mb", e.g", "dumb", already skipped over... */
457 MetaphAdd(primary
, "P");
458 MetaphAdd(secondary
, "P");
460 if (GetAt(original
, current
+ 1) == 'B')
466 case '\xc7': /* C with cedilla */
467 MetaphAdd(primary
, "S");
468 MetaphAdd(secondary
, "S");
473 /* various germanic */
475 && !IsVowel(original
, current
- 2)
476 && StringAt(original
, (current
- 1), 3, "ACH", "")
477 && ((GetAt(original
, current
+ 2) != 'I')
478 && ((GetAt(original
, current
+ 2) != 'E')
479 || StringAt(original
, (current
- 2), 6, "BACHER",
482 MetaphAdd(primary
, "K");
483 MetaphAdd(secondary
, "K");
488 /* special case 'caesar' */
490 && StringAt(original
, current
, 6, "CAESAR", ""))
492 MetaphAdd(primary
, "S");
493 MetaphAdd(secondary
, "S");
498 /* italian 'chianti' */
499 if (StringAt(original
, current
, 4, "CHIA", ""))
501 MetaphAdd(primary
, "K");
502 MetaphAdd(secondary
, "K");
507 if (StringAt(original
, current
, 2, "CH", ""))
511 && StringAt(original
, current
, 4, "CHAE", ""))
513 MetaphAdd(primary
, "K");
514 MetaphAdd(secondary
, "X");
519 /* greek roots e.g. 'chemistry', 'chorus' */
521 && (StringAt(original
, (current
+ 1), 5,
522 "HARAC", "HARIS", "")
523 || StringAt(original
, (current
+ 1), 3, "HOR",
524 "HYM", "HIA", "HEM", ""))
525 && !StringAt(original
, 0, 5, "CHORE", ""))
527 MetaphAdd(primary
, "K");
528 MetaphAdd(secondary
, "K");
533 /* germanic, greek, or otherwise 'ch' for 'kh' sound */
534 if ((StringAt(original
, 0, 4, "VAN ", "VON ", "")
535 || StringAt(original
, 0, 3, "SCH", ""))
536 /* 'architect but not 'arch', 'orchestra', 'orchid' */
537 || StringAt(original
, (current
- 2), 6, "ORCHES",
538 "ARCHIT", "ORCHID", "")
539 || StringAt(original
, (current
+ 2), 1, "T", "S",
541 || ((StringAt(original
, (current
- 1), 1,
542 "A", "O", "U", "E", "")
546 * e.g., 'wachtler', 'wechsler', but not 'tichner'
548 && StringAt(original
, (current
+ 2), 1, "L", "R",
549 "N", "M", "B", "H", "F", "V", "W",
552 MetaphAdd(primary
, "K");
553 MetaphAdd(secondary
, "K");
559 if (StringAt(original
, 0, 2, "MC", ""))
562 MetaphAdd(primary
, "K");
563 MetaphAdd(secondary
, "K");
567 MetaphAdd(primary
, "X");
568 MetaphAdd(secondary
, "K");
573 MetaphAdd(primary
, "X");
574 MetaphAdd(secondary
, "X");
581 if (StringAt(original
, current
, 2, "CZ", "")
582 && !StringAt(original
, (current
- 2), 4, "WICZ", ""))
584 MetaphAdd(primary
, "S");
585 MetaphAdd(secondary
, "X");
590 /* e.g., 'focaccia' */
591 if (StringAt(original
, (current
+ 1), 3, "CIA", ""))
593 MetaphAdd(primary
, "X");
594 MetaphAdd(secondary
, "X");
599 /* double 'C', but not if e.g. 'McClellan' */
600 if (StringAt(original
, current
, 2, "CC", "")
601 && !((current
== 1) && (GetAt(original
, 0) == 'M')))
603 /* 'bellocchio' but not 'bacchus' */
604 if (StringAt(original
, (current
+ 2), 1, "I", "E", "H", "")
605 && !StringAt(original
, (current
+ 2), 2, "HU", ""))
607 /* 'accident', 'accede' 'succeed' */
609 && (GetAt(original
, current
- 1) == 'A'))
610 || StringAt(original
, (current
- 1), 5, "UCCEE",
613 MetaphAdd(primary
, "KS");
614 MetaphAdd(secondary
, "KS");
615 /* 'bacci', 'bertucci', other italian */
619 MetaphAdd(primary
, "X");
620 MetaphAdd(secondary
, "X");
626 { /* Pierce's rule */
627 MetaphAdd(primary
, "K");
628 MetaphAdd(secondary
, "K");
634 if (StringAt(original
, current
, 2, "CK", "CG", "CQ", ""))
636 MetaphAdd(primary
, "K");
637 MetaphAdd(secondary
, "K");
642 if (StringAt(original
, current
, 2, "CI", "CE", "CY", ""))
644 /* italian vs. english */
646 (original
, current
, 3, "CIO", "CIE", "CIA", ""))
648 MetaphAdd(primary
, "S");
649 MetaphAdd(secondary
, "X");
653 MetaphAdd(primary
, "S");
654 MetaphAdd(secondary
, "S");
661 MetaphAdd(primary
, "K");
662 MetaphAdd(secondary
, "K");
664 /* name sent in 'mac caffrey', 'mac gregor */
665 if (StringAt(original
, (current
+ 1), 2, " C", " Q", " G", ""))
667 else if (StringAt(original
, (current
+ 1), 1, "C", "K", "Q", "")
668 && !StringAt(original
, (current
+ 1), 2,
676 if (StringAt(original
, current
, 2, "DG", ""))
678 if (StringAt(original
, (current
+ 2), 1,
682 MetaphAdd(primary
, "J");
683 MetaphAdd(secondary
, "J");
690 MetaphAdd(primary
, "TK");
691 MetaphAdd(secondary
, "TK");
697 if (StringAt(original
, current
, 2, "DT", "DD", ""))
699 MetaphAdd(primary
, "T");
700 MetaphAdd(secondary
, "T");
706 MetaphAdd(primary
, "T");
707 MetaphAdd(secondary
, "T");
712 if (GetAt(original
, current
+ 1) == 'F')
716 MetaphAdd(primary
, "F");
717 MetaphAdd(secondary
, "F");
721 if (GetAt(original
, current
+ 1) == 'H')
723 if ((current
> 0) && !IsVowel(original
, current
- 1))
725 MetaphAdd(primary
, "K");
726 MetaphAdd(secondary
, "K");
733 /* 'ghislane', ghiradelli */
736 if (GetAt(original
, current
+ 2) == 'I')
738 MetaphAdd(primary
, "J");
739 MetaphAdd(secondary
, "J");
743 MetaphAdd(primary
, "K");
744 MetaphAdd(secondary
, "K");
752 * Parker's rule (with some further refinements) - e.g.,
756 && StringAt(original
, (current
- 2), 1,
760 && StringAt(original
, (current
- 3), 1,
762 /* e.g., 'broughton' */
764 && StringAt(original
, (current
- 4), 1,
773 * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
777 && (GetAt(original
, current
- 1) == 'U')
778 && StringAt(original
, (current
- 3), 1, "C",
779 "G", "L", "R", "T", ""))
781 MetaphAdd(primary
, "F");
782 MetaphAdd(secondary
, "F");
784 else if ((current
> 0)
785 && GetAt(original
, current
- 1) != 'I')
789 MetaphAdd(primary
, "K");
790 MetaphAdd(secondary
, "K");
798 if (GetAt(original
, current
+ 1) == 'N')
800 if ((current
== 1) && IsVowel(original
, 0)
801 && !SlavoGermanic(original
))
803 MetaphAdd(primary
, "KN");
804 MetaphAdd(secondary
, "N");
807 /* not e.g. 'cagney' */
808 if (!StringAt(original
, (current
+ 2), 2, "EY", "")
809 && (GetAt(original
, current
+ 1) != 'Y')
810 && !SlavoGermanic(original
))
812 MetaphAdd(primary
, "N");
813 MetaphAdd(secondary
, "KN");
817 MetaphAdd(primary
, "KN");
818 MetaphAdd(secondary
, "KN");
825 if (StringAt(original
, (current
+ 1), 2, "LI", "")
826 && !SlavoGermanic(original
))
828 MetaphAdd(primary
, "KL");
829 MetaphAdd(secondary
, "L");
834 /* -ges-,-gep-,-gel-, -gie- at beginning */
836 && ((GetAt(original
, current
+ 1) == 'Y')
837 || StringAt(original
, (current
+ 1), 2, "ES", "EP",
838 "EB", "EL", "EY", "IB", "IL", "IN", "IE",
841 MetaphAdd(primary
, "K");
842 MetaphAdd(secondary
, "J");
848 if ((StringAt(original
, (current
+ 1), 2, "ER", "")
849 || (GetAt(original
, current
+ 1) == 'Y'))
850 && !StringAt(original
, 0, 6,
851 "DANGER", "RANGER", "MANGER", "")
852 && !StringAt(original
, (current
- 1), 1, "E", "I", "")
853 && !StringAt(original
, (current
- 1), 3, "RGY", "OGY", ""))
855 MetaphAdd(primary
, "K");
856 MetaphAdd(secondary
, "J");
861 /* italian e.g, 'biaggi' */
862 if (StringAt(original
, (current
+ 1), 1, "E", "I", "Y", "")
863 || StringAt(original
, (current
- 1), 4,
866 /* obvious germanic */
867 if ((StringAt(original
, 0, 4, "VAN ", "VON ", "")
868 || StringAt(original
, 0, 3, "SCH", ""))
869 || StringAt(original
, (current
+ 1), 2, "ET", ""))
871 MetaphAdd(primary
, "K");
872 MetaphAdd(secondary
, "K");
876 /* always soft if french ending */
878 (original
, (current
+ 1), 4, "IER ", ""))
880 MetaphAdd(primary
, "J");
881 MetaphAdd(secondary
, "J");
885 MetaphAdd(primary
, "J");
886 MetaphAdd(secondary
, "K");
893 if (GetAt(original
, current
+ 1) == 'G')
897 MetaphAdd(primary
, "K");
898 MetaphAdd(secondary
, "K");
902 /* only keep if first & before vowel or btw. 2 vowels */
903 if (((current
== 0) || IsVowel(original
, current
- 1))
904 && IsVowel(original
, current
+ 1))
906 MetaphAdd(primary
, "H");
907 MetaphAdd(secondary
, "H");
911 /* also takes care of 'HH' */
916 /* obvious spanish, 'jose', 'san jacinto' */
917 if (StringAt(original
, current
, 4, "JOSE", "")
918 || StringAt(original
, 0, 4, "SAN ", ""))
921 && (GetAt(original
, current
+ 4) == ' '))
922 || StringAt(original
, 0, 4, "SAN ", ""))
924 MetaphAdd(primary
, "H");
925 MetaphAdd(secondary
, "H");
929 MetaphAdd(primary
, "J");
930 MetaphAdd(secondary
, "H");
937 && !StringAt(original
, current
, 4, "JOSE", ""))
939 MetaphAdd(primary
, "J"); /* Yankelovich/Jankelowicz */
940 MetaphAdd(secondary
, "A");
944 /* spanish pron. of e.g. 'bajador' */
945 if (IsVowel(original
, current
- 1)
946 && !SlavoGermanic(original
)
947 && ((GetAt(original
, current
+ 1) == 'A')
948 || (GetAt(original
, current
+ 1) == 'O')))
950 MetaphAdd(primary
, "J");
951 MetaphAdd(secondary
, "H");
957 MetaphAdd(primary
, "J");
958 MetaphAdd(secondary
, "");
962 if (!StringAt(original
, (current
+ 1), 1, "L", "T",
963 "K", "S", "N", "M", "B", "Z", "")
964 && !StringAt(original
, (current
- 1), 1,
967 MetaphAdd(primary
, "J");
968 MetaphAdd(secondary
, "J");
974 if (GetAt(original
, current
+ 1) == 'J') /* it could happen! */
981 if (GetAt(original
, current
+ 1) == 'K')
985 MetaphAdd(primary
, "K");
986 MetaphAdd(secondary
, "K");
990 if (GetAt(original
, current
+ 1) == 'L')
992 /* spanish e.g. 'cabrillo', 'gallegos' */
993 if (((current
== (length
- 3))
994 && StringAt(original
, (current
- 1), 4, "ILLO",
996 || ((StringAt(original
, (last
- 1), 2, "AS", "OS", "")
997 || StringAt(original
, last
, 1, "A", "O", ""))
998 && StringAt(original
, (current
- 1), 4,
1001 MetaphAdd(primary
, "L");
1002 MetaphAdd(secondary
, "");
1010 MetaphAdd(primary
, "L");
1011 MetaphAdd(secondary
, "L");
1015 if ((StringAt(original
, (current
- 1), 3, "UMB", "")
1016 && (((current
+ 1) == last
)
1017 || StringAt(original
, (current
+ 2), 2, "ER", "")))
1018 /* 'dumb','thumb' */
1019 || (GetAt(original
, current
+ 1) == 'M'))
1023 MetaphAdd(primary
, "M");
1024 MetaphAdd(secondary
, "M");
1028 if (GetAt(original
, current
+ 1) == 'N')
1032 MetaphAdd(primary
, "N");
1033 MetaphAdd(secondary
, "N");
1036 case '\xd1': /* N with tilde */
1038 MetaphAdd(primary
, "N");
1039 MetaphAdd(secondary
, "N");
1043 if (GetAt(original
, current
+ 1) == 'H')
1045 MetaphAdd(primary
, "F");
1046 MetaphAdd(secondary
, "F");
1051 /* also account for "campbell", "raspberry" */
1052 if (StringAt(original
, (current
+ 1), 1, "P", "B", ""))
1056 MetaphAdd(primary
, "P");
1057 MetaphAdd(secondary
, "P");
1061 if (GetAt(original
, current
+ 1) == 'Q')
1065 MetaphAdd(primary
, "K");
1066 MetaphAdd(secondary
, "K");
1070 /* french e.g. 'rogier', but exclude 'hochmeier' */
1071 if ((current
== last
)
1072 && !SlavoGermanic(original
)
1073 && StringAt(original
, (current
- 2), 2, "IE", "")
1074 && !StringAt(original
, (current
- 4), 2, "ME", "MA", ""))
1076 MetaphAdd(primary
, "");
1077 MetaphAdd(secondary
, "R");
1081 MetaphAdd(primary
, "R");
1082 MetaphAdd(secondary
, "R");
1085 if (GetAt(original
, current
+ 1) == 'R')
1092 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1093 if (StringAt(original
, (current
- 1), 3, "ISL", "YSL", ""))
1099 /* special case 'sugar-' */
1101 && StringAt(original
, current
, 5, "SUGAR", ""))
1103 MetaphAdd(primary
, "X");
1104 MetaphAdd(secondary
, "S");
1109 if (StringAt(original
, current
, 2, "SH", ""))
1113 (original
, (current
+ 1), 4, "HEIM", "HOEK", "HOLM",
1116 MetaphAdd(primary
, "S");
1117 MetaphAdd(secondary
, "S");
1121 MetaphAdd(primary
, "X");
1122 MetaphAdd(secondary
, "X");
1128 /* italian & armenian */
1129 if (StringAt(original
, current
, 3, "SIO", "SIA", "")
1130 || StringAt(original
, current
, 4, "SIAN", ""))
1132 if (!SlavoGermanic(original
))
1134 MetaphAdd(primary
, "S");
1135 MetaphAdd(secondary
, "X");
1139 MetaphAdd(primary
, "S");
1140 MetaphAdd(secondary
, "S");
1147 * german & anglicisations, e.g. 'smith' match 'schmidt',
1148 * 'snider' match 'schneider' also, -sz- in slavic language
1149 * although in hungarian it is pronounced 's'
1152 && StringAt(original
, (current
+ 1), 1,
1153 "M", "N", "L", "W", ""))
1154 || StringAt(original
, (current
+ 1), 1, "Z", ""))
1156 MetaphAdd(primary
, "S");
1157 MetaphAdd(secondary
, "X");
1158 if (StringAt(original
, (current
+ 1), 1, "Z", ""))
1165 if (StringAt(original
, current
, 2, "SC", ""))
1167 /* Schlesinger's rule */
1168 if (GetAt(original
, current
+ 2) == 'H')
1170 /* dutch origin, e.g. 'school', 'schooner' */
1171 if (StringAt(original
, (current
+ 3), 2,
1173 "UY", "ED", "EM", ""))
1175 /* 'schermerhorn', 'schenker' */
1176 if (StringAt(original
, (current
+ 3), 2,
1179 MetaphAdd(primary
, "X");
1180 MetaphAdd(secondary
, "SK");
1184 MetaphAdd(primary
, "SK");
1185 MetaphAdd(secondary
, "SK");
1192 if ((current
== 0) && !IsVowel(original
, 3)
1193 && (GetAt(original
, 3) != 'W'))
1195 MetaphAdd(primary
, "X");
1196 MetaphAdd(secondary
, "S");
1200 MetaphAdd(primary
, "X");
1201 MetaphAdd(secondary
, "X");
1208 if (StringAt(original
, (current
+ 2), 1,
1211 MetaphAdd(primary
, "S");
1212 MetaphAdd(secondary
, "S");
1217 MetaphAdd(primary
, "SK");
1218 MetaphAdd(secondary
, "SK");
1223 /* french e.g. 'resnais', 'artois' */
1224 if ((current
== last
)
1225 && StringAt(original
, (current
- 2), 2, "AI", "OI", ""))
1227 MetaphAdd(primary
, "");
1228 MetaphAdd(secondary
, "S");
1232 MetaphAdd(primary
, "S");
1233 MetaphAdd(secondary
, "S");
1236 if (StringAt(original
, (current
+ 1), 1, "S", "Z", ""))
1243 if (StringAt(original
, current
, 4, "TION", ""))
1245 MetaphAdd(primary
, "X");
1246 MetaphAdd(secondary
, "X");
1251 if (StringAt(original
, current
, 3, "TIA", "TCH", ""))
1253 MetaphAdd(primary
, "X");
1254 MetaphAdd(secondary
, "X");
1259 if (StringAt(original
, current
, 2, "TH", "")
1260 || StringAt(original
, current
, 3, "TTH", ""))
1262 /* special case 'thomas', 'thames' or germanic */
1263 if (StringAt(original
, (current
+ 2), 2, "OM", "AM", "")
1264 || StringAt(original
, 0, 4, "VAN ", "VON ", "")
1265 || StringAt(original
, 0, 3, "SCH", ""))
1267 MetaphAdd(primary
, "T");
1268 MetaphAdd(secondary
, "T");
1272 MetaphAdd(primary
, "0");
1273 MetaphAdd(secondary
, "T");
1279 if (StringAt(original
, (current
+ 1), 1, "T", "D", ""))
1283 MetaphAdd(primary
, "T");
1284 MetaphAdd(secondary
, "T");
1288 if (GetAt(original
, current
+ 1) == 'V')
1292 MetaphAdd(primary
, "F");
1293 MetaphAdd(secondary
, "F");
1297 /* can also be in middle of word */
1298 if (StringAt(original
, current
, 2, "WR", ""))
1300 MetaphAdd(primary
, "R");
1301 MetaphAdd(secondary
, "R");
1307 && (IsVowel(original
, current
+ 1)
1308 || StringAt(original
, current
, 2, "WH", "")))
1310 /* Wasserman should match Vasserman */
1311 if (IsVowel(original
, current
+ 1))
1313 MetaphAdd(primary
, "A");
1314 MetaphAdd(secondary
, "F");
1318 /* need Uomo to match Womo */
1319 MetaphAdd(primary
, "A");
1320 MetaphAdd(secondary
, "A");
1324 /* Arnow should match Arnoff */
1325 if (((current
== last
) && IsVowel(original
, current
- 1))
1326 || StringAt(original
, (current
- 1), 5, "EWSKI", "EWSKY",
1327 "OWSKI", "OWSKY", "")
1328 || StringAt(original
, 0, 3, "SCH", ""))
1330 MetaphAdd(primary
, "");
1331 MetaphAdd(secondary
, "F");
1336 /* polish e.g. 'filipowicz' */
1337 if (StringAt(original
, current
, 4, "WICZ", "WITZ", ""))
1339 MetaphAdd(primary
, "TS");
1340 MetaphAdd(secondary
, "FX");
1350 /* french e.g. breaux */
1351 if (!((current
== last
)
1352 && (StringAt(original
, (current
- 3), 3,
1354 || StringAt(original
, (current
- 2), 2,
1357 MetaphAdd(primary
, "KS");
1358 MetaphAdd(secondary
, "KS");
1362 if (StringAt(original
, (current
+ 1), 1, "C", "X", ""))
1369 /* chinese pinyin e.g. 'zhao' */
1370 if (GetAt(original
, current
+ 1) == 'H')
1372 MetaphAdd(primary
, "J");
1373 MetaphAdd(secondary
, "J");
1377 else if (StringAt(original
, (current
+ 1), 2,
1378 "ZO", "ZI", "ZA", "")
1379 || (SlavoGermanic(original
)
1381 && GetAt(original
, current
- 1) != 'T')))
1383 MetaphAdd(primary
, "S");
1384 MetaphAdd(secondary
, "TS");
1388 MetaphAdd(primary
, "S");
1389 MetaphAdd(secondary
, "S");
1392 if (GetAt(original
, current
+ 1) == 'Z')
1403 * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1409 if (primary
->length
> 4)
1410 SetAt(primary
, 4, '\0');
1412 if (secondary
->length
> 4)
1413 SetAt(secondary
, 4, '\0');
1415 *codes
= primary
->str
;
1416 *++codes
= secondary
->str
;
1418 DestroyMetaString(original
);
1419 DestroyMetaString(primary
);
1420 DestroyMetaString(secondary
);
1423 #ifdef DMETAPHONE_MAIN
1425 /* just for testing - not part of the perl code */
1427 main(int argc
, char **argv
)
1433 DoubleMetaphone(argv
[1], codes
);
1434 printf("%s|%s\n", codes
[0], codes
[1]);