Restore initdb's old behavior of always setting the lc_xxx GUCs.
[pgsql.git] / contrib / fuzzystrmatch / dmetaphone.c
blobf8f2c2b447d2b1ab1e2c0a32083e96166234e7c3
1 /*
2 * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
4 * contrib/fuzzystrmatch/dmetaphone.c
6 * Double Metaphone computes 2 "sounds like" strings - a primary and an
7 * alternate. In most cases they are the same, but for foreign names
8 * especially they can be a bit different, depending on pronunciation.
10 * Information on using Double Metaphone can be found at
11 * http://www.codeproject.com/string/dmetaphone1.asp
12 * and the original article describing it can be found at
13 * http://drdobbs.com/184401251
15 * For PostgreSQL we provide 2 functions - one for the primary and one for
16 * the alternate. That way the functions are pure text->text mappings that
17 * are useful in functional indexes. These are 'dmetaphone' for the
18 * primary and 'dmetaphone_alt' for the alternate.
20 * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
21 * functions looks like this:
23 * CREATE FUNCTION dmetaphone (text) RETURNS text
24 * LANGUAGE C IMMUTABLE STRICT
25 * AS '$libdir/dmetaphone', 'dmetaphone';
27 * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
28 * LANGUAGE C IMMUTABLE STRICT
29 * AS '$libdir/dmetaphone', 'dmetaphone_alt';
31 * Note that you have to declare the functions IMMUTABLE if you want to
32 * use them in functional indexes, and you have to declare them as STRICT
33 * as they do not check for NULL input, and will segfault if given NULL input.
34 * (See below for alternative ) Declaring them as STRICT means PostgreSQL
35 * will never call them with NULL, but instead assume the result is NULL,
36 * which is what we (I) want.
38 * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
39 * will detect NULL input and return NULL. The you don't have to declare them
40 * as STRICT.
42 * There is a small inefficiency here - each function call actually computes
43 * both the primary and the alternate and then throws away the one it doesn't
44 * need. That's the way the perl module was written, because perl can handle
45 * a list return more easily than we can in PostgreSQL. The result has been
46 * fast enough for my needs, but it could maybe be optimized a bit to remove
47 * that behaviour.
52 /***************************** COPYRIGHT NOTICES ***********************
54 Most of this code is directly from the Text::DoubleMetaphone perl module
55 version 0.05 available from https://www.cpan.org/.
56 It bears this copyright notice:
59 Copyright 2000, Maurice Aubrey <maurice@hevanet.com>.
60 All rights reserved.
62 This code is based heavily on the C++ implementation by
63 Lawrence Philips and incorporates several bug fixes courtesy
64 of Kevin Atkinson <kevina@users.sourceforge.net>.
66 This module is free software; you may redistribute it and/or
67 modify it under the same terms as Perl itself.
69 The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
70 <andrew@dunslane.net> and is covered this copyright:
72 Copyright 2003, North Carolina State Highway Patrol.
73 All rights reserved.
75 Permission to use, copy, modify, and distribute this software and its
76 documentation for any purpose, without fee, and without a written agreement
77 is hereby granted, provided that the above copyright notice and this
78 paragraph and the following two paragraphs appear in all copies.
80 IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
81 PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
82 INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
83 DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
84 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
86 THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
87 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
88 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED
89 HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
90 HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
91 MODIFICATIONS.
93 ***********************************************************************/
96 /* include these first, according to the docs */
97 #ifndef DMETAPHONE_MAIN
99 #include "postgres.h"
101 #include "utils/builtins.h"
103 /* turn off assertions for embedded function */
104 #define NDEBUG
106 #else /* DMETAPHONE_MAIN */
108 /* we need these if we didn't get them from postgres.h */
109 #include <stdio.h>
110 #include <stdlib.h>
111 #include <string.h>
112 #include <stdarg.h>
114 #endif /* DMETAPHONE_MAIN */
116 #include <assert.h>
117 #include <ctype.h>
119 /* prototype for the main function we got from the perl module */
120 static void DoubleMetaphone(char *str, char **codes);
122 #ifndef DMETAPHONE_MAIN
125 * The PostgreSQL visible dmetaphone function.
128 PG_FUNCTION_INFO_V1(dmetaphone);
130 Datum
131 dmetaphone(PG_FUNCTION_ARGS)
133 text *arg;
134 char *aptr,
135 *codes[2],
136 *code;
138 #ifdef DMETAPHONE_NOSTRICT
139 if (PG_ARGISNULL(0))
140 PG_RETURN_NULL();
141 #endif
142 arg = PG_GETARG_TEXT_PP(0);
143 aptr = text_to_cstring(arg);
145 DoubleMetaphone(aptr, codes);
146 code = codes[0];
147 if (!code)
148 code = "";
150 PG_RETURN_TEXT_P(cstring_to_text(code));
154 * The PostgreSQL visible dmetaphone_alt function.
157 PG_FUNCTION_INFO_V1(dmetaphone_alt);
159 Datum
160 dmetaphone_alt(PG_FUNCTION_ARGS)
162 text *arg;
163 char *aptr,
164 *codes[2],
165 *code;
167 #ifdef DMETAPHONE_NOSTRICT
168 if (PG_ARGISNULL(0))
169 PG_RETURN_NULL();
170 #endif
171 arg = PG_GETARG_TEXT_PP(0);
172 aptr = text_to_cstring(arg);
174 DoubleMetaphone(aptr, codes);
175 code = codes[1];
176 if (!code)
177 code = "";
179 PG_RETURN_TEXT_P(cstring_to_text(code));
183 /* here is where we start the code imported from the perl module */
185 /* all memory handling is done with these macros */
187 #define META_MALLOC(v,n,t) \
188 (v = (t*)palloc(((n)*sizeof(t))))
190 #define META_REALLOC(v,n,t) \
191 (v = (t*)repalloc((v),((n)*sizeof(t))))
194 * Don't do pfree - it seems to cause a SIGSEGV sometimes - which might have just
195 * been caused by reloading the module in development.
196 * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
197 * in a case like this.
200 #define META_FREE(x) ((void)true) /* pfree((x)) */
201 #else /* not defined DMETAPHONE_MAIN */
203 /* use the standard malloc library when not running in PostgreSQL */
205 #define META_MALLOC(v,n,t) \
206 (v = (t*)malloc(((n)*sizeof(t))))
208 #define META_REALLOC(v,n,t) \
209 (v = (t*)realloc((v),((n)*sizeof(t))))
211 #define META_FREE(x) free((x))
212 #endif /* defined DMETAPHONE_MAIN */
216 /* this typedef was originally in the perl module's .h file */
218 typedef struct
220 char *str;
221 int length;
222 int bufsize;
223 int free_string_on_destroy;
226 metastring;
229 * remaining perl module funcs unchanged except for declaring them static
230 * and reformatting to PostgreSQL indentation and to fit in 80 cols.
234 static metastring *
235 NewMetaString(const char *init_str)
237 metastring *s;
238 char empty_string[] = "";
240 META_MALLOC(s, 1, metastring);
241 assert(s != NULL);
243 if (init_str == NULL)
244 init_str = empty_string;
245 s->length = strlen(init_str);
246 /* preallocate a bit more for potential growth */
247 s->bufsize = s->length + 7;
249 META_MALLOC(s->str, s->bufsize, char);
250 assert(s->str != NULL);
252 memcpy(s->str, init_str, s->length + 1);
253 s->free_string_on_destroy = 1;
255 return s;
259 static void
260 DestroyMetaString(metastring *s)
262 if (s == NULL)
263 return;
265 if (s->free_string_on_destroy && (s->str != NULL))
266 META_FREE(s->str);
268 META_FREE(s);
272 static void
273 IncreaseBuffer(metastring *s, int chars_needed)
275 META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
276 assert(s->str != NULL);
277 s->bufsize = s->bufsize + chars_needed + 10;
281 static void
282 MakeUpper(metastring *s)
284 char *i;
286 for (i = s->str; *i; i++)
287 *i = toupper((unsigned char) *i);
291 static int
292 IsVowel(metastring *s, int pos)
294 char c;
296 if ((pos < 0) || (pos >= s->length))
297 return 0;
299 c = *(s->str + pos);
300 if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
301 (c == 'U') || (c == 'Y'))
302 return 1;
304 return 0;
308 static int
309 SlavoGermanic(metastring *s)
311 if ((char *) strstr(s->str, "W"))
312 return 1;
313 else if ((char *) strstr(s->str, "K"))
314 return 1;
315 else if ((char *) strstr(s->str, "CZ"))
316 return 1;
317 else if ((char *) strstr(s->str, "WITZ"))
318 return 1;
319 else
320 return 0;
324 static char
325 GetAt(metastring *s, int pos)
327 if ((pos < 0) || (pos >= s->length))
328 return '\0';
330 return ((char) *(s->str + pos));
334 static void
335 SetAt(metastring *s, int pos, char c)
337 if ((pos < 0) || (pos >= s->length))
338 return;
340 *(s->str + pos) = c;
345 Caveats: the START value is 0 based
347 static int
348 StringAt(metastring *s, int start, int length,...)
350 char *test;
351 char *pos;
352 va_list ap;
354 if ((start < 0) || (start >= s->length))
355 return 0;
357 pos = (s->str + start);
358 va_start(ap, length);
362 test = va_arg(ap, char *);
363 if (*test && (strncmp(pos, test, length) == 0))
365 va_end(ap);
366 return 1;
369 while (strcmp(test, "") != 0);
371 va_end(ap);
373 return 0;
377 static void
378 MetaphAdd(metastring *s, const char *new_str)
380 int add_length;
382 if (new_str == NULL)
383 return;
385 add_length = strlen(new_str);
386 if ((s->length + add_length) > (s->bufsize - 1))
387 IncreaseBuffer(s, add_length);
389 strcat(s->str, new_str);
390 s->length += add_length;
394 static void
395 DoubleMetaphone(char *str, char **codes)
397 int length;
398 metastring *original;
399 metastring *primary;
400 metastring *secondary;
401 int current;
402 int last;
404 current = 0;
405 /* we need the real length and last prior to padding */
406 length = strlen(str);
407 last = length - 1;
408 original = NewMetaString(str);
409 /* Pad original so we can index beyond end */
410 MetaphAdd(original, " ");
412 primary = NewMetaString("");
413 secondary = NewMetaString("");
414 primary->free_string_on_destroy = 0;
415 secondary->free_string_on_destroy = 0;
417 MakeUpper(original);
419 /* skip these when at start of word */
420 if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
421 current += 1;
423 /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
424 if (GetAt(original, 0) == 'X')
426 MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */
427 MetaphAdd(secondary, "S");
428 current += 1;
431 /* main loop */
432 while ((primary->length < 4) || (secondary->length < 4))
434 if (current >= length)
435 break;
437 switch (GetAt(original, current))
439 case 'A':
440 case 'E':
441 case 'I':
442 case 'O':
443 case 'U':
444 case 'Y':
445 if (current == 0)
447 /* all init vowels now map to 'A' */
448 MetaphAdd(primary, "A");
449 MetaphAdd(secondary, "A");
451 current += 1;
452 break;
454 case 'B':
456 /* "-mb", e.g", "dumb", already skipped over... */
457 MetaphAdd(primary, "P");
458 MetaphAdd(secondary, "P");
460 if (GetAt(original, current + 1) == 'B')
461 current += 2;
462 else
463 current += 1;
464 break;
466 case '\xc7': /* C with cedilla */
467 MetaphAdd(primary, "S");
468 MetaphAdd(secondary, "S");
469 current += 1;
470 break;
472 case 'C':
473 /* various germanic */
474 if ((current > 1)
475 && !IsVowel(original, current - 2)
476 && StringAt(original, (current - 1), 3, "ACH", "")
477 && ((GetAt(original, current + 2) != 'I')
478 && ((GetAt(original, current + 2) != 'E')
479 || StringAt(original, (current - 2), 6, "BACHER",
480 "MACHER", ""))))
482 MetaphAdd(primary, "K");
483 MetaphAdd(secondary, "K");
484 current += 2;
485 break;
488 /* special case 'caesar' */
489 if ((current == 0)
490 && StringAt(original, current, 6, "CAESAR", ""))
492 MetaphAdd(primary, "S");
493 MetaphAdd(secondary, "S");
494 current += 2;
495 break;
498 /* italian 'chianti' */
499 if (StringAt(original, current, 4, "CHIA", ""))
501 MetaphAdd(primary, "K");
502 MetaphAdd(secondary, "K");
503 current += 2;
504 break;
507 if (StringAt(original, current, 2, "CH", ""))
509 /* find 'michael' */
510 if ((current > 0)
511 && StringAt(original, current, 4, "CHAE", ""))
513 MetaphAdd(primary, "K");
514 MetaphAdd(secondary, "X");
515 current += 2;
516 break;
519 /* greek roots e.g. 'chemistry', 'chorus' */
520 if ((current == 0)
521 && (StringAt(original, (current + 1), 5,
522 "HARAC", "HARIS", "")
523 || StringAt(original, (current + 1), 3, "HOR",
524 "HYM", "HIA", "HEM", ""))
525 && !StringAt(original, 0, 5, "CHORE", ""))
527 MetaphAdd(primary, "K");
528 MetaphAdd(secondary, "K");
529 current += 2;
530 break;
533 /* germanic, greek, or otherwise 'ch' for 'kh' sound */
534 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
535 || StringAt(original, 0, 3, "SCH", ""))
536 /* 'architect but not 'arch', 'orchestra', 'orchid' */
537 || StringAt(original, (current - 2), 6, "ORCHES",
538 "ARCHIT", "ORCHID", "")
539 || StringAt(original, (current + 2), 1, "T", "S",
541 || ((StringAt(original, (current - 1), 1,
542 "A", "O", "U", "E", "")
543 || (current == 0))
546 * e.g., 'wachtler', 'wechsler', but not 'tichner'
548 && StringAt(original, (current + 2), 1, "L", "R",
549 "N", "M", "B", "H", "F", "V", "W",
550 " ", "")))
552 MetaphAdd(primary, "K");
553 MetaphAdd(secondary, "K");
555 else
557 if (current > 0)
559 if (StringAt(original, 0, 2, "MC", ""))
561 /* e.g., "McHugh" */
562 MetaphAdd(primary, "K");
563 MetaphAdd(secondary, "K");
565 else
567 MetaphAdd(primary, "X");
568 MetaphAdd(secondary, "K");
571 else
573 MetaphAdd(primary, "X");
574 MetaphAdd(secondary, "X");
577 current += 2;
578 break;
580 /* e.g, 'czerny' */
581 if (StringAt(original, current, 2, "CZ", "")
582 && !StringAt(original, (current - 2), 4, "WICZ", ""))
584 MetaphAdd(primary, "S");
585 MetaphAdd(secondary, "X");
586 current += 2;
587 break;
590 /* e.g., 'focaccia' */
591 if (StringAt(original, (current + 1), 3, "CIA", ""))
593 MetaphAdd(primary, "X");
594 MetaphAdd(secondary, "X");
595 current += 3;
596 break;
599 /* double 'C', but not if e.g. 'McClellan' */
600 if (StringAt(original, current, 2, "CC", "")
601 && !((current == 1) && (GetAt(original, 0) == 'M')))
603 /* 'bellocchio' but not 'bacchus' */
604 if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
605 && !StringAt(original, (current + 2), 2, "HU", ""))
607 /* 'accident', 'accede' 'succeed' */
608 if (((current == 1)
609 && (GetAt(original, current - 1) == 'A'))
610 || StringAt(original, (current - 1), 5, "UCCEE",
611 "UCCES", ""))
613 MetaphAdd(primary, "KS");
614 MetaphAdd(secondary, "KS");
615 /* 'bacci', 'bertucci', other italian */
617 else
619 MetaphAdd(primary, "X");
620 MetaphAdd(secondary, "X");
622 current += 3;
623 break;
625 else
626 { /* Pierce's rule */
627 MetaphAdd(primary, "K");
628 MetaphAdd(secondary, "K");
629 current += 2;
630 break;
634 if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
636 MetaphAdd(primary, "K");
637 MetaphAdd(secondary, "K");
638 current += 2;
639 break;
642 if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
644 /* italian vs. english */
645 if (StringAt
646 (original, current, 3, "CIO", "CIE", "CIA", ""))
648 MetaphAdd(primary, "S");
649 MetaphAdd(secondary, "X");
651 else
653 MetaphAdd(primary, "S");
654 MetaphAdd(secondary, "S");
656 current += 2;
657 break;
660 /* else */
661 MetaphAdd(primary, "K");
662 MetaphAdd(secondary, "K");
664 /* name sent in 'mac caffrey', 'mac gregor */
665 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
666 current += 3;
667 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
668 && !StringAt(original, (current + 1), 2,
669 "CE", "CI", ""))
670 current += 2;
671 else
672 current += 1;
673 break;
675 case 'D':
676 if (StringAt(original, current, 2, "DG", ""))
678 if (StringAt(original, (current + 2), 1,
679 "I", "E", "Y", ""))
681 /* e.g. 'edge' */
682 MetaphAdd(primary, "J");
683 MetaphAdd(secondary, "J");
684 current += 3;
685 break;
687 else
689 /* e.g. 'edgar' */
690 MetaphAdd(primary, "TK");
691 MetaphAdd(secondary, "TK");
692 current += 2;
693 break;
697 if (StringAt(original, current, 2, "DT", "DD", ""))
699 MetaphAdd(primary, "T");
700 MetaphAdd(secondary, "T");
701 current += 2;
702 break;
705 /* else */
706 MetaphAdd(primary, "T");
707 MetaphAdd(secondary, "T");
708 current += 1;
709 break;
711 case 'F':
712 if (GetAt(original, current + 1) == 'F')
713 current += 2;
714 else
715 current += 1;
716 MetaphAdd(primary, "F");
717 MetaphAdd(secondary, "F");
718 break;
720 case 'G':
721 if (GetAt(original, current + 1) == 'H')
723 if ((current > 0) && !IsVowel(original, current - 1))
725 MetaphAdd(primary, "K");
726 MetaphAdd(secondary, "K");
727 current += 2;
728 break;
731 if (current < 3)
733 /* 'ghislane', ghiradelli */
734 if (current == 0)
736 if (GetAt(original, current + 2) == 'I')
738 MetaphAdd(primary, "J");
739 MetaphAdd(secondary, "J");
741 else
743 MetaphAdd(primary, "K");
744 MetaphAdd(secondary, "K");
746 current += 2;
747 break;
752 * Parker's rule (with some further refinements) - e.g.,
753 * 'hugh'
755 if (((current > 1)
756 && StringAt(original, (current - 2), 1,
757 "B", "H", "D", ""))
758 /* e.g., 'bough' */
759 || ((current > 2)
760 && StringAt(original, (current - 3), 1,
761 "B", "H", "D", ""))
762 /* e.g., 'broughton' */
763 || ((current > 3)
764 && StringAt(original, (current - 4), 1,
765 "B", "H", "")))
767 current += 2;
768 break;
770 else
773 * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
774 * 'rough', 'tough'
776 if ((current > 2)
777 && (GetAt(original, current - 1) == 'U')
778 && StringAt(original, (current - 3), 1, "C",
779 "G", "L", "R", "T", ""))
781 MetaphAdd(primary, "F");
782 MetaphAdd(secondary, "F");
784 else if ((current > 0)
785 && GetAt(original, current - 1) != 'I')
789 MetaphAdd(primary, "K");
790 MetaphAdd(secondary, "K");
793 current += 2;
794 break;
798 if (GetAt(original, current + 1) == 'N')
800 if ((current == 1) && IsVowel(original, 0)
801 && !SlavoGermanic(original))
803 MetaphAdd(primary, "KN");
804 MetaphAdd(secondary, "N");
806 else
807 /* not e.g. 'cagney' */
808 if (!StringAt(original, (current + 2), 2, "EY", "")
809 && (GetAt(original, current + 1) != 'Y')
810 && !SlavoGermanic(original))
812 MetaphAdd(primary, "N");
813 MetaphAdd(secondary, "KN");
815 else
817 MetaphAdd(primary, "KN");
818 MetaphAdd(secondary, "KN");
820 current += 2;
821 break;
824 /* 'tagliaro' */
825 if (StringAt(original, (current + 1), 2, "LI", "")
826 && !SlavoGermanic(original))
828 MetaphAdd(primary, "KL");
829 MetaphAdd(secondary, "L");
830 current += 2;
831 break;
834 /* -ges-,-gep-,-gel-, -gie- at beginning */
835 if ((current == 0)
836 && ((GetAt(original, current + 1) == 'Y')
837 || StringAt(original, (current + 1), 2, "ES", "EP",
838 "EB", "EL", "EY", "IB", "IL", "IN", "IE",
839 "EI", "ER", "")))
841 MetaphAdd(primary, "K");
842 MetaphAdd(secondary, "J");
843 current += 2;
844 break;
847 /* -ger-, -gy- */
848 if ((StringAt(original, (current + 1), 2, "ER", "")
849 || (GetAt(original, current + 1) == 'Y'))
850 && !StringAt(original, 0, 6,
851 "DANGER", "RANGER", "MANGER", "")
852 && !StringAt(original, (current - 1), 1, "E", "I", "")
853 && !StringAt(original, (current - 1), 3, "RGY", "OGY", ""))
855 MetaphAdd(primary, "K");
856 MetaphAdd(secondary, "J");
857 current += 2;
858 break;
861 /* italian e.g, 'biaggi' */
862 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
863 || StringAt(original, (current - 1), 4,
864 "AGGI", "OGGI", ""))
866 /* obvious germanic */
867 if ((StringAt(original, 0, 4, "VAN ", "VON ", "")
868 || StringAt(original, 0, 3, "SCH", ""))
869 || StringAt(original, (current + 1), 2, "ET", ""))
871 MetaphAdd(primary, "K");
872 MetaphAdd(secondary, "K");
874 else
876 /* always soft if french ending */
877 if (StringAt
878 (original, (current + 1), 4, "IER ", ""))
880 MetaphAdd(primary, "J");
881 MetaphAdd(secondary, "J");
883 else
885 MetaphAdd(primary, "J");
886 MetaphAdd(secondary, "K");
889 current += 2;
890 break;
893 if (GetAt(original, current + 1) == 'G')
894 current += 2;
895 else
896 current += 1;
897 MetaphAdd(primary, "K");
898 MetaphAdd(secondary, "K");
899 break;
901 case 'H':
902 /* only keep if first & before vowel or btw. 2 vowels */
903 if (((current == 0) || IsVowel(original, current - 1))
904 && IsVowel(original, current + 1))
906 MetaphAdd(primary, "H");
907 MetaphAdd(secondary, "H");
908 current += 2;
910 else
911 /* also takes care of 'HH' */
912 current += 1;
913 break;
915 case 'J':
916 /* obvious spanish, 'jose', 'san jacinto' */
917 if (StringAt(original, current, 4, "JOSE", "")
918 || StringAt(original, 0, 4, "SAN ", ""))
920 if (((current == 0)
921 && (GetAt(original, current + 4) == ' '))
922 || StringAt(original, 0, 4, "SAN ", ""))
924 MetaphAdd(primary, "H");
925 MetaphAdd(secondary, "H");
927 else
929 MetaphAdd(primary, "J");
930 MetaphAdd(secondary, "H");
932 current += 1;
933 break;
936 if ((current == 0)
937 && !StringAt(original, current, 4, "JOSE", ""))
939 MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */
940 MetaphAdd(secondary, "A");
942 else
944 /* spanish pron. of e.g. 'bajador' */
945 if (IsVowel(original, current - 1)
946 && !SlavoGermanic(original)
947 && ((GetAt(original, current + 1) == 'A')
948 || (GetAt(original, current + 1) == 'O')))
950 MetaphAdd(primary, "J");
951 MetaphAdd(secondary, "H");
953 else
955 if (current == last)
957 MetaphAdd(primary, "J");
958 MetaphAdd(secondary, "");
960 else
962 if (!StringAt(original, (current + 1), 1, "L", "T",
963 "K", "S", "N", "M", "B", "Z", "")
964 && !StringAt(original, (current - 1), 1,
965 "S", "K", "L", ""))
967 MetaphAdd(primary, "J");
968 MetaphAdd(secondary, "J");
974 if (GetAt(original, current + 1) == 'J') /* it could happen! */
975 current += 2;
976 else
977 current += 1;
978 break;
980 case 'K':
981 if (GetAt(original, current + 1) == 'K')
982 current += 2;
983 else
984 current += 1;
985 MetaphAdd(primary, "K");
986 MetaphAdd(secondary, "K");
987 break;
989 case 'L':
990 if (GetAt(original, current + 1) == 'L')
992 /* spanish e.g. 'cabrillo', 'gallegos' */
993 if (((current == (length - 3))
994 && StringAt(original, (current - 1), 4, "ILLO",
995 "ILLA", "ALLE", ""))
996 || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
997 || StringAt(original, last, 1, "A", "O", ""))
998 && StringAt(original, (current - 1), 4,
999 "ALLE", "")))
1001 MetaphAdd(primary, "L");
1002 MetaphAdd(secondary, "");
1003 current += 2;
1004 break;
1006 current += 2;
1008 else
1009 current += 1;
1010 MetaphAdd(primary, "L");
1011 MetaphAdd(secondary, "L");
1012 break;
1014 case 'M':
1015 if ((StringAt(original, (current - 1), 3, "UMB", "")
1016 && (((current + 1) == last)
1017 || StringAt(original, (current + 2), 2, "ER", "")))
1018 /* 'dumb','thumb' */
1019 || (GetAt(original, current + 1) == 'M'))
1020 current += 2;
1021 else
1022 current += 1;
1023 MetaphAdd(primary, "M");
1024 MetaphAdd(secondary, "M");
1025 break;
1027 case 'N':
1028 if (GetAt(original, current + 1) == 'N')
1029 current += 2;
1030 else
1031 current += 1;
1032 MetaphAdd(primary, "N");
1033 MetaphAdd(secondary, "N");
1034 break;
1036 case '\xd1': /* N with tilde */
1037 current += 1;
1038 MetaphAdd(primary, "N");
1039 MetaphAdd(secondary, "N");
1040 break;
1042 case 'P':
1043 if (GetAt(original, current + 1) == 'H')
1045 MetaphAdd(primary, "F");
1046 MetaphAdd(secondary, "F");
1047 current += 2;
1048 break;
1051 /* also account for "campbell", "raspberry" */
1052 if (StringAt(original, (current + 1), 1, "P", "B", ""))
1053 current += 2;
1054 else
1055 current += 1;
1056 MetaphAdd(primary, "P");
1057 MetaphAdd(secondary, "P");
1058 break;
1060 case 'Q':
1061 if (GetAt(original, current + 1) == 'Q')
1062 current += 2;
1063 else
1064 current += 1;
1065 MetaphAdd(primary, "K");
1066 MetaphAdd(secondary, "K");
1067 break;
1069 case 'R':
1070 /* french e.g. 'rogier', but exclude 'hochmeier' */
1071 if ((current == last)
1072 && !SlavoGermanic(original)
1073 && StringAt(original, (current - 2), 2, "IE", "")
1074 && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
1076 MetaphAdd(primary, "");
1077 MetaphAdd(secondary, "R");
1079 else
1081 MetaphAdd(primary, "R");
1082 MetaphAdd(secondary, "R");
1085 if (GetAt(original, current + 1) == 'R')
1086 current += 2;
1087 else
1088 current += 1;
1089 break;
1091 case 'S':
1092 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
1093 if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
1095 current += 1;
1096 break;
1099 /* special case 'sugar-' */
1100 if ((current == 0)
1101 && StringAt(original, current, 5, "SUGAR", ""))
1103 MetaphAdd(primary, "X");
1104 MetaphAdd(secondary, "S");
1105 current += 1;
1106 break;
1109 if (StringAt(original, current, 2, "SH", ""))
1111 /* germanic */
1112 if (StringAt
1113 (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
1114 "HOLZ", ""))
1116 MetaphAdd(primary, "S");
1117 MetaphAdd(secondary, "S");
1119 else
1121 MetaphAdd(primary, "X");
1122 MetaphAdd(secondary, "X");
1124 current += 2;
1125 break;
1128 /* italian & armenian */
1129 if (StringAt(original, current, 3, "SIO", "SIA", "")
1130 || StringAt(original, current, 4, "SIAN", ""))
1132 if (!SlavoGermanic(original))
1134 MetaphAdd(primary, "S");
1135 MetaphAdd(secondary, "X");
1137 else
1139 MetaphAdd(primary, "S");
1140 MetaphAdd(secondary, "S");
1142 current += 3;
1143 break;
1147 * german & anglicisations, e.g. 'smith' match 'schmidt',
1148 * 'snider' match 'schneider' also, -sz- in slavic language
1149 * although in hungarian it is pronounced 's'
1151 if (((current == 0)
1152 && StringAt(original, (current + 1), 1,
1153 "M", "N", "L", "W", ""))
1154 || StringAt(original, (current + 1), 1, "Z", ""))
1156 MetaphAdd(primary, "S");
1157 MetaphAdd(secondary, "X");
1158 if (StringAt(original, (current + 1), 1, "Z", ""))
1159 current += 2;
1160 else
1161 current += 1;
1162 break;
1165 if (StringAt(original, current, 2, "SC", ""))
1167 /* Schlesinger's rule */
1168 if (GetAt(original, current + 2) == 'H')
1170 /* dutch origin, e.g. 'school', 'schooner' */
1171 if (StringAt(original, (current + 3), 2,
1172 "OO", "ER", "EN",
1173 "UY", "ED", "EM", ""))
1175 /* 'schermerhorn', 'schenker' */
1176 if (StringAt(original, (current + 3), 2,
1177 "ER", "EN", ""))
1179 MetaphAdd(primary, "X");
1180 MetaphAdd(secondary, "SK");
1182 else
1184 MetaphAdd(primary, "SK");
1185 MetaphAdd(secondary, "SK");
1187 current += 3;
1188 break;
1190 else
1192 if ((current == 0) && !IsVowel(original, 3)
1193 && (GetAt(original, 3) != 'W'))
1195 MetaphAdd(primary, "X");
1196 MetaphAdd(secondary, "S");
1198 else
1200 MetaphAdd(primary, "X");
1201 MetaphAdd(secondary, "X");
1203 current += 3;
1204 break;
1208 if (StringAt(original, (current + 2), 1,
1209 "I", "E", "Y", ""))
1211 MetaphAdd(primary, "S");
1212 MetaphAdd(secondary, "S");
1213 current += 3;
1214 break;
1216 /* else */
1217 MetaphAdd(primary, "SK");
1218 MetaphAdd(secondary, "SK");
1219 current += 3;
1220 break;
1223 /* french e.g. 'resnais', 'artois' */
1224 if ((current == last)
1225 && StringAt(original, (current - 2), 2, "AI", "OI", ""))
1227 MetaphAdd(primary, "");
1228 MetaphAdd(secondary, "S");
1230 else
1232 MetaphAdd(primary, "S");
1233 MetaphAdd(secondary, "S");
1236 if (StringAt(original, (current + 1), 1, "S", "Z", ""))
1237 current += 2;
1238 else
1239 current += 1;
1240 break;
1242 case 'T':
1243 if (StringAt(original, current, 4, "TION", ""))
1245 MetaphAdd(primary, "X");
1246 MetaphAdd(secondary, "X");
1247 current += 3;
1248 break;
1251 if (StringAt(original, current, 3, "TIA", "TCH", ""))
1253 MetaphAdd(primary, "X");
1254 MetaphAdd(secondary, "X");
1255 current += 3;
1256 break;
1259 if (StringAt(original, current, 2, "TH", "")
1260 || StringAt(original, current, 3, "TTH", ""))
1262 /* special case 'thomas', 'thames' or germanic */
1263 if (StringAt(original, (current + 2), 2, "OM", "AM", "")
1264 || StringAt(original, 0, 4, "VAN ", "VON ", "")
1265 || StringAt(original, 0, 3, "SCH", ""))
1267 MetaphAdd(primary, "T");
1268 MetaphAdd(secondary, "T");
1270 else
1272 MetaphAdd(primary, "0");
1273 MetaphAdd(secondary, "T");
1275 current += 2;
1276 break;
1279 if (StringAt(original, (current + 1), 1, "T", "D", ""))
1280 current += 2;
1281 else
1282 current += 1;
1283 MetaphAdd(primary, "T");
1284 MetaphAdd(secondary, "T");
1285 break;
1287 case 'V':
1288 if (GetAt(original, current + 1) == 'V')
1289 current += 2;
1290 else
1291 current += 1;
1292 MetaphAdd(primary, "F");
1293 MetaphAdd(secondary, "F");
1294 break;
1296 case 'W':
1297 /* can also be in middle of word */
1298 if (StringAt(original, current, 2, "WR", ""))
1300 MetaphAdd(primary, "R");
1301 MetaphAdd(secondary, "R");
1302 current += 2;
1303 break;
1306 if ((current == 0)
1307 && (IsVowel(original, current + 1)
1308 || StringAt(original, current, 2, "WH", "")))
1310 /* Wasserman should match Vasserman */
1311 if (IsVowel(original, current + 1))
1313 MetaphAdd(primary, "A");
1314 MetaphAdd(secondary, "F");
1316 else
1318 /* need Uomo to match Womo */
1319 MetaphAdd(primary, "A");
1320 MetaphAdd(secondary, "A");
1324 /* Arnow should match Arnoff */
1325 if (((current == last) && IsVowel(original, current - 1))
1326 || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
1327 "OWSKI", "OWSKY", "")
1328 || StringAt(original, 0, 3, "SCH", ""))
1330 MetaphAdd(primary, "");
1331 MetaphAdd(secondary, "F");
1332 current += 1;
1333 break;
1336 /* polish e.g. 'filipowicz' */
1337 if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
1339 MetaphAdd(primary, "TS");
1340 MetaphAdd(secondary, "FX");
1341 current += 4;
1342 break;
1345 /* else skip it */
1346 current += 1;
1347 break;
1349 case 'X':
1350 /* french e.g. breaux */
1351 if (!((current == last)
1352 && (StringAt(original, (current - 3), 3,
1353 "IAU", "EAU", "")
1354 || StringAt(original, (current - 2), 2,
1355 "AU", "OU", ""))))
1357 MetaphAdd(primary, "KS");
1358 MetaphAdd(secondary, "KS");
1362 if (StringAt(original, (current + 1), 1, "C", "X", ""))
1363 current += 2;
1364 else
1365 current += 1;
1366 break;
1368 case 'Z':
1369 /* chinese pinyin e.g. 'zhao' */
1370 if (GetAt(original, current + 1) == 'H')
1372 MetaphAdd(primary, "J");
1373 MetaphAdd(secondary, "J");
1374 current += 2;
1375 break;
1377 else if (StringAt(original, (current + 1), 2,
1378 "ZO", "ZI", "ZA", "")
1379 || (SlavoGermanic(original)
1380 && ((current > 0)
1381 && GetAt(original, current - 1) != 'T')))
1383 MetaphAdd(primary, "S");
1384 MetaphAdd(secondary, "TS");
1386 else
1388 MetaphAdd(primary, "S");
1389 MetaphAdd(secondary, "S");
1392 if (GetAt(original, current + 1) == 'Z')
1393 current += 2;
1394 else
1395 current += 1;
1396 break;
1398 default:
1399 current += 1;
1403 * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
1404 * secondary->str);
1409 if (primary->length > 4)
1410 SetAt(primary, 4, '\0');
1412 if (secondary->length > 4)
1413 SetAt(secondary, 4, '\0');
1415 *codes = primary->str;
1416 *++codes = secondary->str;
1418 DestroyMetaString(original);
1419 DestroyMetaString(primary);
1420 DestroyMetaString(secondary);
1423 #ifdef DMETAPHONE_MAIN
1425 /* just for testing - not part of the perl code */
1427 main(int argc, char **argv)
1429 char *codes[2];
1431 if (argc > 1)
1433 DoubleMetaphone(argv[1], codes);
1434 printf("%s|%s\n", codes[0], codes[1]);
1438 #endif