3 # Generation of types and lookup tables for Daitch-Mokotoff soundex.
5 # Copyright (c) 2023, PostgreSQL Global Development Group
7 # This module was originally sponsored by Finance Norway /
8 # Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
14 die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
15 my $output_file = $ARGV[0];
17 # Open the output file
18 open my $OUTPUT, '>', $output_file
19 or die "Could not open output file $output_file: $!\n";
21 # Parse code table and generate tree for letter transitions.
23 my $table = [ {}, [ [ "", "", "" ] ] ];
27 my ($letters, $codes) = split(/\s+/);
28 my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
30 my $key = "codes_" . join("_or_", map { join("_", @
$_) } @codes);
35 . join(", ", map { "\"$_\"" } @
$_) . "\n\t}"
39 for my $letter (split(/,/, $letters))
41 my $ref = $table->[0];
42 # Link each character to the next in the letter combination.
43 my @c = split(//, $letter);
47 $ref->{$c} //= [ {}, undef ];
51 # The sound code for the letter combination is stored at the last character.
52 $ref->{$last_c}[1] = $key;
59 * Constants and lookup tables for Daitch-Mokotoff Soundex
61 * Copyright (c) 2023, PostgreSQL Global Development Group
63 * This file is generated by daitch_mokotoff_header.pl
66 /* Coding chart table: Soundex codes */
67 typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */
68 typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */
70 /* Coding chart table: Letter in input sequence */
73 char letter; /* Present letter in sequence */
74 const struct dm_letter *letters; /* List of possible successive letters */
75 const dm_codes *codes; /* Code sequence(s) for complete sequence */
78 typedef struct dm_letter dm_letter;
80 /* Codes for letter sequence at start of name, before a vowel, and any other. */
83 for my $key (sort keys %codes)
85 print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
92 /* Coding for alternative following letters in sequence. */
97 my ($ref, $letter) = @_;
102 for my $key (sort keys %$h)
105 my $children = "NULL";
106 if (defined $ref->[0])
108 $children = "letter_$letter$key";
109 hash2code
($ref, "$letter$key");
111 my $codes = $ref->[1] // "NULL";
112 push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
115 print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
118 print $OUTPUT "$_,\n";
120 print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
121 print $OUTPUT "};\n";
124 hash2code
($table, '');
128 # Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
130 # The conversion from the coding chart to the table should be self
131 # explanatory, but note the differences stated below.
135 # The non-ASCII letters in the coding chart are coded with substitute
136 # lowercase ASCII letters, which sort after the uppercase ASCII letters:
138 # Ą => a (use '[' for table lookup)
139 # Ę => e (use '\\' for table lookup)
140 # Ţ => t (use ']' for table lookup)
142 # The rule for "UE" does not correspond to the coding chart, however
143 # it is used by all other known implementations, including the one at
144 # https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
146 # Note that the implementation assumes that vowels are assigned code
147 # 0 or 1. "J" can be either a vowel or a consonant.
190 SCHTSCH
,SCHTSH
,SCHTCH
2,4,4
192 SHTCH
,SHCH
,SHTSH
2,4,4
193 SHT
,SCHT
,SCHD
2,43,43
199 SZT
,SHD
,SZD
,SD
2,43,43