Restore initdb's old behavior of always setting the lc_xxx GUCs.
[pgsql.git] / contrib / fuzzystrmatch / daitch_mokotoff_header.pl
blob51a40e774898a7c24651ccce74e1fbe9f6c38779
1 #!/usr/bin/perl
3 # Generation of types and lookup tables for Daitch-Mokotoff soundex.
5 # Copyright (c) 2023, PostgreSQL Global Development Group
7 # This module was originally sponsored by Finance Norway /
8 # Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
11 use strict;
12 use warnings;
14 die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
15 my $output_file = $ARGV[0];
17 # Open the output file
18 open my $OUTPUT, '>', $output_file
19 or die "Could not open output file $output_file: $!\n";
21 # Parse code table and generate tree for letter transitions.
22 my %codes;
23 my $table = [ {}, [ [ "", "", "" ] ] ];
24 while (<DATA>)
26 chomp;
27 my ($letters, $codes) = split(/\s+/);
28 my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
30 my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
31 my $val = join(
32 ",\n",
33 map {
34 "\t{\n\t\t"
35 . join(", ", map { "\"$_\"" } @$_) . "\n\t}"
36 } @codes);
37 $codes{$key} = $val;
39 for my $letter (split(/,/, $letters))
41 my $ref = $table->[0];
42 # Link each character to the next in the letter combination.
43 my @c = split(//, $letter);
44 my $last_c = pop(@c);
45 for my $c (@c)
47 $ref->{$c} //= [ {}, undef ];
48 $ref->{$c}[0] //= {};
49 $ref = $ref->{$c}[0];
51 # The sound code for the letter combination is stored at the last character.
52 $ref->{$last_c}[1] = $key;
55 close(DATA);
57 print $OUTPUT <<EOF;
59 * Constants and lookup tables for Daitch-Mokotoff Soundex
61 * Copyright (c) 2023, PostgreSQL Global Development Group
63 * This file is generated by daitch_mokotoff_header.pl
66 /* Coding chart table: Soundex codes */
67 typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */
68 typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */
70 /* Coding chart table: Letter in input sequence */
71 struct dm_letter
73 char letter; /* Present letter in sequence */
74 const struct dm_letter *letters; /* List of possible successive letters */
75 const dm_codes *codes; /* Code sequence(s) for complete sequence */
78 typedef struct dm_letter dm_letter;
80 /* Codes for letter sequence at start of name, before a vowel, and any other. */
81 EOF
83 for my $key (sort keys %codes)
85 print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
86 . $codes{$key}
87 . "\n};\n";
90 print $OUTPUT <<EOF;
92 /* Coding for alternative following letters in sequence. */
93 EOF
95 sub hash2code
97 my ($ref, $letter) = @_;
99 my @letters = ();
101 my $h = $ref->[0];
102 for my $key (sort keys %$h)
104 $ref = $h->{$key};
105 my $children = "NULL";
106 if (defined $ref->[0])
108 $children = "letter_$letter$key";
109 hash2code($ref, "$letter$key");
111 my $codes = $ref->[1] // "NULL";
112 push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
115 print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
116 for (@letters)
118 print $OUTPUT "$_,\n";
120 print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
121 print $OUTPUT "};\n";
124 hash2code($table, '');
126 close $OUTPUT;
128 # Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
130 # The conversion from the coding chart to the table should be self
131 # explanatory, but note the differences stated below.
133 # X = NC (not coded)
135 # The non-ASCII letters in the coding chart are coded with substitute
136 # lowercase ASCII letters, which sort after the uppercase ASCII letters:
138 # Ą => a (use '[' for table lookup)
139 # Ę => e (use '\\' for table lookup)
140 # Ţ => t (use ']' for table lookup)
142 # The rule for "UE" does not correspond to the coding chart, however
143 # it is used by all other known implementations, including the one at
144 # https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
146 # Note that the implementation assumes that vowels are assigned code
147 # 0 or 1. "J" can be either a vowel or a consonant.
150 __DATA__
151 AI,AJ,AY 0,1,X
152 AU 0,7,X
153 a X,X,6|X,X,X
154 A 0,X,X
155 B 7,7,7
156 CHS 5,54,54
157 CH 5,5,5|4,4,4
158 CK 5,5,5|45,45,45
159 CZ,CS,CSZ,CZS 4,4,4
160 C 5,5,5|4,4,4
161 DRZ,DRS 4,4,4
162 DS,DSH,DSZ 4,4,4
163 DZ,DZH,DZS 4,4,4
164 D,DT 3,3,3
165 EI,EJ,EY 0,1,X
166 EU 1,1,X
167 e X,X,6|X,X,X
168 E 0,X,X
169 FB 7,7,7
170 F 7,7,7
171 G 5,5,5
172 H 5,5,X
173 IA,IE,IO,IU 1,X,X
174 I 0,X,X
175 J 1,X,X|4,4,4
176 KS 5,54,54
177 KH 5,5,5
178 K 5,5,5
179 L 8,8,8
180 MN 66,66,66
181 M 6,6,6
182 NM 66,66,66
183 N 6,6,6
184 OI,OJ,OY 0,1,X
185 O 0,X,X
186 P,PF,PH 7,7,7
187 Q 5,5,5
188 RZ,RS 94,94,94|4,4,4
189 R 9,9,9
190 SCHTSCH,SCHTSH,SCHTCH 2,4,4
191 SCH 4,4,4
192 SHTCH,SHCH,SHTSH 2,4,4
193 SHT,SCHT,SCHD 2,43,43
194 SH 4,4,4
195 STCH,STSCH,SC 2,4,4
196 STRZ,STRS,STSH 2,4,4
197 ST 2,43,43
198 SZCZ,SZCS 2,4,4
199 SZT,SHD,SZD,SD 2,43,43
200 SZ 4,4,4
201 S 4,4,4
202 TCH,TTCH,TTSCH 4,4,4
203 TH 3,3,3
204 TRZ,TRS 4,4,4
205 TSCH,TSH 4,4,4
206 TS,TTS,TTSZ,TC 4,4,4
207 TZ,TTZ,TZS,TSZ 4,4,4
208 t 3,3,3|4,4,4
209 T 3,3,3
210 UI,UJ,UY,UE 0,1,X
211 U 0,X,X
212 V 7,7,7
213 W 7,7,7
214 X 5,54,54
215 Y 1,X,X
216 ZDZ,ZDZH,ZHDZH 2,4,4
217 ZD,ZHD 2,43,43
218 ZH,ZS,ZSCH,ZSH 4,4,4
219 Z 4,4,4