misc: a few portability tweaks
[rb-79.git] / tools / gen-unicode-transforms.pl
blobc4c1ecf5b5277d59b6db292332aa5edd65127049
1 #!/usr/bin/perl -w -CS
3 # Copyright (c) 2017, De Rais <derais@cock.li>
4 #
5 # Permission to use, copy, modify, and/or distribute this software for
6 # any purpose with or without fee is hereby granted, provided that the
7 # above copyright notice and this permission notice appear in all
8 # copies.
9 #
10 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
11 # WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
12 # WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
13 # AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
14 # DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
15 # PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
16 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
17 # PERFORMANCE OF THIS SOFTWARE.
19 # This tool creates a header file for use in sanitize-comment.c.
20 # The goal is to turn Unicode input into Unicode input that can be
21 # easily searched (via regex or such). To do this, there are a few
22 # conceptual steps.
24 # - Perform every transformation listed in confusables.txt. (This
25 # turns £ into L).
27 # - Replace every character with its Decompositional_Mapping, if
28 # appropriate. (This turns ᴬ into A.)
30 # - Delete every character whose General_Category is Mark, Control,
31 # Format, Modifier_Symbol, Punctuation (this gets rid of *, the
32 # ( and ) from ⑨ , etc.)
34 # - Delete every character whose General_Category is Space, but
35 # is not U+0020. (This gets rid of U+200A HAIR SPACE, etc.)
37 # The problem is that these operations do not obviously compose
38 # nicely. It's not assured that a Decompositional_Mapping won't
39 # result in a confusable character, nor that a confusable transformation
40 # will never result in a decomposable character. So we (conceptually)
41 # create the identity transformation, then repeatedly hit it with
42 # all four of these transformations until it (hopefully) stabilizes.
43 # There's no guarantee that it will actually stabilize, which is
44 # why it's broken out into a separate script.
46 use strict;
47 use warnings;
48 use utf8;
50 use Data::Dumper;
52 $| = 1;
54 # Key: a string like "00A3". Value: an array like [ "004C" ]
55 my $transform = {};
57 # We start off with the recommendations of confusables.txt
58 #open CONFUSABLES, "wget -q -O- http://www.unicode.org/Public/security/latest/confusables.txt|" or die;
59 open CONFUSABLES, "<confusables.txt" or die;
60 while (<CONFUSABLES>) {
61 my ($from, $to) = $_ =~ m/^\s*([0-9A-Fa-f]+)\s*;\s*([0-9A-Fa-f ]+)\s*;/;
63 if (not $to) {
64 next;
67 $transform->{$from} = [ split / /, $to ];
69 close CONFUSABLES;
71 # We now add the decompositional mappings
72 #open EVERYTHING, "wget -q -O- http://www.unicode.org/Public/UNIDATA/UnicodeData.txt|" or die;
73 open EVERYTHING, "<UnicodeData.txt" or die;
74 while (<EVERYTHING>) {
75 my @contents = split /;/,$_;
76 my $codepoint = $contents[0];
77 my $codepoint_h = hex($codepoint);
78 my $name = $contents[1];
79 my $general_category = $contents[2];
80 my $decompositional_mapping = $contents[5];
81 $decompositional_mapping =~ s/<[^>]*>\s*//;
83 if ($general_category =~ /^(M|C|P|Sk)/ and $codepoint ne "0020") {
84 $transform->{$codepoint} = [];
85 next;
88 # These take up more space than they're worth
89 if ($name =~ /^(ARABIC LIGATURE|CJK|HIRANGA|HANGUL|TAMIL|CANADIAN)/) {
90 $transform->{$codepoint} = [];
91 next;
94 # Now, do the decompositions and the confusables have a
95 # contest? Confusables win.
96 if ($transform->{$codepoint}) {
97 next;
100 # Otherwise, put in stuff for decomposition
101 if ($decompositional_mapping ne "") {
102 $transform->{$codepoint} = [ split / /, $decompositional_mapping ];
105 close EVERYTHING;
107 # A bunch of CJK stuff
108 for (my $codepoint_h = 0x31c0; $codepoint_h <= 0x9fff; ++$codepoint_h) {
109 my $codepoint = uc sprintf "%04x", $codepoint_h;
110 $transform->{$codepoint} = [];
113 # A bunch of Arabic stuff
114 for (my $codepoint_h = 0x1ee00; $codepoint_h <= 0x1eeff; ++$codepoint_h) {
115 my $codepoint = uc sprintf "%04x", $codepoint_h;
116 $transform->{$codepoint} = [];
119 # Some minor languages that don't really look like Latin-1
120 for (my $codepoint_h = 0x1b00; $codepoint_h <= 0x1ccf; ++$codepoint_h) {
121 my $codepoint = uc sprintf "%04x", $codepoint_h;
122 $transform->{$codepoint} = [];
125 # Squared capital letters that haven't caught up
126 for (my $codepoint_h = 0x1f170; $codepoint_h <= 0x1f189; ++$codepoint_h) {
127 my $dest = uc sprintf "%04x", ($codepoint_h - 0x1f170 + 0x41);
128 my $codepoint = uc sprintf "%04x", $codepoint_h;
129 $transform->{$codepoint} = [ $dest ];
130 $codepoint = uc sprintf "%04x", ($codepoint_h - 0x1f170 + 0x1f130);
131 $transform->{$codepoint} = [ $dest ];
134 # Some of the transforms are dumb.
135 delete $transform->{"0030"}; # 0
136 delete $transform->{"0031"}; # 1
137 delete $transform->{"0049"}; # I
138 delete $transform->{"006D"}; # m
139 delete $transform->{"0077"}; # w
140 delete $transform->{"007C"}; # |
141 $transform->{"FF29"} = [ "0049" ]; # I should go to I, not l
143 # The Unicode Consortium will never take these
144 $transform->{"00A9"} = [ "0043" ];
145 $transform->{"00AE"} = [ "0052" ];
146 $transform->{"00DF"} = [ "0042" ];
147 $transform->{"01AB"} = [ "0074" ];
148 $transform->{"0272"} = [ "006E" ];
149 $transform->{"0274"} = [ "004E" ];
150 $transform->{"0291"} = [ "007A" ];
151 $transform->{"0298"} = [ "004F" ];
152 $transform->{"029F"} = [ "004C" ];
153 $transform->{"02B3"} = [ "0072" ];
154 $transform->{"0629"} = [ "006F" ];
155 $transform->{"0644"} = [ "004A" ];
156 $transform->{"1472"} = [ "0062" ];
157 $transform->{"1473"} = [ "0062" ];
158 $transform->{"1D07"} = [ "0045" ];
159 $transform->{"2117"} = [ "0050" ];
160 $transform->{"2365"} = [ "004F" ];
161 $transform->{"A793"} = [ "0065" ];
163 $transform->{"2776"} = [ "0028", "0031", "0029" ];
164 $transform->{"2777"} = [ "0028", "0032", "0029" ];
165 $transform->{"2778"} = [ "0028", "0033", "0029" ];
166 $transform->{"2779"} = [ "0028", "0034", "0029" ];
167 $transform->{"277A"} = [ "0028", "0035", "0029" ];
168 $transform->{"277B"} = [ "0028", "0036", "0029" ];
169 $transform->{"277C"} = [ "0028", "0037", "0029" ];
170 $transform->{"277D"} = [ "0028", "0038", "0029" ];
171 $transform->{"277E"} = [ "0028", "0039", "0029" ];
172 $transform->{"277F"} = [ "0028", "0031", "0030", "0029" ];
174 $transform->{"2780"} = [ "0028", "0031", "0029" ];
175 $transform->{"2781"} = [ "0028", "0032", "0029" ];
176 $transform->{"2782"} = [ "0028", "0033", "0029" ];
177 $transform->{"2783"} = [ "0028", "0034", "0029" ];
178 $transform->{"2784"} = [ "0028", "0035", "0029" ];
179 $transform->{"2785"} = [ "0028", "0036", "0029" ];
180 $transform->{"2786"} = [ "0028", "0037", "0029" ];
181 $transform->{"2787"} = [ "0028", "0038", "0029" ];
182 $transform->{"2788"} = [ "0028", "0039", "0029" ];
183 $transform->{"2789"} = [ "0028", "0031", "0030", "0029" ];
185 $transform->{"278A"} = [ "0028", "0031", "0029" ];
186 $transform->{"278B"} = [ "0028", "0032", "0029" ];
187 $transform->{"278C"} = [ "0028", "0033", "0029" ];
188 $transform->{"278D"} = [ "0028", "0034", "0029" ];
189 $transform->{"278E"} = [ "0028", "0035", "0029" ];
190 $transform->{"278F"} = [ "0028", "0036", "0029" ];
191 $transform->{"2790"} = [ "0028", "0037", "0029" ];
192 $transform->{"2791"} = [ "0028", "0038", "0029" ];
193 $transform->{"2792"} = [ "0028", "0039", "0029" ];
194 $transform->{"2793"} = [ "0028", "0031", "0030", "0029" ];
196 # I disagree with m ~ rn and w ~ vv - moved to the iteration
197 $transform->{"0460"} = [ "0077" ];
198 # $transform->{"0461"} = [ "0077" ];
199 # $transform->{"047D"} = [ "0077" ];
200 # $transform->{"04CE"} = [ "006D" ];
201 # $transform->{"1D55e"} = [ "006D" ];
202 # $transform->{"1D568"} = [ "0077" ];
203 # $transform->{"1D592"} = [ "006D" ];
204 # $transform->{"1D59C"} = [ "0077" ];
205 # $transform->{"1D5C6"} = [ "006D" ];
206 # $transform->{"1D5D0"} = [ "0077" ];
207 # $transform->{"1D5FA"} = [ "006D" ];
208 # $transform->{"1D604"} = [ "0077" ];
209 # $transform->{"1D62E"} = [ "0077" ];
210 # $transform->{"1D638"} = [ "006D" ];
211 # $transform->{"1D662"} = [ "006D" ];
212 # $transform->{"1D66C"} = [ "0077" ];
213 # $transform->{"1D696"} = [ "006D" ];
214 # $transform->{"1D6A0"} = [ "0077" ];
216 # The unicode consortium might take these, but I don't care to wait
217 $transform->{"0138"} = [ "006B" ];
218 $transform->{"0185"} = [ "0062" ];
219 $transform->{"01A9"} = [ "03A3" ];
220 $transform->{"01F6"} = [ "0048" ];
221 $transform->{"024B"} = [ "0071" ];
222 $transform->{"0262"} = [ "0047" ];
223 $transform->{"0262"} = [ "0047" ];
224 $transform->{"0278"} = [ "03A6" ];
225 $transform->{"0280"} = [ "0052" ];
226 $transform->{"028A"} = [ "0055" ];
227 $transform->{"028C"} = [ "039B" ];
228 $transform->{"028D"} = [ "004D" ];
229 $transform->{"0299"} = [ "0042" ];
230 $transform->{"029C"} = [ "0048" ];
231 $transform->{"03C0"} = [ "006E" ];
232 $transform->{"03C7"} = [ "0058" ];
233 $transform->{"03DD"} = [ "0066" ];
234 $transform->{"0423"} = [ "0079" ];
235 $transform->{"0427"} = [ "0079" ];
236 $transform->{"0447"} = [ "0079" ];
237 $transform->{"04B6"} = [ "0079" ];
238 $transform->{"04B7"} = [ "0079" ];
239 $transform->{"1471"} = [ "0064" ];
240 $transform->{"1D00"} = [ "0041" ];
241 $transform->{"1D05"} = [ "0044" ];
242 $transform->{"1D0A"} = [ "004A" ];
243 $transform->{"1D18"} = [ "0050" ];
244 $transform->{"1D1B"} = [ "0054" ];
245 $transform->{"1E9F"} = [ "03B4" ];
246 $transform->{"A727"} = [ "0068" ];
248 delete $transform->{"03A3"};
250 # Now we have to run the transform on itself.
252 my $times = 0;
253 my $need_another_run = 1;
254 while ($need_another_run > 0) {
255 $need_another_run = 0;
256 $times++;
257 if ($times > 8) {
258 die "Look, we seem to be in some kind of a feedback loop here.";
261 foreach my $k (%$transform) {
262 my $prev = Dumper($transform->{$k});
263 $prev =~ s{\s+}{ }gs;
264 if (not exists($transform->{$k})) {
265 next;
268 my @new = ();
269 foreach my $v (@{$transform->{$k}}) {
270 if (exists($transform->{$v})) {
271 push @new, @{$transform->{$v}};
272 } else {
273 push @new, $v;
277 # I disagree that w~vv and m~rn
278 if ($#new == 1 and $new[0] eq "0072" and $new[1] eq "006E") {
279 $transform->{$k} = [ "006D" ];
280 } elsif ($#new == 1 and $new[0] eq "0076" and $new[1] eq "0076") {
281 $transform->{$k} = [ "0077" ];
282 } else {
283 $transform->{$k} = [ @new ];
286 my $now = Dumper($transform->{$k});
287 $now =~ s{\s+}{ }gs;
289 if ($prev ne $now) {
290 $need_another_run = 1;
295 print "/* Autogenerated by gen-unicode-transforms.pl */\n";
296 print "struct translate {\n";
297 print " wchar_t from_s;\n";
298 print " wchar_t from_t;\n";
299 print " const char *to;\n";
300 print "};\n";
301 print "\n";
302 print "static struct translate translates[] = {\n";
304 my $k_s = 0x20;
305 my $k_t = 0x20;
306 my $last_repl_str = "nope";
307 my $last_desc_str = "nope";
308 my $last_len = 0;
310 foreach my $k (sort { hex($a) <=> hex($b) } keys %$transform) {
311 my $kh = hex($k);
313 my $repl_str = "";
314 my $desc_str = "";
315 my $len = 0;
316 foreach my $v (@{$transform->{$k}}) {
317 my $vh = hex($v);
318 if ($vh > 0 and $vh < 127) {
319 $repl_str = $repl_str . chr($vh);
320 $len += 1;
321 } elsif ($vh < 0x10000) {
322 $repl_str = $repl_str . (sprintf "\\u%04x", $vh);
323 $len += 6;
324 } else {
325 $repl_str = $repl_str . (sprintf "\\U%08x", $vh);
326 $len += 10;
329 if (chr($vh) eq "\n") {
330 $desc_str = $desc_str . "\\n";
331 } elsif (chr($vh) eq "\t") {
332 $desc_str = $desc_str . "\\t";
333 } else {
334 $desc_str = $desc_str . chr($vh);
339 # Are we just continuing the range?
340 if ($kh == $k_t + 1 and $last_repl_str eq $repl_str) {
341 $k_t = $kh;
342 next;
345 # Okay, we're not. We've got to print out that last thing
346 if ($k_s != 0x20) {
347 print " {";
348 printf ".from_s = 0x%x", $k_s;
349 printf ", .from_t = 0x%x", $k_t;
350 printf ", .to = \"%s\" }, ", $last_repl_str;
351 print " "x(30 - $last_len);
352 printf "/* \"%s\"..\"%s\"", chr($k_s), chr($k_t);
353 printf " -> \"%s\" */\n", $last_desc_str;
356 # Now we start a new range
357 $k_s = $kh;
358 $k_t = $kh;
359 $last_repl_str = $repl_str;
360 $last_desc_str = $desc_str;
361 $last_len = $len;
364 # And print the last thing out
365 print " {";
366 printf ".from_s = 0x%x", $k_s;
367 printf ", .from_t = 0x%x", $k_t;
368 printf ", .to = \"%s\" }, ", $last_repl_str;
369 print " "x(20 - $last_len);
370 printf "/* \"%s\"..\"%s\"", chr($k_s), chr($k_t);
371 printf " -> \"%s\" */\n", $last_repl_str;
372 print "};";