msado15: Implement Dispatch functions in Fields.
[wine.git] / tools / make_unicode
blob93d6a9346c212316ce549daa9ad3efb6df47978e
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "13.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $REPORTS = "http://www.unicode.org/reports";
30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
33 # Sort keys file
34 my $SORTKEYS = "tr10/allkeys.txt";
36 # Default char for undefined mappings
37 my $DEF_CHAR = ord '?';
39 # Last valid Unicode character
40 my $MAX_CHAR = 0x10ffff;
42 my @allfiles =
44 "CodpageFiles/037.txt",
45 "CodpageFiles/437.txt",
46 "CodpageFiles/500.txt",
47 "CodpageFiles/708.txt",
48 "CodpageFiles/737.txt",
49 "CodpageFiles/775.txt",
50 "CodpageFiles/850.txt",
51 "CodpageFiles/852.txt",
52 "CodpageFiles/855.txt",
53 "CodpageFiles/857.txt",
54 "CodpageFiles/860.txt",
55 "CodpageFiles/861.txt",
56 "CodpageFiles/862.txt",
57 "CodpageFiles/863.txt",
58 "CodpageFiles/864.txt",
59 "CodpageFiles/865.txt",
60 "CodpageFiles/866.txt",
61 "CodpageFiles/869.txt",
62 "CodpageFiles/874.txt",
63 "CodpageFiles/875.txt",
64 "CodpageFiles/932.txt",
65 "CodpageFiles/936.txt",
66 "CodpageFiles/949.txt",
67 "CodpageFiles/950.txt",
68 "CodpageFiles/1026.txt",
69 "CodpageFiles/1250.txt",
70 "CodpageFiles/1251.txt",
71 "CodpageFiles/1252.txt",
72 "CodpageFiles/1253.txt",
73 "CodpageFiles/1254.txt",
74 "CodpageFiles/1255.txt",
75 "CodpageFiles/1256.txt",
76 "CodpageFiles/1257.txt",
77 "CodpageFiles/1258.txt",
78 "CodpageFiles/1361.txt",
79 "CodpageFiles/10000.txt",
80 "CodpageFiles/10001.txt",
81 "CodpageFiles/10002.txt",
82 "CodpageFiles/10003.txt",
83 "CodpageFiles/10004.txt",
84 "CodpageFiles/10005.txt",
85 "CodpageFiles/10006.txt",
86 "CodpageFiles/10007.txt",
87 "CodpageFiles/10008.txt",
88 "CodpageFiles/10010.txt",
89 "CodpageFiles/10017.txt",
90 "CodpageFiles/10021.txt",
91 "CodpageFiles/10029.txt",
92 "CodpageFiles/10079.txt",
93 "CodpageFiles/10081.txt",
94 "CodpageFiles/10082.txt",
95 "CodpageFiles/20127.txt",
96 "CodpageFiles/20866.txt",
97 "CodpageFiles/21866.txt",
98 "CodpageFiles/28591.txt",
99 "CodpageFiles/28592.txt",
100 "CodpageFiles/28593.txt",
101 "CodpageFiles/28594.txt",
102 "CodpageFiles/28595.txt",
103 "CodpageFiles/28596.txt",
104 "CodpageFiles/28597.txt",
105 "CodpageFiles/28598.txt",
106 "CodpageFiles/28599.txt",
107 "CodpageFiles/28603.txt",
108 "CodpageFiles/28605.txt",
112 my %ctype =
114 # CT_CTYPE1
115 "upper" => 0x0001,
116 "lower" => 0x0002,
117 "digit" => 0x0004,
118 "space" => 0x0008,
119 "punct" => 0x0010,
120 "cntrl" => 0x0020,
121 "blank" => 0x0040,
122 "xdigit" => 0x0080,
123 "alpha" => 0x0100 | 0x80000000,
124 "defin" => 0x0200,
125 # CT_CTYPE3 in high 16 bits
126 "nonspacing" => 0x00010000,
127 "diacritic" => 0x00020000,
128 "vowelmark" => 0x00040000,
129 "symbol" => 0x00080000,
130 "katakana" => 0x00100000,
131 "hiragana" => 0x00200000,
132 "halfwidth" => 0x00400000,
133 "fullwidth" => 0x00800000,
134 "ideograph" => 0x01000000,
135 "kashida" => 0x02000000,
136 "lexical" => 0x04000000,
137 "highsurrogate" => 0x08000000,
138 "lowsurrogate" => 0x10000000,
141 my %bracket_types =
143 "o" => 0x0000,
144 "c" => 0x0001,
147 my %indic_types =
149 "Other" => 0x0000,
150 "Bindu" => 0x0001,
151 "Visarga" => 0x0002,
152 "Avagraha" => 0x0003,
153 "Nukta" => 0x0004,
154 "Virama" => 0x0005,
155 "Vowel_Independent" => 0x0006,
156 "Vowel_Dependent" => 0x0007,
157 "Vowel" => 0x0008,
158 "Consonant_Placeholder" => 0x0009,
159 "Consonant" => 0x000a,
160 "Consonant_Dead" => 0x000b,
161 "Consonant_Succeeding_Repha" => 0x000c,
162 "Consonant_Subjoined" => 0x000d,
163 "Consonant_Medial" => 0x000e,
164 "Consonant_Final" => 0x000f,
165 "Consonant_Head_Letter" => 0x0010,
166 "Modifying_Letter" => 0x0011,
167 "Tone_Letter" => 0x0012,
168 "Tone_Mark" => 0x0013,
169 "Register_Shifter" => 0x0014,
170 "Consonant_Preceding_Repha" => 0x0015,
171 "Pure_Killer" => 0x0016,
172 "Invisible_Stacker" => 0x0017,
173 "Gemination_Mark" => 0x0018,
174 "Cantillation_Mark" => 0x0019,
175 "Non_Joiner" => 0x001a,
176 "Joiner" => 0x001b,
177 "Number_Joiner" => 0x001c,
178 "Number" => 0x001d,
179 "Brahmi_Joining_Number" => 0x001e,
180 "Consonant_With_Stacker" => 0x001f,
181 "Consonant_Prefixed" => 0x0020,
182 "Syllable_Modifier" => 0x0021,
183 "Consonant_Killer" => 0x0022,
184 "Consonant_Initial_Postfixed" => 0x0023,
187 my %matra_types =
189 "Right" => 0x01,
190 "Left" => 0x02,
191 "Visual_Order_Left" => 0x03,
192 "Left_And_Right" => 0x04,
193 "Top" => 0x05,
194 "Bottom" => 0x06,
195 "Top_And_Bottom" => 0x07,
196 "Top_And_Right" => 0x08,
197 "Top_And_Left" => 0x09,
198 "Top_And_Left_And_Right" => 0x0a,
199 "Bottom_And_Right" => 0x0b,
200 "Top_And_Bottom_And_Right" => 0x0c,
201 "Overstruck" => 0x0d,
202 "Invisible" => 0x0e,
203 "Bottom_And_Left" => 0x0f,
204 "Top_And_Bottom_And_Left" => 0x10,
207 my %break_types =
209 "BK" => 0x0001,
210 "CR" => 0x0002,
211 "LF" => 0x0003,
212 "CM" => 0x0004,
213 "SG" => 0x0005,
214 "GL" => 0x0006,
215 "CB" => 0x0007,
216 "SP" => 0x0008,
217 "ZW" => 0x0009,
218 "NL" => 0x000a,
219 "WJ" => 0x000b,
220 "JL" => 0x000c,
221 "JV" => 0x000d,
222 "JT" => 0x000e,
223 "H2" => 0x000f,
224 "H3" => 0x0010,
225 "XX" => 0x0011,
226 "OP" => 0x0012,
227 "CL" => 0x0013,
228 "CP" => 0x0014,
229 "QU" => 0x0015,
230 "NS" => 0x0016,
231 "EX" => 0x0017,
232 "SY" => 0x0018,
233 "IS" => 0x0019,
234 "PR" => 0x001a,
235 "PO" => 0x001b,
236 "NU" => 0x001c,
237 "AL" => 0x001d,
238 "ID" => 0x001e,
239 "IN" => 0x001f,
240 "HY" => 0x0020,
241 "BB" => 0x0021,
242 "BA" => 0x0022,
243 "SA" => 0x0023,
244 "AI" => 0x0024,
245 "B2" => 0x0025,
246 "HL" => 0x0026,
247 "CJ" => 0x0027,
248 "RI" => 0x0028,
249 "EB" => 0x0029,
250 "EM" => 0x002a,
251 "ZWJ" => 0x002b,
254 my %vertical_types =
256 "R" => 0x0000,
257 "U" => 0x0001,
258 "Tr" => 0x0002,
259 "Tu" => 0x0003,
262 my %categories =
264 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
265 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
266 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
267 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
268 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
269 "Me" => $ctype{"defin"}, # Mark, Enclosing
270 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
271 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
272 "No" => $ctype{"defin"}, # Number, Other
273 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
274 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
275 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
276 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
277 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
278 "Cs" => $ctype{"defin"}, # Other, Surrogate
279 "Co" => $ctype{"defin"}, # Other, Private Use
280 "Cn" => $ctype{"defin"}, # Other, Not Assigned
281 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
282 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
283 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
284 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
285 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
286 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
287 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
288 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
289 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
290 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
291 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
292 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
293 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
296 # a few characters need additional categories that cannot be determined automatically
297 my %special_categories =
299 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
300 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
301 "space" => [ 0x09..0x0d, 0x85 ],
302 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
303 "cntrl" => [ 0x070f, 0x200c, 0x200d,
304 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
305 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
306 0xfff9, 0xfffa, 0xfffb ],
307 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
308 0xd7, 0xf7 ],
309 "digit" => [ 0xb2, 0xb3, 0xb9 ],
310 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
311 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
312 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
313 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
314 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
315 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
316 0x02b9..0x02ba, 0x02c6..0x02cf ],
317 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
318 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
319 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
320 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
321 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
322 0x3131..0x3164 ],
323 "ideograph" => [ 0x3006..0x3007 ],
324 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
325 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
326 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
327 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
328 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
329 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
330 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
331 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
332 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
333 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
334 "kashida" => [ 0x0640 ],
337 my %directions =
339 "L" => 1, # Left-to-Right
340 "R" => 2, # Right-to-Left
341 "AL" => 12, # Right-to-Left Arabic
342 "EN" => 3, # European Number
343 "ES" => 4, # European Number Separator
344 "ET" => 5, # European Number Terminator
345 "AN" => 6, # Arabic Number
346 "CS" => 7, # Common Number Separator
347 "NSM" => 13, # Non-Spacing Mark
348 "BN" => 14, # Boundary Neutral
349 "B" => 8, # Paragraph Separator
350 "S" => 9, # Segment Separator
351 "WS" => 10, # Whitespace
352 "ON" => 11, # Other Neutrals
353 "LRE" => 15, # Left-to-Right Embedding
354 "LRO" => 15, # Left-to-Right Override
355 "RLE" => 15, # Right-to-Left Embedding
356 "RLO" => 15, # Right-to-Left Override
357 "PDF" => 15, # Pop Directional Format
358 "LRI" => 15, # Left-to-Right Isolate
359 "RLI" => 15, # Right-to-Left Isolate
360 "FSI" => 15, # First Strong Isolate
361 "PDI" => 15 # Pop Directional Isolate
364 my %c2_types =
366 "L" => 1, # C2_LEFTTORIGHT
367 "R" => 2, # C2_RIGHTTOLEFT
368 "AL" => 2, # C2_RIGHTTOLEFT
369 "EN" => 3, # C2_EUROPENUMBER
370 "ES" => 4, # C2_EUROPESEPARATOR
371 "ET" => 5, # C2_EUROPETERMINATOR
372 "AN" => 6, # C2_ARABICNUMBER
373 "CS" => 7, # C2_COMMONSEPARATOR
374 "NSM" => 11, # C2_OTHERNEUTRAL
375 "BN" => 0, # C2_NOTAPPLICABLE
376 "B" => 8, # C2_BLOCKSEPARATOR
377 "S" => 9, # C2_SEGMENTSEPARATOR
378 "WS" => 10, # C2_WHITESPACE
379 "ON" => 11, # C2_OTHERNEUTRAL
380 "LRE" => 11, # C2_OTHERNEUTRAL
381 "LRO" => 11, # C2_OTHERNEUTRAL
382 "RLE" => 11, # C2_OTHERNEUTRAL
383 "RLO" => 11, # C2_OTHERNEUTRAL
384 "PDF" => 11, # C2_OTHERNEUTRAL
385 "LRI" => 11, # C2_OTHERNEUTRAL
386 "RLI" => 11, # C2_OTHERNEUTRAL
387 "FSI" => 11, # C2_OTHERNEUTRAL
388 "PDI" => 11 # C2_OTHERNEUTRAL
391 my %bidi_types =
393 "ON" => 0, # Other Neutrals
394 "L" => 1, # Left-to-Right
395 "R" => 2, # Right-to-Left
396 "AN" => 3, # Arabic Number
397 "EN" => 4, # European Number
398 "AL" => 5, # Right-to-Left Arabic
399 "NSM" => 6, # Non-Spacing Mark
400 "CS" => 7, # Common Number Separator
401 "ES" => 8, # European Number Separator
402 "ET" => 9, # European Number Terminator
403 "BN" => 10, # Boundary Neutral
404 "S" => 11, # Segment Separator
405 "WS" => 12, # Whitespace
406 "B" => 13, # Paragraph Separator
407 "RLO" => 14, # Right-to-Left Override
408 "RLE" => 15, # Right-to-Left Embedding
409 "LRO" => 16, # Left-to-Right Override
410 "LRE" => 17, # Left-to-Right Embedding
411 "PDF" => 18, # Pop Directional Format
412 "LRI" => 19, # Left-to-Right Isolate
413 "RLI" => 20, # Right-to-Left Isolate
414 "FSI" => 21, # First Strong Isolate
415 "PDI" => 22 # Pop Directional Isolate
418 my %joining_types =
420 "U" => 0, # Non_Joining
421 "L" => 1, # Left_Joining
422 "R" => 2, # Right_Joining
423 "D" => 3, # Dual_Joining
424 "C" => 3, # Join_Causing
425 "ALAPH" => 4, # Syriac ALAPH
426 "DALATH RISH" => 5, # Syriac DALATH RISH group
427 "T" => 6, # Transparent
430 my @cp2uni = ();
431 my @glyph2uni = ();
432 my @lead_bytes = ();
433 my @uni2cp = ();
434 my @tolower_table = ();
435 my @toupper_table = ();
436 my @digitmap_table = ();
437 my @category_table = ();
438 my @initial_joining_table = ();
439 my @direction_table = ();
440 my @decomp_table = ();
441 my @combining_class_table = ();
442 my @decomp_compat_table = ();
443 my @comp_exclusions = ();
444 my @idna_decomp_table = ();
445 my @idna_disallowed = ();
446 my %registry_keys;
447 my $default_char;
448 my $default_wchar;
450 my %joining_forms =
452 "isolated" => [],
453 "final" => [],
454 "initial" => [],
455 "medial" => []
458 sub to_utf16(@)
460 my @ret;
461 foreach my $ch (@_)
463 if ($ch < 0x10000)
465 push @ret, $ch;
467 else
469 my $val = $ch - 0x10000;
470 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
473 return @ret;
476 ################################################################
477 # fetch a unicode.org file and open it
478 sub open_data_file($$)
480 my ($base, $name) = @_;
481 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
482 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
483 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
484 local *FILE;
486 if ($base =~ /.*\/([^\/]+)\.zip$/)
488 my $zip = "$1$suffix.zip";
489 unless (-f "$cache/$zip")
491 system "mkdir", "-p", $cache;
492 print "Fetching $base...\n";
493 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
495 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
497 else
499 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
500 unless (-f $dest)
502 system "mkdir", "-p", $dir;
503 print "Fetching $base/$name...\n";
504 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
506 open FILE, "<$dest" or die "cannot open $dest";
508 return *FILE;
511 ################################################################
512 # recursively get the decomposition for a character
513 sub get_decomposition($$);
514 sub get_decomposition($$)
516 my ($char, $table) = @_;
517 my @ret;
519 return $char unless defined $table->[$char];
520 foreach my $ch (@{$table->[$char]})
522 push @ret, get_decomposition( $ch, $table );
524 return @ret;
527 ################################################################
528 # get the composition that results in a given character
529 sub get_composition($$)
531 my ($ch, $compat) = @_;
532 return () unless defined $decomp_table[$ch]; # no decomposition
533 my @ret = @{$decomp_table[$ch]};
534 return () if @ret < 2; # singleton decomposition
535 return () if $comp_exclusions[$ch]; # composition exclusion
536 return () if $combining_class_table[$ch]; # non-starter
537 return () if $combining_class_table[$ret[0]]; # first char is non-starter
538 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
539 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
540 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
541 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
542 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
543 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
544 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
545 return @ret;
548 ################################################################
549 # recursively build decompositions
550 sub build_decompositions(@)
552 my @src = @_;
553 my @dst;
555 for (my $i = 0; $i < @src; $i++)
557 next unless defined $src[$i];
558 my @decomp = to_utf16( get_decomposition( $i, \@src ));
559 $dst[$i] = \@decomp;
561 return @dst;
564 ################################################################
565 # compose Hangul sequences
566 sub compose_hangul(@)
568 my $SBASE = 0xac00;
569 my $LBASE = 0x1100;
570 my $VBASE = 0x1161;
571 my $TBASE = 0x11a7;
572 my $LCOUNT = 19;
573 my $VCOUNT = 21;
574 my $TCOUNT = 28;
575 my $NCOUNT = $VCOUNT * $TCOUNT;
576 my $SCOUNT = $LCOUNT * $NCOUNT;
578 my @seq = @_;
579 my @ret;
580 my $i;
582 for ($i = 0; $i < @seq; $i++)
584 my $ch = $seq[$i];
585 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
586 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
588 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
589 $i++;
591 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
592 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
594 $ch += $seq[$i+1] - $TBASE;
595 $i++;
597 push @ret, $ch;
599 return @ret;
602 ################################################################
603 # remove linguistic-only mappings from the case table
604 sub remove_linguistic_mappings($$)
606 my ($upper, $lower) = @_;
608 # remove case mappings that don't round-trip
610 for (my $i = 0; $i < @{$upper}; $i++)
612 next unless defined ${$upper}[$i];
613 my $ch = ${$upper}[$i];
614 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
616 for (my $i = 0; $i < @{$lower}; $i++)
618 next unless defined ${$lower}[$i];
619 my $ch = ${$lower}[$i];
620 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
624 ################################################################
625 # read in the Unicode database files
626 sub load_data()
628 my $start;
630 # now build mappings from the decomposition field of the Unicode database
632 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
633 while (<$UNICODE_DATA>)
635 # Decode the fields ...
636 my ($code, $name, $cat, $comb, $bidi,
637 $decomp, $dec, $dig, $num, $mirror,
638 $oldname, $comment, $upper, $lower, $title) = split /;/;
639 my $src = hex $code;
641 die "unknown category $cat" unless defined $categories{$cat};
642 die "unknown directionality $bidi" unless defined $directions{$bidi};
644 $category_table[$src] = $categories{$cat};
645 $direction_table[$src] = $bidi;
646 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
648 $initial_joining_table[$src] = $joining_types{"T"};
650 else
652 $initial_joining_table[$src] = $joining_types{"U"};
655 if ($lower ne "")
657 $tolower_table[$src] = hex $lower;
659 if ($upper ne "")
661 $toupper_table[$src] = hex $upper;
663 if ($dec ne "")
665 $category_table[$src] |= $ctype{"digit"};
667 if ($dig ne "")
669 $digitmap_table[$src] = ord $dig;
671 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
673 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
674 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
675 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
676 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
677 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
678 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
679 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
680 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
681 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
682 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
683 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
684 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
686 # copy the category and direction for everything between First/Last pairs
687 if ($name =~ /, First>/) { $start = $src; }
688 if ($name =~ /, Last>/)
690 while ($start < $src)
692 $category_table[$start] = $category_table[$src];
693 $direction_table[$start] = $direction_table[$src];
694 $combining_class_table[$start] = $combining_class_table[$src];
695 $start++;
699 next if $decomp eq ""; # no decomposition, skip it
701 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
703 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
704 $decomp_compat_table[$src] = \@seq;
707 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
709 # decomposition of the form "<foo> 1234" -> use char if type is known
710 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
712 ${joining_forms{$1}}[hex $2] = $src;
715 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
717 # decomposition "<compat> 0020 1234" -> combining accent
719 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
721 # store decomposition
722 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
724 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
726 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
728 # Single char decomposition
729 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
733 close $UNICODE_DATA;
735 # patch the category of some special characters
737 for (my $i = 0; $i < @decomp_table; $i++)
739 next unless defined $decomp_table[$i];
740 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
742 foreach my $cat (keys %special_categories)
744 my $flag = $ctype{$cat};
745 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
747 for (my $i = 0; $i < @decomp_compat_table; $i++)
749 next unless defined $decomp_compat_table[$i];
750 next unless @{$decomp_compat_table[$i]} == 2;
751 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
754 # load the composition exclusions
756 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
757 while (<$EXCL>)
759 s/\#.*//; # remove comments
760 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
762 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
764 elsif (/^([0-9a-fA-F]+)\s*$/)
766 $comp_exclusions[hex $1] = 1;
769 close $EXCL;
771 # load the IDNA mappings
773 @idna_decomp_table = @decomp_compat_table;
774 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
775 while (<$IDNA>)
777 s/\#.*//; # remove comments
778 next if /^\s*$/;
779 my ($char, $type, $mapping) = split /;/;
780 my ($ch1, $ch2);
781 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
783 $ch1 = hex $1;
784 $ch2 = hex $2;
786 elsif ($char =~ /([0-9a-fA-F]+)/)
788 $ch1 = $ch2 = hex $1;
791 if ($type =~ /mapped/ || $type =~ /deviation/)
793 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
794 my @seq = map { hex $_; } split /\s+/, $mapping;
795 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
797 elsif ($type =~ /valid/)
800 elsif ($type =~ /ignored/)
802 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
804 elsif ($type =~ /disallowed/)
806 foreach my $i ($ch1 .. $ch2)
808 $idna_decomp_table[$i] = undef;
809 $idna_disallowed[$i] = 1;
813 close $IDNA;
817 ################################################################
818 # add a new registry key
819 sub add_registry_key($$)
821 my ($key, $defval) = @_;
822 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
825 ################################################################
826 # add a new registry value
827 sub add_registry_value($$$)
829 my ($key, $name, $value) = @_;
830 add_registry_key( $key, undef );
831 push @{$registry_keys{$key}}, "'$name' = s '$value'";
834 ################################################################
835 # define a new lead byte
836 sub add_lead_byte($)
838 my $ch = shift;
839 return if defined $cp2uni[$ch];
840 push @lead_bytes, $ch;
841 $cp2uni[$ch] = 0;
844 ################################################################
845 # define a new char mapping
846 sub add_mapping($$)
848 my ($cp, $uni) = @_;
849 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
850 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
851 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
854 ################################################################
855 # get a mapping including glyph chars for MB_USEGLYPHCHARS
856 sub get_glyphs_mapping(@)
858 my @table = @_;
860 for (my $i = 0; $i < @glyph2uni; $i++)
862 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
864 return @table;
867 ################################################################
868 # build EUC-JP table from the JIS 0208/0212 files
869 sub dump_eucjp_codepage()
871 @cp2uni = ();
872 @glyph2uni = ();
873 @lead_bytes = ();
874 @uni2cp = ();
875 $default_char = $DEF_CHAR;
876 $default_wchar = 0x30fb;
878 # ASCII chars
879 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
881 # lead bytes
882 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
884 # JIS X 0201 right plane
885 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
887 # undefined chars
888 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
889 $cp2uni[0xa0] = 0xf8f0;
890 $cp2uni[0xff] = 0xf8f3;
892 # Fix backslash conversion
893 add_mapping( 0xa1c0, 0xff3c );
895 # Add private mappings for rows undefined in JIS 0208/0212
896 my $private = 0xe000;
897 foreach my $hi (0xf5 .. 0xfe)
899 foreach my $lo (0xa1 .. 0xfe)
901 add_mapping( ($hi << 8) + $lo, $private++ );
904 foreach my $hi (0xf5 .. 0xfe)
906 foreach my $lo (0x21 .. 0x7e)
908 add_mapping( ($hi << 8) + $lo, $private++ );
912 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
913 while (<$INPUT>)
915 next if /^\#/; # skip comments
916 next if /^$/; # skip empty lines
917 next if /\x1a/; # skip ^Z
918 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
920 add_mapping( 0x8080 + hex $1, hex $2 );
921 next;
923 die "Unrecognized line $_\n";
925 close $INPUT;
927 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
928 while (<$INPUT>)
930 next if /^\#/; # skip comments
931 next if /^$/; # skip empty lines
932 next if /\x1a/; # skip ^Z
933 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
935 add_mapping( 0x8000 + hex $1, hex $2 );
936 next;
938 die "Unrecognized line $_\n";
940 close $INPUT;
942 output_codepage_file( 20932 );
946 ################################################################
947 # build the sort keys table
948 sub dump_sortkeys($)
950 my $filename = shift;
951 my @sortkeys = ();
953 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
954 while (<$INPUT>)
956 next if /^\#/; # skip comments
957 next if /^$/; # skip empty lines
958 next if /\x1a/; # skip ^Z
959 next if /^\@version/; # skip @version header
960 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
962 my ($uni,$variable) = (hex $1, $2);
963 next if $uni > 65535;
964 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
965 next;
967 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
969 # multiple character sequence, ignored for now
970 next;
972 die "$SORTKEYS: Unrecognized line $_\n";
974 close $INPUT;
976 # compress the keys to 32 bit:
977 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
979 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
980 ${$a}[2] <=> ${$b}[2] or
981 ${$a}[3] <=> ${$b}[3] or
982 ${$a}[4] <=> ${$b}[4] or
983 $a cmp $b; } @sortkeys;
985 my ($n2, $n3) = (1, 1);
986 my @keys = (-1, -1, -1, -1, -1 );
987 my @flatkeys = ();
989 for (my $i = 0; $i < @sortkeys; $i++)
991 next unless defined $sortkeys[$i];
992 my @current = @{$sortkeys[$i]};
993 if ($current[1] == $keys[1])
995 if ($current[2] == $keys[2])
997 if ($current[3] == $keys[3])
999 # nothing
1001 else
1003 $keys[3] = $current[3];
1004 $n3++;
1005 die if ($n3 >= 16);
1008 else
1010 $keys[2] = $current[2];
1011 $keys[3] = $current[3];
1012 $n2++;
1013 $n3 = 1;
1014 die if ($n2 >= 256);
1017 else
1019 $keys[1] = $current[1];
1020 $keys[2] = $current[2];
1021 $keys[3] = $current[3];
1022 $n2 = 1;
1023 $n3 = 1;
1026 if ($current[2]) { $current[2] = $n2; }
1027 if ($current[3]) { $current[3] = $n3; }
1028 if ($current[4]) { $current[4] = 1; }
1030 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1033 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1034 printf "Building $filename\n";
1035 printf OUTPUT "/* Unicode collation element table */\n";
1036 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1037 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1038 print OUTPUT "#include \"windef.h\"\n\n";
1040 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1042 close OUTPUT;
1043 save_file($filename);
1047 ################################################################
1048 # dump an array of integers
1049 sub dump_array($$@)
1051 my ($bit_width, $default, @array) = @_;
1052 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1053 my $i;
1054 my $ret = " ";
1055 for ($i = 0; $i < $#array; $i++)
1057 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1058 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1060 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1061 return $ret;
1065 ################################################################
1066 # dump an SBCS mapping table in binary format
1067 sub dump_binary_sbcs_table($)
1069 my $codepage = shift;
1071 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1072 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1074 print OUTPUT pack "S<*", @header;
1075 print OUTPUT pack "C12", (0) x 12;
1076 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1078 if (@glyph2uni)
1080 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1082 else
1084 print OUTPUT pack "S<*", 0;
1087 print OUTPUT pack "S<*", 0, 0;
1089 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1093 ################################################################
1094 # dump a DBCS mapping table in binary format
1095 sub dump_binary_dbcs_table($)
1097 my $codepage = shift;
1098 my @lb_ranges = get_lb_ranges();
1099 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1101 my @offsets = (0) x 256;
1102 my $pos = 0;
1103 foreach my $i (@lead_bytes)
1105 $offsets[$i] = ($pos += 256);
1106 $cp2uni[$i] = 0;
1109 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1111 print OUTPUT pack "S<*", @header;
1112 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1113 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1114 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1116 foreach my $i (@lead_bytes)
1118 my $base = $i << 8;
1119 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1122 print OUTPUT pack "S<", 4;
1123 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1127 ################################################################
1128 # get the list of defined lead byte ranges
1129 sub get_lb_ranges()
1131 my @list = ();
1132 my @ranges = ();
1134 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1135 my $on = 0;
1136 for (my $i = 0; $i < 256; $i++)
1138 if ($on)
1140 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1142 else
1144 if ($list[$i]) { push @ranges, $i; $on = 1; }
1147 if ($on) { push @ranges, 0xff; }
1148 return @ranges;
1151 ################################################################
1152 # dump the Indic Syllabic Category table
1153 sub dump_indic($)
1155 my $filename = shift;
1156 my @indic_table;
1158 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1159 while (<$INPUT>)
1161 next if /^\#/; # skip comments
1162 next if /^\s*$/; # skip empty lines
1163 next if /\x1a/; # skip ^Z
1164 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1166 my $type = $2;
1167 die "unknown indic $type" unless defined $indic_types{$type};
1168 if (hex $1 < 65536)
1170 $indic_table[hex $1] = $indic_types{$type};
1172 next;
1174 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1176 my $type = $3;
1177 die "unknown indic $type" unless defined $indic_types{$type};
1178 if (hex $1 < 65536 and hex $2 < 65536)
1180 foreach my $i (hex $1 .. hex $2)
1182 $indic_table[$i] = $indic_types{$type};
1185 next;
1187 die "malformed line $_";
1189 close $INPUT;
1191 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1192 while (<$INPUT>)
1194 next if /^\#/; # skip comments
1195 next if /^\s*$/; # skip empty lines
1196 next if /\x1a/; # skip ^Z
1197 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1199 my $type = $2;
1200 die "unknown matra $type" unless defined $matra_types{$type};
1201 $indic_table[hex $1] |= $matra_types{$type} << 8;
1202 next;
1204 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1206 my $type = $3;
1207 die "unknown matra $type" unless defined $matra_types{$type};
1208 foreach my $i (hex $1 .. hex $2)
1210 $indic_table[$i] |= $matra_types{$type} << 8;
1212 next;
1214 die "malformed line $_";
1216 close $INPUT;
1218 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1219 print "Building $filename\n";
1220 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1221 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1222 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1223 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1224 print OUTPUT "#include \"windef.h\"\n\n";
1226 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1228 close OUTPUT;
1229 save_file($filename);
1232 ################################################################
1233 # dump the Line Break Properties table
1234 sub dump_linebreak($)
1236 my $filename = shift;
1237 my @break_table;
1239 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1240 while (<$INPUT>)
1242 next if /^\#/; # skip comments
1243 next if /^\s*$/; # skip empty lines
1244 next if /\x1a/; # skip ^Z
1245 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1247 my $type = $2;
1248 die "unknown breaktype $type" unless defined $break_types{$type};
1249 $break_table[hex $1] = $break_types{$type};
1250 next;
1252 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1254 my $type = $3;
1255 die "unknown breaktype $type" unless defined $break_types{$type};
1256 foreach my $i (hex $1 .. hex $2)
1258 $break_table[$i] = $break_types{$type};
1260 next;
1262 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1264 my $type = $2;
1265 die "unknown breaktype $type" unless defined $break_types{$type};
1266 $break_table[hex $1] = $break_types{$type};
1267 next;
1269 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1271 my $type = $3;
1272 die "unknown breaktype $type" unless defined $break_types{$type};
1273 foreach my $i (hex $1 .. hex $2)
1275 $break_table[$i] = $break_types{$type};
1277 next;
1279 die "malformed line $_";
1281 close $INPUT;
1283 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1284 print "Building $filename\n";
1285 print OUTPUT "/* Unicode Line Break Properties */\n";
1286 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1287 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1288 print OUTPUT "#include \"windef.h\"\n\n";
1290 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1292 close OUTPUT;
1293 save_file($filename);
1296 my %scripts =
1298 "Unknown" => 0,
1299 "Common" => 1,
1300 "Inherited" => 2,
1301 "Arabic" => 3,
1302 "Armenian" => 4,
1303 "Avestan" => 5,
1304 "Balinese" => 6,
1305 "Bamum" => 7,
1306 "Batak" => 8,
1307 "Bengali" => 9,
1308 "Bopomofo" => 10,
1309 "Brahmi" => 11,
1310 "Braille" => 12,
1311 "Buginese" => 13,
1312 "Buhid" => 14,
1313 "Canadian_Aboriginal" => 15,
1314 "Carian" => 16,
1315 "Cham" => 17,
1316 "Cherokee" => 18,
1317 "Coptic" => 19,
1318 "Cuneiform" => 20,
1319 "Cypriot" => 21,
1320 "Cyrillic" => 22,
1321 "Deseret" => 23,
1322 "Devanagari" => 24,
1323 "Egyptian_Hieroglyphs" => 25,
1324 "Ethiopic" => 26,
1325 "Georgian" => 27,
1326 "Glagolitic" => 28,
1327 "Gothic" => 29,
1328 "Greek" => 30,
1329 "Gujarati" => 31,
1330 "Gurmukhi" => 32,
1331 "Han" => 33,
1332 "Hangul" => 34,
1333 "Hanunoo" => 35,
1334 "Hebrew" => 36,
1335 "Hiragana" => 37,
1336 "Imperial_Aramaic" => 38,
1337 "Inscriptional_Pahlavi" => 39,
1338 "Inscriptional_Parthian" => 40,
1339 "Javanese" => 41,
1340 "Kaithi" => 42,
1341 "Kannada" => 43,
1342 "Katakana" => 44,
1343 "Kayah_Li" => 45,
1344 "Kharoshthi" => 46,
1345 "Khmer" => 47,
1346 "Lao" => 48,
1347 "Latin" => 49,
1348 "Lepcha" => 50,
1349 "Limbu" => 51,
1350 "Linear_B" => 52,
1351 "Lisu" => 53,
1352 "Lycian" => 54,
1353 "Lydian" => 55,
1354 "Malayalam" => 56,
1355 "Mandaic" => 57,
1356 "Meetei_Mayek" => 58,
1357 "Mongolian" => 59,
1358 "Myanmar" => 60,
1359 "New_Tai_Lue" => 61,
1360 "Nko" => 62,
1361 "Ogham" => 63,
1362 "Ol_Chiki" => 64,
1363 "Old_Italic" => 65,
1364 "Old_Persian" => 66,
1365 "Old_South_Arabian" => 67,
1366 "Old_Turkic" => 68,
1367 "Oriya" => 69,
1368 "Osmanya" => 70,
1369 "Phags_Pa" => 71,
1370 "Phoenician" => 72,
1371 "Rejang" => 73,
1372 "Runic" => 74,
1373 "Samaritan" => 75,
1374 "Saurashtra" => 76,
1375 "Shavian" => 77,
1376 "Sinhala" => 78,
1377 "Sundanese" => 79,
1378 "Syloti_Nagri" => 80,
1379 "Syriac" => 81,
1380 "Tagalog" => 82,
1381 "Tagbanwa" => 83,
1382 "Tai_Le" => 84,
1383 "Tai_Tham" => 85,
1384 "Tai_Viet" => 86,
1385 "Tamil" => 87,
1386 "Telugu" => 88,
1387 "Thaana" => 89,
1388 "Thai" => 90,
1389 "Tibetan" => 91,
1390 "Tifinagh" => 92,
1391 "Ugaritic" => 93,
1392 "Vai" => 94,
1393 "Yi" => 95,
1394 # Win8/Win8.1
1395 "Chakma" => 96,
1396 "Meroitic_Cursive" => 97,
1397 "Meroitic_Hieroglyphs" => 98,
1398 "Miao" => 99,
1399 "Sharada" => 100,
1400 "Sora_Sompeng" => 101,
1401 "Takri" => 102,
1402 # Win10
1403 "Bassa_Vah" => 103,
1404 "Caucasian_Albanian" => 104,
1405 "Duployan" => 105,
1406 "Elbasan" => 106,
1407 "Grantha" => 107,
1408 "Khojki" => 108,
1409 "Khudawadi" => 109,
1410 "Linear_A" => 110,
1411 "Mahajani" => 111,
1412 "Manichaean" => 112,
1413 "Mende_Kikakui" => 113,
1414 "Modi" => 114,
1415 "Mro" => 115,
1416 "Nabataean" => 116,
1417 "Old_North_Arabian" => 117,
1418 "Old_Permic" => 118,
1419 "Pahawh_Hmong" => 119,
1420 "Palmyrene" => 120,
1421 "Pau_Cin_Hau" => 121,
1422 "Psalter_Pahlavi" => 122,
1423 "Siddham" => 123,
1424 "Tirhuta" => 124,
1425 "Warang_Citi" => 125,
1426 # Win10 RS1
1427 "Adlam" => 126,
1428 "Ahom" => 127,
1429 "Anatolian_Hieroglyphs" => 128,
1430 "Bhaiksuki" => 129,
1431 "Hatran" => 130,
1432 "Marchen" => 131,
1433 "Multani" => 132,
1434 "Newa" => 133,
1435 "Old_Hungarian" => 134,
1436 "Osage" => 135,
1437 "SignWriting" => 136,
1438 "Tangut" => 137,
1439 # Win10 RS4
1440 "Masaram_Gondi" => 138,
1441 "Nushu" => 139,
1442 "Soyombo" => 140,
1443 "Zanabazar_Square" => 141,
1444 # Win10 1903
1445 "Dogra" => 142,
1446 "Gunjala_Gondi" => 143,
1447 "Hanifi_Rohingya" => 144,
1448 "Makasar" => 145,
1449 "Medefaidrin" => 146,
1450 "Old_Sogdian" => 147,
1451 "Sogdian" => 148,
1452 # Win10 2004
1453 "Elymaic" => 149,
1454 "Nyiakeng_Puachue_Hmong" => 150,
1455 "Nandinagari" => 151,
1456 "Wancho" => 152,
1459 ################################################################
1460 # dump Script IDs table
1461 sub dump_scripts($)
1463 my $filename = shift;
1464 my $header = $filename;
1465 my @scripts_table;
1466 my $script_index;
1467 my $i;
1469 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1470 # Fill the table
1471 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1472 while (<$INPUT>)
1474 my $type = "";
1476 next if /^\#/; # skip comments
1477 next if /^\s*$/; # skip empty lines
1478 next if /\x1a/; # skip ^Z
1479 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1481 $type = $2;
1482 if (defined $scripts{$type})
1484 $scripts_table[hex $1] = $scripts{$type};
1486 next;
1488 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1490 $type = $3;
1491 if (defined $scripts{$type})
1493 foreach my $i (hex $1 .. hex $2)
1495 $scripts_table[$i] = $scripts{$type};
1498 next;
1502 close $INPUT;
1504 $header = "$filename.h";
1505 open OUTPUT,">$header.new" or die "Cannot create $header";
1506 print "Building $header\n";
1507 print OUTPUT "/* Unicode Script IDs */\n";
1508 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1509 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1511 print OUTPUT "enum unicode_script_id {\n";
1512 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1514 print OUTPUT " Script_$script = $scripts{$script},\n";
1516 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1517 print OUTPUT "};\n";
1519 close OUTPUT;
1520 save_file($header);
1522 $filename = "$filename.c";
1523 open OUTPUT,">$filename.new" or die "Cannot create $header";
1524 print "Building $filename\n";
1525 print OUTPUT "/* Unicode Script IDs */\n";
1526 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1527 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1528 print OUTPUT "#include \"windef.h\"\n\n";
1530 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1531 close OUTPUT;
1532 save_file($filename);
1535 ################################################################
1536 # dump the BiDi mirroring table
1537 sub dump_mirroring($)
1539 my $filename = shift;
1540 my @mirror_table = ();
1542 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1543 while (<$INPUT>)
1545 next if /^\#/; # skip comments
1546 next if /^$/; # skip empty lines
1547 next if /\x1a/; # skip ^Z
1548 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1550 $mirror_table[hex $1] = hex $2;
1551 next;
1553 die "malformed line $_";
1555 close $INPUT;
1557 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1558 print "Building $filename\n";
1559 print OUTPUT "/* Unicode BiDi mirroring */\n";
1560 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1561 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1562 print OUTPUT "#include \"windef.h\"\n\n";
1563 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1564 close OUTPUT;
1565 save_file($filename);
1568 ################################################################
1569 # dump the Bidi Brackets
1570 sub dump_bracket($)
1572 my $filename = shift;
1573 my @bracket_table;
1575 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1576 while (<$INPUT>)
1578 next if /^\#/; # skip comments
1579 next if /^\s*$/; # skip empty lines
1580 next if /\x1a/; # skip ^Z
1581 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1583 my $type = $3;
1584 die "unknown bracket $type" unless defined $bracket_types{$type};
1585 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1586 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1587 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1588 next;
1590 die "malformed line $_";
1592 close $INPUT;
1594 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1595 print "Building $filename\n";
1596 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1597 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1598 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1599 print OUTPUT "#include \"windef.h\"\n\n";
1601 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1603 close OUTPUT;
1604 save_file($filename);
1607 ################################################################
1608 # dump the Arabic shaping table
1609 sub dump_shaping($)
1611 my $filename = shift;
1612 my @joining_table = @initial_joining_table;
1614 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1615 while (<$INPUT>)
1617 next if /^\#/; # skip comments
1618 next if /^\s*$/; # skip empty lines
1619 next if /\x1a/; # skip ^Z
1620 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1622 my $type = $2;
1623 $joining_table[hex $1] = $joining_types{$type};
1624 next;
1626 die "malformed line $_";
1628 close $INPUT;
1630 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1631 print "Building $filename\n";
1632 print OUTPUT "/* Unicode Arabic shaping */\n";
1633 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1634 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1635 print OUTPUT "#include \"windef.h\"\n\n";
1637 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1639 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1640 for (my $i = 0x600; $i <= 0x6ff; $i++)
1642 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1643 ${joining_forms{"isolated"}}[$i] || $i,
1644 ${joining_forms{"final"}}[$i] || $i,
1645 ${joining_forms{"initial"}}[$i] || $i,
1646 ${joining_forms{"medial"}}[$i] || $i;
1648 print OUTPUT "};\n";
1650 close OUTPUT;
1651 save_file($filename);
1654 ################################################################
1655 # dump the Arabic shaping table
1656 sub dump_arabic_shaping($)
1658 my $filename = shift;
1659 my @joining_table = @initial_joining_table;
1661 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1662 while (<$INPUT>)
1664 next if /^\#/; # skip comments
1665 next if /^\s*$/; # skip empty lines
1666 next if /\x1a/; # skip ^Z
1667 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1669 my $type = $2;
1670 my $group = $3;
1672 if ($group eq "ALAPH" || $group eq "DALATH RISH")
1674 $joining_table[hex $1] = $joining_types{$group};
1676 else
1678 $joining_table[hex $1] = $joining_types{$type};
1681 next;
1683 die "malformed line $_";
1685 close $INPUT;
1687 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1688 print "Building $filename\n";
1689 print OUTPUT "/* Unicode Arabic shaping */\n";
1690 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1691 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1692 print OUTPUT "#include \"windef.h\"\n\n";
1694 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1696 close OUTPUT;
1697 save_file($filename);
1700 ################################################################
1701 # dump the Vertical Orientation table
1702 sub dump_vertical($)
1704 my $filename = shift;
1705 my @vertical_table;
1707 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1708 while (<$INPUT>)
1710 next if /^\#/; # skip comments
1711 next if /^\s*$/; # skip empty lines
1712 next if /\x1a/; # skip ^Z
1713 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1715 my $type = $2;
1716 die "unknown vertical $type" unless defined $vertical_types{$type};
1717 if (hex $1 < 65536)
1719 $vertical_table[hex $1] = $vertical_types{$type};
1721 next;
1723 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1725 my $type = $3;
1726 die "unknown vertical $type" unless defined $vertical_types{$type};
1727 foreach my $i (hex $1 .. hex $2)
1729 $vertical_table[$i] = $vertical_types{$type};
1731 next;
1733 die "malformed line $_";
1735 close $INPUT;
1737 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1738 print "Building $filename\n";
1739 print OUTPUT "/* Unicode Vertical Orientation */\n";
1740 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1741 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1742 print OUTPUT "#include \"windef.h\"\n\n";
1744 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1746 close OUTPUT;
1747 save_file($filename);
1750 ################################################################
1751 # dump the digit folding tables
1752 sub dump_digit_folding($)
1754 my ($filename) = shift;
1755 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1756 print "Building $filename\n";
1757 print OUTPUT "/* Unicode digit folding mappings */\n";
1758 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1759 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1760 print OUTPUT "#include \"windef.h\"\n\n";
1762 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1763 close OUTPUT;
1764 save_file($filename);
1768 ################################################################
1769 # compress a mapping table by removing identical rows
1770 sub compress_array($$@)
1772 my $rows = shift;
1773 my $def = shift;
1774 my @table = @_;
1775 my $len = @table / $rows;
1776 my @array;
1777 my $data = "";
1779 # try to merge table rows
1780 for (my $row = 0; $row < $rows; $row++)
1782 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1783 my $pos = index $data, $rowtxt;
1784 if ($pos == -1)
1786 # check if the tail of the data can match the start of the new row
1787 my $first = substr( $rowtxt, 0, 1 );
1788 for (my $i = length($data) - 1; $i > 0; $i--)
1790 $pos = index( substr( $data, -$i ), $first );
1791 last if $pos == -1;
1792 $i -= $pos;
1793 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1794 substr( $data, -$i ) = "";
1795 last;
1797 $pos = length $data;
1798 $data .= $rowtxt;
1800 $array[$row] = $rows + $pos;
1802 return @array, unpack "U*", $data;
1805 ################################################################
1806 # dump a char -> 16-bit value mapping table using two-level tables
1807 sub dump_two_level_mapping($$@)
1809 my $name = shift;
1810 my $def = shift;
1811 my $size = shift;
1812 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1813 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1814 my @array = compress_array( 256, 0, @row_array[0..4095] );
1816 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1818 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1819 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1820 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1821 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1824 ################################################################
1825 # dump a char -> value mapping table using three-level tables
1826 sub dump_three_level_mapping($$@)
1828 my $name = shift;
1829 my $def = shift;
1830 my $size = shift;
1831 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1832 my $level3 = ($MAX_CHAR + 1) / 16;
1833 my $level2 = $level3 / 16;
1834 my $level1 = $level2 / 16;
1835 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1836 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1837 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1839 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1840 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1842 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1843 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1844 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1845 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1846 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1849 ################################################################
1850 # dump a binary case mapping table in l_intl.nls format
1851 sub dump_binary_case_table(@)
1853 my (@table) = @_;
1854 my $max_char = 0x10000;
1855 my $level1 = $max_char / 16;
1856 my $level2 = $level1 / 16;
1858 my @difftable;
1859 for (my $i = 0; $i < @table; $i++)
1861 next unless defined $table[$i];
1862 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1865 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1866 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1867 my $offset = @array - $level1;
1868 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1869 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1872 ################################################################
1873 # dump case mappings for l_intl.nls
1874 sub dump_intl_nls($)
1876 my @upper_table = @toupper_table;
1877 my @lower_table = @tolower_table;
1878 remove_linguistic_mappings( \@upper_table, \@lower_table );
1880 my $upper = dump_binary_case_table( @upper_table );
1881 my $lower = dump_binary_case_table( @lower_table );
1883 my $filename = shift;
1884 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1885 printf "Building $filename\n";
1887 binmode OUTPUT;
1888 print OUTPUT pack "S<", 1; # version
1889 print OUTPUT $upper;
1890 print OUTPUT $lower;
1891 close OUTPUT;
1892 save_file($filename);
1896 ################################################################
1897 # dump the bidi direction table
1898 sub dump_bidi_dir_table($)
1900 my $filename = shift;
1901 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1902 printf "Building $filename\n";
1903 printf OUTPUT "/* Unicode BiDi direction table */\n";
1904 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1905 printf OUTPUT "#include \"windef.h\"\n\n";
1907 my @table;
1909 for (my $i = 0; $i < 65536; $i++)
1911 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1914 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1916 close OUTPUT;
1917 save_file($filename);
1921 sub rol($$)
1923 my ($byte, $count) = @_;
1924 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1927 ################################################################
1928 # compress the character properties table
1929 sub compress_char_props_table($@)
1931 my $rows = shift;
1932 my @table = @_;
1933 my $len = @table / $rows;
1934 my $pos = 0;
1935 my @array = (0) x $rows;
1936 my %sequences;
1938 # add some predefined sequences
1939 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1941 # try to merge table rows
1942 for (my $row = 0; $row < $rows; $row++)
1944 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1945 my $rowtxt = pack "L*", @table_row;
1946 if (defined($sequences{$rowtxt}))
1948 # reuse an existing row
1949 $array[$row] = $sequences{$rowtxt};
1951 else
1953 # create a new row
1954 $sequences{$rowtxt} = $array[$row] = ++$pos;
1955 push @array, @table_row;
1958 return @array;
1961 ################################################################
1962 # dump a normalization table in binary format
1963 sub dump_norm_table($)
1965 my $filename = shift;
1967 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
1968 my %decomp = ( "nfc" => \@decomp_table,
1969 "nfd" => \@decomp_table,
1970 "nfkc" => \@decomp_compat_table,
1971 "nfkd" => \@decomp_compat_table ,
1972 "idna" => \@idna_decomp_table );
1974 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1975 print "Building $filename\n";
1977 my $type = $filename;
1978 $type =~ s!.*/norm(\w+)\.nls!$1!;
1980 my $compose = $forms{$type} & 1;
1981 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
1983 my @version = split /\./, $UNIVERSION;
1985 # combining classes
1987 my @classes;
1988 my @class_values;
1990 foreach my $c (grep defined, @combining_class_table)
1992 $classes[$c] = 1 if $c < 0x100;
1994 for (my $i = 0; $i < @classes; $i++)
1996 next unless defined $classes[$i];
1997 $classes[$i] = @class_values;
1998 push @class_values, $i;
2000 push @class_values, 0 if (@class_values % 2);
2001 die "too many classes" if @class_values >= 0x40;
2003 # character properties
2005 my @char_props;
2006 my @decomposed;
2007 my @comp_hash_table;
2008 my $comp_hash_size = $compose ? 254 : 0;
2010 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2012 next unless defined $combining_class_table[$i];
2013 if (defined $decomp{$type}->[$i])
2015 my @dec = get_decomposition( $i, $decomp{$type} );
2016 if ($compose && (my @comp = get_composition( $i, $compat )))
2018 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2019 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2021 my $val = 0;
2022 foreach my $d (@dec)
2024 $val = $combining_class_table[$d];
2025 last if $val;
2027 $char_props[$i] = $classes[$val];
2029 else
2031 $char_props[$i] = 0xbf;
2033 @dec = compose_hangul( @dec ) if $compose;
2034 @dec = to_utf16( @dec );
2035 push @dec, 0 if @dec >= 7;
2036 $decomposed[$i] = \@dec;
2038 else
2040 if ($combining_class_table[$i] == 0x100)
2042 $char_props[$i] = 0x7f;
2044 elsif ($combining_class_table[$i])
2046 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2048 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2050 $char_props[$i] = 0xff;
2052 else
2054 $char_props[$i] = 0;
2059 if ($compose)
2061 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2063 my @comp = get_composition( $i, $compat );
2064 next unless @comp;
2065 if ($combining_class_table[$comp[1]])
2067 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2068 $char_props[$comp[1]] |= 0x40;
2070 else
2072 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2073 $char_props[$comp[1]] |= 0xc0;
2078 # surrogates
2079 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2080 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2082 # Hangul
2083 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2084 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2085 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2087 # invalid chars
2088 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2089 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2090 foreach my $i (0x00..0x10)
2092 $char_props[($i << 16) | 0xfffe] = 0xff;
2093 $char_props[($i << 16) | 0xffff] = 0xff;
2096 # decomposition hash table
2098 my @decomp_hash_table;
2099 my @decomp_hash_index;
2100 my @decomp_hash_data;
2101 my $decomp_hash_size = 944;
2103 # build string of character data, reusing substrings when possible
2104 my $decomp_char_data = "";
2105 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2107 my $str = pack "U*", @{$i};
2108 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2110 for (my $i = 0; $i < @decomposed; $i++)
2112 next unless defined $decomposed[$i];
2113 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2114 die "sequence not found" if $pos == -1;
2115 my $len = @{$decomposed[$i]};
2116 $len = 7 if $len > 7;
2117 my $hash = $i % $decomp_hash_size;
2118 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2120 for (my $i = 0; $i < $decomp_hash_size; $i++)
2122 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2123 next unless defined $decomp_hash_table[$i];
2124 if (@{$decomp_hash_table[$i]} == 1)
2126 my $entry = $decomp_hash_table[$i]->[0];
2127 if ($char_props[$entry->[0]] == 0xbf)
2129 $decomp_hash_index[$i] = $entry->[1];
2130 next;
2133 foreach my $entry (@{$decomp_hash_table[$i]})
2135 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2138 push @decomp_hash_data, 0, 0;
2140 # composition hash table
2142 my @comp_hash_index;
2143 my @comp_hash_data;
2144 if (@comp_hash_table)
2146 for (my $i = 0; $i < $comp_hash_size; $i++)
2148 $comp_hash_index[$i] = @comp_hash_data;
2149 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2151 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2152 push @comp_hash_data, 0, 0, 0;
2155 my $level1 = ($MAX_CHAR + 1) / 128;
2156 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2158 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2159 0, $decomp_hash_size, $comp_hash_size, 0 );
2160 my @tables = (0) x 8;
2162 $tables[0] = 16 + @header + @tables;
2163 $tables[1] = $tables[0] + @class_values / 2;
2164 $tables[2] = $tables[1] + $level1 / 2;
2165 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2166 $tables[4] = $tables[3] + @decomp_hash_index;
2167 $tables[5] = $tables[4] + @decomp_hash_data;
2168 $tables[6] = $tables[5] + length $decomp_char_data;
2169 $tables[7] = $tables[6] + @comp_hash_index;
2171 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2172 print OUTPUT pack "S<*", @header;
2173 print OUTPUT pack "S<*", @tables;
2174 print OUTPUT pack "C*", @class_values;
2176 print OUTPUT pack "C*", @rows[0..$level1-1];
2177 print OUTPUT pack "C*", @rows[$level1..$#rows];
2178 print OUTPUT pack "S<*", @decomp_hash_index;
2179 print OUTPUT pack "S<*", @decomp_hash_data;
2180 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2181 print OUTPUT pack "S<*", @comp_hash_index;
2182 print OUTPUT pack "S<*", @comp_hash_data;
2184 close OUTPUT;
2185 save_file($filename);
2187 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2191 ################################################################
2192 # output a codepage definition file from the global tables
2193 sub output_codepage_file($)
2195 my $codepage = shift;
2197 my $output = sprintf "nls/c_%03d.nls", $codepage;
2198 open OUTPUT,">$output.new" or die "Cannot create $output";
2200 printf "Building %s\n", $output;
2201 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2202 else { dump_binary_dbcs_table( $codepage ); }
2204 close OUTPUT;
2205 save_file($output);
2207 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2210 ################################################################
2211 # output a codepage table from a Microsoft-style mapping file
2212 sub dump_msdata_codepage($)
2214 my $filename = shift;
2216 my $state = "";
2217 my ($codepage, $width, $count);
2218 my ($lb_cur, $lb_end);
2220 @cp2uni = ();
2221 @glyph2uni = ();
2222 @lead_bytes = ();
2223 @uni2cp = ();
2224 $default_char = $DEF_CHAR;
2225 $default_wchar = $DEF_CHAR;
2227 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2229 while (<$INPUT>)
2231 next if /^;/; # skip comments
2232 next if /^\s*$/; # skip empty lines
2233 next if /\x1a/; # skip ^Z
2234 last if /^ENDCODEPAGE/;
2236 if (/^CODEPAGE\s+(\d+)/)
2238 $codepage = $1;
2239 next;
2241 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2243 $width = $1;
2244 $default_char = hex $2;
2245 $default_wchar = hex $3;
2246 next;
2248 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2250 $state = $1;
2251 $count = $2;
2252 next;
2254 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2256 if ($state eq "MBTABLE")
2258 my $cp = hex $1;
2259 my $uni = hex $2;
2260 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2261 next;
2263 if ($state eq "GLYPHTABLE")
2265 my $cp = hex $1;
2266 my $uni = hex $2;
2267 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2268 next;
2270 if ($state eq "WCTABLE")
2272 my $uni = hex $1;
2273 my $cp = hex $2;
2274 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2275 next;
2277 if ($state eq "DBCSRANGE")
2279 my $start = hex $1;
2280 my $end = hex $2;
2281 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2282 $lb_cur = $start;
2283 $lb_end = $end;
2284 next;
2286 if ($state eq "DBCSTABLE")
2288 my $mb = hex $1;
2289 my $uni = hex $2;
2290 my $cp = ($lb_cur << 8) | $mb;
2291 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2292 if (!--$count)
2294 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2296 next;
2299 die "$filename: Unrecognized line $_\n";
2301 close $INPUT;
2303 output_codepage_file( $codepage );
2306 ################################################################
2307 # align a string length
2308 sub align_string($$)
2310 my ($align, $str) = @_;
2311 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2312 return $str;
2315 ################################################################
2316 # pack a GUID string
2317 sub pack_guid($)
2319 $_ = shift;
2320 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2321 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2324 ################################################################
2325 # comparison function for compression sort
2326 sub cmp_compression
2328 return scalar @{$a} <=> scalar @{$b} ||
2329 $a->[4] <=> $b->[4] ||
2330 $a->[5] <=> $b->[5] ||
2331 $a->[6] <=> $b->[6] ||
2332 $a->[7] <=> $b->[7] ||
2333 $a->[8] <=> $b->[8] ||
2334 $a->[9] <=> $b->[9] ||
2335 $a->[10] <=> $b->[10] ||
2336 $a->[11] <=> $b->[11] ||
2337 $a->[12] <=> $b->[12];
2340 ################################################################
2341 # build a binary sort keys table
2342 sub dump_sortkey_table($$)
2344 my ($filename, $download) = @_;
2346 my @keys;
2347 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2348 my @multiple_weights;
2349 my @expansions;
2350 my @compressions;
2351 my %exceptions;
2352 my %guids;
2353 my %compr_flags;
2354 my %locales;
2355 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2356 my $jamostr = "";
2358 my $re_hex = '0x[0-9A-Fa-f]+';
2359 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2360 $guids{$default_guid} = { };
2362 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2364 my $KEYS = open_data_file( $MSDATA, $download );
2366 printf "Building $filename\n";
2368 while (<$KEYS>)
2370 s/\s*;.*$//;
2371 next if /^\s*$/; # skip empty lines
2372 if (/^\s*(SORTKEY|SORTTABLES)/)
2374 $part = $1;
2375 next;
2377 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2379 $part = $section = "";
2380 next;
2382 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2384 $section = $1;
2385 $guid = undef;
2386 next;
2388 next unless $part;
2389 if ("$part.$section" eq "SORTKEY.DEFAULT")
2391 if (/^\s*($re_hex)\s+$re_key/)
2393 $keys[hex $1] = [ split(/\s+/,$2) ];
2394 next;
2397 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2399 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2401 $version = hex $1;
2402 next;
2404 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2406 # ignore for now
2407 next;
2410 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2411 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2412 "$part.$section" eq "SORTTABLES.INVERSECASING")
2414 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2416 $guid = lc $1;
2417 $guids{$guid} = { } unless defined $guids{$guid};
2418 $guids{$guid}->{flags} |= $flags{$section};
2419 next;
2421 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2423 $locales{$1} = $guid;
2424 next;
2427 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2429 if (/^\s*(\d+)\s+(\d+)/)
2431 push @multiple_weights, $1, $2;
2432 next;
2435 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2437 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2439 my $pos = scalar @expansions / 2;
2440 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2441 push @expansions, hex $2, hex $3;
2442 next;
2445 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2447 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2449 $keys[hex $1] = $keys[hex $2];
2450 next;
2453 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2455 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2457 if ($subsection || !$guid) # start a new one
2459 $guid = lc $1;
2460 $subsection = "";
2461 $guids{$guid} = { } unless defined $guids{$guid};
2462 $guids{$guid}->{flags} |= $flags{$2} if $2;
2463 $guids{$guid}->{compr} = @compressions;
2464 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2465 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2466 push @compressions, [ ];
2468 else # merge with current one
2470 $guids{lc $1} = { } unless defined $guids{lc $1};
2471 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2472 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2473 $compr_flags{lc $1} = $compr_flags{$guid};
2475 next;
2477 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2479 $locales{$1} = $guid;
2480 next;
2482 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2484 $subsection = $1;
2485 next;
2487 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2489 my @comp = map { hex $_; } split(/\s+/,$1);
2490 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2491 # add compression flags
2492 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2493 next;
2496 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2498 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2500 $guid = lc $1;
2501 $guids{$guid} = { } unless defined $guids{lc $1};
2502 $ling_flag = ($2 ? "+" : "-");
2503 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2504 next;
2506 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2508 $locales{$1} = $guid;
2509 next;
2511 if (/^\s*($re_hex)\s+$re_key/)
2513 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2514 next;
2517 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2519 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2521 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2522 next;
2525 die "$download: $part.$section: unrecognized line $_\n";
2527 close $KEYS;
2529 # Sortkey table
2531 my $table;
2532 for (my $i = 0; $i < 0x10000; $i++)
2534 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2535 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2538 foreach my $id (sort keys %exceptions)
2540 my $pos = length($table) / 4;
2541 my @exc = @{$exceptions{$id}};
2542 my @filled;
2543 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2544 my $guid = substr( $id, 0, -1 );
2545 $guids{$guid}->{$key} = $pos;
2546 $pos += 0x100;
2547 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2548 for (my $j = 0; $j < 0x10000; $j++)
2550 next unless defined $exc[$j] || defined $flags[$j];
2551 $filled[$j >> 8] = 1;
2552 $j |= 0xff;
2554 for (my $j = 0; $j < 0x100; $j++)
2556 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2557 $pos += 0x100 if $filled[$j];
2559 for (my $j = 0; $j < 0x10000; $j++)
2561 next unless $filled[$j >> 8];
2562 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2563 $k[3] |= $flags[$j] || 0;
2564 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2568 # Case mapping tables
2570 # standard table
2571 my @casemaps;
2572 my @upper = @toupper_table;
2573 my @lower = @tolower_table;
2574 remove_linguistic_mappings( \@upper, \@lower );
2575 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2577 # linguistic table
2578 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2580 # Turkish table
2581 @upper = @toupper_table;
2582 @lower = @tolower_table;
2583 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2584 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2585 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2586 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2588 # Char type table
2590 my @table;
2591 my $types = "";
2592 my %typestr;
2593 for (my $i = 0; $i < 0x10000; $i++)
2595 my $str = pack "S<3",
2596 ($category_table[$i] || 0) & 0xffff,
2597 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2598 ($category_table[$i] || 0) >> 16;
2600 if (!defined($typestr{$str}))
2602 $typestr{$str} = length($types) / 6;
2603 $types .= $str;
2605 $table[$i] = $typestr{$str};
2608 my @rows = compress_array( 4096, 0, @table[0..65535] );
2609 my @array = compress_array( 256, 0, @rows[0..4095] );
2610 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2611 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2613 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2614 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2615 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2617 # Sort tables
2619 # guids
2620 my $sorttables = pack "L<2", $version, scalar %guids;
2621 foreach my $id (sort keys %guids)
2623 my %guid = %{$guids{$id}};
2624 my $flags = $guid{flags} || 0;
2625 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2626 $sorttables .= pack_guid($id) . pack "L<5",
2627 $flags,
2628 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2629 $guid{except} || 0,
2630 $guid{ling_except} || 0,
2631 $map / 2;
2634 # expansions
2635 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2637 # compressions
2638 $sorttables .= pack "L<", scalar @compressions;
2639 my $rowstr = "";
2640 foreach my $c (@compressions)
2642 my $pos = length($rowstr) / 2;
2643 my $min = 0xffff;
2644 my $max = 0;
2645 my @lengths = (0) x 8;
2646 foreach my $r (sort cmp_compression @{$c})
2648 my @row = @{$r};
2649 $lengths[scalar @row - 6]++;
2650 foreach my $val (@row[4..$#row])
2652 $min = $val if $min > $val;
2653 $max = $val if $max < $val;
2655 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2656 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2658 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2660 $sorttables .= $rowstr;
2662 # multiple weights
2663 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2665 # jamo sort
2666 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2668 # Locales
2670 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2671 foreach my $loc (sort keys %locales)
2673 # skip specific locales that match more general ones
2674 my @parts = split /[-_]/, $loc;
2675 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2676 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2677 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2680 # File header
2682 my @header;
2683 $header[0] = 16;
2684 $header[1] = $header[0] + length $table;
2685 $header[2] = $header[1] + length $casemaps;
2686 $header[3] = $header[2] + length $chartypes;
2688 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2689 print OUTPUT pack "L<*", @header;
2690 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2691 close OUTPUT;
2692 save_file($filename);
2696 ################################################################
2697 # build the script to create registry keys
2698 sub dump_registry_script($%)
2700 my ($filename, %keys) = @_;
2701 my $indent = 1;
2703 printf "Building %s\n", $filename;
2704 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2705 print OUTPUT "HKLM\n{\n";
2706 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2708 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2709 $indent++;
2711 foreach my $k (sort keys %keys)
2713 my @subkeys = split /\\/, $k;
2714 my ($def, @vals) = @{$keys{$k}};
2715 for (my $i = 0; $i < @subkeys; $i++)
2717 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2718 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2719 $indent++;
2721 foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2722 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2724 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2725 close OUTPUT;
2726 save_file($filename);
2730 ################################################################
2731 # save a file if modified
2732 sub save_file($)
2734 my $file = shift;
2735 if (-f $file && !system "cmp $file $file.new >/dev/null")
2737 unlink "$file.new";
2739 else
2741 rename "$file.new", "$file";
2746 ################################################################
2747 # main routine
2749 chdir ".." if -f "./make_unicode";
2750 load_data();
2751 dump_sortkeys( "dlls/kernelbase/collation.c" );
2752 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2753 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2754 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2755 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2756 dump_mirroring( "dlls/dwrite/mirror.c" );
2757 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2758 dump_bracket( "dlls/dwrite/bracket.c" );
2759 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2760 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2761 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2762 dump_linebreak( "dlls/dwrite/linebreak.c" );
2763 dump_scripts( "dlls/dwrite/scripts" );
2764 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2765 dump_vertical( "dlls/gdi32/vertical.c" );
2766 dump_vertical( "dlls/wineps.drv/vertical.c" );
2767 dump_intl_nls("nls/l_intl.nls");
2768 dump_norm_table( "nls/normnfc.nls" );
2769 dump_norm_table( "nls/normnfd.nls" );
2770 dump_norm_table( "nls/normnfkc.nls" );
2771 dump_norm_table( "nls/normnfkd.nls" );
2772 dump_norm_table( "nls/normidna.nls" );
2773 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2774 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2775 dump_eucjp_codepage();
2776 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2778 exit 0;
2780 # Local Variables:
2781 # compile-command: "./make_unicode"
2782 # End: