ntdll: Quiet some noisy ETW FIXMEs.
[wine.git] / tools / make_unicode
blob1e0dbe419e195740ebe07796b2eee5f0b188d7fc
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "13.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
30 my $REPORTS = "http://www.unicode.org/reports";
31 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
32 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
34 # Sort keys file
35 my $SORTKEYS = "tr10/allkeys.txt";
37 # Default char for undefined mappings
38 my $DEF_CHAR = ord '?';
40 # Last valid Unicode character
41 my $MAX_CHAR = 0x10ffff;
43 my @allfiles =
45 "CodpageFiles/037.txt",
46 "CodpageFiles/437.txt",
47 "CodpageFiles/500.txt",
48 "CodpageFiles/708.txt",
49 "CodpageFiles/737.txt",
50 "CodpageFiles/775.txt",
51 "CodpageFiles/850.txt",
52 "CodpageFiles/852.txt",
53 "CodpageFiles/855.txt",
54 "CodpageFiles/857.txt",
55 "CodpageFiles/860.txt",
56 "CodpageFiles/861.txt",
57 "CodpageFiles/862.txt",
58 "CodpageFiles/863.txt",
59 "CodpageFiles/864.txt",
60 "CodpageFiles/865.txt",
61 "CodpageFiles/866.txt",
62 "CodpageFiles/869.txt",
63 "CodpageFiles/874.txt",
64 "CodpageFiles/875.txt",
65 "CodpageFiles/932.txt",
66 "CodpageFiles/936.txt",
67 "CodpageFiles/949.txt",
68 "CodpageFiles/950.txt",
69 "CodpageFiles/1026.txt",
70 "CodpageFiles/1250.txt",
71 "CodpageFiles/1251.txt",
72 "CodpageFiles/1252.txt",
73 "CodpageFiles/1253.txt",
74 "CodpageFiles/1254.txt",
75 "CodpageFiles/1255.txt",
76 "CodpageFiles/1256.txt",
77 "CodpageFiles/1257.txt",
78 "CodpageFiles/1258.txt",
79 "CodpageFiles/1361.txt",
80 "CodpageFiles/10000.txt",
81 "CodpageFiles/10001.txt",
82 "CodpageFiles/10002.txt",
83 "CodpageFiles/10003.txt",
84 "CodpageFiles/10004.txt",
85 "CodpageFiles/10005.txt",
86 "CodpageFiles/10006.txt",
87 "CodpageFiles/10007.txt",
88 "CodpageFiles/10008.txt",
89 "CodpageFiles/10010.txt",
90 "CodpageFiles/10017.txt",
91 "CodpageFiles/10021.txt",
92 "CodpageFiles/10029.txt",
93 "CodpageFiles/10079.txt",
94 "CodpageFiles/10081.txt",
95 "CodpageFiles/10082.txt",
96 "CodpageFiles/20127.txt",
97 "CodpageFiles/20866.txt",
98 "CodpageFiles/21866.txt",
99 "CodpageFiles/28591.txt",
100 "CodpageFiles/28592.txt",
101 "CodpageFiles/28593.txt",
102 "CodpageFiles/28594.txt",
103 "CodpageFiles/28595.txt",
104 "CodpageFiles/28596.txt",
105 "CodpageFiles/28597.txt",
106 "CodpageFiles/28598.txt",
107 "CodpageFiles/28599.txt",
108 "CodpageFiles/28603.txt",
109 "CodpageFiles/28605.txt",
113 my %ctype =
115 # CT_CTYPE1
116 "upper" => 0x0001,
117 "lower" => 0x0002,
118 "digit" => 0x0004,
119 "space" => 0x0008,
120 "punct" => 0x0010,
121 "cntrl" => 0x0020,
122 "blank" => 0x0040,
123 "xdigit" => 0x0080,
124 "alpha" => 0x0100 | 0x80000000,
125 "defin" => 0x0200,
126 # CT_CTYPE3 in high 16 bits
127 "nonspacing" => 0x00010000,
128 "diacritic" => 0x00020000,
129 "vowelmark" => 0x00040000,
130 "symbol" => 0x00080000,
131 "katakana" => 0x00100000,
132 "hiragana" => 0x00200000,
133 "halfwidth" => 0x00400000,
134 "fullwidth" => 0x00800000,
135 "ideograph" => 0x01000000,
136 "kashida" => 0x02000000,
137 "lexical" => 0x04000000,
138 "highsurrogate" => 0x08000000,
139 "lowsurrogate" => 0x10000000,
142 my %bracket_types =
144 "o" => 0x0000,
145 "c" => 0x0001,
148 my %indic_types =
150 "Other" => 0x0000,
151 "Bindu" => 0x0001,
152 "Visarga" => 0x0002,
153 "Avagraha" => 0x0003,
154 "Nukta" => 0x0004,
155 "Virama" => 0x0005,
156 "Vowel_Independent" => 0x0006,
157 "Vowel_Dependent" => 0x0007,
158 "Vowel" => 0x0008,
159 "Consonant_Placeholder" => 0x0009,
160 "Consonant" => 0x000a,
161 "Consonant_Dead" => 0x000b,
162 "Consonant_Succeeding_Repha" => 0x000c,
163 "Consonant_Subjoined" => 0x000d,
164 "Consonant_Medial" => 0x000e,
165 "Consonant_Final" => 0x000f,
166 "Consonant_Head_Letter" => 0x0010,
167 "Modifying_Letter" => 0x0011,
168 "Tone_Letter" => 0x0012,
169 "Tone_Mark" => 0x0013,
170 "Register_Shifter" => 0x0014,
171 "Consonant_Preceding_Repha" => 0x0015,
172 "Pure_Killer" => 0x0016,
173 "Invisible_Stacker" => 0x0017,
174 "Gemination_Mark" => 0x0018,
175 "Cantillation_Mark" => 0x0019,
176 "Non_Joiner" => 0x001a,
177 "Joiner" => 0x001b,
178 "Number_Joiner" => 0x001c,
179 "Number" => 0x001d,
180 "Brahmi_Joining_Number" => 0x001e,
181 "Consonant_With_Stacker" => 0x001f,
182 "Consonant_Prefixed" => 0x0020,
183 "Syllable_Modifier" => 0x0021,
184 "Consonant_Killer" => 0x0022,
185 "Consonant_Initial_Postfixed" => 0x0023,
188 my %matra_types =
190 "Right" => 0x01,
191 "Left" => 0x02,
192 "Visual_Order_Left" => 0x03,
193 "Left_And_Right" => 0x04,
194 "Top" => 0x05,
195 "Bottom" => 0x06,
196 "Top_And_Bottom" => 0x07,
197 "Top_And_Right" => 0x08,
198 "Top_And_Left" => 0x09,
199 "Top_And_Left_And_Right" => 0x0a,
200 "Bottom_And_Right" => 0x0b,
201 "Top_And_Bottom_And_Right" => 0x0c,
202 "Overstruck" => 0x0d,
203 "Invisible" => 0x0e,
204 "Bottom_And_Left" => 0x0f,
205 "Top_And_Bottom_And_Left" => 0x10,
208 my %break_types =
210 "BK" => 0x0001,
211 "CR" => 0x0002,
212 "LF" => 0x0003,
213 "CM" => 0x0004,
214 "SG" => 0x0005,
215 "GL" => 0x0006,
216 "CB" => 0x0007,
217 "SP" => 0x0008,
218 "ZW" => 0x0009,
219 "NL" => 0x000a,
220 "WJ" => 0x000b,
221 "JL" => 0x000c,
222 "JV" => 0x000d,
223 "JT" => 0x000e,
224 "H2" => 0x000f,
225 "H3" => 0x0010,
226 "XX" => 0x0011,
227 "OP" => 0x0012,
228 "CL" => 0x0013,
229 "CP" => 0x0014,
230 "QU" => 0x0015,
231 "NS" => 0x0016,
232 "EX" => 0x0017,
233 "SY" => 0x0018,
234 "IS" => 0x0019,
235 "PR" => 0x001a,
236 "PO" => 0x001b,
237 "NU" => 0x001c,
238 "AL" => 0x001d,
239 "ID" => 0x001e,
240 "IN" => 0x001f,
241 "HY" => 0x0020,
242 "BB" => 0x0021,
243 "BA" => 0x0022,
244 "SA" => 0x0023,
245 "AI" => 0x0024,
246 "B2" => 0x0025,
247 "HL" => 0x0026,
248 "CJ" => 0x0027,
249 "RI" => 0x0028,
250 "EB" => 0x0029,
251 "EM" => 0x002a,
252 "ZWJ" => 0x002b,
255 my %vertical_types =
257 "R" => 0x0000,
258 "U" => 0x0001,
259 "Tr" => 0x0002,
260 "Tu" => 0x0003,
263 my %categories =
265 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
266 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
267 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
268 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
269 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
270 "Me" => $ctype{"defin"}, # Mark, Enclosing
271 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
272 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
273 "No" => $ctype{"defin"}, # Number, Other
274 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
275 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
276 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
277 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
278 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
279 "Cs" => $ctype{"defin"}, # Other, Surrogate
280 "Co" => $ctype{"defin"}, # Other, Private Use
281 "Cn" => $ctype{"defin"}, # Other, Not Assigned
282 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
283 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
284 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
285 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
286 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
287 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
288 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
289 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
290 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
291 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
292 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
293 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
294 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
297 # a few characters need additional categories that cannot be determined automatically
298 my %special_categories =
300 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
301 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
302 "space" => [ 0x09..0x0d, 0x85 ],
303 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
304 "cntrl" => [ 0x070f, 0x200c, 0x200d,
305 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
306 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
307 0xfff9, 0xfffa, 0xfffb ],
308 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
309 0xd7, 0xf7 ],
310 "digit" => [ 0xb2, 0xb3, 0xb9 ],
311 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
312 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
313 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
314 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
315 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
316 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
317 0x02b9..0x02ba, 0x02c6..0x02cf ],
318 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
319 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
320 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
321 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
322 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
323 0x3131..0x3164 ],
324 "ideograph" => [ 0x3006..0x3007 ],
325 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
326 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
327 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
328 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
329 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
330 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
331 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
332 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
333 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
334 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
335 "kashida" => [ 0x0640 ],
338 my %directions =
340 "L" => 1, # Left-to-Right
341 "R" => 2, # Right-to-Left
342 "AL" => 12, # Right-to-Left Arabic
343 "EN" => 3, # European Number
344 "ES" => 4, # European Number Separator
345 "ET" => 5, # European Number Terminator
346 "AN" => 6, # Arabic Number
347 "CS" => 7, # Common Number Separator
348 "NSM" => 13, # Non-Spacing Mark
349 "BN" => 14, # Boundary Neutral
350 "B" => 8, # Paragraph Separator
351 "S" => 9, # Segment Separator
352 "WS" => 10, # Whitespace
353 "ON" => 11, # Other Neutrals
354 "LRE" => 15, # Left-to-Right Embedding
355 "LRO" => 15, # Left-to-Right Override
356 "RLE" => 15, # Right-to-Left Embedding
357 "RLO" => 15, # Right-to-Left Override
358 "PDF" => 15, # Pop Directional Format
359 "LRI" => 15, # Left-to-Right Isolate
360 "RLI" => 15, # Right-to-Left Isolate
361 "FSI" => 15, # First Strong Isolate
362 "PDI" => 15 # Pop Directional Isolate
365 my %c2_types =
367 "L" => 1, # C2_LEFTTORIGHT
368 "R" => 2, # C2_RIGHTTOLEFT
369 "AL" => 2, # C2_RIGHTTOLEFT
370 "EN" => 3, # C2_EUROPENUMBER
371 "ES" => 4, # C2_EUROPESEPARATOR
372 "ET" => 5, # C2_EUROPETERMINATOR
373 "AN" => 6, # C2_ARABICNUMBER
374 "CS" => 7, # C2_COMMONSEPARATOR
375 "NSM" => 11, # C2_OTHERNEUTRAL
376 "BN" => 0, # C2_NOTAPPLICABLE
377 "B" => 8, # C2_BLOCKSEPARATOR
378 "S" => 9, # C2_SEGMENTSEPARATOR
379 "WS" => 10, # C2_WHITESPACE
380 "ON" => 11, # C2_OTHERNEUTRAL
381 "LRE" => 11, # C2_OTHERNEUTRAL
382 "LRO" => 11, # C2_OTHERNEUTRAL
383 "RLE" => 11, # C2_OTHERNEUTRAL
384 "RLO" => 11, # C2_OTHERNEUTRAL
385 "PDF" => 11, # C2_OTHERNEUTRAL
386 "LRI" => 11, # C2_OTHERNEUTRAL
387 "RLI" => 11, # C2_OTHERNEUTRAL
388 "FSI" => 11, # C2_OTHERNEUTRAL
389 "PDI" => 11 # C2_OTHERNEUTRAL
392 my %bidi_types =
394 "ON" => 0, # Other Neutrals
395 "L" => 1, # Left-to-Right
396 "R" => 2, # Right-to-Left
397 "AN" => 3, # Arabic Number
398 "EN" => 4, # European Number
399 "AL" => 5, # Right-to-Left Arabic
400 "NSM" => 6, # Non-Spacing Mark
401 "CS" => 7, # Common Number Separator
402 "ES" => 8, # European Number Separator
403 "ET" => 9, # European Number Terminator
404 "BN" => 10, # Boundary Neutral
405 "S" => 11, # Segment Separator
406 "WS" => 12, # Whitespace
407 "B" => 13, # Paragraph Separator
408 "RLO" => 14, # Right-to-Left Override
409 "RLE" => 15, # Right-to-Left Embedding
410 "LRO" => 16, # Left-to-Right Override
411 "LRE" => 17, # Left-to-Right Embedding
412 "PDF" => 18, # Pop Directional Format
413 "LRI" => 19, # Left-to-Right Isolate
414 "RLI" => 20, # Right-to-Left Isolate
415 "FSI" => 21, # First Strong Isolate
416 "PDI" => 22 # Pop Directional Isolate
419 my %joining_types =
421 "U" => 0, # Non_Joining
422 "L" => 1, # Left_Joining
423 "R" => 2, # Right_Joining
424 "D" => 3, # Dual_Joining
425 "C" => 3, # Join_Causing
426 "ALAPH" => 4, # Syriac ALAPH
427 "DALATH RISH" => 5, # Syriac DALATH RISH group
428 "T" => 6, # Transparent
431 my @cp2uni = ();
432 my @glyph2uni = ();
433 my @lead_bytes = ();
434 my @uni2cp = ();
435 my @tolower_table = ();
436 my @toupper_table = ();
437 my @digitmap_table = ();
438 my @category_table = ();
439 my @initial_joining_table = ();
440 my @direction_table = ();
441 my @decomp_table = ();
442 my @combining_class_table = ();
443 my @decomp_compat_table = ();
444 my @comp_exclusions = ();
445 my @idna_decomp_table = ();
446 my @idna_disallowed = ();
447 my %registry_keys;
448 my $default_char;
449 my $default_wchar;
451 my %joining_forms =
453 "isolated" => [],
454 "final" => [],
455 "initial" => [],
456 "medial" => []
459 sub to_utf16(@)
461 my @ret;
462 foreach my $ch (@_)
464 if ($ch < 0x10000)
466 push @ret, $ch;
468 else
470 my $val = $ch - 0x10000;
471 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
474 return @ret;
477 ################################################################
478 # fetch a unicode.org file and open it
479 sub open_data_file($$)
481 my ($base, $name) = @_;
482 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
483 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
484 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
485 local *FILE;
487 if ($base =~ /.*\/([^\/]+)\.zip$/)
489 my $zip = "$1$suffix.zip";
490 unless (-f "$cache/$zip")
492 system "mkdir", "-p", $cache;
493 print "Fetching $base...\n";
494 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
496 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
498 else
500 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
501 unless (-f $dest)
503 system "mkdir", "-p", $dir;
504 print "Fetching $base/$name...\n";
505 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
507 open FILE, "<$dest" or die "cannot open $dest";
509 return *FILE;
512 ################################################################
513 # recursively get the decomposition for a character
514 sub get_decomposition($$);
515 sub get_decomposition($$)
517 my ($char, $table) = @_;
518 my @ret;
520 return $char unless defined $table->[$char];
521 foreach my $ch (@{$table->[$char]})
523 push @ret, get_decomposition( $ch, $table );
525 return @ret;
528 ################################################################
529 # get the composition that results in a given character
530 sub get_composition($$)
532 my ($ch, $compat) = @_;
533 return () unless defined $decomp_table[$ch]; # no decomposition
534 my @ret = @{$decomp_table[$ch]};
535 return () if @ret < 2; # singleton decomposition
536 return () if $comp_exclusions[$ch]; # composition exclusion
537 return () if $combining_class_table[$ch]; # non-starter
538 return () if $combining_class_table[$ret[0]]; # first char is non-starter
539 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
540 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
541 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
542 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
543 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
544 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
545 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
546 return @ret;
549 ################################################################
550 # recursively build decompositions
551 sub build_decompositions(@)
553 my @src = @_;
554 my @dst;
556 for (my $i = 0; $i < @src; $i++)
558 next unless defined $src[$i];
559 my @decomp = to_utf16( get_decomposition( $i, \@src ));
560 $dst[$i] = \@decomp;
562 return @dst;
565 ################################################################
566 # compose Hangul sequences
567 sub compose_hangul(@)
569 my $SBASE = 0xac00;
570 my $LBASE = 0x1100;
571 my $VBASE = 0x1161;
572 my $TBASE = 0x11a7;
573 my $LCOUNT = 19;
574 my $VCOUNT = 21;
575 my $TCOUNT = 28;
576 my $NCOUNT = $VCOUNT * $TCOUNT;
577 my $SCOUNT = $LCOUNT * $NCOUNT;
579 my @seq = @_;
580 my @ret;
581 my $i;
583 for ($i = 0; $i < @seq; $i++)
585 my $ch = $seq[$i];
586 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
587 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
589 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
590 $i++;
592 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
593 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
595 $ch += $seq[$i+1] - $TBASE;
596 $i++;
598 push @ret, $ch;
600 return @ret;
603 ################################################################
604 # remove linguistic-only mappings from the case table
605 sub remove_linguistic_mappings($$)
607 my ($upper, $lower) = @_;
609 # remove case mappings that don't round-trip
611 for (my $i = 0; $i < @{$upper}; $i++)
613 next unless defined ${$upper}[$i];
614 my $ch = ${$upper}[$i];
615 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
617 for (my $i = 0; $i < @{$lower}; $i++)
619 next unless defined ${$lower}[$i];
620 my $ch = ${$lower}[$i];
621 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
625 ################################################################
626 # read in the Unicode database files
627 sub load_data()
629 my $start;
631 # now build mappings from the decomposition field of the Unicode database
633 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
634 while (<$UNICODE_DATA>)
636 # Decode the fields ...
637 my ($code, $name, $cat, $comb, $bidi,
638 $decomp, $dec, $dig, $num, $mirror,
639 $oldname, $comment, $upper, $lower, $title) = split /;/;
640 my $src = hex $code;
642 die "unknown category $cat" unless defined $categories{$cat};
643 die "unknown directionality $bidi" unless defined $directions{$bidi};
645 $category_table[$src] = $categories{$cat};
646 $direction_table[$src] = $bidi;
647 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
649 $initial_joining_table[$src] = $joining_types{"T"};
651 else
653 $initial_joining_table[$src] = $joining_types{"U"};
656 if ($lower ne "")
658 $tolower_table[$src] = hex $lower;
660 if ($upper ne "")
662 $toupper_table[$src] = hex $upper;
664 if ($dec ne "")
666 $category_table[$src] |= $ctype{"digit"};
668 if ($dig ne "")
670 $digitmap_table[$src] = ord $dig;
672 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
674 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
675 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
676 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
677 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
678 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
679 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
680 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
681 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
682 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
683 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
684 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
685 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
687 # copy the category and direction for everything between First/Last pairs
688 if ($name =~ /, First>/) { $start = $src; }
689 if ($name =~ /, Last>/)
691 while ($start < $src)
693 $category_table[$start] = $category_table[$src];
694 $direction_table[$start] = $direction_table[$src];
695 $combining_class_table[$start] = $combining_class_table[$src];
696 $start++;
700 next if $decomp eq ""; # no decomposition, skip it
702 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
704 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
705 $decomp_compat_table[$src] = \@seq;
708 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
710 # decomposition of the form "<foo> 1234" -> use char if type is known
711 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
713 ${joining_forms{$1}}[hex $2] = $src;
716 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
718 # decomposition "<compat> 0020 1234" -> combining accent
720 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
722 # store decomposition
723 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
725 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
727 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
729 # Single char decomposition
730 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
734 close $UNICODE_DATA;
736 # patch the category of some special characters
738 for (my $i = 0; $i < @decomp_table; $i++)
740 next unless defined $decomp_table[$i];
741 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
743 foreach my $cat (keys %special_categories)
745 my $flag = $ctype{$cat};
746 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
748 for (my $i = 0; $i < @decomp_compat_table; $i++)
750 next unless defined $decomp_compat_table[$i];
751 next unless @{$decomp_compat_table[$i]} == 2;
752 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
755 # load the composition exclusions
757 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
758 while (<$EXCL>)
760 s/\#.*//; # remove comments
761 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
763 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
765 elsif (/^([0-9a-fA-F]+)\s*$/)
767 $comp_exclusions[hex $1] = 1;
770 close $EXCL;
772 # load the IDNA mappings
774 @idna_decomp_table = @decomp_compat_table;
775 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
776 while (<$IDNA>)
778 s/\#.*//; # remove comments
779 next if /^\s*$/;
780 my ($char, $type, $mapping) = split /;/;
781 my ($ch1, $ch2);
782 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
784 $ch1 = hex $1;
785 $ch2 = hex $2;
787 elsif ($char =~ /([0-9a-fA-F]+)/)
789 $ch1 = $ch2 = hex $1;
792 if ($type =~ /mapped/ || $type =~ /deviation/)
794 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
795 my @seq = map { hex $_; } split /\s+/, $mapping;
796 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
798 elsif ($type =~ /valid/)
801 elsif ($type =~ /ignored/)
803 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
805 elsif ($type =~ /disallowed/)
807 foreach my $i ($ch1 .. $ch2)
809 $idna_decomp_table[$i] = undef;
810 $idna_disallowed[$i] = 1;
814 close $IDNA;
818 ################################################################
819 # add a new registry key
820 sub add_registry_key($$)
822 my ($key, $defval) = @_;
823 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
826 ################################################################
827 # add a new registry value
828 sub add_registry_value($$$)
830 my ($key, $name, $value) = @_;
831 add_registry_key( $key, undef );
832 push @{$registry_keys{$key}}, "'$name' = s '$value'";
835 ################################################################
836 # define a new lead byte
837 sub add_lead_byte($)
839 my $ch = shift;
840 return if defined $cp2uni[$ch];
841 push @lead_bytes, $ch;
842 $cp2uni[$ch] = 0;
845 ################################################################
846 # define a new char mapping
847 sub add_mapping($$)
849 my ($cp, $uni) = @_;
850 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
851 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
852 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
855 ################################################################
856 # get a mapping including glyph chars for MB_USEGLYPHCHARS
857 sub get_glyphs_mapping(@)
859 my @table = @_;
861 for (my $i = 0; $i < @glyph2uni; $i++)
863 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
865 return @table;
868 ################################################################
869 # build EUC-JP table from the JIS 0208/0212 files
870 sub dump_eucjp_codepage()
872 @cp2uni = ();
873 @glyph2uni = ();
874 @lead_bytes = ();
875 @uni2cp = ();
876 $default_char = $DEF_CHAR;
877 $default_wchar = 0x30fb;
879 # ASCII chars
880 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
882 # lead bytes
883 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
885 # JIS X 0201 right plane
886 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
888 # undefined chars
889 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
890 $cp2uni[0xa0] = 0xf8f0;
891 $cp2uni[0xff] = 0xf8f3;
893 # Fix backslash conversion
894 add_mapping( 0xa1c0, 0xff3c );
896 # Add private mappings for rows undefined in JIS 0208/0212
897 my $private = 0xe000;
898 foreach my $hi (0xf5 .. 0xfe)
900 foreach my $lo (0xa1 .. 0xfe)
902 add_mapping( ($hi << 8) + $lo, $private++ );
905 foreach my $hi (0xf5 .. 0xfe)
907 foreach my $lo (0x21 .. 0x7e)
909 add_mapping( ($hi << 8) + $lo, $private++ );
913 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
914 while (<$INPUT>)
916 next if /^\#/; # skip comments
917 next if /^$/; # skip empty lines
918 next if /\x1a/; # skip ^Z
919 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
921 add_mapping( 0x8080 + hex $1, hex $2 );
922 next;
924 die "Unrecognized line $_\n";
926 close $INPUT;
928 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
929 while (<$INPUT>)
931 next if /^\#/; # skip comments
932 next if /^$/; # skip empty lines
933 next if /\x1a/; # skip ^Z
934 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
936 add_mapping( 0x8000 + hex $1, hex $2 );
937 next;
939 die "Unrecognized line $_\n";
941 close $INPUT;
943 output_codepage_file( 20932 );
946 ################################################################
947 # build Korean Wansung table from the KSX1001 file
948 sub dump_krwansung_codepage(@)
950 my @cp949 = @_;
951 @cp2uni = ();
952 @glyph2uni = ();
953 @lead_bytes = ();
954 @uni2cp = ();
955 $default_char = 0x3f;
956 $default_wchar = 0x003f;
958 # ASCII and undefined chars
959 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
960 add_mapping( 0xa0, 0xf8e6 );
961 add_mapping( 0xad, 0xf8e7 );
962 add_mapping( 0xae, 0xf8e8 );
963 add_mapping( 0xaf, 0xf8e9 );
964 add_mapping( 0xfe, 0xf8ea );
965 add_mapping( 0xff, 0xf8eb );
967 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
968 while (<$INPUT>)
970 next if /^\#/; # skip comments
971 next if /^$/; # skip empty lines
972 next if /\x1a/; # skip ^Z
973 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
975 add_mapping( 0x8080 + hex $1, hex $2 );
976 next;
978 die "Unrecognized line $_\n";
980 close $INPUT;
982 # get some extra mappings from cp 949
983 my @defined_lb;
984 map { $defined_lb[$_] = 1; } @lead_bytes;
985 foreach my $i (0x0000 .. 0xffff)
987 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
988 next unless defined $cp949[$i];
989 if ($cp949[$i] >= 0xff)
991 # only add chars for lead bytes that exist in 20949
992 my $hi = $cp949[$i] >> 8;
993 my $lo = $cp949[$i] & 0xff;
994 next unless $defined_lb[$hi];
995 next unless $lo >= 0xa1 && $lo <= 0xfe;
997 add_mapping( $cp949[$i], $i );
1000 output_codepage_file( 20949 );
1003 ################################################################
1004 # build the sort keys table
1005 sub dump_sortkeys($)
1007 my $filename = shift;
1008 my @sortkeys = ();
1010 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
1011 while (<$INPUT>)
1013 next if /^\#/; # skip comments
1014 next if /^$/; # skip empty lines
1015 next if /\x1a/; # skip ^Z
1016 next if /^\@version/; # skip @version header
1017 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1019 my ($uni,$variable) = (hex $1, $2);
1020 next if $uni > 65535;
1021 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
1022 next;
1024 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1026 # multiple character sequence, ignored for now
1027 next;
1029 die "$SORTKEYS: Unrecognized line $_\n";
1031 close $INPUT;
1033 # compress the keys to 32 bit:
1034 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
1036 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
1037 ${$a}[2] <=> ${$b}[2] or
1038 ${$a}[3] <=> ${$b}[3] or
1039 ${$a}[4] <=> ${$b}[4] or
1040 $a cmp $b; } @sortkeys;
1042 my ($n2, $n3) = (1, 1);
1043 my @keys = (-1, -1, -1, -1, -1 );
1044 my @flatkeys = ();
1046 for (my $i = 0; $i < @sortkeys; $i++)
1048 next unless defined $sortkeys[$i];
1049 my @current = @{$sortkeys[$i]};
1050 if ($current[1] == $keys[1])
1052 if ($current[2] == $keys[2])
1054 if ($current[3] == $keys[3])
1056 # nothing
1058 else
1060 $keys[3] = $current[3];
1061 $n3++;
1062 die if ($n3 >= 16);
1065 else
1067 $keys[2] = $current[2];
1068 $keys[3] = $current[3];
1069 $n2++;
1070 $n3 = 1;
1071 die if ($n2 >= 256);
1074 else
1076 $keys[1] = $current[1];
1077 $keys[2] = $current[2];
1078 $keys[3] = $current[3];
1079 $n2 = 1;
1080 $n3 = 1;
1083 if ($current[2]) { $current[2] = $n2; }
1084 if ($current[3]) { $current[3] = $n3; }
1085 if ($current[4]) { $current[4] = 1; }
1087 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1090 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1091 printf "Building $filename\n";
1092 printf OUTPUT "/* Unicode collation element table */\n";
1093 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1094 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1095 print OUTPUT "#include \"windef.h\"\n\n";
1097 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1099 close OUTPUT;
1100 save_file($filename);
1104 ################################################################
1105 # dump an array of integers
1106 sub dump_array($$@)
1108 my ($bit_width, $default, @array) = @_;
1109 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1110 my $i;
1111 my $ret = " ";
1112 for ($i = 0; $i < $#array; $i++)
1114 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1115 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1117 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1118 return $ret;
1122 ################################################################
1123 # dump an SBCS mapping table in binary format
1124 sub dump_binary_sbcs_table($)
1126 my $codepage = shift;
1128 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1129 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1131 print OUTPUT pack "S<*", @header;
1132 print OUTPUT pack "C12", (0) x 12;
1133 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1135 if (@glyph2uni)
1137 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1139 else
1141 print OUTPUT pack "S<*", 0;
1144 print OUTPUT pack "S<*", 0, 0;
1146 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1150 ################################################################
1151 # dump a DBCS mapping table in binary format
1152 sub dump_binary_dbcs_table($)
1154 my $codepage = shift;
1155 my @lb_ranges = get_lb_ranges();
1156 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1158 my @offsets = (0) x 256;
1159 my $pos = 0;
1160 foreach my $i (@lead_bytes)
1162 $offsets[$i] = ($pos += 256);
1163 $cp2uni[$i] = 0;
1166 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1168 print OUTPUT pack "S<*", @header;
1169 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1170 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1171 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1173 foreach my $i (@lead_bytes)
1175 my $base = $i << 8;
1176 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1179 print OUTPUT pack "S<", 4;
1180 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1184 ################################################################
1185 # get the list of defined lead byte ranges
1186 sub get_lb_ranges()
1188 my @list = ();
1189 my @ranges = ();
1191 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1192 my $on = 0;
1193 for (my $i = 0; $i < 256; $i++)
1195 if ($on)
1197 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1199 else
1201 if ($list[$i]) { push @ranges, $i; $on = 1; }
1204 if ($on) { push @ranges, 0xff; }
1205 return @ranges;
1208 ################################################################
1209 # dump the Indic Syllabic Category table
1210 sub dump_indic($)
1212 my $filename = shift;
1213 my @indic_table;
1215 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1216 while (<$INPUT>)
1218 next if /^\#/; # skip comments
1219 next if /^\s*$/; # skip empty lines
1220 next if /\x1a/; # skip ^Z
1221 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1223 my $type = $2;
1224 die "unknown indic $type" unless defined $indic_types{$type};
1225 if (hex $1 < 65536)
1227 $indic_table[hex $1] = $indic_types{$type};
1229 next;
1231 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1233 my $type = $3;
1234 die "unknown indic $type" unless defined $indic_types{$type};
1235 if (hex $1 < 65536 and hex $2 < 65536)
1237 foreach my $i (hex $1 .. hex $2)
1239 $indic_table[$i] = $indic_types{$type};
1242 next;
1244 die "malformed line $_";
1246 close $INPUT;
1248 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1249 while (<$INPUT>)
1251 next if /^\#/; # skip comments
1252 next if /^\s*$/; # skip empty lines
1253 next if /\x1a/; # skip ^Z
1254 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1256 my $type = $2;
1257 die "unknown matra $type" unless defined $matra_types{$type};
1258 $indic_table[hex $1] |= $matra_types{$type} << 8;
1259 next;
1261 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1263 my $type = $3;
1264 die "unknown matra $type" unless defined $matra_types{$type};
1265 foreach my $i (hex $1 .. hex $2)
1267 $indic_table[$i] |= $matra_types{$type} << 8;
1269 next;
1271 die "malformed line $_";
1273 close $INPUT;
1275 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1276 print "Building $filename\n";
1277 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1278 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1279 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1280 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1281 print OUTPUT "#include \"windef.h\"\n\n";
1283 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1285 close OUTPUT;
1286 save_file($filename);
1289 ################################################################
1290 # dump the Line Break Properties table
1291 sub dump_linebreak($)
1293 my $filename = shift;
1294 my @break_table;
1296 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1297 while (<$INPUT>)
1299 next if /^\#/; # skip comments
1300 next if /^\s*$/; # skip empty lines
1301 next if /\x1a/; # skip ^Z
1302 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1304 my $type = $2;
1305 die "unknown breaktype $type" unless defined $break_types{$type};
1306 $break_table[hex $1] = $break_types{$type};
1307 next;
1309 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1311 my $type = $3;
1312 die "unknown breaktype $type" unless defined $break_types{$type};
1313 foreach my $i (hex $1 .. hex $2)
1315 $break_table[$i] = $break_types{$type};
1317 next;
1319 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1321 my $type = $2;
1322 die "unknown breaktype $type" unless defined $break_types{$type};
1323 $break_table[hex $1] = $break_types{$type};
1324 next;
1326 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1328 my $type = $3;
1329 die "unknown breaktype $type" unless defined $break_types{$type};
1330 foreach my $i (hex $1 .. hex $2)
1332 $break_table[$i] = $break_types{$type};
1334 next;
1336 die "malformed line $_";
1338 close $INPUT;
1340 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1341 print "Building $filename\n";
1342 print OUTPUT "/* Unicode Line Break Properties */\n";
1343 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1344 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1345 print OUTPUT "#include \"windef.h\"\n\n";
1347 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1349 close OUTPUT;
1350 save_file($filename);
1353 my %scripts =
1355 "Unknown" => 0,
1356 "Common" => 1,
1357 "Inherited" => 2,
1358 "Arabic" => 3,
1359 "Armenian" => 4,
1360 "Avestan" => 5,
1361 "Balinese" => 6,
1362 "Bamum" => 7,
1363 "Batak" => 8,
1364 "Bengali" => 9,
1365 "Bopomofo" => 10,
1366 "Brahmi" => 11,
1367 "Braille" => 12,
1368 "Buginese" => 13,
1369 "Buhid" => 14,
1370 "Canadian_Aboriginal" => 15,
1371 "Carian" => 16,
1372 "Cham" => 17,
1373 "Cherokee" => 18,
1374 "Coptic" => 19,
1375 "Cuneiform" => 20,
1376 "Cypriot" => 21,
1377 "Cyrillic" => 22,
1378 "Deseret" => 23,
1379 "Devanagari" => 24,
1380 "Egyptian_Hieroglyphs" => 25,
1381 "Ethiopic" => 26,
1382 "Georgian" => 27,
1383 "Glagolitic" => 28,
1384 "Gothic" => 29,
1385 "Greek" => 30,
1386 "Gujarati" => 31,
1387 "Gurmukhi" => 32,
1388 "Han" => 33,
1389 "Hangul" => 34,
1390 "Hanunoo" => 35,
1391 "Hebrew" => 36,
1392 "Hiragana" => 37,
1393 "Imperial_Aramaic" => 38,
1394 "Inscriptional_Pahlavi" => 39,
1395 "Inscriptional_Parthian" => 40,
1396 "Javanese" => 41,
1397 "Kaithi" => 42,
1398 "Kannada" => 43,
1399 "Katakana" => 44,
1400 "Kayah_Li" => 45,
1401 "Kharoshthi" => 46,
1402 "Khmer" => 47,
1403 "Lao" => 48,
1404 "Latin" => 49,
1405 "Lepcha" => 50,
1406 "Limbu" => 51,
1407 "Linear_B" => 52,
1408 "Lisu" => 53,
1409 "Lycian" => 54,
1410 "Lydian" => 55,
1411 "Malayalam" => 56,
1412 "Mandaic" => 57,
1413 "Meetei_Mayek" => 58,
1414 "Mongolian" => 59,
1415 "Myanmar" => 60,
1416 "New_Tai_Lue" => 61,
1417 "Nko" => 62,
1418 "Ogham" => 63,
1419 "Ol_Chiki" => 64,
1420 "Old_Italic" => 65,
1421 "Old_Persian" => 66,
1422 "Old_South_Arabian" => 67,
1423 "Old_Turkic" => 68,
1424 "Oriya" => 69,
1425 "Osmanya" => 70,
1426 "Phags_Pa" => 71,
1427 "Phoenician" => 72,
1428 "Rejang" => 73,
1429 "Runic" => 74,
1430 "Samaritan" => 75,
1431 "Saurashtra" => 76,
1432 "Shavian" => 77,
1433 "Sinhala" => 78,
1434 "Sundanese" => 79,
1435 "Syloti_Nagri" => 80,
1436 "Syriac" => 81,
1437 "Tagalog" => 82,
1438 "Tagbanwa" => 83,
1439 "Tai_Le" => 84,
1440 "Tai_Tham" => 85,
1441 "Tai_Viet" => 86,
1442 "Tamil" => 87,
1443 "Telugu" => 88,
1444 "Thaana" => 89,
1445 "Thai" => 90,
1446 "Tibetan" => 91,
1447 "Tifinagh" => 92,
1448 "Ugaritic" => 93,
1449 "Vai" => 94,
1450 "Yi" => 95,
1451 # Win8/Win8.1
1452 "Chakma" => 96,
1453 "Meroitic_Cursive" => 97,
1454 "Meroitic_Hieroglyphs" => 98,
1455 "Miao" => 99,
1456 "Sharada" => 100,
1457 "Sora_Sompeng" => 101,
1458 "Takri" => 102,
1459 # Win10
1460 "Bassa_Vah" => 103,
1461 "Caucasian_Albanian" => 104,
1462 "Duployan" => 105,
1463 "Elbasan" => 106,
1464 "Grantha" => 107,
1465 "Khojki" => 108,
1466 "Khudawadi" => 109,
1467 "Linear_A" => 110,
1468 "Mahajani" => 111,
1469 "Manichaean" => 112,
1470 "Mende_Kikakui" => 113,
1471 "Modi" => 114,
1472 "Mro" => 115,
1473 "Nabataean" => 116,
1474 "Old_North_Arabian" => 117,
1475 "Old_Permic" => 118,
1476 "Pahawh_Hmong" => 119,
1477 "Palmyrene" => 120,
1478 "Pau_Cin_Hau" => 121,
1479 "Psalter_Pahlavi" => 122,
1480 "Siddham" => 123,
1481 "Tirhuta" => 124,
1482 "Warang_Citi" => 125,
1483 # Win10 RS1
1484 "Adlam" => 126,
1485 "Ahom" => 127,
1486 "Anatolian_Hieroglyphs" => 128,
1487 "Bhaiksuki" => 129,
1488 "Hatran" => 130,
1489 "Marchen" => 131,
1490 "Multani" => 132,
1491 "Newa" => 133,
1492 "Old_Hungarian" => 134,
1493 "Osage" => 135,
1494 "SignWriting" => 136,
1495 "Tangut" => 137,
1496 # Win10 RS4
1497 "Masaram_Gondi" => 138,
1498 "Nushu" => 139,
1499 "Soyombo" => 140,
1500 "Zanabazar_Square" => 141,
1501 # Win10 1903
1502 "Dogra" => 142,
1503 "Gunjala_Gondi" => 143,
1504 "Hanifi_Rohingya" => 144,
1505 "Makasar" => 145,
1506 "Medefaidrin" => 146,
1507 "Old_Sogdian" => 147,
1508 "Sogdian" => 148,
1509 # Win10 2004
1510 "Elymaic" => 149,
1511 "Nyiakeng_Puachue_Hmong" => 150,
1512 "Nandinagari" => 151,
1513 "Wancho" => 152,
1516 ################################################################
1517 # dump Script IDs table
1518 sub dump_scripts($)
1520 my $filename = shift;
1521 my $header = $filename;
1522 my @scripts_table;
1523 my $script_index;
1524 my $i;
1526 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1527 # Fill the table
1528 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1529 while (<$INPUT>)
1531 my $type = "";
1533 next if /^\#/; # skip comments
1534 next if /^\s*$/; # skip empty lines
1535 next if /\x1a/; # skip ^Z
1536 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1538 $type = $2;
1539 if (defined $scripts{$type})
1541 $scripts_table[hex $1] = $scripts{$type};
1543 next;
1545 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1547 $type = $3;
1548 if (defined $scripts{$type})
1550 foreach my $i (hex $1 .. hex $2)
1552 $scripts_table[$i] = $scripts{$type};
1555 next;
1559 close $INPUT;
1561 $header = "$filename.h";
1562 open OUTPUT,">$header.new" or die "Cannot create $header";
1563 print "Building $header\n";
1564 print OUTPUT "/* Unicode Script IDs */\n";
1565 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1566 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1568 print OUTPUT "enum unicode_script_id {\n";
1569 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1571 print OUTPUT " Script_$script = $scripts{$script},\n";
1573 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1574 print OUTPUT "};\n";
1576 close OUTPUT;
1577 save_file($header);
1579 $filename = "$filename.c";
1580 open OUTPUT,">$filename.new" or die "Cannot create $header";
1581 print "Building $filename\n";
1582 print OUTPUT "/* Unicode Script IDs */\n";
1583 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1584 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1585 print OUTPUT "#include \"windef.h\"\n\n";
1587 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1588 close OUTPUT;
1589 save_file($filename);
1592 ################################################################
1593 # dump the BiDi mirroring table
1594 sub dump_mirroring($)
1596 my $filename = shift;
1597 my @mirror_table = ();
1599 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1600 while (<$INPUT>)
1602 next if /^\#/; # skip comments
1603 next if /^$/; # skip empty lines
1604 next if /\x1a/; # skip ^Z
1605 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1607 $mirror_table[hex $1] = hex $2;
1608 next;
1610 die "malformed line $_";
1612 close $INPUT;
1614 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1615 print "Building $filename\n";
1616 print OUTPUT "/* Unicode BiDi mirroring */\n";
1617 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1618 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1619 print OUTPUT "#include \"windef.h\"\n\n";
1620 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1621 close OUTPUT;
1622 save_file($filename);
1625 ################################################################
1626 # dump the Bidi Brackets
1627 sub dump_bracket($)
1629 my $filename = shift;
1630 my @bracket_table;
1632 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1633 while (<$INPUT>)
1635 next if /^\#/; # skip comments
1636 next if /^\s*$/; # skip empty lines
1637 next if /\x1a/; # skip ^Z
1638 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1640 my $type = $3;
1641 die "unknown bracket $type" unless defined $bracket_types{$type};
1642 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1643 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1644 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1645 next;
1647 die "malformed line $_";
1649 close $INPUT;
1651 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1652 print "Building $filename\n";
1653 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1654 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1655 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1656 print OUTPUT "#include \"windef.h\"\n\n";
1658 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1660 close OUTPUT;
1661 save_file($filename);
1664 ################################################################
1665 # dump the Arabic shaping table
1666 sub dump_shaping($)
1668 my $filename = shift;
1669 my @joining_table = @initial_joining_table;
1671 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1672 while (<$INPUT>)
1674 next if /^\#/; # skip comments
1675 next if /^\s*$/; # skip empty lines
1676 next if /\x1a/; # skip ^Z
1677 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1679 my $type = $2;
1680 $joining_table[hex $1] = $joining_types{$type};
1681 next;
1683 die "malformed line $_";
1685 close $INPUT;
1687 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1688 print "Building $filename\n";
1689 print OUTPUT "/* Unicode Arabic shaping */\n";
1690 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1691 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1692 print OUTPUT "#include \"windef.h\"\n\n";
1694 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1696 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1697 for (my $i = 0x600; $i <= 0x6ff; $i++)
1699 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1700 ${joining_forms{"isolated"}}[$i] || $i,
1701 ${joining_forms{"final"}}[$i] || $i,
1702 ${joining_forms{"initial"}}[$i] || $i,
1703 ${joining_forms{"medial"}}[$i] || $i;
1705 print OUTPUT "};\n";
1707 close OUTPUT;
1708 save_file($filename);
1711 ################################################################
1712 # dump the Arabic shaping table
1713 sub dump_arabic_shaping($)
1715 my $filename = shift;
1716 my @joining_table = @initial_joining_table;
1718 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1719 while (<$INPUT>)
1721 next if /^\#/; # skip comments
1722 next if /^\s*$/; # skip empty lines
1723 next if /\x1a/; # skip ^Z
1724 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1726 my $type = $2;
1727 my $group = $3;
1729 if ($group eq "ALAPH" || $group eq "DALATH RISH")
1731 $joining_table[hex $1] = $joining_types{$group};
1733 else
1735 $joining_table[hex $1] = $joining_types{$type};
1738 next;
1740 die "malformed line $_";
1742 close $INPUT;
1744 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1745 print "Building $filename\n";
1746 print OUTPUT "/* Unicode Arabic shaping */\n";
1747 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1748 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1749 print OUTPUT "#include \"windef.h\"\n\n";
1751 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1753 close OUTPUT;
1754 save_file($filename);
1757 ################################################################
1758 # dump the Vertical Orientation table
1759 sub dump_vertical($)
1761 my $filename = shift;
1762 my @vertical_table;
1764 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1765 while (<$INPUT>)
1767 next if /^\#/; # skip comments
1768 next if /^\s*$/; # skip empty lines
1769 next if /\x1a/; # skip ^Z
1770 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1772 my $type = $2;
1773 die "unknown vertical $type" unless defined $vertical_types{$type};
1774 if (hex $1 < 65536)
1776 $vertical_table[hex $1] = $vertical_types{$type};
1778 next;
1780 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1782 my $type = $3;
1783 die "unknown vertical $type" unless defined $vertical_types{$type};
1784 foreach my $i (hex $1 .. hex $2)
1786 $vertical_table[$i] = $vertical_types{$type};
1788 next;
1790 die "malformed line $_";
1792 close $INPUT;
1794 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1795 print "Building $filename\n";
1796 print OUTPUT "/* Unicode Vertical Orientation */\n";
1797 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1798 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1799 print OUTPUT "#include \"windef.h\"\n\n";
1801 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1803 close OUTPUT;
1804 save_file($filename);
1807 ################################################################
1808 # dump the digit folding tables
1809 sub dump_digit_folding($)
1811 my ($filename) = shift;
1812 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1813 print "Building $filename\n";
1814 print OUTPUT "/* Unicode digit folding mappings */\n";
1815 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1816 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1817 print OUTPUT "#include \"windef.h\"\n\n";
1819 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1820 close OUTPUT;
1821 save_file($filename);
1825 ################################################################
1826 # compress a mapping table by removing identical rows
1827 sub compress_array($$@)
1829 my $rows = shift;
1830 my $def = shift;
1831 my @table = @_;
1832 my $len = @table / $rows;
1833 my @array;
1834 my $data = "";
1836 # try to merge table rows
1837 for (my $row = 0; $row < $rows; $row++)
1839 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1840 my $pos = index $data, $rowtxt;
1841 if ($pos == -1)
1843 # check if the tail of the data can match the start of the new row
1844 my $first = substr( $rowtxt, 0, 1 );
1845 for (my $i = length($data) - 1; $i > 0; $i--)
1847 $pos = index( substr( $data, -$i ), $first );
1848 last if $pos == -1;
1849 $i -= $pos;
1850 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1851 substr( $data, -$i ) = "";
1852 last;
1854 $pos = length $data;
1855 $data .= $rowtxt;
1857 $array[$row] = $rows + $pos;
1859 return @array, unpack "U*", $data;
1862 ################################################################
1863 # dump a char -> 16-bit value mapping table using two-level tables
1864 sub dump_two_level_mapping($$@)
1866 my $name = shift;
1867 my $def = shift;
1868 my $size = shift;
1869 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1870 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1871 my @array = compress_array( 256, 0, @row_array[0..4095] );
1873 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1875 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1876 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1877 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1878 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1881 ################################################################
1882 # dump a char -> value mapping table using three-level tables
1883 sub dump_three_level_mapping($$@)
1885 my $name = shift;
1886 my $def = shift;
1887 my $size = shift;
1888 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1889 my $level3 = ($MAX_CHAR + 1) / 16;
1890 my $level2 = $level3 / 16;
1891 my $level1 = $level2 / 16;
1892 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1893 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1894 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1896 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1897 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1899 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1900 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1901 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1902 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1903 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1906 ################################################################
1907 # dump a binary case mapping table in l_intl.nls format
1908 sub dump_binary_case_table(@)
1910 my (@table) = @_;
1911 my $max_char = 0x10000;
1912 my $level1 = $max_char / 16;
1913 my $level2 = $level1 / 16;
1915 my @difftable;
1916 for (my $i = 0; $i < @table; $i++)
1918 next unless defined $table[$i];
1919 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1922 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1923 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1924 my $offset = @array - $level1;
1925 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1926 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1929 ################################################################
1930 # dump case mappings for l_intl.nls
1931 sub dump_intl_nls($)
1933 my @upper_table = @toupper_table;
1934 my @lower_table = @tolower_table;
1935 remove_linguistic_mappings( \@upper_table, \@lower_table );
1937 my $upper = dump_binary_case_table( @upper_table );
1938 my $lower = dump_binary_case_table( @lower_table );
1940 my $filename = shift;
1941 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1942 printf "Building $filename\n";
1944 binmode OUTPUT;
1945 print OUTPUT pack "S<", 1; # version
1946 print OUTPUT $upper;
1947 print OUTPUT $lower;
1948 close OUTPUT;
1949 save_file($filename);
1953 ################################################################
1954 # dump the bidi direction table
1955 sub dump_bidi_dir_table($)
1957 my $filename = shift;
1958 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1959 printf "Building $filename\n";
1960 printf OUTPUT "/* Unicode BiDi direction table */\n";
1961 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1962 printf OUTPUT "#include \"windef.h\"\n\n";
1964 my @table;
1966 for (my $i = 0; $i < 65536; $i++)
1968 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1971 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1973 close OUTPUT;
1974 save_file($filename);
1978 sub rol($$)
1980 my ($byte, $count) = @_;
1981 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1984 ################################################################
1985 # compress the character properties table
1986 sub compress_char_props_table($@)
1988 my $rows = shift;
1989 my @table = @_;
1990 my $len = @table / $rows;
1991 my $pos = 0;
1992 my @array = (0) x $rows;
1993 my %sequences;
1995 # add some predefined sequences
1996 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1998 # try to merge table rows
1999 for (my $row = 0; $row < $rows; $row++)
2001 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
2002 my $rowtxt = pack "L*", @table_row;
2003 if (defined($sequences{$rowtxt}))
2005 # reuse an existing row
2006 $array[$row] = $sequences{$rowtxt};
2008 else
2010 # create a new row
2011 $sequences{$rowtxt} = $array[$row] = ++$pos;
2012 push @array, @table_row;
2015 return @array;
2018 ################################################################
2019 # dump a normalization table in binary format
2020 sub dump_norm_table($)
2022 my $filename = shift;
2024 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2025 my %decomp = ( "nfc" => \@decomp_table,
2026 "nfd" => \@decomp_table,
2027 "nfkc" => \@decomp_compat_table,
2028 "nfkd" => \@decomp_compat_table ,
2029 "idna" => \@idna_decomp_table );
2031 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2032 print "Building $filename\n";
2034 my $type = $filename;
2035 $type =~ s!.*/norm(\w+)\.nls!$1!;
2037 my $compose = $forms{$type} & 1;
2038 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2040 my @version = split /\./, $UNIVERSION;
2042 # combining classes
2044 my @classes;
2045 my @class_values;
2047 foreach my $c (grep defined, @combining_class_table)
2049 $classes[$c] = 1 if $c < 0x100;
2051 for (my $i = 0; $i < @classes; $i++)
2053 next unless defined $classes[$i];
2054 $classes[$i] = @class_values;
2055 push @class_values, $i;
2057 push @class_values, 0 if (@class_values % 2);
2058 die "too many classes" if @class_values >= 0x40;
2060 # character properties
2062 my @char_props;
2063 my @decomposed;
2064 my @comp_hash_table;
2065 my $comp_hash_size = $compose ? 254 : 0;
2067 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2069 next unless defined $combining_class_table[$i];
2070 if (defined $decomp{$type}->[$i])
2072 my @dec = get_decomposition( $i, $decomp{$type} );
2073 if ($compose && (my @comp = get_composition( $i, $compat )))
2075 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2076 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2078 my $val = 0;
2079 foreach my $d (@dec)
2081 $val = $combining_class_table[$d];
2082 last if $val;
2084 $char_props[$i] = $classes[$val];
2086 else
2088 $char_props[$i] = 0xbf;
2090 @dec = compose_hangul( @dec ) if $compose;
2091 @dec = to_utf16( @dec );
2092 push @dec, 0 if @dec >= 7;
2093 $decomposed[$i] = \@dec;
2095 else
2097 if ($combining_class_table[$i] == 0x100)
2099 $char_props[$i] = 0x7f;
2101 elsif ($combining_class_table[$i])
2103 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2105 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2107 $char_props[$i] = 0xff;
2109 else
2111 $char_props[$i] = 0;
2116 if ($compose)
2118 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2120 my @comp = get_composition( $i, $compat );
2121 next unless @comp;
2122 if ($combining_class_table[$comp[1]])
2124 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2125 $char_props[$comp[1]] |= 0x40;
2127 else
2129 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2130 $char_props[$comp[1]] |= 0xc0;
2135 # surrogates
2136 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2137 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2139 # Hangul
2140 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2141 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2142 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2144 # invalid chars
2145 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2146 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2147 foreach my $i (0x00..0x10)
2149 $char_props[($i << 16) | 0xfffe] = 0xff;
2150 $char_props[($i << 16) | 0xffff] = 0xff;
2153 # decomposition hash table
2155 my @decomp_hash_table;
2156 my @decomp_hash_index;
2157 my @decomp_hash_data;
2158 my $decomp_hash_size = 944;
2160 # build string of character data, reusing substrings when possible
2161 my $decomp_char_data = "";
2162 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2164 my $str = pack "U*", @{$i};
2165 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2167 for (my $i = 0; $i < @decomposed; $i++)
2169 next unless defined $decomposed[$i];
2170 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2171 die "sequence not found" if $pos == -1;
2172 my $len = @{$decomposed[$i]};
2173 $len = 7 if $len > 7;
2174 my $hash = $i % $decomp_hash_size;
2175 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2177 for (my $i = 0; $i < $decomp_hash_size; $i++)
2179 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2180 next unless defined $decomp_hash_table[$i];
2181 if (@{$decomp_hash_table[$i]} == 1)
2183 my $entry = $decomp_hash_table[$i]->[0];
2184 if ($char_props[$entry->[0]] == 0xbf)
2186 $decomp_hash_index[$i] = $entry->[1];
2187 next;
2190 foreach my $entry (@{$decomp_hash_table[$i]})
2192 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2195 push @decomp_hash_data, 0, 0;
2197 # composition hash table
2199 my @comp_hash_index;
2200 my @comp_hash_data;
2201 if (@comp_hash_table)
2203 for (my $i = 0; $i < $comp_hash_size; $i++)
2205 $comp_hash_index[$i] = @comp_hash_data;
2206 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2208 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2209 push @comp_hash_data, 0, 0, 0;
2212 my $level1 = ($MAX_CHAR + 1) / 128;
2213 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2215 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2216 0, $decomp_hash_size, $comp_hash_size, 0 );
2217 my @tables = (0) x 8;
2219 $tables[0] = 16 + @header + @tables;
2220 $tables[1] = $tables[0] + @class_values / 2;
2221 $tables[2] = $tables[1] + $level1 / 2;
2222 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2223 $tables[4] = $tables[3] + @decomp_hash_index;
2224 $tables[5] = $tables[4] + @decomp_hash_data;
2225 $tables[6] = $tables[5] + length $decomp_char_data;
2226 $tables[7] = $tables[6] + @comp_hash_index;
2228 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2229 print OUTPUT pack "S<*", @header;
2230 print OUTPUT pack "S<*", @tables;
2231 print OUTPUT pack "C*", @class_values;
2233 print OUTPUT pack "C*", @rows[0..$level1-1];
2234 print OUTPUT pack "C*", @rows[$level1..$#rows];
2235 print OUTPUT pack "S<*", @decomp_hash_index;
2236 print OUTPUT pack "S<*", @decomp_hash_data;
2237 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2238 print OUTPUT pack "S<*", @comp_hash_index;
2239 print OUTPUT pack "S<*", @comp_hash_data;
2241 close OUTPUT;
2242 save_file($filename);
2244 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2248 ################################################################
2249 # output a codepage definition file from the global tables
2250 sub output_codepage_file($)
2252 my $codepage = shift;
2254 my $output = sprintf "nls/c_%03d.nls", $codepage;
2255 open OUTPUT,">$output.new" or die "Cannot create $output";
2257 printf "Building %s\n", $output;
2258 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2259 else { dump_binary_dbcs_table( $codepage ); }
2261 close OUTPUT;
2262 save_file($output);
2264 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2267 ################################################################
2268 # output a codepage table from a Microsoft-style mapping file
2269 sub dump_msdata_codepage($)
2271 my $filename = shift;
2273 my $state = "";
2274 my ($codepage, $width, $count);
2275 my ($lb_cur, $lb_end);
2277 @cp2uni = ();
2278 @glyph2uni = ();
2279 @lead_bytes = ();
2280 @uni2cp = ();
2281 $default_char = $DEF_CHAR;
2282 $default_wchar = $DEF_CHAR;
2284 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2286 while (<$INPUT>)
2288 next if /^;/; # skip comments
2289 next if /^\s*$/; # skip empty lines
2290 next if /\x1a/; # skip ^Z
2291 last if /^ENDCODEPAGE/;
2293 if (/^CODEPAGE\s+(\d+)/)
2295 $codepage = $1;
2296 next;
2298 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2300 $width = $1;
2301 $default_char = hex $2;
2302 $default_wchar = hex $3;
2303 next;
2305 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2307 $state = $1;
2308 $count = $2;
2309 next;
2311 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2313 if ($state eq "MBTABLE")
2315 my $cp = hex $1;
2316 my $uni = hex $2;
2317 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2318 next;
2320 if ($state eq "GLYPHTABLE")
2322 my $cp = hex $1;
2323 my $uni = hex $2;
2324 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2325 next;
2327 if ($state eq "WCTABLE")
2329 my $uni = hex $1;
2330 my $cp = hex $2;
2331 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2332 next;
2334 if ($state eq "DBCSRANGE")
2336 my $start = hex $1;
2337 my $end = hex $2;
2338 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2339 $lb_cur = $start;
2340 $lb_end = $end;
2341 next;
2343 if ($state eq "DBCSTABLE")
2345 my $mb = hex $1;
2346 my $uni = hex $2;
2347 my $cp = ($lb_cur << 8) | $mb;
2348 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2349 if (!--$count)
2351 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2353 next;
2356 die "$filename: Unrecognized line $_\n";
2358 close $INPUT;
2360 output_codepage_file( $codepage );
2362 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
2365 ################################################################
2366 # align a string length
2367 sub align_string($$)
2369 my ($align, $str) = @_;
2370 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2371 return $str;
2374 ################################################################
2375 # pack a GUID string
2376 sub pack_guid($)
2378 $_ = shift;
2379 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2380 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2383 ################################################################
2384 # comparison function for compression sort
2385 sub cmp_compression
2387 return scalar @{$a} <=> scalar @{$b} ||
2388 $a->[4] <=> $b->[4] ||
2389 $a->[5] <=> $b->[5] ||
2390 $a->[6] <=> $b->[6] ||
2391 $a->[7] <=> $b->[7] ||
2392 $a->[8] <=> $b->[8] ||
2393 $a->[9] <=> $b->[9] ||
2394 $a->[10] <=> $b->[10] ||
2395 $a->[11] <=> $b->[11] ||
2396 $a->[12] <=> $b->[12];
2399 ################################################################
2400 # build a binary sort keys table
2401 sub dump_sortkey_table($$)
2403 my ($filename, $download) = @_;
2405 my @keys;
2406 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2407 my @multiple_weights;
2408 my @expansions;
2409 my @compressions;
2410 my %exceptions;
2411 my %guids;
2412 my %compr_flags;
2413 my %locales;
2414 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2415 my $jamostr = "";
2417 my $re_hex = '0x[0-9A-Fa-f]+';
2418 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2419 $guids{$default_guid} = { };
2421 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2423 my $KEYS = open_data_file( $MSDATA, $download );
2425 printf "Building $filename\n";
2427 while (<$KEYS>)
2429 s/\s*;.*$//;
2430 next if /^\s*$/; # skip empty lines
2431 if (/^\s*(SORTKEY|SORTTABLES)/)
2433 $part = $1;
2434 next;
2436 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2438 $part = $section = "";
2439 next;
2441 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2443 $section = $1;
2444 $guid = undef;
2445 next;
2447 next unless $part;
2448 if ("$part.$section" eq "SORTKEY.DEFAULT")
2450 if (/^\s*($re_hex)\s+$re_key/)
2452 $keys[hex $1] = [ split(/\s+/,$2) ];
2453 next;
2456 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2458 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2460 $version = hex $1;
2461 next;
2463 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2465 # ignore for now
2466 next;
2469 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2470 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2471 "$part.$section" eq "SORTTABLES.INVERSECASING")
2473 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2475 $guid = lc $1;
2476 $guids{$guid} = { } unless defined $guids{$guid};
2477 $guids{$guid}->{flags} |= $flags{$section};
2478 next;
2480 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2482 $locales{$1} = $guid;
2483 next;
2486 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2488 if (/^\s*(\d+)\s+(\d+)/)
2490 push @multiple_weights, $1, $2;
2491 next;
2494 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2496 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2498 my $pos = scalar @expansions / 2;
2499 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2500 push @expansions, hex $2, hex $3;
2501 next;
2504 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2506 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2508 $keys[hex $1] = $keys[hex $2];
2509 next;
2512 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2514 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2516 if ($subsection || !$guid) # start a new one
2518 $guid = lc $1;
2519 $subsection = "";
2520 $guids{$guid} = { } unless defined $guids{$guid};
2521 $guids{$guid}->{flags} |= $flags{$2} if $2;
2522 $guids{$guid}->{compr} = @compressions;
2523 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2524 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2525 push @compressions, [ ];
2527 else # merge with current one
2529 $guids{lc $1} = { } unless defined $guids{lc $1};
2530 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2531 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2532 $compr_flags{lc $1} = $compr_flags{$guid};
2534 next;
2536 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2538 $locales{$1} = $guid;
2539 next;
2541 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2543 $subsection = $1;
2544 next;
2546 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2548 my @comp = map { hex $_; } split(/\s+/,$1);
2549 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2550 # add compression flags
2551 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2552 next;
2555 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2557 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2559 $guid = lc $1;
2560 $guids{$guid} = { } unless defined $guids{lc $1};
2561 $ling_flag = ($2 ? "+" : "-");
2562 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2563 next;
2565 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2567 $locales{$1} = $guid;
2568 next;
2570 if (/^\s*($re_hex)\s+$re_key/)
2572 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2573 next;
2576 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2578 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2580 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2581 next;
2584 die "$download: $part.$section: unrecognized line $_\n";
2586 close $KEYS;
2588 # Sortkey table
2590 my $table;
2591 for (my $i = 0; $i < 0x10000; $i++)
2593 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2594 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2597 foreach my $id (sort keys %exceptions)
2599 my $pos = length($table) / 4;
2600 my @exc = @{$exceptions{$id}};
2601 my @filled;
2602 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2603 my $guid = substr( $id, 0, -1 );
2604 $guids{$guid}->{$key} = $pos;
2605 $pos += 0x100;
2606 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2607 for (my $j = 0; $j < 0x10000; $j++)
2609 next unless defined $exc[$j] || defined $flags[$j];
2610 $filled[$j >> 8] = 1;
2611 $j |= 0xff;
2613 for (my $j = 0; $j < 0x100; $j++)
2615 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2616 $pos += 0x100 if $filled[$j];
2618 for (my $j = 0; $j < 0x10000; $j++)
2620 next unless $filled[$j >> 8];
2621 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2622 $k[3] |= $flags[$j] || 0;
2623 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2627 # Case mapping tables
2629 # standard table
2630 my @casemaps;
2631 my @upper = @toupper_table;
2632 my @lower = @tolower_table;
2633 remove_linguistic_mappings( \@upper, \@lower );
2634 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2636 # linguistic table
2637 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2639 # Turkish table
2640 @upper = @toupper_table;
2641 @lower = @tolower_table;
2642 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2643 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2644 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2645 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2647 # Char type table
2649 my @table;
2650 my $types = "";
2651 my %typestr;
2652 for (my $i = 0; $i < 0x10000; $i++)
2654 my $str = pack "S<3",
2655 ($category_table[$i] || 0) & 0xffff,
2656 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2657 ($category_table[$i] || 0) >> 16;
2659 if (!defined($typestr{$str}))
2661 $typestr{$str} = length($types) / 6;
2662 $types .= $str;
2664 $table[$i] = $typestr{$str};
2667 my @rows = compress_array( 4096, 0, @table[0..65535] );
2668 my @array = compress_array( 256, 0, @rows[0..4095] );
2669 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2670 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2672 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2673 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2674 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2676 # Sort tables
2678 # guids
2679 my $sorttables = pack "L<2", $version, scalar %guids;
2680 foreach my $id (sort keys %guids)
2682 my %guid = %{$guids{$id}};
2683 my $flags = $guid{flags} || 0;
2684 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2685 $sorttables .= pack_guid($id) . pack "L<5",
2686 $flags,
2687 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2688 $guid{except} || 0,
2689 $guid{ling_except} || 0,
2690 $map / 2;
2693 # expansions
2694 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2696 # compressions
2697 $sorttables .= pack "L<", scalar @compressions;
2698 my $rowstr = "";
2699 foreach my $c (@compressions)
2701 my $pos = length($rowstr) / 2;
2702 my $min = 0xffff;
2703 my $max = 0;
2704 my @lengths = (0) x 8;
2705 foreach my $r (sort cmp_compression @{$c})
2707 my @row = @{$r};
2708 $lengths[scalar @row - 6]++;
2709 foreach my $val (@row[4..$#row])
2711 $min = $val if $min > $val;
2712 $max = $val if $max < $val;
2714 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2715 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2717 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2719 $sorttables .= $rowstr;
2721 # multiple weights
2722 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2724 # jamo sort
2725 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2727 # Locales
2729 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2730 foreach my $loc (sort keys %locales)
2732 # skip specific locales that match more general ones
2733 my @parts = split /[-_]/, $loc;
2734 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2735 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2736 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2739 # File header
2741 my @header;
2742 $header[0] = 16;
2743 $header[1] = $header[0] + length $table;
2744 $header[2] = $header[1] + length $casemaps;
2745 $header[3] = $header[2] + length $chartypes;
2747 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2748 print OUTPUT pack "L<*", @header;
2749 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2750 close OUTPUT;
2751 save_file($filename);
2755 ################################################################
2756 # build the script to create registry keys
2757 sub dump_registry_script($%)
2759 my ($filename, %keys) = @_;
2760 my $indent = 1;
2762 printf "Building %s\n", $filename;
2763 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2764 print OUTPUT "HKLM\n{\n";
2765 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2767 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2768 $indent++;
2770 foreach my $k (sort keys %keys)
2772 my @subkeys = split /\\/, $k;
2773 my ($def, @vals) = @{$keys{$k}};
2774 for (my $i = 0; $i < @subkeys; $i++)
2776 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2777 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2778 $indent++;
2780 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2781 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2783 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2784 close OUTPUT;
2785 save_file($filename);
2789 ################################################################
2790 # save a file if modified
2791 sub save_file($)
2793 my $file = shift;
2794 if (-f $file && !system "cmp $file $file.new >/dev/null")
2796 unlink "$file.new";
2798 else
2800 rename "$file.new", "$file";
2805 ################################################################
2806 # main routine
2808 chdir ".." if -f "./make_unicode";
2809 load_data();
2810 dump_sortkeys( "dlls/kernelbase/collation.c" );
2811 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2812 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2813 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2814 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2815 dump_mirroring( "dlls/dwrite/mirror.c" );
2816 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2817 dump_bracket( "dlls/dwrite/bracket.c" );
2818 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2819 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2820 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2821 dump_linebreak( "dlls/dwrite/linebreak.c" );
2822 dump_scripts( "dlls/dwrite/scripts" );
2823 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2824 dump_vertical( "dlls/gdi32/vertical.c" );
2825 dump_vertical( "dlls/wineps.drv/vertical.c" );
2826 dump_intl_nls("nls/l_intl.nls");
2827 dump_norm_table( "nls/normnfc.nls" );
2828 dump_norm_table( "nls/normnfd.nls" );
2829 dump_norm_table( "nls/normnfkc.nls" );
2830 dump_norm_table( "nls/normnfkd.nls" );
2831 dump_norm_table( "nls/normidna.nls" );
2832 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2833 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2834 dump_eucjp_codepage();
2835 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2837 exit 0;
2839 # Local Variables:
2840 # compile-command: "./make_unicode"
2841 # End: