winhttp: Call read_data() at least once in drain_content().
[wine.git] / tools / make_unicode
blob4f060876b67f5dba1a35f2b5757572dcba418abe
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "14.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
30 my $REPORTS = "http://www.unicode.org/reports";
31 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
32 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
34 # Sort keys file
35 my $SORTKEYS = "tr10/allkeys.txt";
37 # Default char for undefined mappings
38 my $DEF_CHAR = ord '?';
40 # Last valid Unicode character
41 my $MAX_CHAR = 0x10ffff;
43 my @allfiles =
45 "CodpageFiles/037.txt",
46 "CodpageFiles/437.txt",
47 "CodpageFiles/500.txt",
48 "CodpageFiles/708.txt",
49 "CodpageFiles/720.txt",
50 "CodpageFiles/737.txt",
51 "CodpageFiles/775.txt",
52 "CodpageFiles/850.txt",
53 "CodpageFiles/852.txt",
54 "CodpageFiles/855.txt",
55 "CodpageFiles/857.txt",
56 "CodpageFiles/860.txt",
57 "CodpageFiles/861.txt",
58 "CodpageFiles/862.txt",
59 "CodpageFiles/863.txt",
60 "CodpageFiles/864.txt",
61 "CodpageFiles/865.txt",
62 "CodpageFiles/866.txt",
63 "CodpageFiles/869.txt",
64 "CodpageFiles/874.txt",
65 "CodpageFiles/875.txt",
66 "CodpageFiles/932.txt",
67 "CodpageFiles/936.txt",
68 "CodpageFiles/949.txt",
69 "CodpageFiles/950.txt",
70 "CodpageFiles/1026.txt",
71 "CodpageFiles/1250.txt",
72 "CodpageFiles/1251.txt",
73 "CodpageFiles/1252.txt",
74 "CodpageFiles/1253.txt",
75 "CodpageFiles/1254.txt",
76 "CodpageFiles/1255.txt",
77 "CodpageFiles/1256.txt",
78 "CodpageFiles/1257.txt",
79 "CodpageFiles/1258.txt",
80 "CodpageFiles/1361.txt",
81 "CodpageFiles/10000.txt",
82 "CodpageFiles/10001.txt",
83 "CodpageFiles/10002.txt",
84 "CodpageFiles/10003.txt",
85 "CodpageFiles/10004.txt",
86 "CodpageFiles/10005.txt",
87 "CodpageFiles/10006.txt",
88 "CodpageFiles/10007.txt",
89 "CodpageFiles/10008.txt",
90 "CodpageFiles/10010.txt",
91 "CodpageFiles/10017.txt",
92 "CodpageFiles/10021.txt",
93 "CodpageFiles/10029.txt",
94 "CodpageFiles/10079.txt",
95 "CodpageFiles/10081.txt",
96 "CodpageFiles/10082.txt",
97 "CodpageFiles/20127.txt",
98 "CodpageFiles/20866.txt",
99 "CodpageFiles/21866.txt",
100 "CodpageFiles/28591.txt",
101 "CodpageFiles/28592.txt",
102 "CodpageFiles/28593.txt",
103 "CodpageFiles/28594.txt",
104 "CodpageFiles/28595.txt",
105 "CodpageFiles/28596.txt",
106 "CodpageFiles/28597.txt",
107 "CodpageFiles/28598.txt",
108 "CodpageFiles/28599.txt",
109 "CodpageFiles/28603.txt",
110 "CodpageFiles/28605.txt",
114 my %ctype =
116 # CT_CTYPE1
117 "upper" => 0x0001,
118 "lower" => 0x0002,
119 "digit" => 0x0004,
120 "space" => 0x0008,
121 "punct" => 0x0010,
122 "cntrl" => 0x0020,
123 "blank" => 0x0040,
124 "xdigit" => 0x0080,
125 "alpha" => 0x0100 | 0x80000000,
126 "defin" => 0x0200,
127 # CT_CTYPE3 in high 16 bits
128 "nonspacing" => 0x00010000,
129 "diacritic" => 0x00020000,
130 "vowelmark" => 0x00040000,
131 "symbol" => 0x00080000,
132 "katakana" => 0x00100000,
133 "hiragana" => 0x00200000,
134 "halfwidth" => 0x00400000,
135 "fullwidth" => 0x00800000,
136 "ideograph" => 0x01000000,
137 "kashida" => 0x02000000,
138 "lexical" => 0x04000000,
139 "highsurrogate" => 0x08000000,
140 "lowsurrogate" => 0x10000000,
143 my %bracket_types =
145 "o" => 0x0000,
146 "c" => 0x0001,
149 my %indic_types =
151 "Other" => 0x0000,
152 "Bindu" => 0x0001,
153 "Visarga" => 0x0002,
154 "Avagraha" => 0x0003,
155 "Nukta" => 0x0004,
156 "Virama" => 0x0005,
157 "Vowel_Independent" => 0x0006,
158 "Vowel_Dependent" => 0x0007,
159 "Vowel" => 0x0008,
160 "Consonant_Placeholder" => 0x0009,
161 "Consonant" => 0x000a,
162 "Consonant_Dead" => 0x000b,
163 "Consonant_Succeeding_Repha" => 0x000c,
164 "Consonant_Subjoined" => 0x000d,
165 "Consonant_Medial" => 0x000e,
166 "Consonant_Final" => 0x000f,
167 "Consonant_Head_Letter" => 0x0010,
168 "Modifying_Letter" => 0x0011,
169 "Tone_Letter" => 0x0012,
170 "Tone_Mark" => 0x0013,
171 "Register_Shifter" => 0x0014,
172 "Consonant_Preceding_Repha" => 0x0015,
173 "Pure_Killer" => 0x0016,
174 "Invisible_Stacker" => 0x0017,
175 "Gemination_Mark" => 0x0018,
176 "Cantillation_Mark" => 0x0019,
177 "Non_Joiner" => 0x001a,
178 "Joiner" => 0x001b,
179 "Number_Joiner" => 0x001c,
180 "Number" => 0x001d,
181 "Brahmi_Joining_Number" => 0x001e,
182 "Consonant_With_Stacker" => 0x001f,
183 "Consonant_Prefixed" => 0x0020,
184 "Syllable_Modifier" => 0x0021,
185 "Consonant_Killer" => 0x0022,
186 "Consonant_Initial_Postfixed" => 0x0023,
189 my %matra_types =
191 "Right" => 0x01,
192 "Left" => 0x02,
193 "Visual_Order_Left" => 0x03,
194 "Left_And_Right" => 0x04,
195 "Top" => 0x05,
196 "Bottom" => 0x06,
197 "Top_And_Bottom" => 0x07,
198 "Top_And_Right" => 0x08,
199 "Top_And_Left" => 0x09,
200 "Top_And_Left_And_Right" => 0x0a,
201 "Bottom_And_Right" => 0x0b,
202 "Top_And_Bottom_And_Right" => 0x0c,
203 "Overstruck" => 0x0d,
204 "Invisible" => 0x0e,
205 "Bottom_And_Left" => 0x0f,
206 "Top_And_Bottom_And_Left" => 0x10,
209 my %break_types =
211 "BK" => 0x0001,
212 "CR" => 0x0002,
213 "LF" => 0x0003,
214 "CM" => 0x0004,
215 "SG" => 0x0005,
216 "GL" => 0x0006,
217 "CB" => 0x0007,
218 "SP" => 0x0008,
219 "ZW" => 0x0009,
220 "NL" => 0x000a,
221 "WJ" => 0x000b,
222 "JL" => 0x000c,
223 "JV" => 0x000d,
224 "JT" => 0x000e,
225 "H2" => 0x000f,
226 "H3" => 0x0010,
227 "XX" => 0x0011,
228 "OP" => 0x0012,
229 "CL" => 0x0013,
230 "CP" => 0x0014,
231 "QU" => 0x0015,
232 "NS" => 0x0016,
233 "EX" => 0x0017,
234 "SY" => 0x0018,
235 "IS" => 0x0019,
236 "PR" => 0x001a,
237 "PO" => 0x001b,
238 "NU" => 0x001c,
239 "AL" => 0x001d,
240 "ID" => 0x001e,
241 "IN" => 0x001f,
242 "HY" => 0x0020,
243 "BB" => 0x0021,
244 "BA" => 0x0022,
245 "SA" => 0x0023,
246 "AI" => 0x0024,
247 "B2" => 0x0025,
248 "HL" => 0x0026,
249 "CJ" => 0x0027,
250 "RI" => 0x0028,
251 "EB" => 0x0029,
252 "EM" => 0x002a,
253 "ZWJ" => 0x002b,
256 my %vertical_types =
258 "R" => 0x0000,
259 "U" => 0x0001,
260 "Tr" => 0x0002,
261 "Tu" => 0x0003,
264 my %categories =
266 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
267 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
268 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
269 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
270 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
271 "Me" => $ctype{"defin"}, # Mark, Enclosing
272 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
273 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
274 "No" => $ctype{"defin"}, # Number, Other
275 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
276 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
277 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
278 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
279 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
280 "Cs" => $ctype{"defin"}, # Other, Surrogate
281 "Co" => $ctype{"defin"}, # Other, Private Use
282 "Cn" => $ctype{"defin"}, # Other, Not Assigned
283 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
284 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
285 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
286 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
287 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
288 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
289 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
290 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
291 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
292 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
293 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
294 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
295 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
298 # a few characters need additional categories that cannot be determined automatically
299 my %special_categories =
301 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
302 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
303 "space" => [ 0x09..0x0d, 0x85 ],
304 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
305 "cntrl" => [ 0x070f, 0x200c, 0x200d,
306 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
307 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
308 0xfff9, 0xfffa, 0xfffb ],
309 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
310 0xd7, 0xf7 ],
311 "digit" => [ 0xb2, 0xb3, 0xb9 ],
312 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
313 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
314 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
315 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
316 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
317 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
318 0x02b9..0x02ba, 0x02c6..0x02cf ],
319 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
320 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
321 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
322 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
323 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
324 0x3131..0x3164 ],
325 "ideograph" => [ 0x3006..0x3007 ],
326 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
327 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
328 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
329 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
330 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
331 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
332 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
333 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
334 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
335 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
336 "kashida" => [ 0x0640 ],
339 my %directions =
341 "L" => 1, # Left-to-Right
342 "R" => 2, # Right-to-Left
343 "AL" => 12, # Right-to-Left Arabic
344 "EN" => 3, # European Number
345 "ES" => 4, # European Number Separator
346 "ET" => 5, # European Number Terminator
347 "AN" => 6, # Arabic Number
348 "CS" => 7, # Common Number Separator
349 "NSM" => 13, # Non-Spacing Mark
350 "BN" => 14, # Boundary Neutral
351 "B" => 8, # Paragraph Separator
352 "S" => 9, # Segment Separator
353 "WS" => 10, # Whitespace
354 "ON" => 11, # Other Neutrals
355 "LRE" => 15, # Left-to-Right Embedding
356 "LRO" => 15, # Left-to-Right Override
357 "RLE" => 15, # Right-to-Left Embedding
358 "RLO" => 15, # Right-to-Left Override
359 "PDF" => 15, # Pop Directional Format
360 "LRI" => 15, # Left-to-Right Isolate
361 "RLI" => 15, # Right-to-Left Isolate
362 "FSI" => 15, # First Strong Isolate
363 "PDI" => 15 # Pop Directional Isolate
366 my %c2_types =
368 "L" => 1, # C2_LEFTTORIGHT
369 "R" => 2, # C2_RIGHTTOLEFT
370 "AL" => 2, # C2_RIGHTTOLEFT
371 "EN" => 3, # C2_EUROPENUMBER
372 "ES" => 4, # C2_EUROPESEPARATOR
373 "ET" => 5, # C2_EUROPETERMINATOR
374 "AN" => 6, # C2_ARABICNUMBER
375 "CS" => 7, # C2_COMMONSEPARATOR
376 "NSM" => 11, # C2_OTHERNEUTRAL
377 "BN" => 0, # C2_NOTAPPLICABLE
378 "B" => 8, # C2_BLOCKSEPARATOR
379 "S" => 9, # C2_SEGMENTSEPARATOR
380 "WS" => 10, # C2_WHITESPACE
381 "ON" => 11, # C2_OTHERNEUTRAL
382 "LRE" => 11, # C2_OTHERNEUTRAL
383 "LRO" => 11, # C2_OTHERNEUTRAL
384 "RLE" => 11, # C2_OTHERNEUTRAL
385 "RLO" => 11, # C2_OTHERNEUTRAL
386 "PDF" => 11, # C2_OTHERNEUTRAL
387 "LRI" => 11, # C2_OTHERNEUTRAL
388 "RLI" => 11, # C2_OTHERNEUTRAL
389 "FSI" => 11, # C2_OTHERNEUTRAL
390 "PDI" => 11 # C2_OTHERNEUTRAL
393 my %bidi_types =
395 "ON" => 0, # Other Neutrals
396 "L" => 1, # Left-to-Right
397 "R" => 2, # Right-to-Left
398 "AN" => 3, # Arabic Number
399 "EN" => 4, # European Number
400 "AL" => 5, # Right-to-Left Arabic
401 "NSM" => 6, # Non-Spacing Mark
402 "CS" => 7, # Common Number Separator
403 "ES" => 8, # European Number Separator
404 "ET" => 9, # European Number Terminator
405 "BN" => 10, # Boundary Neutral
406 "S" => 11, # Segment Separator
407 "WS" => 12, # Whitespace
408 "B" => 13, # Paragraph Separator
409 "RLO" => 14, # Right-to-Left Override
410 "RLE" => 15, # Right-to-Left Embedding
411 "LRO" => 16, # Left-to-Right Override
412 "LRE" => 17, # Left-to-Right Embedding
413 "PDF" => 18, # Pop Directional Format
414 "LRI" => 19, # Left-to-Right Isolate
415 "RLI" => 20, # Right-to-Left Isolate
416 "FSI" => 21, # First Strong Isolate
417 "PDI" => 22 # Pop Directional Isolate
420 my %joining_types =
422 "U" => 0, # Non_Joining
423 "L" => 1, # Left_Joining
424 "R" => 2, # Right_Joining
425 "D" => 3, # Dual_Joining
426 "C" => 3, # Join_Causing
427 "ALAPH" => 4, # Syriac ALAPH
428 "DALATH RISH" => 5, # Syriac DALATH RISH group
429 "T" => 6, # Transparent
432 my @cp2uni = ();
433 my @glyph2uni = ();
434 my @lead_bytes = ();
435 my @uni2cp = ();
436 my @tolower_table = ();
437 my @toupper_table = ();
438 my @digitmap_table = ();
439 my @category_table = ();
440 my @initial_joining_table = ();
441 my @direction_table = ();
442 my @decomp_table = ();
443 my @combining_class_table = ();
444 my @decomp_compat_table = ();
445 my @comp_exclusions = ();
446 my @idna_decomp_table = ();
447 my @idna_disallowed = ();
448 my %registry_keys;
449 my $default_char;
450 my $default_wchar;
452 my %joining_forms =
454 "isolated" => [],
455 "final" => [],
456 "initial" => [],
457 "medial" => []
460 sub to_utf16(@)
462 my @ret;
463 foreach my $ch (@_)
465 if ($ch < 0x10000)
467 push @ret, $ch;
469 else
471 my $val = $ch - 0x10000;
472 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
475 return @ret;
478 ################################################################
479 # fetch a unicode.org file and open it
480 sub open_data_file($$)
482 my ($base, $name) = @_;
483 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
484 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
485 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
486 local *FILE;
488 if ($base =~ /.*\/([^\/]+)\.zip$/)
490 my $zip = "$1$suffix.zip";
491 unless (-f "$cache/$zip")
493 system "mkdir", "-p", $cache;
494 print "Fetching $base...\n";
495 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
497 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
499 else
501 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
502 unless (-f $dest)
504 system "mkdir", "-p", $dir;
505 print "Fetching $base/$name...\n";
506 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
508 open FILE, "<$dest" or die "cannot open $dest";
510 return *FILE;
513 ################################################################
514 # recursively get the decomposition for a character
515 sub get_decomposition($$);
516 sub get_decomposition($$)
518 my ($char, $table) = @_;
519 my @ret;
521 return $char unless defined $table->[$char];
522 foreach my $ch (@{$table->[$char]})
524 push @ret, get_decomposition( $ch, $table );
526 return @ret;
529 ################################################################
530 # get the composition that results in a given character
531 sub get_composition($$)
533 my ($ch, $compat) = @_;
534 return () unless defined $decomp_table[$ch]; # no decomposition
535 my @ret = @{$decomp_table[$ch]};
536 return () if @ret < 2; # singleton decomposition
537 return () if $comp_exclusions[$ch]; # composition exclusion
538 return () if $combining_class_table[$ch]; # non-starter
539 return () if $combining_class_table[$ret[0]]; # first char is non-starter
540 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
541 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
542 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
543 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
544 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
545 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
546 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
547 return @ret;
550 ################################################################
551 # recursively build decompositions
552 sub build_decompositions(@)
554 my @src = @_;
555 my @dst;
557 for (my $i = 0; $i < @src; $i++)
559 next unless defined $src[$i];
560 my @decomp = to_utf16( get_decomposition( $i, \@src ));
561 $dst[$i] = \@decomp;
563 return @dst;
566 ################################################################
567 # compose Hangul sequences
568 sub compose_hangul(@)
570 my $SBASE = 0xac00;
571 my $LBASE = 0x1100;
572 my $VBASE = 0x1161;
573 my $TBASE = 0x11a7;
574 my $LCOUNT = 19;
575 my $VCOUNT = 21;
576 my $TCOUNT = 28;
577 my $NCOUNT = $VCOUNT * $TCOUNT;
578 my $SCOUNT = $LCOUNT * $NCOUNT;
580 my @seq = @_;
581 my @ret;
582 my $i;
584 for ($i = 0; $i < @seq; $i++)
586 my $ch = $seq[$i];
587 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
588 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
590 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
591 $i++;
593 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
594 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
596 $ch += $seq[$i+1] - $TBASE;
597 $i++;
599 push @ret, $ch;
601 return @ret;
604 ################################################################
605 # remove linguistic-only mappings from the case table
606 sub remove_linguistic_mappings($$)
608 my ($upper, $lower) = @_;
610 # remove case mappings that don't round-trip
612 for (my $i = 0; $i < @{$upper}; $i++)
614 next unless defined ${$upper}[$i];
615 my $ch = ${$upper}[$i];
616 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
618 for (my $i = 0; $i < @{$lower}; $i++)
620 next unless defined ${$lower}[$i];
621 my $ch = ${$lower}[$i];
622 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
626 ################################################################
627 # read in the Unicode database files
628 sub load_data()
630 my $start;
632 # now build mappings from the decomposition field of the Unicode database
634 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
635 while (<$UNICODE_DATA>)
637 # Decode the fields ...
638 my ($code, $name, $cat, $comb, $bidi,
639 $decomp, $dec, $dig, $num, $mirror,
640 $oldname, $comment, $upper, $lower, $title) = split /;/;
641 my $src = hex $code;
643 die "unknown category $cat" unless defined $categories{$cat};
644 die "unknown directionality $bidi" unless defined $directions{$bidi};
646 $category_table[$src] = $categories{$cat};
647 $direction_table[$src] = $bidi;
648 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
650 $initial_joining_table[$src] = $joining_types{"T"};
652 else
654 $initial_joining_table[$src] = $joining_types{"U"};
657 if ($lower ne "")
659 $tolower_table[$src] = hex $lower;
661 if ($upper ne "")
663 $toupper_table[$src] = hex $upper;
665 if ($dec ne "")
667 $category_table[$src] |= $ctype{"digit"};
669 if ($dig ne "")
671 $digitmap_table[$src] = ord $dig;
673 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
675 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
676 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
677 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
678 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
679 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
680 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
681 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
682 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
683 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
684 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
685 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
686 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
688 # copy the category and direction for everything between First/Last pairs
689 if ($name =~ /, First>/) { $start = $src; }
690 if ($name =~ /, Last>/)
692 while ($start < $src)
694 $category_table[$start] = $category_table[$src];
695 $direction_table[$start] = $direction_table[$src];
696 $combining_class_table[$start] = $combining_class_table[$src];
697 $start++;
701 next if $decomp eq ""; # no decomposition, skip it
703 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
705 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
706 $decomp_compat_table[$src] = \@seq;
709 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
711 # decomposition of the form "<foo> 1234" -> use char if type is known
712 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
714 ${joining_forms{$1}}[hex $2] = $src;
717 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
719 # decomposition "<compat> 0020 1234" -> combining accent
721 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
723 # store decomposition
724 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
726 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
728 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
730 # Single char decomposition
731 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
735 close $UNICODE_DATA;
737 # patch the category of some special characters
739 for (my $i = 0; $i < @decomp_table; $i++)
741 next unless defined $decomp_table[$i];
742 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
744 foreach my $cat (keys %special_categories)
746 my $flag = $ctype{$cat};
747 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
749 for (my $i = 0; $i < @decomp_compat_table; $i++)
751 next unless defined $decomp_compat_table[$i];
752 next unless @{$decomp_compat_table[$i]} == 2;
753 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
756 # load the composition exclusions
758 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
759 while (<$EXCL>)
761 s/\#.*//; # remove comments
762 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
764 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
766 elsif (/^([0-9a-fA-F]+)\s*$/)
768 $comp_exclusions[hex $1] = 1;
771 close $EXCL;
773 # load the IDNA mappings
775 @idna_decomp_table = @decomp_compat_table;
776 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
777 while (<$IDNA>)
779 s/\#.*//; # remove comments
780 next if /^\s*$/;
781 my ($char, $type, $mapping) = split /;/;
782 my ($ch1, $ch2);
783 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
785 $ch1 = hex $1;
786 $ch2 = hex $2;
788 elsif ($char =~ /([0-9a-fA-F]+)/)
790 $ch1 = $ch2 = hex $1;
793 if ($type =~ /mapped/ || $type =~ /deviation/)
795 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
796 my @seq = map { hex $_; } split /\s+/, $mapping;
797 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
799 elsif ($type =~ /valid/)
802 elsif ($type =~ /ignored/)
804 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
806 elsif ($type =~ /disallowed/)
808 foreach my $i ($ch1 .. $ch2)
810 $idna_decomp_table[$i] = undef;
811 $idna_disallowed[$i] = 1;
815 close $IDNA;
819 ################################################################
820 # add a new registry key
821 sub add_registry_key($$)
823 my ($key, $defval) = @_;
824 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
827 ################################################################
828 # add a new registry value
829 sub add_registry_value($$$)
831 my ($key, $name, $value) = @_;
832 add_registry_key( $key, undef );
833 push @{$registry_keys{$key}}, "'$name' = s '$value'";
836 ################################################################
837 # define a new lead byte
838 sub add_lead_byte($)
840 my $ch = shift;
841 return if defined $cp2uni[$ch];
842 push @lead_bytes, $ch;
843 $cp2uni[$ch] = 0;
846 ################################################################
847 # define a new char mapping
848 sub add_mapping($$)
850 my ($cp, $uni) = @_;
851 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
852 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
853 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
856 ################################################################
857 # get a mapping including glyph chars for MB_USEGLYPHCHARS
858 sub get_glyphs_mapping(@)
860 my @table = @_;
862 for (my $i = 0; $i < @glyph2uni; $i++)
864 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
866 return @table;
869 ################################################################
870 # build EUC-JP table from the JIS 0208/0212 files
871 sub dump_eucjp_codepage()
873 @cp2uni = ();
874 @glyph2uni = ();
875 @lead_bytes = ();
876 @uni2cp = ();
877 $default_char = $DEF_CHAR;
878 $default_wchar = 0x30fb;
880 # ASCII chars
881 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
883 # lead bytes
884 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
886 # JIS X 0201 right plane
887 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
889 # undefined chars
890 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
891 $cp2uni[0xa0] = 0xf8f0;
892 $cp2uni[0xff] = 0xf8f3;
894 # Fix backslash conversion
895 add_mapping( 0xa1c0, 0xff3c );
897 # Add private mappings for rows undefined in JIS 0208/0212
898 my $private = 0xe000;
899 foreach my $hi (0xf5 .. 0xfe)
901 foreach my $lo (0xa1 .. 0xfe)
903 add_mapping( ($hi << 8) + $lo, $private++ );
906 foreach my $hi (0xf5 .. 0xfe)
908 foreach my $lo (0x21 .. 0x7e)
910 add_mapping( ($hi << 8) + $lo, $private++ );
914 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
915 while (<$INPUT>)
917 next if /^\#/; # skip comments
918 next if /^$/; # skip empty lines
919 next if /\x1a/; # skip ^Z
920 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
922 add_mapping( 0x8080 + hex $1, hex $2 );
923 next;
925 die "Unrecognized line $_\n";
927 close $INPUT;
929 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
930 while (<$INPUT>)
932 next if /^\#/; # skip comments
933 next if /^$/; # skip empty lines
934 next if /\x1a/; # skip ^Z
935 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
937 add_mapping( 0x8000 + hex $1, hex $2 );
938 next;
940 die "Unrecognized line $_\n";
942 close $INPUT;
944 output_codepage_file( 20932 );
947 ################################################################
948 # build Korean Wansung table from the KSX1001 file
949 sub dump_krwansung_codepage(@)
951 my @cp949 = @_;
952 @cp2uni = ();
953 @glyph2uni = ();
954 @lead_bytes = ();
955 @uni2cp = ();
956 $default_char = 0x3f;
957 $default_wchar = 0x003f;
959 # ASCII and undefined chars
960 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
961 add_mapping( 0xa0, 0xf8e6 );
962 add_mapping( 0xad, 0xf8e7 );
963 add_mapping( 0xae, 0xf8e8 );
964 add_mapping( 0xaf, 0xf8e9 );
965 add_mapping( 0xfe, 0xf8ea );
966 add_mapping( 0xff, 0xf8eb );
968 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
969 while (<$INPUT>)
971 next if /^\#/; # skip comments
972 next if /^$/; # skip empty lines
973 next if /\x1a/; # skip ^Z
974 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
976 add_mapping( 0x8080 + hex $1, hex $2 );
977 next;
979 die "Unrecognized line $_\n";
981 close $INPUT;
983 # get some extra mappings from cp 949
984 my @defined_lb;
985 map { $defined_lb[$_] = 1; } @lead_bytes;
986 foreach my $i (0x0000 .. 0xffff)
988 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
989 next unless defined $cp949[$i];
990 if ($cp949[$i] >= 0xff)
992 # only add chars for lead bytes that exist in 20949
993 my $hi = $cp949[$i] >> 8;
994 my $lo = $cp949[$i] & 0xff;
995 next unless $defined_lb[$hi];
996 next unless $lo >= 0xa1 && $lo <= 0xfe;
998 add_mapping( $cp949[$i], $i );
1001 output_codepage_file( 20949 );
1004 ################################################################
1005 # build the sort keys table
1006 sub dump_sortkeys($)
1008 my $filename = shift;
1009 my @sortkeys = ();
1011 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
1012 while (<$INPUT>)
1014 next if /^\#/; # skip comments
1015 next if /^$/; # skip empty lines
1016 next if /\x1a/; # skip ^Z
1017 next if /^\@version/; # skip @version header
1018 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1020 my ($uni,$variable) = (hex $1, $2);
1021 next if $uni > 65535;
1022 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
1023 next;
1025 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1027 # multiple character sequence, ignored for now
1028 next;
1030 die "$SORTKEYS: Unrecognized line $_\n";
1032 close $INPUT;
1034 # compress the keys to 32 bit:
1035 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
1037 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
1038 ${$a}[2] <=> ${$b}[2] or
1039 ${$a}[3] <=> ${$b}[3] or
1040 ${$a}[4] <=> ${$b}[4] or
1041 $a cmp $b; } @sortkeys;
1043 my ($n2, $n3) = (1, 1);
1044 my @keys = (-1, -1, -1, -1, -1 );
1045 my @flatkeys = ();
1047 for (my $i = 0; $i < @sortkeys; $i++)
1049 next unless defined $sortkeys[$i];
1050 my @current = @{$sortkeys[$i]};
1051 if ($current[1] == $keys[1])
1053 if ($current[2] == $keys[2])
1055 if ($current[3] == $keys[3])
1057 # nothing
1059 else
1061 $keys[3] = $current[3];
1062 $n3++;
1063 die if ($n3 >= 16);
1066 else
1068 $keys[2] = $current[2];
1069 $keys[3] = $current[3];
1070 $n2++;
1071 $n3 = 1;
1072 die if ($n2 >= 256);
1075 else
1077 $keys[1] = $current[1];
1078 $keys[2] = $current[2];
1079 $keys[3] = $current[3];
1080 $n2 = 1;
1081 $n3 = 1;
1084 if ($current[2]) { $current[2] = $n2; }
1085 if ($current[3]) { $current[3] = $n3; }
1086 if ($current[4]) { $current[4] = 1; }
1088 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1091 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1092 printf "Building $filename\n";
1093 printf OUTPUT "/* Unicode collation element table */\n";
1094 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1095 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1096 print OUTPUT "#include \"windef.h\"\n\n";
1098 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1100 close OUTPUT;
1101 save_file($filename);
1105 ################################################################
1106 # dump an array of integers
1107 sub dump_array($$@)
1109 my ($bit_width, $default, @array) = @_;
1110 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1111 my $i;
1112 my $ret = " ";
1113 for ($i = 0; $i < $#array; $i++)
1115 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1116 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1118 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1119 return $ret;
1123 ################################################################
1124 # dump an SBCS mapping table in binary format
1125 sub dump_binary_sbcs_table($)
1127 my $codepage = shift;
1129 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1130 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1132 print OUTPUT pack "S<*", @header;
1133 print OUTPUT pack "C12", (0) x 12;
1134 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1136 if (@glyph2uni)
1138 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1140 else
1142 print OUTPUT pack "S<*", 0;
1145 print OUTPUT pack "S<*", 0, 0;
1147 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1151 ################################################################
1152 # dump a DBCS mapping table in binary format
1153 sub dump_binary_dbcs_table($)
1155 my $codepage = shift;
1156 my @lb_ranges = get_lb_ranges();
1157 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1159 my @offsets = (0) x 256;
1160 my $pos = 0;
1161 foreach my $i (@lead_bytes)
1163 $offsets[$i] = ($pos += 256);
1164 $cp2uni[$i] = 0;
1167 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1169 print OUTPUT pack "S<*", @header;
1170 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1171 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1172 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1174 foreach my $i (@lead_bytes)
1176 my $base = $i << 8;
1177 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1180 print OUTPUT pack "S<", 4;
1181 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1185 ################################################################
1186 # get the list of defined lead byte ranges
1187 sub get_lb_ranges()
1189 my @list = ();
1190 my @ranges = ();
1192 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1193 my $on = 0;
1194 for (my $i = 0; $i < 256; $i++)
1196 if ($on)
1198 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1200 else
1202 if ($list[$i]) { push @ranges, $i; $on = 1; }
1205 if ($on) { push @ranges, 0xff; }
1206 return @ranges;
1209 ################################################################
1210 # dump the Indic Syllabic Category table
1211 sub dump_indic($)
1213 my $filename = shift;
1214 my @indic_table;
1216 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1217 while (<$INPUT>)
1219 next if /^\#/; # skip comments
1220 next if /^\s*$/; # skip empty lines
1221 next if /\x1a/; # skip ^Z
1222 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1224 my $type = $2;
1225 die "unknown indic $type" unless defined $indic_types{$type};
1226 if (hex $1 < 65536)
1228 $indic_table[hex $1] = $indic_types{$type};
1230 next;
1232 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1234 my $type = $3;
1235 die "unknown indic $type" unless defined $indic_types{$type};
1236 if (hex $1 < 65536 and hex $2 < 65536)
1238 foreach my $i (hex $1 .. hex $2)
1240 $indic_table[$i] = $indic_types{$type};
1243 next;
1245 die "malformed line $_";
1247 close $INPUT;
1249 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1250 while (<$INPUT>)
1252 next if /^\#/; # skip comments
1253 next if /^\s*$/; # skip empty lines
1254 next if /\x1a/; # skip ^Z
1255 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1257 my $type = $2;
1258 die "unknown matra $type" unless defined $matra_types{$type};
1259 $indic_table[hex $1] |= $matra_types{$type} << 8;
1260 next;
1262 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1264 my $type = $3;
1265 die "unknown matra $type" unless defined $matra_types{$type};
1266 foreach my $i (hex $1 .. hex $2)
1268 $indic_table[$i] |= $matra_types{$type} << 8;
1270 next;
1272 die "malformed line $_";
1274 close $INPUT;
1276 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1277 print "Building $filename\n";
1278 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1279 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1280 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1281 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1282 print OUTPUT "#include \"windef.h\"\n\n";
1284 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1286 close OUTPUT;
1287 save_file($filename);
1290 ################################################################
1291 # dump the Line Break Properties table
1292 sub dump_linebreak($)
1294 my $filename = shift;
1295 my @break_table;
1297 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1298 while (<$INPUT>)
1300 next if /^\#/; # skip comments
1301 next if /^\s*$/; # skip empty lines
1302 next if /\x1a/; # skip ^Z
1303 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1305 my $type = $2;
1306 die "unknown breaktype $type" unless defined $break_types{$type};
1307 $break_table[hex $1] = $break_types{$type};
1308 next;
1310 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1312 my $type = $3;
1313 die "unknown breaktype $type" unless defined $break_types{$type};
1314 foreach my $i (hex $1 .. hex $2)
1316 $break_table[$i] = $break_types{$type};
1318 next;
1320 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1322 my $type = $2;
1323 die "unknown breaktype $type" unless defined $break_types{$type};
1324 $break_table[hex $1] = $break_types{$type};
1325 next;
1327 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1329 my $type = $3;
1330 die "unknown breaktype $type" unless defined $break_types{$type};
1331 foreach my $i (hex $1 .. hex $2)
1333 $break_table[$i] = $break_types{$type};
1335 next;
1337 die "malformed line $_";
1339 close $INPUT;
1341 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1342 print "Building $filename\n";
1343 print OUTPUT "/* Unicode Line Break Properties */\n";
1344 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1345 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1346 print OUTPUT "#include \"windef.h\"\n\n";
1348 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1350 close OUTPUT;
1351 save_file($filename);
1354 my %scripts =
1356 "Unknown" => 0,
1357 "Common" => 1,
1358 "Inherited" => 2,
1359 "Arabic" => 3,
1360 "Armenian" => 4,
1361 "Avestan" => 5,
1362 "Balinese" => 6,
1363 "Bamum" => 7,
1364 "Batak" => 8,
1365 "Bengali" => 9,
1366 "Bopomofo" => 10,
1367 "Brahmi" => 11,
1368 "Braille" => 12,
1369 "Buginese" => 13,
1370 "Buhid" => 14,
1371 "Canadian_Aboriginal" => 15,
1372 "Carian" => 16,
1373 "Cham" => 17,
1374 "Cherokee" => 18,
1375 "Coptic" => 19,
1376 "Cuneiform" => 20,
1377 "Cypriot" => 21,
1378 "Cyrillic" => 22,
1379 "Deseret" => 23,
1380 "Devanagari" => 24,
1381 "Egyptian_Hieroglyphs" => 25,
1382 "Ethiopic" => 26,
1383 "Georgian" => 27,
1384 "Glagolitic" => 28,
1385 "Gothic" => 29,
1386 "Greek" => 30,
1387 "Gujarati" => 31,
1388 "Gurmukhi" => 32,
1389 "Han" => 33,
1390 "Hangul" => 34,
1391 "Hanunoo" => 35,
1392 "Hebrew" => 36,
1393 "Hiragana" => 37,
1394 "Imperial_Aramaic" => 38,
1395 "Inscriptional_Pahlavi" => 39,
1396 "Inscriptional_Parthian" => 40,
1397 "Javanese" => 41,
1398 "Kaithi" => 42,
1399 "Kannada" => 43,
1400 "Katakana" => 44,
1401 "Kayah_Li" => 45,
1402 "Kharoshthi" => 46,
1403 "Khmer" => 47,
1404 "Lao" => 48,
1405 "Latin" => 49,
1406 "Lepcha" => 50,
1407 "Limbu" => 51,
1408 "Linear_B" => 52,
1409 "Lisu" => 53,
1410 "Lycian" => 54,
1411 "Lydian" => 55,
1412 "Malayalam" => 56,
1413 "Mandaic" => 57,
1414 "Meetei_Mayek" => 58,
1415 "Mongolian" => 59,
1416 "Myanmar" => 60,
1417 "New_Tai_Lue" => 61,
1418 "Nko" => 62,
1419 "Ogham" => 63,
1420 "Ol_Chiki" => 64,
1421 "Old_Italic" => 65,
1422 "Old_Persian" => 66,
1423 "Old_South_Arabian" => 67,
1424 "Old_Turkic" => 68,
1425 "Oriya" => 69,
1426 "Osmanya" => 70,
1427 "Phags_Pa" => 71,
1428 "Phoenician" => 72,
1429 "Rejang" => 73,
1430 "Runic" => 74,
1431 "Samaritan" => 75,
1432 "Saurashtra" => 76,
1433 "Shavian" => 77,
1434 "Sinhala" => 78,
1435 "Sundanese" => 79,
1436 "Syloti_Nagri" => 80,
1437 "Syriac" => 81,
1438 "Tagalog" => 82,
1439 "Tagbanwa" => 83,
1440 "Tai_Le" => 84,
1441 "Tai_Tham" => 85,
1442 "Tai_Viet" => 86,
1443 "Tamil" => 87,
1444 "Telugu" => 88,
1445 "Thaana" => 89,
1446 "Thai" => 90,
1447 "Tibetan" => 91,
1448 "Tifinagh" => 92,
1449 "Ugaritic" => 93,
1450 "Vai" => 94,
1451 "Yi" => 95,
1452 # Win8/Win8.1
1453 "Chakma" => 96,
1454 "Meroitic_Cursive" => 97,
1455 "Meroitic_Hieroglyphs" => 98,
1456 "Miao" => 99,
1457 "Sharada" => 100,
1458 "Sora_Sompeng" => 101,
1459 "Takri" => 102,
1460 # Win10
1461 "Bassa_Vah" => 103,
1462 "Caucasian_Albanian" => 104,
1463 "Duployan" => 105,
1464 "Elbasan" => 106,
1465 "Grantha" => 107,
1466 "Khojki" => 108,
1467 "Khudawadi" => 109,
1468 "Linear_A" => 110,
1469 "Mahajani" => 111,
1470 "Manichaean" => 112,
1471 "Mende_Kikakui" => 113,
1472 "Modi" => 114,
1473 "Mro" => 115,
1474 "Nabataean" => 116,
1475 "Old_North_Arabian" => 117,
1476 "Old_Permic" => 118,
1477 "Pahawh_Hmong" => 119,
1478 "Palmyrene" => 120,
1479 "Pau_Cin_Hau" => 121,
1480 "Psalter_Pahlavi" => 122,
1481 "Siddham" => 123,
1482 "Tirhuta" => 124,
1483 "Warang_Citi" => 125,
1484 # Win10 RS1
1485 "Adlam" => 126,
1486 "Ahom" => 127,
1487 "Anatolian_Hieroglyphs" => 128,
1488 "Bhaiksuki" => 129,
1489 "Hatran" => 130,
1490 "Marchen" => 131,
1491 "Multani" => 132,
1492 "Newa" => 133,
1493 "Old_Hungarian" => 134,
1494 "Osage" => 135,
1495 "SignWriting" => 136,
1496 "Tangut" => 137,
1497 # Win10 RS4
1498 "Masaram_Gondi" => 138,
1499 "Nushu" => 139,
1500 "Soyombo" => 140,
1501 "Zanabazar_Square" => 141,
1502 # Win10 1903
1503 "Dogra" => 142,
1504 "Gunjala_Gondi" => 143,
1505 "Hanifi_Rohingya" => 144,
1506 "Makasar" => 145,
1507 "Medefaidrin" => 146,
1508 "Old_Sogdian" => 147,
1509 "Sogdian" => 148,
1510 # Win10 2004
1511 "Elymaic" => 149,
1512 "Nyiakeng_Puachue_Hmong" => 150,
1513 "Nandinagari" => 151,
1514 "Wancho" => 152,
1517 ################################################################
1518 # dump Script IDs table
1519 sub dump_scripts($)
1521 my $filename = shift;
1522 my $header = $filename;
1523 my @scripts_table;
1524 my $script_index;
1525 my $i;
1527 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1528 # Fill the table
1529 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1530 while (<$INPUT>)
1532 my $type = "";
1534 next if /^\#/; # skip comments
1535 next if /^\s*$/; # skip empty lines
1536 next if /\x1a/; # skip ^Z
1537 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1539 $type = $2;
1540 if (defined $scripts{$type})
1542 $scripts_table[hex $1] = $scripts{$type};
1544 next;
1546 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1548 $type = $3;
1549 if (defined $scripts{$type})
1551 foreach my $i (hex $1 .. hex $2)
1553 $scripts_table[$i] = $scripts{$type};
1556 next;
1560 close $INPUT;
1562 $header = "$filename.h";
1563 open OUTPUT,">$header.new" or die "Cannot create $header";
1564 print "Building $header\n";
1565 print OUTPUT "/* Unicode Script IDs */\n";
1566 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1567 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1569 print OUTPUT "enum unicode_script_id {\n";
1570 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1572 print OUTPUT " Script_$script = $scripts{$script},\n";
1574 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1575 print OUTPUT "};\n";
1577 close OUTPUT;
1578 save_file($header);
1580 $filename = "$filename.c";
1581 open OUTPUT,">$filename.new" or die "Cannot create $header";
1582 print "Building $filename\n";
1583 print OUTPUT "/* Unicode Script IDs */\n";
1584 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1585 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1586 print OUTPUT "#include \"windef.h\"\n\n";
1588 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1589 close OUTPUT;
1590 save_file($filename);
1593 ################################################################
1594 # dump the BiDi mirroring table
1595 sub dump_mirroring($)
1597 my $filename = shift;
1598 my @mirror_table = ();
1600 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1601 while (<$INPUT>)
1603 next if /^\#/; # skip comments
1604 next if /^$/; # skip empty lines
1605 next if /\x1a/; # skip ^Z
1606 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1608 $mirror_table[hex $1] = hex $2;
1609 next;
1611 die "malformed line $_";
1613 close $INPUT;
1615 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1616 print "Building $filename\n";
1617 print OUTPUT "/* Unicode BiDi mirroring */\n";
1618 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1619 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1620 print OUTPUT "#include \"windef.h\"\n\n";
1621 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1622 close OUTPUT;
1623 save_file($filename);
1626 ################################################################
1627 # dump the Bidi Brackets
1628 sub dump_bracket($)
1630 my $filename = shift;
1631 my @bracket_table;
1633 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1634 while (<$INPUT>)
1636 next if /^\#/; # skip comments
1637 next if /^\s*$/; # skip empty lines
1638 next if /\x1a/; # skip ^Z
1639 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1641 my $type = $3;
1642 die "unknown bracket $type" unless defined $bracket_types{$type};
1643 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1644 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1645 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1646 next;
1648 die "malformed line $_";
1650 close $INPUT;
1652 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1653 print "Building $filename\n";
1654 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1655 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1656 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1657 print OUTPUT "#include \"windef.h\"\n\n";
1659 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1661 close OUTPUT;
1662 save_file($filename);
1665 ################################################################
1666 # dump the Arabic shaping table
1667 sub dump_shaping($)
1669 my $filename = shift;
1670 my @joining_table = @initial_joining_table;
1672 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1673 while (<$INPUT>)
1675 next if /^\#/; # skip comments
1676 next if /^\s*$/; # skip empty lines
1677 next if /\x1a/; # skip ^Z
1678 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1680 my $type = $2;
1681 $joining_table[hex $1] = $joining_types{$type};
1682 next;
1684 die "malformed line $_";
1686 close $INPUT;
1688 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1689 print "Building $filename\n";
1690 print OUTPUT "/* Unicode Arabic shaping */\n";
1691 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1692 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1693 print OUTPUT "#include \"windef.h\"\n\n";
1695 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1697 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1698 for (my $i = 0x600; $i <= 0x6ff; $i++)
1700 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1701 ${joining_forms{"isolated"}}[$i] || $i,
1702 ${joining_forms{"final"}}[$i] || $i,
1703 ${joining_forms{"initial"}}[$i] || $i,
1704 ${joining_forms{"medial"}}[$i] || $i;
1706 print OUTPUT "};\n";
1708 close OUTPUT;
1709 save_file($filename);
1712 ################################################################
1713 # dump the Arabic shaping table
1714 sub dump_arabic_shaping($)
1716 my $filename = shift;
1717 my @joining_table = @initial_joining_table;
1719 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1720 while (<$INPUT>)
1722 next if /^\#/; # skip comments
1723 next if /^\s*$/; # skip empty lines
1724 next if /\x1a/; # skip ^Z
1725 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1727 my $type = $2;
1728 my $group = $3;
1730 if ($group eq "ALAPH" || $group eq "DALATH RISH")
1732 $joining_table[hex $1] = $joining_types{$group};
1734 else
1736 $joining_table[hex $1] = $joining_types{$type};
1739 next;
1741 die "malformed line $_";
1743 close $INPUT;
1745 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1746 print "Building $filename\n";
1747 print OUTPUT "/* Unicode Arabic shaping */\n";
1748 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1749 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1750 print OUTPUT "#include \"windef.h\"\n\n";
1752 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1754 close OUTPUT;
1755 save_file($filename);
1758 ################################################################
1759 # dump the Vertical Orientation table
1760 sub dump_vertical($)
1762 my $filename = shift;
1763 my @vertical_table;
1765 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1766 while (<$INPUT>)
1768 next if /^\#/; # skip comments
1769 next if /^\s*$/; # skip empty lines
1770 next if /\x1a/; # skip ^Z
1771 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1773 my $type = $2;
1774 die "unknown vertical $type" unless defined $vertical_types{$type};
1775 if (hex $1 < 65536)
1777 $vertical_table[hex $1] = $vertical_types{$type};
1779 next;
1781 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1783 my $type = $3;
1784 die "unknown vertical $type" unless defined $vertical_types{$type};
1785 foreach my $i (hex $1 .. hex $2)
1787 $vertical_table[$i] = $vertical_types{$type};
1789 next;
1791 die "malformed line $_";
1793 close $INPUT;
1795 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1796 print "Building $filename\n";
1797 print OUTPUT "/* Unicode Vertical Orientation */\n";
1798 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1799 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1800 print OUTPUT "#include \"windef.h\"\n\n";
1802 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1804 close OUTPUT;
1805 save_file($filename);
1808 ################################################################
1809 # dump the digit folding tables
1810 sub dump_digit_folding($)
1812 my ($filename) = shift;
1813 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1814 print "Building $filename\n";
1815 print OUTPUT "/* Unicode digit folding mappings */\n";
1816 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1817 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1818 print OUTPUT "#include \"windef.h\"\n\n";
1820 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1821 close OUTPUT;
1822 save_file($filename);
1826 ################################################################
1827 # compress a mapping table by removing identical rows
1828 sub compress_array($$@)
1830 my $rows = shift;
1831 my $def = shift;
1832 my @table = @_;
1833 my $len = @table / $rows;
1834 my @array;
1835 my $data = "";
1837 # try to merge table rows
1838 for (my $row = 0; $row < $rows; $row++)
1840 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1841 my $pos = index $data, $rowtxt;
1842 if ($pos == -1)
1844 # check if the tail of the data can match the start of the new row
1845 my $first = substr( $rowtxt, 0, 1 );
1846 for (my $i = length($data) - 1; $i > 0; $i--)
1848 $pos = index( substr( $data, -$i ), $first );
1849 last if $pos == -1;
1850 $i -= $pos;
1851 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1852 substr( $data, -$i ) = "";
1853 last;
1855 $pos = length $data;
1856 $data .= $rowtxt;
1858 $array[$row] = $rows + $pos;
1860 return @array, unpack "U*", $data;
1863 ################################################################
1864 # dump a char -> 16-bit value mapping table using two-level tables
1865 sub dump_two_level_mapping($$@)
1867 my $name = shift;
1868 my $def = shift;
1869 my $size = shift;
1870 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1871 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1872 my @array = compress_array( 256, 0, @row_array[0..4095] );
1874 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1876 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1877 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1878 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1879 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1882 ################################################################
1883 # dump a char -> value mapping table using three-level tables
1884 sub dump_three_level_mapping($$@)
1886 my $name = shift;
1887 my $def = shift;
1888 my $size = shift;
1889 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1890 my $level3 = ($MAX_CHAR + 1) / 16;
1891 my $level2 = $level3 / 16;
1892 my $level1 = $level2 / 16;
1893 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1894 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1895 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1897 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1898 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1900 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1901 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1902 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1903 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1904 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1907 ################################################################
1908 # dump a binary case mapping table in l_intl.nls format
1909 sub dump_binary_case_table(@)
1911 my (@table) = @_;
1912 my $max_char = 0x10000;
1913 my $level1 = $max_char / 16;
1914 my $level2 = $level1 / 16;
1916 my @difftable;
1917 for (my $i = 0; $i < @table; $i++)
1919 next unless defined $table[$i];
1920 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1923 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1924 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1925 my $offset = @array - $level1;
1926 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1927 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1930 ################################################################
1931 # dump case mappings for l_intl.nls
1932 sub dump_intl_nls($)
1934 my @upper_table = @toupper_table;
1935 my @lower_table = @tolower_table;
1936 remove_linguistic_mappings( \@upper_table, \@lower_table );
1938 my $upper = dump_binary_case_table( @upper_table );
1939 my $lower = dump_binary_case_table( @lower_table );
1941 my $filename = shift;
1942 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1943 printf "Building $filename\n";
1945 binmode OUTPUT;
1946 print OUTPUT pack "S<", 1; # version
1947 print OUTPUT $upper;
1948 print OUTPUT $lower;
1949 close OUTPUT;
1950 save_file($filename);
1954 ################################################################
1955 # dump the bidi direction table
1956 sub dump_bidi_dir_table($)
1958 my $filename = shift;
1959 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1960 printf "Building $filename\n";
1961 printf OUTPUT "/* Unicode BiDi direction table */\n";
1962 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1963 printf OUTPUT "#include \"windef.h\"\n\n";
1965 my @table;
1967 for (my $i = 0; $i < 65536; $i++)
1969 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1972 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1974 close OUTPUT;
1975 save_file($filename);
1979 sub rol($$)
1981 my ($byte, $count) = @_;
1982 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1985 ################################################################
1986 # compress the character properties table
1987 sub compress_char_props_table($@)
1989 my $rows = shift;
1990 my @table = @_;
1991 my $len = @table / $rows;
1992 my $pos = 0;
1993 my @array = (0) x $rows;
1994 my %sequences;
1996 # add some predefined sequences
1997 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1999 # try to merge table rows
2000 for (my $row = 0; $row < $rows; $row++)
2002 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
2003 my $rowtxt = pack "L*", @table_row;
2004 if (defined($sequences{$rowtxt}))
2006 # reuse an existing row
2007 $array[$row] = $sequences{$rowtxt};
2009 else
2011 # create a new row
2012 $sequences{$rowtxt} = $array[$row] = ++$pos;
2013 push @array, @table_row;
2016 return @array;
2019 ################################################################
2020 # dump a normalization table in binary format
2021 sub dump_norm_table($)
2023 my $filename = shift;
2025 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2026 my %decomp = ( "nfc" => \@decomp_table,
2027 "nfd" => \@decomp_table,
2028 "nfkc" => \@decomp_compat_table,
2029 "nfkd" => \@decomp_compat_table ,
2030 "idna" => \@idna_decomp_table );
2032 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2033 print "Building $filename\n";
2035 my $type = $filename;
2036 $type =~ s!.*/norm(\w+)\.nls!$1!;
2038 my $compose = $forms{$type} & 1;
2039 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2041 my @version = split /\./, $UNIVERSION;
2043 # combining classes
2045 my @classes;
2046 my @class_values;
2048 foreach my $c (grep defined, @combining_class_table)
2050 $classes[$c] = 1 if $c < 0x100;
2052 for (my $i = 0; $i < @classes; $i++)
2054 next unless defined $classes[$i];
2055 $classes[$i] = @class_values;
2056 push @class_values, $i;
2058 push @class_values, 0 if (@class_values % 2);
2059 die "too many classes" if @class_values >= 0x40;
2061 # character properties
2063 my @char_props;
2064 my @decomposed;
2065 my @comp_hash_table;
2066 my $comp_hash_size = $compose ? 254 : 0;
2068 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2070 next unless defined $combining_class_table[$i];
2071 if (defined $decomp{$type}->[$i])
2073 my @dec = get_decomposition( $i, $decomp{$type} );
2074 if ($compose && (my @comp = get_composition( $i, $compat )))
2076 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2077 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2079 my $val = 0;
2080 foreach my $d (@dec)
2082 $val = $combining_class_table[$d];
2083 last if $val;
2085 $char_props[$i] = $classes[$val];
2087 else
2089 $char_props[$i] = 0xbf;
2091 @dec = compose_hangul( @dec ) if $compose;
2092 @dec = to_utf16( @dec );
2093 push @dec, 0 if @dec >= 7;
2094 $decomposed[$i] = \@dec;
2096 else
2098 if ($combining_class_table[$i] == 0x100)
2100 $char_props[$i] = 0x7f;
2102 elsif ($combining_class_table[$i])
2104 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2106 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2108 $char_props[$i] = 0xff;
2110 else
2112 $char_props[$i] = 0;
2117 if ($compose)
2119 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2121 my @comp = get_composition( $i, $compat );
2122 next unless @comp;
2123 if ($combining_class_table[$comp[1]])
2125 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2126 $char_props[$comp[1]] |= 0x40;
2128 else
2130 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2131 $char_props[$comp[1]] |= 0xc0;
2136 # surrogates
2137 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2138 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2140 # Hangul
2141 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2142 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2143 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2145 # invalid chars
2146 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2147 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2148 foreach my $i (0x00..0x10)
2150 $char_props[($i << 16) | 0xfffe] = 0xff;
2151 $char_props[($i << 16) | 0xffff] = 0xff;
2154 # decomposition hash table
2156 my @decomp_hash_table;
2157 my @decomp_hash_index;
2158 my @decomp_hash_data;
2159 my $decomp_hash_size = 944;
2161 # build string of character data, reusing substrings when possible
2162 my $decomp_char_data = "";
2163 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2165 my $str = pack "U*", @{$i};
2166 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2168 for (my $i = 0; $i < @decomposed; $i++)
2170 next unless defined $decomposed[$i];
2171 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2172 die "sequence not found" if $pos == -1;
2173 my $len = @{$decomposed[$i]};
2174 $len = 7 if $len > 7;
2175 my $hash = $i % $decomp_hash_size;
2176 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2178 for (my $i = 0; $i < $decomp_hash_size; $i++)
2180 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2181 next unless defined $decomp_hash_table[$i];
2182 if (@{$decomp_hash_table[$i]} == 1)
2184 my $entry = $decomp_hash_table[$i]->[0];
2185 if ($char_props[$entry->[0]] == 0xbf)
2187 $decomp_hash_index[$i] = $entry->[1];
2188 next;
2191 foreach my $entry (@{$decomp_hash_table[$i]})
2193 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2196 push @decomp_hash_data, 0, 0;
2198 # composition hash table
2200 my @comp_hash_index;
2201 my @comp_hash_data;
2202 if (@comp_hash_table)
2204 for (my $i = 0; $i < $comp_hash_size; $i++)
2206 $comp_hash_index[$i] = @comp_hash_data;
2207 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2209 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2210 push @comp_hash_data, 0, 0, 0;
2213 my $level1 = ($MAX_CHAR + 1) / 128;
2214 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2216 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2217 0, $decomp_hash_size, $comp_hash_size, 0 );
2218 my @tables = (0) x 8;
2220 $tables[0] = 16 + @header + @tables;
2221 $tables[1] = $tables[0] + @class_values / 2;
2222 $tables[2] = $tables[1] + $level1 / 2;
2223 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2224 $tables[4] = $tables[3] + @decomp_hash_index;
2225 $tables[5] = $tables[4] + @decomp_hash_data;
2226 $tables[6] = $tables[5] + length $decomp_char_data;
2227 $tables[7] = $tables[6] + @comp_hash_index;
2229 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2230 print OUTPUT pack "S<*", @header;
2231 print OUTPUT pack "S<*", @tables;
2232 print OUTPUT pack "C*", @class_values;
2234 print OUTPUT pack "C*", @rows[0..$level1-1];
2235 print OUTPUT pack "C*", @rows[$level1..$#rows];
2236 print OUTPUT pack "S<*", @decomp_hash_index;
2237 print OUTPUT pack "S<*", @decomp_hash_data;
2238 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2239 print OUTPUT pack "S<*", @comp_hash_index;
2240 print OUTPUT pack "S<*", @comp_hash_data;
2242 close OUTPUT;
2243 save_file($filename);
2245 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2249 ################################################################
2250 # output a codepage definition file from the global tables
2251 sub output_codepage_file($)
2253 my $codepage = shift;
2255 my $output = sprintf "nls/c_%03d.nls", $codepage;
2256 open OUTPUT,">$output.new" or die "Cannot create $output";
2258 printf "Building %s\n", $output;
2259 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2260 else { dump_binary_dbcs_table( $codepage ); }
2262 close OUTPUT;
2263 save_file($output);
2265 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2268 ################################################################
2269 # output a codepage table from a Microsoft-style mapping file
2270 sub dump_msdata_codepage($)
2272 my $filename = shift;
2274 my $state = "";
2275 my ($codepage, $width, $count);
2276 my ($lb_cur, $lb_end);
2278 @cp2uni = ();
2279 @glyph2uni = ();
2280 @lead_bytes = ();
2281 @uni2cp = ();
2282 $default_char = $DEF_CHAR;
2283 $default_wchar = $DEF_CHAR;
2285 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2287 while (<$INPUT>)
2289 next if /^;/; # skip comments
2290 next if /^\s*$/; # skip empty lines
2291 next if /\x1a/; # skip ^Z
2292 last if /^ENDCODEPAGE/;
2294 if (/^CODEPAGE\s+(\d+)/)
2296 $codepage = $1;
2297 next;
2299 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2301 $width = $1;
2302 $default_char = hex $2;
2303 $default_wchar = hex $3;
2304 next;
2306 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2308 $state = $1;
2309 $count = $2;
2310 next;
2312 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2314 if ($state eq "MBTABLE")
2316 my $cp = hex $1;
2317 my $uni = hex $2;
2318 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2319 next;
2321 if ($state eq "GLYPHTABLE")
2323 my $cp = hex $1;
2324 my $uni = hex $2;
2325 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2326 next;
2328 if ($state eq "WCTABLE")
2330 my $uni = hex $1;
2331 my $cp = hex $2;
2332 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2333 next;
2335 if ($state eq "DBCSRANGE")
2337 my $start = hex $1;
2338 my $end = hex $2;
2339 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2340 $lb_cur = $start;
2341 $lb_end = $end;
2342 next;
2344 if ($state eq "DBCSTABLE")
2346 my $mb = hex $1;
2347 my $uni = hex $2;
2348 my $cp = ($lb_cur << 8) | $mb;
2349 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2350 if (!--$count)
2352 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2354 next;
2357 die "$filename: Unrecognized line $_\n";
2359 close $INPUT;
2361 output_codepage_file( $codepage );
2363 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
2366 ################################################################
2367 # align a string length
2368 sub align_string($$)
2370 my ($align, $str) = @_;
2371 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2372 return $str;
2375 ################################################################
2376 # pack a GUID string
2377 sub pack_guid($)
2379 $_ = shift;
2380 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2381 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2384 ################################################################
2385 # comparison function for compression sort
2386 sub cmp_compression
2388 return scalar @{$a} <=> scalar @{$b} ||
2389 $a->[4] <=> $b->[4] ||
2390 $a->[5] <=> $b->[5] ||
2391 $a->[6] <=> $b->[6] ||
2392 $a->[7] <=> $b->[7] ||
2393 $a->[8] <=> $b->[8] ||
2394 $a->[9] <=> $b->[9] ||
2395 $a->[10] <=> $b->[10] ||
2396 $a->[11] <=> $b->[11] ||
2397 $a->[12] <=> $b->[12];
2400 ################################################################
2401 # build a binary sort keys table
2402 sub dump_sortkey_table($$)
2404 my ($filename, $download) = @_;
2406 my @keys;
2407 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2408 my @multiple_weights;
2409 my @expansions;
2410 my @compressions;
2411 my %exceptions;
2412 my %guids;
2413 my %compr_flags;
2414 my %locales;
2415 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2416 my $jamostr = "";
2418 my $re_hex = '0x[0-9A-Fa-f]+';
2419 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2420 $guids{$default_guid} = { };
2422 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2424 my $KEYS = open_data_file( $MSDATA, $download );
2426 printf "Building $filename\n";
2428 while (<$KEYS>)
2430 s/\s*;.*$//;
2431 next if /^\s*$/; # skip empty lines
2432 if (/^\s*(SORTKEY|SORTTABLES)/)
2434 $part = $1;
2435 next;
2437 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2439 $part = $section = "";
2440 next;
2442 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2444 $section = $1;
2445 $guid = undef;
2446 next;
2448 next unless $part;
2449 if ("$part.$section" eq "SORTKEY.DEFAULT")
2451 if (/^\s*($re_hex)\s+$re_key/)
2453 $keys[hex $1] = [ split(/\s+/,$2) ];
2454 next;
2457 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2459 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2461 $version = hex $1;
2462 next;
2464 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2466 # ignore for now
2467 next;
2470 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2471 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2472 "$part.$section" eq "SORTTABLES.INVERSECASING")
2474 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2476 $guid = lc $1;
2477 $guids{$guid} = { } unless defined $guids{$guid};
2478 $guids{$guid}->{flags} |= $flags{$section};
2479 next;
2481 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2483 $locales{$1} = $guid;
2484 next;
2487 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2489 if (/^\s*(\d+)\s+(\d+)/)
2491 push @multiple_weights, $1, $2;
2492 next;
2495 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2497 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2499 my $pos = scalar @expansions / 2;
2500 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2501 push @expansions, hex $2, hex $3;
2502 next;
2505 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2507 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2509 $keys[hex $1] = $keys[hex $2];
2510 next;
2513 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2515 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2517 if ($subsection || !$guid) # start a new one
2519 $guid = lc $1;
2520 $subsection = "";
2521 $guids{$guid} = { } unless defined $guids{$guid};
2522 $guids{$guid}->{flags} |= $flags{$2} if $2;
2523 $guids{$guid}->{compr} = @compressions;
2524 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2525 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2526 push @compressions, [ ];
2528 else # merge with current one
2530 $guids{lc $1} = { } unless defined $guids{lc $1};
2531 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2532 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2533 $compr_flags{lc $1} = $compr_flags{$guid};
2535 next;
2537 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2539 $locales{$1} = $guid;
2540 next;
2542 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2544 $subsection = $1;
2545 next;
2547 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2549 my @comp = map { hex $_; } split(/\s+/,$1);
2550 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2551 # add compression flags
2552 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2553 next;
2556 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2558 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2560 $guid = lc $1;
2561 $guids{$guid} = { } unless defined $guids{lc $1};
2562 $ling_flag = ($2 ? "+" : "-");
2563 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2564 next;
2566 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2568 $locales{$1} = $guid;
2569 next;
2571 if (/^\s*($re_hex)\s+$re_key/)
2573 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2574 next;
2577 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2579 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2581 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2582 next;
2585 die "$download: $part.$section: unrecognized line $_\n";
2587 close $KEYS;
2589 # Sortkey table
2591 my $table;
2592 for (my $i = 0; $i < 0x10000; $i++)
2594 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2595 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2598 foreach my $id (sort keys %exceptions)
2600 my $pos = length($table) / 4;
2601 my @exc = @{$exceptions{$id}};
2602 my @filled;
2603 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2604 my $guid = substr( $id, 0, -1 );
2605 $guids{$guid}->{$key} = $pos;
2606 $pos += 0x100;
2607 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2608 for (my $j = 0; $j < 0x10000; $j++)
2610 next unless defined $exc[$j] || defined $flags[$j];
2611 $filled[$j >> 8] = 1;
2612 $j |= 0xff;
2614 for (my $j = 0; $j < 0x100; $j++)
2616 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2617 $pos += 0x100 if $filled[$j];
2619 for (my $j = 0; $j < 0x10000; $j++)
2621 next unless $filled[$j >> 8];
2622 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2623 $k[3] |= $flags[$j] || 0;
2624 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2628 # Case mapping tables
2630 # standard table
2631 my @casemaps;
2632 my @upper = @toupper_table;
2633 my @lower = @tolower_table;
2634 remove_linguistic_mappings( \@upper, \@lower );
2635 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2637 # linguistic table
2638 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2640 # Turkish table
2641 @upper = @toupper_table;
2642 @lower = @tolower_table;
2643 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2644 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2645 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2646 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2648 # Char type table
2650 my @table;
2651 my $types = "";
2652 my %typestr;
2653 for (my $i = 0; $i < 0x10000; $i++)
2655 my $str = pack "S<3",
2656 ($category_table[$i] || 0) & 0xffff,
2657 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2658 ($category_table[$i] || 0) >> 16;
2660 if (!defined($typestr{$str}))
2662 $typestr{$str} = length($types) / 6;
2663 $types .= $str;
2665 $table[$i] = $typestr{$str};
2668 my @rows = compress_array( 4096, 0, @table[0..65535] );
2669 my @array = compress_array( 256, 0, @rows[0..4095] );
2670 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2671 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2673 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2674 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2675 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2677 # Sort tables
2679 # guids
2680 my $sorttables = pack "L<2", $version, scalar %guids;
2681 foreach my $id (sort keys %guids)
2683 my %guid = %{$guids{$id}};
2684 my $flags = $guid{flags} || 0;
2685 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2686 $sorttables .= pack_guid($id) . pack "L<5",
2687 $flags,
2688 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2689 $guid{except} || 0,
2690 $guid{ling_except} || 0,
2691 $map / 2;
2694 # expansions
2695 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2697 # compressions
2698 $sorttables .= pack "L<", scalar @compressions;
2699 my $rowstr = "";
2700 foreach my $c (@compressions)
2702 my $pos = length($rowstr) / 2;
2703 my $min = 0xffff;
2704 my $max = 0;
2705 my @lengths = (0) x 8;
2706 foreach my $r (sort cmp_compression @{$c})
2708 my @row = @{$r};
2709 $lengths[scalar @row - 6]++;
2710 foreach my $val (@row[4..$#row])
2712 $min = $val if $min > $val;
2713 $max = $val if $max < $val;
2715 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2716 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2718 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2720 $sorttables .= $rowstr;
2722 # multiple weights
2723 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2725 # jamo sort
2726 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2728 # Locales
2730 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2731 foreach my $loc (sort keys %locales)
2733 # skip specific locales that match more general ones
2734 my @parts = split /[-_]/, $loc;
2735 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2736 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2737 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2740 # File header
2742 my @header;
2743 $header[0] = 16;
2744 $header[1] = $header[0] + length $table;
2745 $header[2] = $header[1] + length $casemaps;
2746 $header[3] = $header[2] + length $chartypes;
2748 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2749 print OUTPUT pack "L<*", @header;
2750 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2751 close OUTPUT;
2752 save_file($filename);
2756 ################################################################
2757 # build the script to create registry keys
2758 sub dump_registry_script($%)
2760 my ($filename, %keys) = @_;
2761 my $indent = 1;
2763 printf "Building %s\n", $filename;
2764 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2765 print OUTPUT "HKLM\n{\n";
2766 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2768 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2769 $indent++;
2771 foreach my $k (sort keys %keys)
2773 my @subkeys = split /\\/, $k;
2774 my ($def, @vals) = @{$keys{$k}};
2775 for (my $i = 0; $i < @subkeys; $i++)
2777 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2778 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2779 $indent++;
2781 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2782 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2784 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2785 close OUTPUT;
2786 save_file($filename);
2790 ################################################################
2791 # save a file if modified
2792 sub save_file($)
2794 my $file = shift;
2795 if (-f $file && !system "cmp $file $file.new >/dev/null")
2797 unlink "$file.new";
2799 else
2801 rename "$file.new", "$file";
2806 ################################################################
2807 # main routine
2809 chdir ".." if -f "./make_unicode";
2810 load_data();
2811 dump_sortkeys( "dlls/kernelbase/collation.c" );
2812 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2813 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2814 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2815 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2816 dump_mirroring( "dlls/dwrite/mirror.c" );
2817 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2818 dump_bracket( "dlls/dwrite/bracket.c" );
2819 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2820 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2821 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2822 dump_linebreak( "dlls/dwrite/linebreak.c" );
2823 dump_scripts( "dlls/dwrite/scripts" );
2824 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2825 dump_vertical( "dlls/gdi32/vertical.c" );
2826 dump_vertical( "dlls/wineps.drv/vertical.c" );
2827 dump_intl_nls("nls/l_intl.nls");
2828 dump_norm_table( "nls/normnfc.nls" );
2829 dump_norm_table( "nls/normnfd.nls" );
2830 dump_norm_table( "nls/normnfkc.nls" );
2831 dump_norm_table( "nls/normnfkd.nls" );
2832 dump_norm_table( "nls/normidna.nls" );
2833 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2834 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2835 dump_eucjp_codepage();
2836 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2838 exit 0;
2840 # Local Variables:
2841 # compile-command: "./make_unicode"
2842 # End: