windowscodecs: Add stub for DdsFrameDecode.
[wine.git] / tools / make_unicode
blobec5d8eafb43567f6f6074f67e9439f1c6b0a2e7a
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "13.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $REPORTS = "http://www.unicode.org/reports";
30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
33 # Sort keys file
34 my $SORTKEYS = "tr10/allkeys.txt";
36 # Default char for undefined mappings
37 my $DEF_CHAR = ord '?';
39 # Last valid Unicode character
40 my $MAX_CHAR = 0x10ffff;
42 my @allfiles =
44 "CodpageFiles/037.txt",
45 "CodpageFiles/437.txt",
46 "CodpageFiles/500.txt",
47 "CodpageFiles/737.txt",
48 "CodpageFiles/775.txt",
49 "CodpageFiles/850.txt",
50 "CodpageFiles/852.txt",
51 "CodpageFiles/855.txt",
52 "CodpageFiles/857.txt",
53 "CodpageFiles/860.txt",
54 "CodpageFiles/861.txt",
55 "CodpageFiles/862.txt",
56 "CodpageFiles/863.txt",
57 "CodpageFiles/864.txt",
58 "CodpageFiles/865.txt",
59 "CodpageFiles/866.txt",
60 "CodpageFiles/869.txt",
61 "CodpageFiles/874.txt",
62 "CodpageFiles/875.txt",
63 "CodpageFiles/932.txt",
64 "CodpageFiles/936.txt",
65 "CodpageFiles/949.txt",
66 "CodpageFiles/950.txt",
67 "CodpageFiles/1026.txt",
68 "CodpageFiles/1250.txt",
69 "CodpageFiles/1251.txt",
70 "CodpageFiles/1252.txt",
71 "CodpageFiles/1253.txt",
72 "CodpageFiles/1254.txt",
73 "CodpageFiles/1255.txt",
74 "CodpageFiles/1256.txt",
75 "CodpageFiles/1257.txt",
76 "CodpageFiles/1258.txt",
77 "CodpageFiles/1361.txt",
78 "CodpageFiles/10000.txt",
79 "CodpageFiles/10001.txt",
80 "CodpageFiles/10002.txt",
81 "CodpageFiles/10003.txt",
82 "CodpageFiles/10004.txt",
83 "CodpageFiles/10005.txt",
84 "CodpageFiles/10006.txt",
85 "CodpageFiles/10007.txt",
86 "CodpageFiles/10008.txt",
87 "CodpageFiles/10010.txt",
88 "CodpageFiles/10017.txt",
89 "CodpageFiles/10021.txt",
90 "CodpageFiles/10029.txt",
91 "CodpageFiles/10079.txt",
92 "CodpageFiles/10081.txt",
93 "CodpageFiles/10082.txt",
94 "CodpageFiles/20127.txt",
95 "CodpageFiles/20866.txt",
96 "CodpageFiles/21866.txt",
97 "CodpageFiles/28591.txt",
98 "CodpageFiles/28592.txt",
99 "CodpageFiles/28593.txt",
100 "CodpageFiles/28594.txt",
101 "CodpageFiles/28595.txt",
102 "CodpageFiles/28596.txt",
103 "CodpageFiles/28597.txt",
104 "CodpageFiles/28598.txt",
105 "CodpageFiles/28599.txt",
106 "CodpageFiles/28603.txt",
107 "CodpageFiles/28605.txt",
111 my %ctype =
113 # CT_CTYPE1
114 "upper" => 0x0001,
115 "lower" => 0x0002,
116 "digit" => 0x0004,
117 "space" => 0x0008,
118 "punct" => 0x0010,
119 "cntrl" => 0x0020,
120 "blank" => 0x0040,
121 "xdigit" => 0x0080,
122 "alpha" => 0x0100 | 0x80000000,
123 "defin" => 0x0200,
124 # CT_CTYPE3 in high 16 bits
125 "nonspacing" => 0x00010000,
126 "diacritic" => 0x00020000,
127 "vowelmark" => 0x00040000,
128 "symbol" => 0x00080000,
129 "katakana" => 0x00100000,
130 "hiragana" => 0x00200000,
131 "halfwidth" => 0x00400000,
132 "fullwidth" => 0x00800000,
133 "ideograph" => 0x01000000,
134 "kashida" => 0x02000000,
135 "lexical" => 0x04000000,
136 "highsurrogate" => 0x08000000,
137 "lowsurrogate" => 0x10000000,
140 my %bracket_types =
142 "o" => 0x0000,
143 "c" => 0x0001,
146 my %indic_types =
148 "Other" => 0x0000,
149 "Bindu" => 0x0001,
150 "Visarga" => 0x0002,
151 "Avagraha" => 0x0003,
152 "Nukta" => 0x0004,
153 "Virama" => 0x0005,
154 "Vowel_Independent" => 0x0006,
155 "Vowel_Dependent" => 0x0007,
156 "Vowel" => 0x0008,
157 "Consonant_Placeholder" => 0x0009,
158 "Consonant" => 0x000a,
159 "Consonant_Dead" => 0x000b,
160 "Consonant_Succeeding_Repha" => 0x000c,
161 "Consonant_Subjoined" => 0x000d,
162 "Consonant_Medial" => 0x000e,
163 "Consonant_Final" => 0x000f,
164 "Consonant_Head_Letter" => 0x0010,
165 "Modifying_Letter" => 0x0011,
166 "Tone_Letter" => 0x0012,
167 "Tone_Mark" => 0x0013,
168 "Register_Shifter" => 0x0014,
169 "Consonant_Preceding_Repha" => 0x0015,
170 "Pure_Killer" => 0x0016,
171 "Invisible_Stacker" => 0x0017,
172 "Gemination_Mark" => 0x0018,
173 "Cantillation_Mark" => 0x0019,
174 "Non_Joiner" => 0x001a,
175 "Joiner" => 0x001b,
176 "Number_Joiner" => 0x001c,
177 "Number" => 0x001d,
178 "Brahmi_Joining_Number" => 0x001e,
179 "Consonant_With_Stacker" => 0x001f,
180 "Consonant_Prefixed" => 0x0020,
181 "Syllable_Modifier" => 0x0021,
182 "Consonant_Killer" => 0x0022,
183 "Consonant_Initial_Postfixed" => 0x0023,
186 my %matra_types =
188 "Right" => 0x01,
189 "Left" => 0x02,
190 "Visual_Order_Left" => 0x03,
191 "Left_And_Right" => 0x04,
192 "Top" => 0x05,
193 "Bottom" => 0x06,
194 "Top_And_Bottom" => 0x07,
195 "Top_And_Right" => 0x08,
196 "Top_And_Left" => 0x09,
197 "Top_And_Left_And_Right" => 0x0a,
198 "Bottom_And_Right" => 0x0b,
199 "Top_And_Bottom_And_Right" => 0x0c,
200 "Overstruck" => 0x0d,
201 "Invisible" => 0x0e,
202 "Bottom_And_Left" => 0x0f,
203 "Top_And_Bottom_And_Left" => 0x10,
206 my %break_types =
208 "BK" => 0x0001,
209 "CR" => 0x0002,
210 "LF" => 0x0003,
211 "CM" => 0x0004,
212 "SG" => 0x0005,
213 "GL" => 0x0006,
214 "CB" => 0x0007,
215 "SP" => 0x0008,
216 "ZW" => 0x0009,
217 "NL" => 0x000a,
218 "WJ" => 0x000b,
219 "JL" => 0x000c,
220 "JV" => 0x000d,
221 "JT" => 0x000e,
222 "H2" => 0x000f,
223 "H3" => 0x0010,
224 "XX" => 0x0011,
225 "OP" => 0x0012,
226 "CL" => 0x0013,
227 "CP" => 0x0014,
228 "QU" => 0x0015,
229 "NS" => 0x0016,
230 "EX" => 0x0017,
231 "SY" => 0x0018,
232 "IS" => 0x0019,
233 "PR" => 0x001a,
234 "PO" => 0x001b,
235 "NU" => 0x001c,
236 "AL" => 0x001d,
237 "ID" => 0x001e,
238 "IN" => 0x001f,
239 "HY" => 0x0020,
240 "BB" => 0x0021,
241 "BA" => 0x0022,
242 "SA" => 0x0023,
243 "AI" => 0x0024,
244 "B2" => 0x0025,
245 "HL" => 0x0026,
246 "CJ" => 0x0027,
247 "RI" => 0x0028,
248 "EB" => 0x0029,
249 "EM" => 0x002a,
250 "ZWJ" => 0x002b,
253 my %vertical_types =
255 "R" => 0x0000,
256 "U" => 0x0001,
257 "Tr" => 0x0002,
258 "Tu" => 0x0003,
261 my %categories =
263 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
264 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
265 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
266 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
267 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
268 "Me" => $ctype{"defin"}, # Mark, Enclosing
269 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
270 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
271 "No" => $ctype{"defin"}, # Number, Other
272 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
273 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
274 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
275 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
276 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
277 "Cs" => $ctype{"defin"}, # Other, Surrogate
278 "Co" => $ctype{"defin"}, # Other, Private Use
279 "Cn" => $ctype{"defin"}, # Other, Not Assigned
280 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
281 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
282 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
283 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
284 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
285 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
286 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
287 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
288 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
289 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
290 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
291 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
292 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
295 # a few characters need additional categories that cannot be determined automatically
296 my %special_categories =
298 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
299 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
300 "space" => [ 0x09..0x0d, 0x85 ],
301 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
302 "cntrl" => [ 0x070f, 0x200c, 0x200d,
303 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
304 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
305 0xfff9, 0xfffa, 0xfffb ],
306 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
307 0xd7, 0xf7 ],
308 "digit" => [ 0xb2, 0xb3, 0xb9 ],
309 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
310 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
311 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
312 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
313 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
314 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
315 0x02b9..0x02ba, 0x02c6..0x02cf ],
316 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
317 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
318 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
319 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
320 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
321 0x3131..0x3164 ],
322 "ideograph" => [ 0x3006..0x3007 ],
323 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
324 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
325 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
326 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
327 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
328 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
329 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
330 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
331 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
332 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
333 "kashida" => [ 0x0640 ],
336 my %directions =
338 "L" => 1, # Left-to-Right
339 "R" => 2, # Right-to-Left
340 "AL" => 12, # Right-to-Left Arabic
341 "EN" => 3, # European Number
342 "ES" => 4, # European Number Separator
343 "ET" => 5, # European Number Terminator
344 "AN" => 6, # Arabic Number
345 "CS" => 7, # Common Number Separator
346 "NSM" => 13, # Non-Spacing Mark
347 "BN" => 14, # Boundary Neutral
348 "B" => 8, # Paragraph Separator
349 "S" => 9, # Segment Separator
350 "WS" => 10, # Whitespace
351 "ON" => 11, # Other Neutrals
352 "LRE" => 15, # Left-to-Right Embedding
353 "LRO" => 15, # Left-to-Right Override
354 "RLE" => 15, # Right-to-Left Embedding
355 "RLO" => 15, # Right-to-Left Override
356 "PDF" => 15, # Pop Directional Format
357 "LRI" => 15, # Left-to-Right Isolate
358 "RLI" => 15, # Right-to-Left Isolate
359 "FSI" => 15, # First Strong Isolate
360 "PDI" => 15 # Pop Directional Isolate
363 my %c2_types =
365 "L" => 1, # C2_LEFTTORIGHT
366 "R" => 2, # C2_RIGHTTOLEFT
367 "AL" => 2, # C2_RIGHTTOLEFT
368 "EN" => 3, # C2_EUROPENUMBER
369 "ES" => 4, # C2_EUROPESEPARATOR
370 "ET" => 5, # C2_EUROPETERMINATOR
371 "AN" => 6, # C2_ARABICNUMBER
372 "CS" => 7, # C2_COMMONSEPARATOR
373 "NSM" => 11, # C2_OTHERNEUTRAL
374 "BN" => 0, # C2_NOTAPPLICABLE
375 "B" => 8, # C2_BLOCKSEPARATOR
376 "S" => 9, # C2_SEGMENTSEPARATOR
377 "WS" => 10, # C2_WHITESPACE
378 "ON" => 11, # C2_OTHERNEUTRAL
379 "LRE" => 11, # C2_OTHERNEUTRAL
380 "LRO" => 11, # C2_OTHERNEUTRAL
381 "RLE" => 11, # C2_OTHERNEUTRAL
382 "RLO" => 11, # C2_OTHERNEUTRAL
383 "PDF" => 11, # C2_OTHERNEUTRAL
384 "LRI" => 11, # C2_OTHERNEUTRAL
385 "RLI" => 11, # C2_OTHERNEUTRAL
386 "FSI" => 11, # C2_OTHERNEUTRAL
387 "PDI" => 11 # C2_OTHERNEUTRAL
390 my %bidi_types =
392 "ON" => 0, # Other Neutrals
393 "L" => 1, # Left-to-Right
394 "R" => 2, # Right-to-Left
395 "AN" => 3, # Arabic Number
396 "EN" => 4, # European Number
397 "AL" => 5, # Right-to-Left Arabic
398 "NSM" => 6, # Non-Spacing Mark
399 "CS" => 7, # Common Number Separator
400 "ES" => 8, # European Number Separator
401 "ET" => 9, # European Number Terminator
402 "BN" => 10, # Boundary Neutral
403 "S" => 11, # Segment Separator
404 "WS" => 12, # Whitespace
405 "B" => 13, # Paragraph Separator
406 "RLO" => 14, # Right-to-Left Override
407 "RLE" => 15, # Right-to-Left Embedding
408 "LRO" => 16, # Left-to-Right Override
409 "LRE" => 17, # Left-to-Right Embedding
410 "PDF" => 18, # Pop Directional Format
411 "LRI" => 19, # Left-to-Right Isolate
412 "RLI" => 20, # Right-to-Left Isolate
413 "FSI" => 21, # First Strong Isolate
414 "PDI" => 22 # Pop Directional Isolate
417 my %joining_types =
419 "U" => 0, # Non_Joining
420 "T" => 1, # Transparent
421 "R" => 2, # Right_Joining
422 "L" => 3, # Left_Joining
423 "D" => 4, # Dual_Joining
424 "C" => 5, # Join_Causing
427 my @cp2uni = ();
428 my @glyph2uni = ();
429 my @lead_bytes = ();
430 my @uni2cp = ();
431 my @tolower_table = ();
432 my @toupper_table = ();
433 my @digitmap_table = ();
434 my @category_table = ();
435 my @joining_table = ();
436 my @direction_table = ();
437 my @decomp_table = ();
438 my @combining_class_table = ();
439 my @decomp_compat_table = ();
440 my @comp_exclusions = ();
441 my @idna_decomp_table = ();
442 my @idna_disallowed = ();
443 my %registry_keys;
444 my $default_char;
445 my $default_wchar;
447 my %joining_forms =
449 "isolated" => [],
450 "final" => [],
451 "initial" => [],
452 "medial" => []
455 sub to_utf16(@)
457 my @ret;
458 foreach my $ch (@_)
460 if ($ch < 0x10000)
462 push @ret, $ch;
464 else
466 my $val = $ch - 0x10000;
467 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
470 return @ret;
473 ################################################################
474 # fetch a unicode.org file and open it
475 sub open_data_file($$)
477 my ($base, $name) = @_;
478 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
479 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
480 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
481 local *FILE;
483 if ($base =~ /.*\/([^\/]+)\.zip$/)
485 my $zip = "$1$suffix.zip";
486 unless (-f "$cache/$zip")
488 system "mkdir", "-p", $cache;
489 print "Fetching $base...\n";
490 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
492 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
494 else
496 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
497 unless (-f $dest)
499 system "mkdir", "-p", $dir;
500 print "Fetching $base/$name...\n";
501 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
503 open FILE, "<$dest" or die "cannot open $dest";
505 return *FILE;
508 ################################################################
509 # recursively get the decomposition for a character
510 sub get_decomposition($$);
511 sub get_decomposition($$)
513 my ($char, $table) = @_;
514 my @ret;
516 return $char unless defined $table->[$char];
517 foreach my $ch (@{$table->[$char]})
519 push @ret, get_decomposition( $ch, $table );
521 return @ret;
524 ################################################################
525 # get the composition that results in a given character
526 sub get_composition($$)
528 my ($ch, $compat) = @_;
529 return () unless defined $decomp_table[$ch]; # no decomposition
530 my @ret = @{$decomp_table[$ch]};
531 return () if @ret < 2; # singleton decomposition
532 return () if $comp_exclusions[$ch]; # composition exclusion
533 return () if $combining_class_table[$ch]; # non-starter
534 return () if $combining_class_table[$ret[0]]; # first char is non-starter
535 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
536 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
537 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
538 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
539 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
540 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
541 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
542 return @ret;
545 ################################################################
546 # recursively build decompositions
547 sub build_decompositions(@)
549 my @src = @_;
550 my @dst;
552 for (my $i = 0; $i < @src; $i++)
554 next unless defined $src[$i];
555 my @decomp = to_utf16( get_decomposition( $i, \@src ));
556 $dst[$i] = \@decomp;
558 return @dst;
561 ################################################################
562 # compose Hangul sequences
563 sub compose_hangul(@)
565 my $SBASE = 0xac00;
566 my $LBASE = 0x1100;
567 my $VBASE = 0x1161;
568 my $TBASE = 0x11a7;
569 my $LCOUNT = 19;
570 my $VCOUNT = 21;
571 my $TCOUNT = 28;
572 my $NCOUNT = $VCOUNT * $TCOUNT;
573 my $SCOUNT = $LCOUNT * $NCOUNT;
575 my @seq = @_;
576 my @ret;
577 my $i;
579 for ($i = 0; $i < @seq; $i++)
581 my $ch = $seq[$i];
582 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
583 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
585 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
586 $i++;
588 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
589 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
591 $ch += $seq[$i+1] - $TBASE;
592 $i++;
594 push @ret, $ch;
596 return @ret;
599 ################################################################
600 # remove linguistic-only mappings from the case table
601 sub remove_linguistic_mappings($$)
603 my ($upper, $lower) = @_;
605 # remove case mappings that don't round-trip
607 for (my $i = 0; $i < @{$upper}; $i++)
609 next unless defined ${$upper}[$i];
610 my $ch = ${$upper}[$i];
611 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
613 for (my $i = 0; $i < @{$lower}; $i++)
615 next unless defined ${$lower}[$i];
616 my $ch = ${$lower}[$i];
617 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
621 ################################################################
622 # read in the Unicode database files
623 sub load_data()
625 my $start;
627 # now build mappings from the decomposition field of the Unicode database
629 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
630 while (<$UNICODE_DATA>)
632 # Decode the fields ...
633 my ($code, $name, $cat, $comb, $bidi,
634 $decomp, $dec, $dig, $num, $mirror,
635 $oldname, $comment, $upper, $lower, $title) = split /;/;
636 my $src = hex $code;
638 die "unknown category $cat" unless defined $categories{$cat};
639 die "unknown directionality $bidi" unless defined $directions{$bidi};
641 $category_table[$src] = $categories{$cat};
642 $direction_table[$src] = $bidi;
643 $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
645 if ($lower ne "")
647 $tolower_table[$src] = hex $lower;
649 if ($upper ne "")
651 $toupper_table[$src] = hex $upper;
653 if ($dec ne "")
655 $category_table[$src] |= $ctype{"digit"};
657 if ($dig ne "")
659 $digitmap_table[$src] = ord $dig;
661 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
663 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
664 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
665 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
666 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
667 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
668 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
669 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
670 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
671 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
672 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
673 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
674 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
676 # copy the category and direction for everything between First/Last pairs
677 if ($name =~ /, First>/) { $start = $src; }
678 if ($name =~ /, Last>/)
680 while ($start < $src)
682 $category_table[$start] = $category_table[$src];
683 $direction_table[$start] = $direction_table[$src];
684 $combining_class_table[$start] = $combining_class_table[$src];
685 $start++;
689 next if $decomp eq ""; # no decomposition, skip it
691 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
693 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
694 $decomp_compat_table[$src] = \@seq;
697 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
699 # decomposition of the form "<foo> 1234" -> use char if type is known
700 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
702 ${joining_forms{$1}}[hex $2] = $src;
705 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
707 # decomposition "<compat> 0020 1234" -> combining accent
709 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
711 # store decomposition
712 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
714 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
716 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
718 # Single char decomposition
719 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
723 close $UNICODE_DATA;
725 # patch the category of some special characters
727 for (my $i = 0; $i < @decomp_table; $i++)
729 next unless defined $decomp_table[$i];
730 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
732 foreach my $cat (keys %special_categories)
734 my $flag = $ctype{$cat};
735 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
737 for (my $i = 0; $i < @decomp_compat_table; $i++)
739 next unless defined $decomp_compat_table[$i];
740 next unless @{$decomp_compat_table[$i]} == 2;
741 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
744 # load the composition exclusions
746 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
747 while (<$EXCL>)
749 s/\#.*//; # remove comments
750 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
752 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
754 elsif (/^([0-9a-fA-F]+)\s*$/)
756 $comp_exclusions[hex $1] = 1;
759 close $EXCL;
761 # load the IDNA mappings
763 @idna_decomp_table = @decomp_compat_table;
764 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
765 while (<$IDNA>)
767 s/\#.*//; # remove comments
768 next if /^\s*$/;
769 my ($char, $type, $mapping) = split /;/;
770 my ($ch1, $ch2);
771 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
773 $ch1 = hex $1;
774 $ch2 = hex $2;
776 elsif ($char =~ /([0-9a-fA-F]+)/)
778 $ch1 = $ch2 = hex $1;
781 if ($type =~ /mapped/ || $type =~ /deviation/)
783 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
784 my @seq = map { hex $_; } split /\s+/, $mapping;
785 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
787 elsif ($type =~ /valid/)
790 elsif ($type =~ /ignored/)
792 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
794 elsif ($type =~ /disallowed/)
796 foreach my $i ($ch1 .. $ch2)
798 $idna_decomp_table[$i] = undef;
799 $idna_disallowed[$i] = 1;
803 close $IDNA;
807 ################################################################
808 # add a new registry key
809 sub add_registry_key($$)
811 my ($key, $defval) = @_;
812 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
815 ################################################################
816 # add a new registry value
817 sub add_registry_value($$$)
819 my ($key, $name, $value) = @_;
820 add_registry_key( $key, undef );
821 push @{$registry_keys{$key}}, "'$name' = s '$value'";
824 ################################################################
825 # define a new lead byte
826 sub add_lead_byte($)
828 my $ch = shift;
829 return if defined $cp2uni[$ch];
830 push @lead_bytes, $ch;
831 $cp2uni[$ch] = 0;
834 ################################################################
835 # define a new char mapping
836 sub add_mapping($$)
838 my ($cp, $uni) = @_;
839 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
840 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
841 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
844 ################################################################
845 # get a mapping including glyph chars for MB_USEGLYPHCHARS
846 sub get_glyphs_mapping(@)
848 my @table = @_;
850 for (my $i = 0; $i < @glyph2uni; $i++)
852 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
854 return @table;
857 ################################################################
858 # build EUC-JP table from the JIS 0208/0212 files
859 sub dump_eucjp_codepage()
861 @cp2uni = ();
862 @glyph2uni = ();
863 @lead_bytes = ();
864 @uni2cp = ();
865 $default_char = $DEF_CHAR;
866 $default_wchar = 0x30fb;
868 # ASCII chars
869 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
871 # lead bytes
872 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
874 # JIS X 0201 right plane
875 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
877 # undefined chars
878 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
879 $cp2uni[0xa0] = 0xf8f0;
880 $cp2uni[0xff] = 0xf8f3;
882 # Fix backslash conversion
883 add_mapping( 0xa1c0, 0xff3c );
885 # Add private mappings for rows undefined in JIS 0208/0212
886 my $private = 0xe000;
887 foreach my $hi (0xf5 .. 0xfe)
889 foreach my $lo (0xa1 .. 0xfe)
891 add_mapping( ($hi << 8) + $lo, $private++ );
894 foreach my $hi (0xf5 .. 0xfe)
896 foreach my $lo (0x21 .. 0x7e)
898 add_mapping( ($hi << 8) + $lo, $private++ );
902 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
903 while (<$INPUT>)
905 next if /^\#/; # skip comments
906 next if /^$/; # skip empty lines
907 next if /\x1a/; # skip ^Z
908 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
910 add_mapping( 0x8080 + hex $1, hex $2 );
911 next;
913 die "Unrecognized line $_\n";
915 close $INPUT;
917 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
918 while (<$INPUT>)
920 next if /^\#/; # skip comments
921 next if /^$/; # skip empty lines
922 next if /\x1a/; # skip ^Z
923 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
925 add_mapping( 0x8000 + hex $1, hex $2 );
926 next;
928 die "Unrecognized line $_\n";
930 close $INPUT;
932 output_codepage_file( 20932 );
936 ################################################################
937 # build the sort keys table
938 sub dump_sortkeys($)
940 my $filename = shift;
941 my @sortkeys = ();
943 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
944 while (<$INPUT>)
946 next if /^\#/; # skip comments
947 next if /^$/; # skip empty lines
948 next if /\x1a/; # skip ^Z
949 next if /^\@version/; # skip @version header
950 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
952 my ($uni,$variable) = (hex $1, $2);
953 next if $uni > 65535;
954 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
955 next;
957 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
959 # multiple character sequence, ignored for now
960 next;
962 die "$SORTKEYS: Unrecognized line $_\n";
964 close $INPUT;
966 # compress the keys to 32 bit:
967 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
969 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
970 ${$a}[2] <=> ${$b}[2] or
971 ${$a}[3] <=> ${$b}[3] or
972 ${$a}[4] <=> ${$b}[4] or
973 $a cmp $b; } @sortkeys;
975 my ($n2, $n3) = (1, 1);
976 my @keys = (-1, -1, -1, -1, -1 );
977 my @flatkeys = ();
979 for (my $i = 0; $i < @sortkeys; $i++)
981 next unless defined $sortkeys[$i];
982 my @current = @{$sortkeys[$i]};
983 if ($current[1] == $keys[1])
985 if ($current[2] == $keys[2])
987 if ($current[3] == $keys[3])
989 # nothing
991 else
993 $keys[3] = $current[3];
994 $n3++;
995 die if ($n3 >= 16);
998 else
1000 $keys[2] = $current[2];
1001 $keys[3] = $current[3];
1002 $n2++;
1003 $n3 = 1;
1004 die if ($n2 >= 256);
1007 else
1009 $keys[1] = $current[1];
1010 $keys[2] = $current[2];
1011 $keys[3] = $current[3];
1012 $n2 = 1;
1013 $n3 = 1;
1016 if ($current[2]) { $current[2] = $n2; }
1017 if ($current[3]) { $current[3] = $n3; }
1018 if ($current[4]) { $current[4] = 1; }
1020 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1023 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1024 printf "Building $filename\n";
1025 printf OUTPUT "/* Unicode collation element table */\n";
1026 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1027 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1028 print OUTPUT "#include \"windef.h\"\n\n";
1030 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1032 close OUTPUT;
1033 save_file($filename);
1037 ################################################################
1038 # dump an array of integers
1039 sub dump_array($$@)
1041 my ($bit_width, $default, @array) = @_;
1042 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1043 my $i;
1044 my $ret = " ";
1045 for ($i = 0; $i < $#array; $i++)
1047 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1048 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1050 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1051 return $ret;
1055 ################################################################
1056 # dump an SBCS mapping table in binary format
1057 sub dump_binary_sbcs_table($)
1059 my $codepage = shift;
1061 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1062 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1064 print OUTPUT pack "S<*", @header;
1065 print OUTPUT pack "C12", (0) x 12;
1066 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1068 if (@glyph2uni)
1070 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1072 else
1074 print OUTPUT pack "S<*", 0;
1077 print OUTPUT pack "S<*", 0, 0;
1079 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1083 ################################################################
1084 # dump a DBCS mapping table in binary format
1085 sub dump_binary_dbcs_table($)
1087 my $codepage = shift;
1088 my @lb_ranges = get_lb_ranges();
1089 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1091 my @offsets = (0) x 256;
1092 my $pos = 0;
1093 foreach my $i (@lead_bytes)
1095 $offsets[$i] = ($pos += 256);
1096 $cp2uni[$i] = 0;
1099 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1101 print OUTPUT pack "S<*", @header;
1102 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1103 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1104 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1106 foreach my $i (@lead_bytes)
1108 my $base = $i << 8;
1109 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1112 print OUTPUT pack "S<", 4;
1113 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1117 ################################################################
1118 # get the list of defined lead byte ranges
1119 sub get_lb_ranges()
1121 my @list = ();
1122 my @ranges = ();
1124 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1125 my $on = 0;
1126 for (my $i = 0; $i < 256; $i++)
1128 if ($on)
1130 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1132 else
1134 if ($list[$i]) { push @ranges, $i; $on = 1; }
1137 if ($on) { push @ranges, 0xff; }
1138 return @ranges;
1141 ################################################################
1142 # dump the Indic Syllabic Category table
1143 sub dump_indic($)
1145 my $filename = shift;
1146 my @indic_table;
1148 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1149 while (<$INPUT>)
1151 next if /^\#/; # skip comments
1152 next if /^\s*$/; # skip empty lines
1153 next if /\x1a/; # skip ^Z
1154 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1156 my $type = $2;
1157 die "unknown indic $type" unless defined $indic_types{$type};
1158 if (hex $1 < 65536)
1160 $indic_table[hex $1] = $indic_types{$type};
1162 next;
1164 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1166 my $type = $3;
1167 die "unknown indic $type" unless defined $indic_types{$type};
1168 if (hex $1 < 65536 and hex $2 < 65536)
1170 foreach my $i (hex $1 .. hex $2)
1172 $indic_table[$i] = $indic_types{$type};
1175 next;
1177 die "malformed line $_";
1179 close $INPUT;
1181 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1182 while (<$INPUT>)
1184 next if /^\#/; # skip comments
1185 next if /^\s*$/; # skip empty lines
1186 next if /\x1a/; # skip ^Z
1187 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1189 my $type = $2;
1190 die "unknown matra $type" unless defined $matra_types{$type};
1191 $indic_table[hex $1] |= $matra_types{$type} << 8;
1192 next;
1194 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1196 my $type = $3;
1197 die "unknown matra $type" unless defined $matra_types{$type};
1198 foreach my $i (hex $1 .. hex $2)
1200 $indic_table[$i] |= $matra_types{$type} << 8;
1202 next;
1204 die "malformed line $_";
1206 close $INPUT;
1208 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1209 print "Building $filename\n";
1210 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1211 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1212 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1213 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1214 print OUTPUT "#include \"windef.h\"\n\n";
1216 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1218 close OUTPUT;
1219 save_file($filename);
1222 ################################################################
1223 # dump the Line Break Properties table
1224 sub dump_linebreak($)
1226 my $filename = shift;
1227 my @break_table;
1228 my $next_group = 0;
1230 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1231 while (<$INPUT>)
1233 next if /^\#/; # skip comments
1234 next if /^\s*$/; # skip empty lines
1235 next if /\x1a/; # skip ^Z
1236 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1238 my $type = $2;
1239 die "unknown breaktype $type" unless defined $break_types{$type};
1240 $break_table[hex $1] = $break_types{$type};
1241 next;
1243 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1245 my $type = $3;
1246 die "unknown breaktype $type" unless defined $break_types{$type};
1247 foreach my $i (hex $1 .. hex $2)
1249 $break_table[$i] = $break_types{$type};
1251 next;
1253 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1255 my $type = $2;
1256 die "unknown breaktype $type" unless defined $break_types{$type};
1257 $break_table[hex $1] = $break_types{$type};
1258 next;
1260 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1262 my $type = $3;
1263 die "unknown breaktype $type" unless defined $break_types{$type};
1264 foreach my $i (hex $1 .. hex $2)
1266 $break_table[$i] = $break_types{$type};
1268 next;
1270 die "malformed line $_";
1272 close $INPUT;
1274 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1275 print "Building $filename\n";
1276 print OUTPUT "/* Unicode Line Break Properties */\n";
1277 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1278 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1279 print OUTPUT "#include \"windef.h\"\n\n";
1281 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1283 close OUTPUT;
1284 save_file($filename);
1287 my %scripts =
1289 "Unknown" => 0,
1290 "Common" => 1,
1291 "Inherited" => 2,
1292 "Arabic" => 3,
1293 "Armenian" => 4,
1294 "Avestan" => 5,
1295 "Balinese" => 6,
1296 "Bamum" => 7,
1297 "Batak" => 8,
1298 "Bengali" => 9,
1299 "Bopomofo" => 10,
1300 "Brahmi" => 11,
1301 "Braille" => 12,
1302 "Buginese" => 13,
1303 "Buhid" => 14,
1304 "Canadian_Aboriginal" => 15,
1305 "Carian" => 16,
1306 "Cham" => 17,
1307 "Cherokee" => 18,
1308 "Coptic" => 19,
1309 "Cuneiform" => 20,
1310 "Cypriot" => 21,
1311 "Cyrillic" => 22,
1312 "Deseret" => 23,
1313 "Devanagari" => 24,
1314 "Egyptian_Hieroglyphs" => 25,
1315 "Ethiopic" => 26,
1316 "Georgian" => 27,
1317 "Glagolitic" => 28,
1318 "Gothic" => 29,
1319 "Greek" => 30,
1320 "Gujarati" => 31,
1321 "Gurmukhi" => 32,
1322 "Han" => 33,
1323 "Hangul" => 34,
1324 "Hanunoo" => 35,
1325 "Hebrew" => 36,
1326 "Hiragana" => 37,
1327 "Imperial_Aramaic" => 38,
1328 "Inscriptional_Pahlavi" => 39,
1329 "Inscriptional_Parthian" => 40,
1330 "Javanese" => 41,
1331 "Kaithi" => 42,
1332 "Kannada" => 43,
1333 "Katakana" => 44,
1334 "Kayah_Li" => 45,
1335 "Kharoshthi" => 46,
1336 "Khmer" => 47,
1337 "Lao" => 48,
1338 "Latin" => 49,
1339 "Lepcha" => 50,
1340 "Limbu" => 51,
1341 "Linear_B" => 52,
1342 "Lisu" => 53,
1343 "Lycian" => 54,
1344 "Lydian" => 55,
1345 "Malayalam" => 56,
1346 "Mandaic" => 57,
1347 "Meetei_Mayek" => 58,
1348 "Mongolian" => 59,
1349 "Myanmar" => 60,
1350 "New_Tai_Lue" => 61,
1351 "Nko" => 62,
1352 "Ogham" => 63,
1353 "Ol_Chiki" => 64,
1354 "Old_Italic" => 65,
1355 "Old_Persian" => 66,
1356 "Old_South_Arabian" => 67,
1357 "Old_Turkic" => 68,
1358 "Oriya" => 69,
1359 "Osmanya" => 70,
1360 "Phags_Pa" => 71,
1361 "Phoenician" => 72,
1362 "Rejang" => 73,
1363 "Runic" => 74,
1364 "Samaritan" => 75,
1365 "Saurashtra" => 76,
1366 "Shavian" => 77,
1367 "Sinhala" => 78,
1368 "Sundanese" => 79,
1369 "Syloti_Nagri" => 80,
1370 "Syriac" => 81,
1371 "Tagalog" => 82,
1372 "Tagbanwa" => 83,
1373 "Tai_Le" => 84,
1374 "Tai_Tham" => 85,
1375 "Tai_Viet" => 86,
1376 "Tamil" => 87,
1377 "Telugu" => 88,
1378 "Thaana" => 89,
1379 "Thai" => 90,
1380 "Tibetan" => 91,
1381 "Tifinagh" => 92,
1382 "Ugaritic" => 93,
1383 "Vai" => 94,
1384 "Yi" => 95,
1385 # Win8/Win8.1
1386 "Chakma" => 96,
1387 "Meroitic_Cursive" => 97,
1388 "Meroitic_Hieroglyphs" => 98,
1389 "Miao" => 99,
1390 "Sharada" => 100,
1391 "Sora_Sompeng" => 101,
1392 "Takri" => 102,
1393 # Win10
1394 "Bassa_Vah" => 103,
1395 "Caucasian_Albanian" => 104,
1396 "Duployan" => 105,
1397 "Elbasan" => 106,
1398 "Grantha" => 107,
1399 "Khojki" => 108,
1400 "Khudawadi" => 109,
1401 "Linear_A" => 110,
1402 "Mahajani" => 111,
1403 "Manichaean" => 112,
1404 "Mende_Kikakui" => 113,
1405 "Modi" => 114,
1406 "Mro" => 115,
1407 "Nabataean" => 116,
1408 "Old_North_Arabian" => 117,
1409 "Old_Permic" => 118,
1410 "Pahawh_Hmong" => 119,
1411 "Palmyrene" => 120,
1412 "Pau_Cin_Hau" => 121,
1413 "Psalter_Pahlavi" => 122,
1414 "Siddham" => 123,
1415 "Tirhuta" => 124,
1416 "Warang_Citi" => 125,
1417 # Win10 RS1
1418 "Adlam" => 126,
1419 "Ahom" => 127,
1420 "Anatolian_Hieroglyphs" => 128,
1421 "Bhaiksuki" => 129,
1422 "Hatran" => 130,
1423 "Marchen" => 131,
1424 "Multani" => 132,
1425 "Newa" => 133,
1426 "Old_Hungarian" => 134,
1427 "Osage" => 135,
1428 "SignWriting" => 136,
1429 "Tangut" => 137,
1430 # Win10 RS4
1431 "Masaram_Gondi" => 138,
1432 "Nushu" => 139,
1433 "Soyombo" => 140,
1434 "Zanabazar_Square" => 141,
1435 # Win10 1903
1436 "Dogra" => 142,
1437 "Gunjala_Gondi" => 143,
1438 "Hanifi_Rohingya" => 144,
1439 "Makasar" => 145,
1440 "Medefaidrin" => 146,
1441 "Old_Sogdian" => 147,
1442 "Sogdian" => 148,
1445 ################################################################
1446 # dump Script IDs table
1447 sub dump_scripts($)
1449 my $filename = shift;
1450 my $header = $filename;
1451 my @scripts_table;
1452 my $script_index;
1453 my $i;
1455 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1456 # Fill the table
1457 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1458 while (<$INPUT>)
1460 my $type = "";
1462 next if /^\#/; # skip comments
1463 next if /^\s*$/; # skip empty lines
1464 next if /\x1a/; # skip ^Z
1465 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1467 $type = $2;
1468 if (defined $scripts{$type})
1470 $scripts_table[hex $1] = $scripts{$type};
1472 next;
1474 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1476 $type = $3;
1477 if (defined $scripts{$type})
1479 foreach my $i (hex $1 .. hex $2)
1481 $scripts_table[$i] = $scripts{$type};
1484 next;
1488 close $INPUT;
1490 $header = "$filename.h";
1491 open OUTPUT,">$header.new" or die "Cannot create $header";
1492 print "Building $header\n";
1493 print OUTPUT "/* Unicode Script IDs */\n";
1494 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1495 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1497 print OUTPUT "enum unicode_script_id {\n";
1498 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1500 print OUTPUT " Script_$script = $scripts{$script},\n";
1502 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1503 print OUTPUT "};\n";
1505 close OUTPUT;
1506 save_file($header);
1508 $filename = "$filename.c";
1509 open OUTPUT,">$filename.new" or die "Cannot create $header";
1510 print "Building $filename\n";
1511 print OUTPUT "/* Unicode Script IDs */\n";
1512 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1513 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1514 print OUTPUT "#include \"windef.h\"\n\n";
1516 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1517 close OUTPUT;
1518 save_file($filename);
1521 ################################################################
1522 # dump the BiDi mirroring table
1523 sub dump_mirroring($)
1525 my $filename = shift;
1526 my @mirror_table = ();
1528 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1529 while (<$INPUT>)
1531 next if /^\#/; # skip comments
1532 next if /^$/; # skip empty lines
1533 next if /\x1a/; # skip ^Z
1534 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1536 $mirror_table[hex $1] = hex $2;
1537 next;
1539 die "malformed line $_";
1541 close $INPUT;
1543 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1544 print "Building $filename\n";
1545 print OUTPUT "/* Unicode BiDi mirroring */\n";
1546 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1547 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1548 print OUTPUT "#include \"windef.h\"\n\n";
1549 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1550 close OUTPUT;
1551 save_file($filename);
1554 ################################################################
1555 # dump the Bidi Brackets
1556 sub dump_bracket($)
1558 my $filename = shift;
1559 my @bracket_table;
1561 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1562 while (<$INPUT>)
1564 next if /^\#/; # skip comments
1565 next if /^\s*$/; # skip empty lines
1566 next if /\x1a/; # skip ^Z
1567 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1569 my $type = $3;
1570 die "unknown bracket $type" unless defined $bracket_types{$type};
1571 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1572 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1573 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1574 next;
1576 die "malformed line $_";
1578 close $INPUT;
1580 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1581 print "Building $filename\n";
1582 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1583 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1584 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1585 print OUTPUT "#include \"windef.h\"\n\n";
1587 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1589 close OUTPUT;
1590 save_file($filename);
1593 ################################################################
1594 # dump the Arabic shaping table
1595 sub dump_shaping($)
1597 my $filename = shift;
1598 my %groups;
1599 my $next_group = 0;
1601 $groups{"No_Joining_Group"} = $next_group++;
1603 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1604 while (<$INPUT>)
1606 next if /^\#/; # skip comments
1607 next if /^\s*$/; # skip empty lines
1608 next if /\x1a/; # skip ^Z
1609 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1611 my $type = $2;
1612 my $group = $3;
1613 $groups{$group} = $next_group++ unless defined $groups{$group};
1614 $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1615 next;
1617 die "malformed line $_";
1619 close $INPUT;
1621 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1622 print "Building $filename\n";
1623 print OUTPUT "/* Unicode Arabic shaping */\n";
1624 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1625 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1626 print OUTPUT "#include \"windef.h\"\n\n";
1628 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1630 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1631 for (my $i = 0x600; $i <= 0x6ff; $i++)
1633 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1634 ${joining_forms{"isolated"}}[$i] || $i,
1635 ${joining_forms{"final"}}[$i] || $i,
1636 ${joining_forms{"initial"}}[$i] || $i,
1637 ${joining_forms{"medial"}}[$i] || $i;
1639 print OUTPUT "};\n";
1641 close OUTPUT;
1642 save_file($filename);
1645 ################################################################
1646 # dump the Vertical Orientation table
1647 sub dump_vertical($)
1649 my $filename = shift;
1650 my @vertical_table;
1652 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1653 while (<$INPUT>)
1655 next if /^\#/; # skip comments
1656 next if /^\s*$/; # skip empty lines
1657 next if /\x1a/; # skip ^Z
1658 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1660 my $type = $2;
1661 die "unknown vertical $type" unless defined $vertical_types{$type};
1662 if (hex $1 < 65536)
1664 $vertical_table[hex $1] = $vertical_types{$type};
1666 next;
1668 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1670 my $type = $3;
1671 die "unknown vertical $type" unless defined $vertical_types{$type};
1672 foreach my $i (hex $1 .. hex $2)
1674 $vertical_table[$i] = $vertical_types{$type};
1676 next;
1678 die "malformed line $_";
1680 close $INPUT;
1682 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1683 print "Building $filename\n";
1684 print OUTPUT "/* Unicode Vertical Orientation */\n";
1685 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1686 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1687 print OUTPUT "#include \"windef.h\"\n\n";
1689 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1691 close OUTPUT;
1692 save_file($filename);
1695 ################################################################
1696 # dump the digit folding tables
1697 sub dump_digit_folding($)
1699 my ($filename) = shift;
1700 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1701 print "Building $filename\n";
1702 print OUTPUT "/* Unicode digit folding mappings */\n";
1703 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1704 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1705 print OUTPUT "#include \"windef.h\"\n\n";
1707 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1708 close OUTPUT;
1709 save_file($filename);
1713 ################################################################
1714 # dump the case mapping tables
1715 sub dump_case_mappings($)
1717 my $filename = shift;
1718 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1719 print "Building $filename\n";
1720 print OUTPUT "/* Unicode case mappings */\n";
1721 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1722 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1723 print OUTPUT "#include \"windef.h\"\n\n";
1725 my @upper = @toupper_table;
1726 my @lower = @tolower_table;
1727 remove_linguistic_mappings( \@upper, \@lower );
1729 dump_case_table( "wine_casemap_lower", @lower );
1730 print OUTPUT "\n";
1731 dump_case_table( "wine_casemap_upper", @upper );
1732 close OUTPUT;
1733 save_file($filename);
1737 ################################################################
1738 # dump a case mapping table
1739 sub dump_case_table($@)
1741 my ($name,@table) = @_;
1743 for (my $i = 0; $i < 65536; $i++)
1745 next unless defined $table[$i];
1746 $table[$i] = ($table[$i] - $i) & 0xffff;
1749 my @array = compress_array( 256, 0, @table[0..65535] );
1751 printf OUTPUT "const WCHAR %s[%d] =\n", $name, scalar @array;
1752 printf OUTPUT "{\n /* index */\n";
1753 printf OUTPUT "%s,\n", dump_array( 16, 0, @array[0..255] );
1754 printf OUTPUT " /* data */\n";
1755 printf OUTPUT "%s", dump_array( 16, 0, @array[256..$#array] );
1756 printf OUTPUT "\n};\n";
1759 ################################################################
1760 # compress a mapping table by removing identical rows
1761 sub compress_array($$@)
1763 my $rows = shift;
1764 my $def = shift;
1765 my @table = @_;
1766 my $len = @table / $rows;
1767 my @array;
1768 my $data = "";
1770 # try to merge table rows
1771 for (my $row = 0; $row < $rows; $row++)
1773 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1774 my $pos = index $data, $rowtxt;
1775 if ($pos == -1)
1777 # check if the tail of the data can match the start of the new row
1778 my $first = substr( $rowtxt, 0, 1 );
1779 for (my $i = length($data) - 1; $i > 0; $i--)
1781 $pos = index( substr( $data, -$i ), $first );
1782 last if $pos == -1;
1783 $i -= $pos;
1784 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1785 substr( $data, -$i ) = "";
1786 last;
1788 $pos = length $data;
1789 $data .= $rowtxt;
1791 $array[$row] = $rows + $pos;
1793 return @array, unpack "U*", $data;
1796 ################################################################
1797 # dump a simple char -> 16-bit value mapping table
1798 sub dump_simple_mapping($$@)
1800 my $name = shift;
1801 my $def = shift;
1802 my @array = compress_array( 256, $def, @_[0..65535] );
1804 printf OUTPUT "const unsigned short %s[%d] =\n{\n", $name, $#array+1;
1805 printf OUTPUT " /* offsets */\n%s,\n", dump_array( 16, 0, @array[0..255] );
1806 printf OUTPUT " /* values */\n%s\n};\n", dump_array( 16, 0, @array[256..$#array] );
1809 ################################################################
1810 # dump a char -> 16-bit value mapping table using two-level tables
1811 sub dump_two_level_mapping($$@)
1813 my $name = shift;
1814 my $def = shift;
1815 my $size = shift;
1816 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1817 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1818 my @array = compress_array( 256, 0, @row_array[0..4095] );
1820 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1822 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1823 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1824 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1825 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1828 ################################################################
1829 # dump a char -> value mapping table using three-level tables
1830 sub dump_three_level_mapping($$@)
1832 my $name = shift;
1833 my $def = shift;
1834 my $size = shift;
1835 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1836 my $level3 = ($MAX_CHAR + 1) / 16;
1837 my $level2 = $level3 / 16;
1838 my $level1 = $level2 / 16;
1839 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1840 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1841 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1843 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1844 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1846 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1847 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1848 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1849 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1850 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1853 ################################################################
1854 # dump a binary case mapping table in l_intl.nls format
1855 sub dump_binary_case_table(@)
1857 my (@table) = @_;
1858 my $max_char = 0x10000;
1859 my $level1 = $max_char / 16;
1860 my $level2 = $level1 / 16;
1862 my @difftable;
1863 for (my $i = 0; $i < @table; $i++)
1865 next unless defined $table[$i];
1866 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1869 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1870 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1871 my $offset = @array - $level1;
1872 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1873 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1876 ################################################################
1877 # dump case mappings for l_intl.nls
1878 sub dump_intl_nls($)
1880 my @upper_table = @toupper_table;
1881 my @lower_table = @tolower_table;
1882 remove_linguistic_mappings( \@upper_table, \@lower_table );
1884 my $upper = dump_binary_case_table( @upper_table );
1885 my $lower = dump_binary_case_table( @lower_table );
1887 my $filename = shift;
1888 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1889 printf "Building $filename\n";
1891 binmode OUTPUT;
1892 print OUTPUT pack "S<", 1; # version
1893 print OUTPUT $upper;
1894 print OUTPUT $lower;
1895 close OUTPUT;
1896 save_file($filename);
1900 ################################################################
1901 # dump the bidi direction table
1902 sub dump_bidi_dir_table($)
1904 my $filename = shift;
1905 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1906 printf "Building $filename\n";
1907 printf OUTPUT "/* Unicode BiDi direction table */\n";
1908 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1909 printf OUTPUT "#include \"windef.h\"\n\n";
1911 my @table;
1913 for (my $i = 0; $i < 65536; $i++)
1915 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1918 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1920 close OUTPUT;
1921 save_file($filename);
1925 ################################################################
1926 # dump the ctype tables
1927 sub dump_ctype_tables($)
1929 my $filename = shift;
1930 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1931 printf "Building $filename\n";
1932 printf OUTPUT "/* Unicode ctype tables */\n";
1933 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1934 printf OUTPUT "#include \"windef.h\"\n\n";
1936 my @table = map { ($_ || 0) & 0xffff; } @category_table;
1938 # add the direction in the high 4 bits of the category
1939 for (my $i = 0; $i < 65536; $i++)
1941 $table[$i] |= $directions{$direction_table[$i]} << 12 if defined $direction_table[$i];
1944 dump_simple_mapping( "wine_wctype_table", 0, @table );
1946 close OUTPUT;
1947 save_file($filename);
1951 sub rol($$)
1953 my ($byte, $count) = @_;
1954 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1957 ################################################################
1958 # compress the character properties table
1959 sub compress_char_props_table($@)
1961 my $rows = shift;
1962 my @table = @_;
1963 my $len = @table / $rows;
1964 my $pos = 0;
1965 my @array = (0) x $rows;
1966 my %sequences;
1968 # add some predefined sequences
1969 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1971 # try to merge table rows
1972 for (my $row = 0; $row < $rows; $row++)
1974 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1975 my $rowtxt = pack "L*", @table_row;
1976 if (defined($sequences{$rowtxt}))
1978 # reuse an existing row
1979 $array[$row] = $sequences{$rowtxt};
1981 else
1983 # create a new row
1984 $sequences{$rowtxt} = $array[$row] = ++$pos;
1985 push @array, @table_row;
1988 return @array;
1991 ################################################################
1992 # dump a normalization table in binary format
1993 sub dump_norm_table($)
1995 my $filename = shift;
1997 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
1998 my %decomp = ( "nfc" => \@decomp_table,
1999 "nfd" => \@decomp_table,
2000 "nfkc" => \@decomp_compat_table,
2001 "nfkd" => \@decomp_compat_table ,
2002 "idna" => \@idna_decomp_table );
2004 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2005 print "Building $filename\n";
2007 my $type = $filename;
2008 $type =~ s!.*/norm(\w+)\.nls!$1!;
2010 my $compose = $forms{$type} & 1;
2011 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2013 my @version = split /\./, $UNIVERSION;
2015 # combining classes
2017 my @classes;
2018 my @class_values;
2020 foreach my $c (grep defined, @combining_class_table)
2022 $classes[$c] = 1 if $c < 0x100;
2024 for (my $i = 0; $i < @classes; $i++)
2026 next unless defined $classes[$i];
2027 $classes[$i] = @class_values;
2028 push @class_values, $i;
2030 push @class_values, 0 if (@class_values % 2);
2031 die "too many classes" if @class_values >= 0x40;
2033 # character properties
2035 my @char_props;
2036 my @decomposed;
2037 my @comp_hash_table;
2038 my $comp_hash_size = $compose ? 254 : 0;
2040 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2042 next unless defined $combining_class_table[$i];
2043 if (defined $decomp{$type}->[$i])
2045 my @dec = get_decomposition( $i, $decomp{$type} );
2046 if ($compose && (my @comp = get_composition( $i, $compat )))
2048 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2049 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2051 my $val = 0;
2052 foreach my $d (@dec)
2054 $val = $combining_class_table[$d];
2055 last if $val;
2057 $char_props[$i] = $classes[$val];
2059 else
2061 $char_props[$i] = 0xbf;
2063 @dec = compose_hangul( @dec ) if $compose;
2064 @dec = to_utf16( @dec );
2065 push @dec, 0 if @dec >= 7;
2066 $decomposed[$i] = \@dec;
2068 else
2070 if ($combining_class_table[$i] == 0x100)
2072 $char_props[$i] = 0x7f;
2074 elsif ($combining_class_table[$i])
2076 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2078 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2080 $char_props[$i] = 0xff;
2082 else
2084 $char_props[$i] = 0;
2089 if ($compose)
2091 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2093 my @comp = get_composition( $i, $compat );
2094 next unless @comp;
2095 if ($combining_class_table[$comp[1]])
2097 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2098 $char_props[$comp[1]] |= 0x40;
2100 else
2102 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2103 $char_props[$comp[1]] |= 0xc0;
2108 # surrogates
2109 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2110 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2112 # Hangul
2113 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2114 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2115 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2117 # invalid chars
2118 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2119 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2120 foreach my $i (0x00..0x10)
2122 $char_props[($i << 16) | 0xfffe] = 0xff;
2123 $char_props[($i << 16) | 0xffff] = 0xff;
2126 # decomposition hash table
2128 my @decomp_hash_table;
2129 my @decomp_hash_index;
2130 my @decomp_hash_data;
2131 my $decomp_hash_size = 944;
2133 # build string of character data, reusing substrings when possible
2134 my $decomp_char_data = "";
2135 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2137 my $str = pack "U*", @{$i};
2138 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2140 for (my $i = 0; $i < @decomposed; $i++)
2142 next unless defined $decomposed[$i];
2143 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2144 die "sequence not found" if $pos == -1;
2145 my $len = @{$decomposed[$i]};
2146 $len = 7 if $len > 7;
2147 my $hash = $i % $decomp_hash_size;
2148 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2150 for (my $i = 0; $i < $decomp_hash_size; $i++)
2152 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2153 next unless defined $decomp_hash_table[$i];
2154 if (@{$decomp_hash_table[$i]} == 1)
2156 my $entry = $decomp_hash_table[$i]->[0];
2157 if ($char_props[$entry->[0]] == 0xbf)
2159 $decomp_hash_index[$i] = $entry->[1];
2160 next;
2163 foreach my $entry (@{$decomp_hash_table[$i]})
2165 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2168 push @decomp_hash_data, 0, 0;
2170 # composition hash table
2172 my @comp_hash_index;
2173 my @comp_hash_data;
2174 if (@comp_hash_table)
2176 for (my $i = 0; $i < $comp_hash_size; $i++)
2178 $comp_hash_index[$i] = @comp_hash_data;
2179 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2181 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2182 push @comp_hash_data, 0, 0, 0;
2185 my $level1 = ($MAX_CHAR + 1) / 128;
2186 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2188 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2189 0, $decomp_hash_size, $comp_hash_size, 0 );
2190 my @tables = (0) x 8;
2192 $tables[0] = 16 + @header + @tables;
2193 $tables[1] = $tables[0] + @class_values / 2;
2194 $tables[2] = $tables[1] + $level1 / 2;
2195 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2196 $tables[4] = $tables[3] + @decomp_hash_index;
2197 $tables[5] = $tables[4] + @decomp_hash_data;
2198 $tables[6] = $tables[5] + length $decomp_char_data;
2199 $tables[7] = $tables[6] + @comp_hash_index;
2201 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2202 print OUTPUT pack "S<*", @header;
2203 print OUTPUT pack "S<*", @tables;
2204 print OUTPUT pack "C*", @class_values;
2206 print OUTPUT pack "C*", @rows[0..$level1-1];
2207 print OUTPUT pack "C*", @rows[$level1..$#rows];
2208 print OUTPUT pack "S<*", @decomp_hash_index;
2209 print OUTPUT pack "S<*", @decomp_hash_data;
2210 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2211 print OUTPUT pack "S<*", @comp_hash_index;
2212 print OUTPUT pack "S<*", @comp_hash_data;
2214 close OUTPUT;
2215 save_file($filename);
2217 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2221 ################################################################
2222 # output a codepage definition file from the global tables
2223 sub output_codepage_file($)
2225 my $codepage = shift;
2227 my $output = sprintf "nls/c_%03d.nls", $codepage;
2228 open OUTPUT,">$output.new" or die "Cannot create $output";
2230 printf "Building %s\n", $output;
2231 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2232 else { dump_binary_dbcs_table( $codepage ); }
2234 close OUTPUT;
2235 save_file($output);
2237 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2240 ################################################################
2241 # output a codepage table from a Microsoft-style mapping file
2242 sub dump_msdata_codepage($)
2244 my $filename = shift;
2246 my $state = "";
2247 my ($codepage, $width, $count);
2248 my ($lb_cur, $lb_end);
2250 @cp2uni = ();
2251 @glyph2uni = ();
2252 @lead_bytes = ();
2253 @uni2cp = ();
2254 $default_char = $DEF_CHAR;
2255 $default_wchar = $DEF_CHAR;
2257 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2259 while (<$INPUT>)
2261 next if /^;/; # skip comments
2262 next if /^\s*$/; # skip empty lines
2263 next if /\x1a/; # skip ^Z
2264 last if /^ENDCODEPAGE/;
2266 if (/^CODEPAGE\s+(\d+)/)
2268 $codepage = $1;
2269 next;
2271 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2273 $width = $1;
2274 $default_char = hex $2;
2275 $default_wchar = hex $3;
2276 next;
2278 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2280 $state = $1;
2281 $count = $2;
2282 next;
2284 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2286 if ($state eq "MBTABLE")
2288 my $cp = hex $1;
2289 my $uni = hex $2;
2290 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2291 next;
2293 if ($state eq "GLYPHTABLE")
2295 my $cp = hex $1;
2296 my $uni = hex $2;
2297 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2298 next;
2300 if ($state eq "WCTABLE")
2302 my $uni = hex $1;
2303 my $cp = hex $2;
2304 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2305 next;
2307 if ($state eq "DBCSRANGE")
2309 my $start = hex $1;
2310 my $end = hex $2;
2311 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2312 $lb_cur = $start;
2313 $lb_end = $end;
2314 next;
2316 if ($state eq "DBCSTABLE")
2318 my $mb = hex $1;
2319 my $uni = hex $2;
2320 my $cp = ($lb_cur << 8) | $mb;
2321 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2322 if (!--$count)
2324 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2326 next;
2329 die "$filename: Unrecognized line $_\n";
2331 close $INPUT;
2333 output_codepage_file( $codepage );
2336 ################################################################
2337 # align a string length
2338 sub align_string($$)
2340 my ($align, $str) = @_;
2341 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2342 return $str;
2345 ################################################################
2346 # pack a GUID string
2347 sub pack_guid($)
2349 $_ = shift;
2350 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2351 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2354 ################################################################
2355 # comparison function for compression sort
2356 sub cmp_compression
2358 return scalar @{$a} <=> scalar @{$b} ||
2359 $a->[4] <=> $b->[4] ||
2360 $a->[5] <=> $b->[5] ||
2361 $a->[6] <=> $b->[6] ||
2362 $a->[7] <=> $b->[7] ||
2363 $a->[8] <=> $b->[8] ||
2364 $a->[9] <=> $b->[9] ||
2365 $a->[10] <=> $b->[10] ||
2366 $a->[11] <=> $b->[11] ||
2367 $a->[12] <=> $b->[12];
2370 ################################################################
2371 # build a binary sort keys table
2372 sub dump_sortkey_table($$)
2374 my ($filename, $download) = @_;
2376 my @keys;
2377 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2378 my @multiple_weights;
2379 my @expansions;
2380 my @compressions;
2381 my %exceptions;
2382 my %guids;
2383 my %compr_flags;
2384 my %locales;
2385 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2386 my $jamostr = "";
2388 my $re_hex = '0x[0-9A-Fa-f]+';
2389 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2390 $guids{$default_guid} = { };
2392 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2394 my $KEYS = open_data_file( $MSDATA, $download );
2396 printf "Building $filename\n";
2398 while (<$KEYS>)
2400 s/\s*;.*$//;
2401 next if /^\s*$/; # skip empty lines
2402 if (/^\s*(SORTKEY|SORTTABLES)/)
2404 $part = $1;
2405 next;
2407 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2409 $part = $section = "";
2410 next;
2412 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2414 $section = $1;
2415 $guid = undef;
2416 next;
2418 next unless $part;
2419 if ("$part.$section" eq "SORTKEY.DEFAULT")
2421 if (/^\s*($re_hex)\s+$re_key/)
2423 $keys[hex $1] = [ split(/\s+/,$2) ];
2424 next;
2427 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2429 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2431 $version = hex $1;
2432 next;
2434 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2436 # ignore for now
2437 next;
2440 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2441 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2442 "$part.$section" eq "SORTTABLES.INVERSECASING")
2444 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2446 $guid = lc $1;
2447 $guids{$guid} = { } unless defined $guids{$guid};
2448 $guids{$guid}->{flags} |= $flags{$section};
2449 next;
2451 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2453 $locales{$1} = $guid;
2454 next;
2457 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2459 if (/^\s*(\d+)\s+(\d+)/)
2461 push @multiple_weights, $1, $2;
2462 next;
2465 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2467 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2469 my $pos = scalar @expansions / 2;
2470 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2471 push @expansions, hex $2, hex $3;
2472 next;
2475 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2477 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2479 $keys[hex $1] = $keys[hex $2];
2480 next;
2483 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2485 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2487 if ($subsection || !$guid) # start a new one
2489 $guid = lc $1;
2490 $subsection = "";
2491 $guids{$guid} = { } unless defined $guids{$guid};
2492 $guids{$guid}->{flags} |= $flags{$2} if $2;
2493 $guids{$guid}->{compr} = @compressions;
2494 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2495 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2496 push @compressions, [ ];
2498 else # merge with current one
2500 $guids{lc $1} = { } unless defined $guids{lc $1};
2501 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2502 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2503 $compr_flags{lc $1} = $compr_flags{$guid};
2505 next;
2507 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2509 $locales{$1} = $guid;
2510 next;
2512 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2514 $subsection = $1;
2515 next;
2517 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2519 my @comp = map { hex $_; } split(/\s+/,$1);
2520 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2521 # add compression flags
2522 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2523 next;
2526 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2528 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2530 $guid = lc $1;
2531 $guids{$guid} = { } unless defined $guids{lc $1};
2532 $ling_flag = ($2 ? "+" : "-");
2533 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2534 next;
2536 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2538 $locales{$1} = $guid;
2539 next;
2541 if (/^\s*($re_hex)\s+$re_key/)
2543 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2544 next;
2547 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2549 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2551 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2552 next;
2555 die "$download: $part.$section: unrecognized line $_\n";
2557 close $KEYS;
2559 # Sortkey table
2561 my $table;
2562 for (my $i = 0; $i < 0x10000; $i++)
2564 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2565 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2568 foreach my $id (sort keys %exceptions)
2570 my $pos = length($table) / 4;
2571 my @exc = @{$exceptions{$id}};
2572 my @filled;
2573 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2574 my $guid = substr( $id, 0, -1 );
2575 $guids{$guid}->{$key} = $pos;
2576 $pos += 0x100;
2577 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2578 for (my $j = 0; $j < 0x10000; $j++)
2580 next unless defined $exc[$j] || defined $flags[$j];
2581 $filled[$j >> 8] = 1;
2582 $j |= 0xff;
2584 for (my $j = 0; $j < 0x100; $j++)
2586 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2587 $pos += 0x100 if $filled[$j];
2589 for (my $j = 0; $j < 0x10000; $j++)
2591 next unless $filled[$j >> 8];
2592 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2593 $k[3] |= $flags[$j] || 0;
2594 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2598 # Case mapping tables
2600 # standard table
2601 my @casemaps;
2602 my @upper = @toupper_table;
2603 my @lower = @tolower_table;
2604 remove_linguistic_mappings( \@upper, \@lower );
2605 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2607 # linguistic table
2608 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2610 # Turkish table
2611 @upper = @toupper_table;
2612 @lower = @tolower_table;
2613 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2614 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2615 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2616 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2618 # Char type table
2620 my @table;
2621 my $types = "";
2622 my %typestr;
2623 for (my $i = 0; $i < 0x10000; $i++)
2625 my $str = pack "S<3",
2626 ($category_table[$i] || 0) & 0xffff,
2627 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2628 ($category_table[$i] || 0) >> 16;
2630 if (!defined($typestr{$str}))
2632 $typestr{$str} = length($types) / 6;
2633 $types .= $str;
2635 $table[$i] = $typestr{$str};
2638 my @rows = compress_array( 4096, 0, @table[0..65535] );
2639 my @array = compress_array( 256, 0, @rows[0..4095] );
2640 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2641 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2643 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2644 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2645 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2647 # Sort tables
2649 # guids
2650 my $sorttables = pack "L<2", $version, scalar %guids;
2651 foreach my $id (sort keys %guids)
2653 my %guid = %{$guids{$id}};
2654 my $flags = $guid{flags} || 0;
2655 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2656 $sorttables .= pack_guid($id) . pack "L<5",
2657 $flags,
2658 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2659 $guid{except} || 0,
2660 $guid{ling_except} || 0,
2661 $map / 2;
2664 # expansions
2665 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2667 # compressions
2668 $sorttables .= pack "L<", scalar @compressions;
2669 my $rowstr = "";
2670 foreach my $c (@compressions)
2672 my $pos = length($rowstr) / 2;
2673 my $min = 0xffff;
2674 my $max = 0;
2675 my @lengths = (0) x 8;
2676 foreach my $r (sort cmp_compression @{$c})
2678 my @row = @{$r};
2679 $lengths[scalar @row - 6]++;
2680 foreach my $val (@row[4..$#row])
2682 $min = $val if $min > $val;
2683 $max = $val if $max < $val;
2685 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2686 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2688 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2690 $sorttables .= $rowstr;
2692 # multiple weights
2693 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2695 # jamo sort
2696 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2698 # Locales
2700 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2701 foreach my $loc (sort keys %locales)
2703 # skip specific locales that match more general ones
2704 my @parts = split /[-_]/, $loc;
2705 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2706 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2707 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2710 # File header
2712 my @header;
2713 $header[0] = 16;
2714 $header[1] = $header[0] + length $table;
2715 $header[2] = $header[1] + length $casemaps;
2716 $header[3] = $header[2] + length $chartypes;
2718 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2719 print OUTPUT pack "L<*", @header;
2720 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2721 close OUTPUT;
2722 save_file($filename);
2726 ################################################################
2727 # build the script to create registry keys
2728 sub dump_registry_script($%)
2730 my ($filename, %keys) = @_;
2731 my $indent = 1;
2733 printf "Building %s\n", $filename;
2734 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2735 print OUTPUT "HKLM\n{\n";
2736 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2738 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2739 $indent++;
2741 foreach my $k (sort keys %keys)
2743 my @subkeys = split /\\/, $k;
2744 my ($def, @vals) = @{$keys{$k}};
2745 for (my $i = 0; $i < @subkeys; $i++)
2747 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2748 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2749 $indent++;
2751 foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2752 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2754 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2755 close OUTPUT;
2756 save_file($filename);
2760 ################################################################
2761 # save a file if modified
2762 sub save_file($)
2764 my $file = shift;
2765 if (-f $file && !system "cmp $file $file.new >/dev/null")
2767 unlink "$file.new";
2769 else
2771 rename "$file.new", "$file";
2776 ################################################################
2777 # main routine
2779 chdir ".." if -f "./make_unicode";
2780 load_data();
2781 dump_case_mappings( "libs/port/casemap.c" );
2782 dump_sortkeys( "dlls/kernelbase/collation.c" );
2783 dump_ctype_tables( "libs/port/wctype.c" );
2784 dump_bidi_dir_table( "dlls/gdi32/direction.c" );
2785 dump_bidi_dir_table( "dlls/usp10/direction.c" );
2786 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2787 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2788 dump_mirroring( "dlls/usp10/mirror.c" );
2789 dump_mirroring( "dlls/dwrite/mirror.c" );
2790 dump_bracket( "dlls/usp10/bracket.c" );
2791 dump_bracket( "dlls/dwrite/bracket.c" );
2792 dump_shaping( "dlls/usp10/shaping.c" );
2793 dump_linebreak( "dlls/usp10/linebreak.c" );
2794 dump_linebreak( "dlls/dwrite/linebreak.c" );
2795 dump_scripts( "dlls/dwrite/scripts" );
2796 dump_indic( "dlls/usp10/indicsyllable.c" );
2797 dump_vertical( "dlls/gdi32/vertical.c" );
2798 dump_vertical( "dlls/wineps.drv/vertical.c" );
2799 dump_intl_nls("nls/l_intl.nls");
2800 dump_norm_table( "nls/normnfc.nls" );
2801 dump_norm_table( "nls/normnfd.nls" );
2802 dump_norm_table( "nls/normnfkc.nls" );
2803 dump_norm_table( "nls/normnfkd.nls" );
2804 dump_norm_table( "nls/normidna.nls" );
2805 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2806 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2807 dump_eucjp_codepage();
2808 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2810 exit 0;
2812 # Local Variables:
2813 # compile-command: "./make_unicode"
2814 # End: