evr: Fix typo in video_mixer_init_dxva_videodesc() (Coverity).
[wine.git] / tools / make_unicode
blobe18ec0a206fa64cf69cc3def534a3bf8b4bc8118
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "13.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $REPORTS = "http://www.unicode.org/reports";
30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
33 # Sort keys file
34 my $SORTKEYS = "tr10/allkeys.txt";
36 # Default char for undefined mappings
37 my $DEF_CHAR = ord '?';
39 # Last valid Unicode character
40 my $MAX_CHAR = 0x10ffff;
42 my @allfiles =
44 "CodpageFiles/037.txt",
45 "CodpageFiles/437.txt",
46 "CodpageFiles/500.txt",
47 "CodpageFiles/737.txt",
48 "CodpageFiles/775.txt",
49 "CodpageFiles/850.txt",
50 "CodpageFiles/852.txt",
51 "CodpageFiles/855.txt",
52 "CodpageFiles/857.txt",
53 "CodpageFiles/860.txt",
54 "CodpageFiles/861.txt",
55 "CodpageFiles/862.txt",
56 "CodpageFiles/863.txt",
57 "CodpageFiles/864.txt",
58 "CodpageFiles/865.txt",
59 "CodpageFiles/866.txt",
60 "CodpageFiles/869.txt",
61 "CodpageFiles/874.txt",
62 "CodpageFiles/875.txt",
63 "CodpageFiles/932.txt",
64 "CodpageFiles/936.txt",
65 "CodpageFiles/949.txt",
66 "CodpageFiles/950.txt",
67 "CodpageFiles/1026.txt",
68 "CodpageFiles/1250.txt",
69 "CodpageFiles/1251.txt",
70 "CodpageFiles/1252.txt",
71 "CodpageFiles/1253.txt",
72 "CodpageFiles/1254.txt",
73 "CodpageFiles/1255.txt",
74 "CodpageFiles/1256.txt",
75 "CodpageFiles/1257.txt",
76 "CodpageFiles/1258.txt",
77 "CodpageFiles/1361.txt",
78 "CodpageFiles/10000.txt",
79 "CodpageFiles/10001.txt",
80 "CodpageFiles/10002.txt",
81 "CodpageFiles/10003.txt",
82 "CodpageFiles/10004.txt",
83 "CodpageFiles/10005.txt",
84 "CodpageFiles/10006.txt",
85 "CodpageFiles/10007.txt",
86 "CodpageFiles/10008.txt",
87 "CodpageFiles/10010.txt",
88 "CodpageFiles/10017.txt",
89 "CodpageFiles/10021.txt",
90 "CodpageFiles/10029.txt",
91 "CodpageFiles/10079.txt",
92 "CodpageFiles/10081.txt",
93 "CodpageFiles/10082.txt",
94 "CodpageFiles/20127.txt",
95 "CodpageFiles/20866.txt",
96 "CodpageFiles/21866.txt",
97 "CodpageFiles/28591.txt",
98 "CodpageFiles/28592.txt",
99 "CodpageFiles/28593.txt",
100 "CodpageFiles/28594.txt",
101 "CodpageFiles/28595.txt",
102 "CodpageFiles/28596.txt",
103 "CodpageFiles/28597.txt",
104 "CodpageFiles/28598.txt",
105 "CodpageFiles/28599.txt",
106 "CodpageFiles/28603.txt",
107 "CodpageFiles/28605.txt",
111 my %ctype =
113 # CT_CTYPE1
114 "upper" => 0x0001,
115 "lower" => 0x0002,
116 "digit" => 0x0004,
117 "space" => 0x0008,
118 "punct" => 0x0010,
119 "cntrl" => 0x0020,
120 "blank" => 0x0040,
121 "xdigit" => 0x0080,
122 "alpha" => 0x0100 | 0x80000000,
123 "defin" => 0x0200,
124 # CT_CTYPE3 in high 16 bits
125 "nonspacing" => 0x00010000,
126 "diacritic" => 0x00020000,
127 "vowelmark" => 0x00040000,
128 "symbol" => 0x00080000,
129 "katakana" => 0x00100000,
130 "hiragana" => 0x00200000,
131 "halfwidth" => 0x00400000,
132 "fullwidth" => 0x00800000,
133 "ideograph" => 0x01000000,
134 "kashida" => 0x02000000,
135 "lexical" => 0x04000000,
136 "highsurrogate" => 0x08000000,
137 "lowsurrogate" => 0x10000000,
140 my %bracket_types =
142 "o" => 0x0000,
143 "c" => 0x0001,
146 my %indic_types =
148 "Other" => 0x0000,
149 "Bindu" => 0x0001,
150 "Visarga" => 0x0002,
151 "Avagraha" => 0x0003,
152 "Nukta" => 0x0004,
153 "Virama" => 0x0005,
154 "Vowel_Independent" => 0x0006,
155 "Vowel_Dependent" => 0x0007,
156 "Vowel" => 0x0008,
157 "Consonant_Placeholder" => 0x0009,
158 "Consonant" => 0x000a,
159 "Consonant_Dead" => 0x000b,
160 "Consonant_Succeeding_Repha" => 0x000c,
161 "Consonant_Subjoined" => 0x000d,
162 "Consonant_Medial" => 0x000e,
163 "Consonant_Final" => 0x000f,
164 "Consonant_Head_Letter" => 0x0010,
165 "Modifying_Letter" => 0x0011,
166 "Tone_Letter" => 0x0012,
167 "Tone_Mark" => 0x0013,
168 "Register_Shifter" => 0x0014,
169 "Consonant_Preceding_Repha" => 0x0015,
170 "Pure_Killer" => 0x0016,
171 "Invisible_Stacker" => 0x0017,
172 "Gemination_Mark" => 0x0018,
173 "Cantillation_Mark" => 0x0019,
174 "Non_Joiner" => 0x001a,
175 "Joiner" => 0x001b,
176 "Number_Joiner" => 0x001c,
177 "Number" => 0x001d,
178 "Brahmi_Joining_Number" => 0x001e,
179 "Consonant_With_Stacker" => 0x001f,
180 "Consonant_Prefixed" => 0x0020,
181 "Syllable_Modifier" => 0x0021,
182 "Consonant_Killer" => 0x0022,
183 "Consonant_Initial_Postfixed" => 0x0023,
186 my %matra_types =
188 "Right" => 0x01,
189 "Left" => 0x02,
190 "Visual_Order_Left" => 0x03,
191 "Left_And_Right" => 0x04,
192 "Top" => 0x05,
193 "Bottom" => 0x06,
194 "Top_And_Bottom" => 0x07,
195 "Top_And_Right" => 0x08,
196 "Top_And_Left" => 0x09,
197 "Top_And_Left_And_Right" => 0x0a,
198 "Bottom_And_Right" => 0x0b,
199 "Top_And_Bottom_And_Right" => 0x0c,
200 "Overstruck" => 0x0d,
201 "Invisible" => 0x0e,
202 "Bottom_And_Left" => 0x0f,
203 "Top_And_Bottom_And_Left" => 0x10,
206 my %break_types =
208 "BK" => 0x0001,
209 "CR" => 0x0002,
210 "LF" => 0x0003,
211 "CM" => 0x0004,
212 "SG" => 0x0005,
213 "GL" => 0x0006,
214 "CB" => 0x0007,
215 "SP" => 0x0008,
216 "ZW" => 0x0009,
217 "NL" => 0x000a,
218 "WJ" => 0x000b,
219 "JL" => 0x000c,
220 "JV" => 0x000d,
221 "JT" => 0x000e,
222 "H2" => 0x000f,
223 "H3" => 0x0010,
224 "XX" => 0x0011,
225 "OP" => 0x0012,
226 "CL" => 0x0013,
227 "CP" => 0x0014,
228 "QU" => 0x0015,
229 "NS" => 0x0016,
230 "EX" => 0x0017,
231 "SY" => 0x0018,
232 "IS" => 0x0019,
233 "PR" => 0x001a,
234 "PO" => 0x001b,
235 "NU" => 0x001c,
236 "AL" => 0x001d,
237 "ID" => 0x001e,
238 "IN" => 0x001f,
239 "HY" => 0x0020,
240 "BB" => 0x0021,
241 "BA" => 0x0022,
242 "SA" => 0x0023,
243 "AI" => 0x0024,
244 "B2" => 0x0025,
245 "HL" => 0x0026,
246 "CJ" => 0x0027,
247 "RI" => 0x0028,
248 "EB" => 0x0029,
249 "EM" => 0x002a,
250 "ZWJ" => 0x002b,
253 my %vertical_types =
255 "R" => 0x0000,
256 "U" => 0x0001,
257 "Tr" => 0x0002,
258 "Tu" => 0x0003,
261 my %categories =
263 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
264 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
265 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
266 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
267 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
268 "Me" => $ctype{"defin"}, # Mark, Enclosing
269 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
270 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
271 "No" => $ctype{"defin"}, # Number, Other
272 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
273 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
274 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
275 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
276 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
277 "Cs" => $ctype{"defin"}, # Other, Surrogate
278 "Co" => $ctype{"defin"}, # Other, Private Use
279 "Cn" => $ctype{"defin"}, # Other, Not Assigned
280 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
281 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
282 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
283 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
284 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
285 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
286 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
287 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
288 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
289 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
290 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
291 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
292 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
295 # a few characters need additional categories that cannot be determined automatically
296 my %special_categories =
298 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
299 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
300 "space" => [ 0x09..0x0d, 0x85 ],
301 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
302 "cntrl" => [ 0x070f, 0x200c, 0x200d,
303 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
304 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
305 0xfff9, 0xfffa, 0xfffb ],
306 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
307 0xd7, 0xf7 ],
308 "digit" => [ 0xb2, 0xb3, 0xb9 ],
309 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
310 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
311 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
312 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
313 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
314 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
315 0x02b9..0x02ba, 0x02c6..0x02cf ],
316 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
317 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
318 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
319 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
320 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
321 0x3131..0x3164 ],
322 "ideograph" => [ 0x3006..0x3007 ],
323 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
324 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
325 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
326 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
327 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
328 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
329 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
330 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
331 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
332 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
333 "kashida" => [ 0x0640 ],
336 my %directions =
338 "L" => 1, # Left-to-Right
339 "R" => 2, # Right-to-Left
340 "AL" => 12, # Right-to-Left Arabic
341 "EN" => 3, # European Number
342 "ES" => 4, # European Number Separator
343 "ET" => 5, # European Number Terminator
344 "AN" => 6, # Arabic Number
345 "CS" => 7, # Common Number Separator
346 "NSM" => 13, # Non-Spacing Mark
347 "BN" => 14, # Boundary Neutral
348 "B" => 8, # Paragraph Separator
349 "S" => 9, # Segment Separator
350 "WS" => 10, # Whitespace
351 "ON" => 11, # Other Neutrals
352 "LRE" => 15, # Left-to-Right Embedding
353 "LRO" => 15, # Left-to-Right Override
354 "RLE" => 15, # Right-to-Left Embedding
355 "RLO" => 15, # Right-to-Left Override
356 "PDF" => 15, # Pop Directional Format
357 "LRI" => 15, # Left-to-Right Isolate
358 "RLI" => 15, # Right-to-Left Isolate
359 "FSI" => 15, # First Strong Isolate
360 "PDI" => 15 # Pop Directional Isolate
363 my %c2_types =
365 "L" => 1, # C2_LEFTTORIGHT
366 "R" => 2, # C2_RIGHTTOLEFT
367 "AL" => 2, # C2_RIGHTTOLEFT
368 "EN" => 3, # C2_EUROPENUMBER
369 "ES" => 4, # C2_EUROPESEPARATOR
370 "ET" => 5, # C2_EUROPETERMINATOR
371 "AN" => 6, # C2_ARABICNUMBER
372 "CS" => 7, # C2_COMMONSEPARATOR
373 "NSM" => 11, # C2_OTHERNEUTRAL
374 "BN" => 0, # C2_NOTAPPLICABLE
375 "B" => 8, # C2_BLOCKSEPARATOR
376 "S" => 9, # C2_SEGMENTSEPARATOR
377 "WS" => 10, # C2_WHITESPACE
378 "ON" => 11, # C2_OTHERNEUTRAL
379 "LRE" => 11, # C2_OTHERNEUTRAL
380 "LRO" => 11, # C2_OTHERNEUTRAL
381 "RLE" => 11, # C2_OTHERNEUTRAL
382 "RLO" => 11, # C2_OTHERNEUTRAL
383 "PDF" => 11, # C2_OTHERNEUTRAL
384 "LRI" => 11, # C2_OTHERNEUTRAL
385 "RLI" => 11, # C2_OTHERNEUTRAL
386 "FSI" => 11, # C2_OTHERNEUTRAL
387 "PDI" => 11 # C2_OTHERNEUTRAL
390 my %bidi_types =
392 "ON" => 0, # Other Neutrals
393 "L" => 1, # Left-to-Right
394 "R" => 2, # Right-to-Left
395 "AN" => 3, # Arabic Number
396 "EN" => 4, # European Number
397 "AL" => 5, # Right-to-Left Arabic
398 "NSM" => 6, # Non-Spacing Mark
399 "CS" => 7, # Common Number Separator
400 "ES" => 8, # European Number Separator
401 "ET" => 9, # European Number Terminator
402 "BN" => 10, # Boundary Neutral
403 "S" => 11, # Segment Separator
404 "WS" => 12, # Whitespace
405 "B" => 13, # Paragraph Separator
406 "RLO" => 14, # Right-to-Left Override
407 "RLE" => 15, # Right-to-Left Embedding
408 "LRO" => 16, # Left-to-Right Override
409 "LRE" => 17, # Left-to-Right Embedding
410 "PDF" => 18, # Pop Directional Format
411 "LRI" => 19, # Left-to-Right Isolate
412 "RLI" => 20, # Right-to-Left Isolate
413 "FSI" => 21, # First Strong Isolate
414 "PDI" => 22 # Pop Directional Isolate
417 my %joining_types =
419 "U" => 0, # Non_Joining
420 "T" => 1, # Transparent
421 "R" => 2, # Right_Joining
422 "L" => 3, # Left_Joining
423 "D" => 4, # Dual_Joining
424 "C" => 5, # Join_Causing
427 my @cp2uni = ();
428 my @glyph2uni = ();
429 my @lead_bytes = ();
430 my @uni2cp = ();
431 my @tolower_table = ();
432 my @toupper_table = ();
433 my @digitmap_table = ();
434 my @category_table = ();
435 my @joining_table = ();
436 my @direction_table = ();
437 my @decomp_table = ();
438 my @combining_class_table = ();
439 my @decomp_compat_table = ();
440 my @comp_exclusions = ();
441 my @idna_decomp_table = ();
442 my @idna_disallowed = ();
443 my %registry_keys;
444 my $default_char;
445 my $default_wchar;
447 my %joining_forms =
449 "isolated" => [],
450 "final" => [],
451 "initial" => [],
452 "medial" => []
455 sub to_utf16(@)
457 my @ret;
458 foreach my $ch (@_)
460 if ($ch < 0x10000)
462 push @ret, $ch;
464 else
466 my $val = $ch - 0x10000;
467 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
470 return @ret;
473 ################################################################
474 # fetch a unicode.org file and open it
475 sub open_data_file($$)
477 my ($base, $name) = @_;
478 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
479 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
480 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
481 local *FILE;
483 if ($base =~ /.*\/([^\/]+)\.zip$/)
485 my $zip = "$1$suffix.zip";
486 unless (-f "$cache/$zip")
488 system "mkdir", "-p", $cache;
489 print "Fetching $base...\n";
490 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
492 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
494 else
496 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
497 unless (-f $dest)
499 system "mkdir", "-p", $dir;
500 print "Fetching $base/$name...\n";
501 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
503 open FILE, "<$dest" or die "cannot open $dest";
505 return *FILE;
508 ################################################################
509 # recursively get the decomposition for a character
510 sub get_decomposition($$);
511 sub get_decomposition($$)
513 my ($char, $table) = @_;
514 my @ret;
516 return $char unless defined $table->[$char];
517 foreach my $ch (@{$table->[$char]})
519 push @ret, get_decomposition( $ch, $table );
521 return @ret;
524 ################################################################
525 # get the composition that results in a given character
526 sub get_composition($$)
528 my ($ch, $compat) = @_;
529 return () unless defined $decomp_table[$ch]; # no decomposition
530 my @ret = @{$decomp_table[$ch]};
531 return () if @ret < 2; # singleton decomposition
532 return () if $comp_exclusions[$ch]; # composition exclusion
533 return () if $combining_class_table[$ch]; # non-starter
534 return () if $combining_class_table[$ret[0]]; # first char is non-starter
535 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
536 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
537 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
538 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
539 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
540 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
541 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
542 return @ret;
545 ################################################################
546 # recursively build decompositions
547 sub build_decompositions(@)
549 my @src = @_;
550 my @dst;
552 for (my $i = 0; $i < @src; $i++)
554 next unless defined $src[$i];
555 my @decomp = to_utf16( get_decomposition( $i, \@src ));
556 $dst[$i] = \@decomp;
558 return @dst;
561 ################################################################
562 # compose Hangul sequences
563 sub compose_hangul(@)
565 my $SBASE = 0xac00;
566 my $LBASE = 0x1100;
567 my $VBASE = 0x1161;
568 my $TBASE = 0x11a7;
569 my $LCOUNT = 19;
570 my $VCOUNT = 21;
571 my $TCOUNT = 28;
572 my $NCOUNT = $VCOUNT * $TCOUNT;
573 my $SCOUNT = $LCOUNT * $NCOUNT;
575 my @seq = @_;
576 my @ret;
577 my $i;
579 for ($i = 0; $i < @seq; $i++)
581 my $ch = $seq[$i];
582 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
583 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
585 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
586 $i++;
588 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
589 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
591 $ch += $seq[$i+1] - $TBASE;
592 $i++;
594 push @ret, $ch;
596 return @ret;
599 ################################################################
600 # remove linguistic-only mappings from the case table
601 sub remove_linguistic_mappings($$)
603 my ($upper, $lower) = @_;
605 # remove case mappings that don't round-trip
607 for (my $i = 0; $i < @{$upper}; $i++)
609 next unless defined ${$upper}[$i];
610 my $ch = ${$upper}[$i];
611 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
613 for (my $i = 0; $i < @{$lower}; $i++)
615 next unless defined ${$lower}[$i];
616 my $ch = ${$lower}[$i];
617 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
621 ################################################################
622 # read in the Unicode database files
623 sub load_data()
625 my $start;
627 # now build mappings from the decomposition field of the Unicode database
629 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
630 while (<$UNICODE_DATA>)
632 # Decode the fields ...
633 my ($code, $name, $cat, $comb, $bidi,
634 $decomp, $dec, $dig, $num, $mirror,
635 $oldname, $comment, $upper, $lower, $title) = split /;/;
636 my $src = hex $code;
638 die "unknown category $cat" unless defined $categories{$cat};
639 die "unknown directionality $bidi" unless defined $directions{$bidi};
641 $category_table[$src] = $categories{$cat};
642 $direction_table[$src] = $bidi;
643 $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
645 if ($lower ne "")
647 $tolower_table[$src] = hex $lower;
649 if ($upper ne "")
651 $toupper_table[$src] = hex $upper;
653 if ($dec ne "")
655 $category_table[$src] |= $ctype{"digit"};
657 if ($dig ne "")
659 $digitmap_table[$src] = ord $dig;
661 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
663 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
664 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
665 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
666 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
667 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
668 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
669 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
670 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
671 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
672 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
673 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
674 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
676 # copy the category and direction for everything between First/Last pairs
677 if ($name =~ /, First>/) { $start = $src; }
678 if ($name =~ /, Last>/)
680 while ($start < $src)
682 $category_table[$start] = $category_table[$src];
683 $direction_table[$start] = $direction_table[$src];
684 $combining_class_table[$start] = $combining_class_table[$src];
685 $start++;
689 next if $decomp eq ""; # no decomposition, skip it
691 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
693 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
694 $decomp_compat_table[$src] = \@seq;
697 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
699 # decomposition of the form "<foo> 1234" -> use char if type is known
700 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
702 ${joining_forms{$1}}[hex $2] = $src;
705 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
707 # decomposition "<compat> 0020 1234" -> combining accent
709 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
711 # store decomposition
712 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
714 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
716 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
718 # Single char decomposition
719 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
723 close $UNICODE_DATA;
725 # patch the category of some special characters
727 for (my $i = 0; $i < @decomp_table; $i++)
729 next unless defined $decomp_table[$i];
730 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
732 foreach my $cat (keys %special_categories)
734 my $flag = $ctype{$cat};
735 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
737 for (my $i = 0; $i < @decomp_compat_table; $i++)
739 next unless defined $decomp_compat_table[$i];
740 next unless @{$decomp_compat_table[$i]} == 2;
741 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
744 # load the composition exclusions
746 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
747 while (<$EXCL>)
749 s/\#.*//; # remove comments
750 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
752 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
754 elsif (/^([0-9a-fA-F]+)\s*$/)
756 $comp_exclusions[hex $1] = 1;
759 close $EXCL;
761 # load the IDNA mappings
763 @idna_decomp_table = @decomp_compat_table;
764 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
765 while (<$IDNA>)
767 s/\#.*//; # remove comments
768 next if /^\s*$/;
769 my ($char, $type, $mapping) = split /;/;
770 my ($ch1, $ch2);
771 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
773 $ch1 = hex $1;
774 $ch2 = hex $2;
776 elsif ($char =~ /([0-9a-fA-F]+)/)
778 $ch1 = $ch2 = hex $1;
781 if ($type =~ /mapped/ || $type =~ /deviation/)
783 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
784 my @seq = map { hex $_; } split /\s+/, $mapping;
785 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
787 elsif ($type =~ /valid/)
790 elsif ($type =~ /ignored/)
792 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
794 elsif ($type =~ /disallowed/)
796 foreach my $i ($ch1 .. $ch2)
798 $idna_decomp_table[$i] = undef;
799 $idna_disallowed[$i] = 1;
803 close $IDNA;
807 ################################################################
808 # add a new registry key
809 sub add_registry_key($$)
811 my ($key, $defval) = @_;
812 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
815 ################################################################
816 # add a new registry value
817 sub add_registry_value($$$)
819 my ($key, $name, $value) = @_;
820 add_registry_key( $key, undef );
821 push @{$registry_keys{$key}}, "'$name' = s '$value'";
824 ################################################################
825 # define a new lead byte
826 sub add_lead_byte($)
828 my $ch = shift;
829 return if defined $cp2uni[$ch];
830 push @lead_bytes, $ch;
831 $cp2uni[$ch] = 0;
834 ################################################################
835 # define a new char mapping
836 sub add_mapping($$)
838 my ($cp, $uni) = @_;
839 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
840 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
841 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
844 ################################################################
845 # get a mapping including glyph chars for MB_USEGLYPHCHARS
846 sub get_glyphs_mapping(@)
848 my @table = @_;
850 for (my $i = 0; $i < @glyph2uni; $i++)
852 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
854 return @table;
857 ################################################################
858 # build EUC-JP table from the JIS 0208/0212 files
859 sub dump_eucjp_codepage()
861 @cp2uni = ();
862 @glyph2uni = ();
863 @lead_bytes = ();
864 @uni2cp = ();
865 $default_char = $DEF_CHAR;
866 $default_wchar = 0x30fb;
868 # ASCII chars
869 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
871 # lead bytes
872 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
874 # JIS X 0201 right plane
875 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
877 # undefined chars
878 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
879 $cp2uni[0xa0] = 0xf8f0;
880 $cp2uni[0xff] = 0xf8f3;
882 # Fix backslash conversion
883 add_mapping( 0xa1c0, 0xff3c );
885 # Add private mappings for rows undefined in JIS 0208/0212
886 my $private = 0xe000;
887 foreach my $hi (0xf5 .. 0xfe)
889 foreach my $lo (0xa1 .. 0xfe)
891 add_mapping( ($hi << 8) + $lo, $private++ );
894 foreach my $hi (0xf5 .. 0xfe)
896 foreach my $lo (0x21 .. 0x7e)
898 add_mapping( ($hi << 8) + $lo, $private++ );
902 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
903 while (<$INPUT>)
905 next if /^\#/; # skip comments
906 next if /^$/; # skip empty lines
907 next if /\x1a/; # skip ^Z
908 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
910 add_mapping( 0x8080 + hex $1, hex $2 );
911 next;
913 die "Unrecognized line $_\n";
915 close $INPUT;
917 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
918 while (<$INPUT>)
920 next if /^\#/; # skip comments
921 next if /^$/; # skip empty lines
922 next if /\x1a/; # skip ^Z
923 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
925 add_mapping( 0x8000 + hex $1, hex $2 );
926 next;
928 die "Unrecognized line $_\n";
930 close $INPUT;
932 output_codepage_file( 20932 );
936 ################################################################
937 # build the sort keys table
938 sub dump_sortkeys($)
940 my $filename = shift;
941 my @sortkeys = ();
943 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
944 while (<$INPUT>)
946 next if /^\#/; # skip comments
947 next if /^$/; # skip empty lines
948 next if /\x1a/; # skip ^Z
949 next if /^\@version/; # skip @version header
950 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
952 my ($uni,$variable) = (hex $1, $2);
953 next if $uni > 65535;
954 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
955 next;
957 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
959 # multiple character sequence, ignored for now
960 next;
962 die "$SORTKEYS: Unrecognized line $_\n";
964 close $INPUT;
966 # compress the keys to 32 bit:
967 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
969 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
970 ${$a}[2] <=> ${$b}[2] or
971 ${$a}[3] <=> ${$b}[3] or
972 ${$a}[4] <=> ${$b}[4] or
973 $a cmp $b; } @sortkeys;
975 my ($n2, $n3) = (1, 1);
976 my @keys = (-1, -1, -1, -1, -1 );
977 my @flatkeys = ();
979 for (my $i = 0; $i < @sortkeys; $i++)
981 next unless defined $sortkeys[$i];
982 my @current = @{$sortkeys[$i]};
983 if ($current[1] == $keys[1])
985 if ($current[2] == $keys[2])
987 if ($current[3] == $keys[3])
989 # nothing
991 else
993 $keys[3] = $current[3];
994 $n3++;
995 die if ($n3 >= 16);
998 else
1000 $keys[2] = $current[2];
1001 $keys[3] = $current[3];
1002 $n2++;
1003 $n3 = 1;
1004 die if ($n2 >= 256);
1007 else
1009 $keys[1] = $current[1];
1010 $keys[2] = $current[2];
1011 $keys[3] = $current[3];
1012 $n2 = 1;
1013 $n3 = 1;
1016 if ($current[2]) { $current[2] = $n2; }
1017 if ($current[3]) { $current[3] = $n3; }
1018 if ($current[4]) { $current[4] = 1; }
1020 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1023 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1024 printf "Building $filename\n";
1025 printf OUTPUT "/* Unicode collation element table */\n";
1026 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1027 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1028 print OUTPUT "#include \"windef.h\"\n\n";
1030 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1032 close OUTPUT;
1033 save_file($filename);
1037 ################################################################
1038 # dump an array of integers
1039 sub dump_array($$@)
1041 my ($bit_width, $default, @array) = @_;
1042 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1043 my $i;
1044 my $ret = " ";
1045 for ($i = 0; $i < $#array; $i++)
1047 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1048 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1050 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1051 return $ret;
1055 ################################################################
1056 # dump an SBCS mapping table in binary format
1057 sub dump_binary_sbcs_table($)
1059 my $codepage = shift;
1061 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1062 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1064 print OUTPUT pack "S<*", @header;
1065 print OUTPUT pack "C12", (0) x 12;
1066 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1068 if (@glyph2uni)
1070 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1072 else
1074 print OUTPUT pack "S<*", 0;
1077 print OUTPUT pack "S<*", 0, 0;
1079 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1083 ################################################################
1084 # dump a DBCS mapping table in binary format
1085 sub dump_binary_dbcs_table($)
1087 my $codepage = shift;
1088 my @lb_ranges = get_lb_ranges();
1089 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1091 my @offsets = (0) x 256;
1092 my $pos = 0;
1093 foreach my $i (@lead_bytes)
1095 $offsets[$i] = ($pos += 256);
1096 $cp2uni[$i] = 0;
1099 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1101 print OUTPUT pack "S<*", @header;
1102 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1103 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1104 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1106 foreach my $i (@lead_bytes)
1108 my $base = $i << 8;
1109 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1112 print OUTPUT pack "S<", 4;
1113 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1117 ################################################################
1118 # get the list of defined lead byte ranges
1119 sub get_lb_ranges()
1121 my @list = ();
1122 my @ranges = ();
1124 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1125 my $on = 0;
1126 for (my $i = 0; $i < 256; $i++)
1128 if ($on)
1130 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1132 else
1134 if ($list[$i]) { push @ranges, $i; $on = 1; }
1137 if ($on) { push @ranges, 0xff; }
1138 return @ranges;
1141 ################################################################
1142 # dump the Indic Syllabic Category table
1143 sub dump_indic($)
1145 my $filename = shift;
1146 my @indic_table;
1148 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1149 while (<$INPUT>)
1151 next if /^\#/; # skip comments
1152 next if /^\s*$/; # skip empty lines
1153 next if /\x1a/; # skip ^Z
1154 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1156 my $type = $2;
1157 die "unknown indic $type" unless defined $indic_types{$type};
1158 if (hex $1 < 65536)
1160 $indic_table[hex $1] = $indic_types{$type};
1162 next;
1164 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1166 my $type = $3;
1167 die "unknown indic $type" unless defined $indic_types{$type};
1168 if (hex $1 < 65536 and hex $2 < 65536)
1170 foreach my $i (hex $1 .. hex $2)
1172 $indic_table[$i] = $indic_types{$type};
1175 next;
1177 die "malformed line $_";
1179 close $INPUT;
1181 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1182 while (<$INPUT>)
1184 next if /^\#/; # skip comments
1185 next if /^\s*$/; # skip empty lines
1186 next if /\x1a/; # skip ^Z
1187 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1189 my $type = $2;
1190 die "unknown matra $type" unless defined $matra_types{$type};
1191 $indic_table[hex $1] |= $matra_types{$type} << 8;
1192 next;
1194 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1196 my $type = $3;
1197 die "unknown matra $type" unless defined $matra_types{$type};
1198 foreach my $i (hex $1 .. hex $2)
1200 $indic_table[$i] |= $matra_types{$type} << 8;
1202 next;
1204 die "malformed line $_";
1206 close $INPUT;
1208 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1209 print "Building $filename\n";
1210 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1211 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1212 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1213 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1214 print OUTPUT "#include \"windef.h\"\n\n";
1216 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1218 close OUTPUT;
1219 save_file($filename);
1222 ################################################################
1223 # dump the Line Break Properties table
1224 sub dump_linebreak($)
1226 my $filename = shift;
1227 my @break_table;
1228 my $next_group = 0;
1230 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1231 while (<$INPUT>)
1233 next if /^\#/; # skip comments
1234 next if /^\s*$/; # skip empty lines
1235 next if /\x1a/; # skip ^Z
1236 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1238 my $type = $2;
1239 die "unknown breaktype $type" unless defined $break_types{$type};
1240 $break_table[hex $1] = $break_types{$type};
1241 next;
1243 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1245 my $type = $3;
1246 die "unknown breaktype $type" unless defined $break_types{$type};
1247 foreach my $i (hex $1 .. hex $2)
1249 $break_table[$i] = $break_types{$type};
1251 next;
1253 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1255 my $type = $2;
1256 die "unknown breaktype $type" unless defined $break_types{$type};
1257 $break_table[hex $1] = $break_types{$type};
1258 next;
1260 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1262 my $type = $3;
1263 die "unknown breaktype $type" unless defined $break_types{$type};
1264 foreach my $i (hex $1 .. hex $2)
1266 $break_table[$i] = $break_types{$type};
1268 next;
1270 die "malformed line $_";
1272 close $INPUT;
1274 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1275 print "Building $filename\n";
1276 print OUTPUT "/* Unicode Line Break Properties */\n";
1277 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1278 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1279 print OUTPUT "#include \"windef.h\"\n\n";
1281 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1283 close OUTPUT;
1284 save_file($filename);
1287 my %scripts =
1289 "Unknown" => 0,
1290 "Common" => 1,
1291 "Inherited" => 2,
1292 "Arabic" => 3,
1293 "Armenian" => 4,
1294 "Avestan" => 5,
1295 "Balinese" => 6,
1296 "Bamum" => 7,
1297 "Batak" => 8,
1298 "Bengali" => 9,
1299 "Bopomofo" => 10,
1300 "Brahmi" => 11,
1301 "Braille" => 12,
1302 "Buginese" => 13,
1303 "Buhid" => 14,
1304 "Canadian_Aboriginal" => 15,
1305 "Carian" => 16,
1306 "Cham" => 17,
1307 "Cherokee" => 18,
1308 "Coptic" => 19,
1309 "Cuneiform" => 20,
1310 "Cypriot" => 21,
1311 "Cyrillic" => 22,
1312 "Deseret" => 23,
1313 "Devanagari" => 24,
1314 "Egyptian_Hieroglyphs" => 25,
1315 "Ethiopic" => 26,
1316 "Georgian" => 27,
1317 "Glagolitic" => 28,
1318 "Gothic" => 29,
1319 "Greek" => 30,
1320 "Gujarati" => 31,
1321 "Gurmukhi" => 32,
1322 "Han" => 33,
1323 "Hangul" => 34,
1324 "Hanunoo" => 35,
1325 "Hebrew" => 36,
1326 "Hiragana" => 37,
1327 "Imperial_Aramaic" => 38,
1328 "Inscriptional_Pahlavi" => 39,
1329 "Inscriptional_Parthian" => 40,
1330 "Javanese" => 41,
1331 "Kaithi" => 42,
1332 "Kannada" => 43,
1333 "Katakana" => 44,
1334 "Kayah_Li" => 45,
1335 "Kharoshthi" => 46,
1336 "Khmer" => 47,
1337 "Lao" => 48,
1338 "Latin" => 49,
1339 "Lepcha" => 50,
1340 "Limbu" => 51,
1341 "Linear_B" => 52,
1342 "Lisu" => 53,
1343 "Lycian" => 54,
1344 "Lydian" => 55,
1345 "Malayalam" => 56,
1346 "Mandaic" => 57,
1347 "Meetei_Mayek" => 58,
1348 "Mongolian" => 59,
1349 "Myanmar" => 60,
1350 "New_Tai_Lue" => 61,
1351 "Nko" => 62,
1352 "Ogham" => 63,
1353 "Ol_Chiki" => 64,
1354 "Old_Italic" => 65,
1355 "Old_Persian" => 66,
1356 "Old_South_Arabian" => 67,
1357 "Old_Turkic" => 68,
1358 "Oriya" => 69,
1359 "Osmanya" => 70,
1360 "Phags_Pa" => 71,
1361 "Phoenician" => 72,
1362 "Rejang" => 73,
1363 "Runic" => 74,
1364 "Samaritan" => 75,
1365 "Saurashtra" => 76,
1366 "Shavian" => 77,
1367 "Sinhala" => 78,
1368 "Sundanese" => 79,
1369 "Syloti_Nagri" => 80,
1370 "Syriac" => 81,
1371 "Tagalog" => 82,
1372 "Tagbanwa" => 83,
1373 "Tai_Le" => 84,
1374 "Tai_Tham" => 85,
1375 "Tai_Viet" => 86,
1376 "Tamil" => 87,
1377 "Telugu" => 88,
1378 "Thaana" => 89,
1379 "Thai" => 90,
1380 "Tibetan" => 91,
1381 "Tifinagh" => 92,
1382 "Ugaritic" => 93,
1383 "Vai" => 94,
1384 "Yi" => 95,
1385 # Win8/Win8.1
1386 "Chakma" => 96,
1387 "Meroitic_Cursive" => 97,
1388 "Meroitic_Hieroglyphs" => 98,
1389 "Miao" => 99,
1390 "Sharada" => 100,
1391 "Sora_Sompeng" => 101,
1392 "Takri" => 102,
1393 # Win10
1394 "Bassa_Vah" => 103,
1395 "Caucasian_Albanian" => 104,
1396 "Duployan" => 105,
1397 "Elbasan" => 106,
1398 "Grantha" => 107,
1399 "Khojki" => 108,
1400 "Khudawadi" => 109,
1401 "Linear_A" => 110,
1402 "Mahajani" => 111,
1403 "Manichaean" => 112,
1404 "Mende_Kikakui" => 113,
1405 "Modi" => 114,
1406 "Mro" => 115,
1407 "Nabataean" => 116,
1408 "Old_North_Arabian" => 117,
1409 "Old_Permic" => 118,
1410 "Pahawh_Hmong" => 119,
1411 "Palmyrene" => 120,
1412 "Pau_Cin_Hau" => 121,
1413 "Psalter_Pahlavi" => 122,
1414 "Siddham" => 123,
1415 "Tirhuta" => 124,
1416 "Warang_Citi" => 125,
1417 # Win10 RS1
1418 "Adlam" => 126,
1419 "Ahom" => 127,
1420 "Anatolian_Hieroglyphs" => 128,
1421 "Bhaiksuki" => 129,
1422 "Hatran" => 130,
1423 "Marchen" => 131,
1424 "Multani" => 132,
1425 "Newa" => 133,
1426 "Old_Hungarian" => 134,
1427 "Osage" => 135,
1428 "SignWriting" => 136,
1429 "Tangut" => 137,
1430 # Win10 RS4
1431 "Masaram_Gondi" => 138,
1432 "Nushu" => 139,
1433 "Soyombo" => 140,
1434 "Zanabazar_Square" => 141,
1435 # Win10 1903
1436 "Dogra" => 142,
1437 "Gunjala_Gondi" => 143,
1438 "Hanifi_Rohingya" => 144,
1439 "Makasar" => 145,
1440 "Medefaidrin" => 146,
1441 "Old_Sogdian" => 147,
1442 "Sogdian" => 148,
1443 # Win10 2004
1444 "Elymaic" => 149,
1445 "Nyiakeng_Puachue_Hmong" => 150,
1446 "Nandinagari" => 151,
1447 "Wancho" => 152,
1450 ################################################################
1451 # dump Script IDs table
1452 sub dump_scripts($)
1454 my $filename = shift;
1455 my $header = $filename;
1456 my @scripts_table;
1457 my $script_index;
1458 my $i;
1460 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1461 # Fill the table
1462 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1463 while (<$INPUT>)
1465 my $type = "";
1467 next if /^\#/; # skip comments
1468 next if /^\s*$/; # skip empty lines
1469 next if /\x1a/; # skip ^Z
1470 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1472 $type = $2;
1473 if (defined $scripts{$type})
1475 $scripts_table[hex $1] = $scripts{$type};
1477 next;
1479 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1481 $type = $3;
1482 if (defined $scripts{$type})
1484 foreach my $i (hex $1 .. hex $2)
1486 $scripts_table[$i] = $scripts{$type};
1489 next;
1493 close $INPUT;
1495 $header = "$filename.h";
1496 open OUTPUT,">$header.new" or die "Cannot create $header";
1497 print "Building $header\n";
1498 print OUTPUT "/* Unicode Script IDs */\n";
1499 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1500 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1502 print OUTPUT "enum unicode_script_id {\n";
1503 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1505 print OUTPUT " Script_$script = $scripts{$script},\n";
1507 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1508 print OUTPUT "};\n";
1510 close OUTPUT;
1511 save_file($header);
1513 $filename = "$filename.c";
1514 open OUTPUT,">$filename.new" or die "Cannot create $header";
1515 print "Building $filename\n";
1516 print OUTPUT "/* Unicode Script IDs */\n";
1517 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1518 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1519 print OUTPUT "#include \"windef.h\"\n\n";
1521 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1522 close OUTPUT;
1523 save_file($filename);
1526 ################################################################
1527 # dump the BiDi mirroring table
1528 sub dump_mirroring($)
1530 my $filename = shift;
1531 my @mirror_table = ();
1533 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1534 while (<$INPUT>)
1536 next if /^\#/; # skip comments
1537 next if /^$/; # skip empty lines
1538 next if /\x1a/; # skip ^Z
1539 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1541 $mirror_table[hex $1] = hex $2;
1542 next;
1544 die "malformed line $_";
1546 close $INPUT;
1548 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1549 print "Building $filename\n";
1550 print OUTPUT "/* Unicode BiDi mirroring */\n";
1551 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1552 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1553 print OUTPUT "#include \"windef.h\"\n\n";
1554 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1555 close OUTPUT;
1556 save_file($filename);
1559 ################################################################
1560 # dump the Bidi Brackets
1561 sub dump_bracket($)
1563 my $filename = shift;
1564 my @bracket_table;
1566 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1567 while (<$INPUT>)
1569 next if /^\#/; # skip comments
1570 next if /^\s*$/; # skip empty lines
1571 next if /\x1a/; # skip ^Z
1572 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1574 my $type = $3;
1575 die "unknown bracket $type" unless defined $bracket_types{$type};
1576 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1577 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1578 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1579 next;
1581 die "malformed line $_";
1583 close $INPUT;
1585 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1586 print "Building $filename\n";
1587 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1588 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1589 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1590 print OUTPUT "#include \"windef.h\"\n\n";
1592 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1594 close OUTPUT;
1595 save_file($filename);
1598 ################################################################
1599 # dump the Arabic shaping table
1600 sub dump_shaping($)
1602 my $filename = shift;
1603 my %groups;
1604 my $next_group = 0;
1606 $groups{"No_Joining_Group"} = $next_group++;
1608 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1609 while (<$INPUT>)
1611 next if /^\#/; # skip comments
1612 next if /^\s*$/; # skip empty lines
1613 next if /\x1a/; # skip ^Z
1614 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1616 my $type = $2;
1617 my $group = $3;
1618 $groups{$group} = $next_group++ unless defined $groups{$group};
1619 $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1620 next;
1622 die "malformed line $_";
1624 close $INPUT;
1626 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1627 print "Building $filename\n";
1628 print OUTPUT "/* Unicode Arabic shaping */\n";
1629 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1630 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1631 print OUTPUT "#include \"windef.h\"\n\n";
1633 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1635 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1636 for (my $i = 0x600; $i <= 0x6ff; $i++)
1638 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1639 ${joining_forms{"isolated"}}[$i] || $i,
1640 ${joining_forms{"final"}}[$i] || $i,
1641 ${joining_forms{"initial"}}[$i] || $i,
1642 ${joining_forms{"medial"}}[$i] || $i;
1644 print OUTPUT "};\n";
1646 close OUTPUT;
1647 save_file($filename);
1650 ################################################################
1651 # dump the Vertical Orientation table
1652 sub dump_vertical($)
1654 my $filename = shift;
1655 my @vertical_table;
1657 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1658 while (<$INPUT>)
1660 next if /^\#/; # skip comments
1661 next if /^\s*$/; # skip empty lines
1662 next if /\x1a/; # skip ^Z
1663 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1665 my $type = $2;
1666 die "unknown vertical $type" unless defined $vertical_types{$type};
1667 if (hex $1 < 65536)
1669 $vertical_table[hex $1] = $vertical_types{$type};
1671 next;
1673 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1675 my $type = $3;
1676 die "unknown vertical $type" unless defined $vertical_types{$type};
1677 foreach my $i (hex $1 .. hex $2)
1679 $vertical_table[$i] = $vertical_types{$type};
1681 next;
1683 die "malformed line $_";
1685 close $INPUT;
1687 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1688 print "Building $filename\n";
1689 print OUTPUT "/* Unicode Vertical Orientation */\n";
1690 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1691 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1692 print OUTPUT "#include \"windef.h\"\n\n";
1694 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1696 close OUTPUT;
1697 save_file($filename);
1700 ################################################################
1701 # dump the digit folding tables
1702 sub dump_digit_folding($)
1704 my ($filename) = shift;
1705 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1706 print "Building $filename\n";
1707 print OUTPUT "/* Unicode digit folding mappings */\n";
1708 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1709 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1710 print OUTPUT "#include \"windef.h\"\n\n";
1712 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1713 close OUTPUT;
1714 save_file($filename);
1718 ################################################################
1719 # dump the case mapping tables
1720 sub dump_case_mappings($)
1722 my $filename = shift;
1723 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1724 print "Building $filename\n";
1725 print OUTPUT "/* Unicode case mappings */\n";
1726 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1727 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1728 print OUTPUT "#include \"windef.h\"\n\n";
1730 my @upper = @toupper_table;
1731 my @lower = @tolower_table;
1732 remove_linguistic_mappings( \@upper, \@lower );
1734 dump_case_table( "wine_casemap_lower", @lower );
1735 print OUTPUT "\n";
1736 dump_case_table( "wine_casemap_upper", @upper );
1737 close OUTPUT;
1738 save_file($filename);
1742 ################################################################
1743 # dump a case mapping table
1744 sub dump_case_table($@)
1746 my ($name,@table) = @_;
1748 for (my $i = 0; $i < 65536; $i++)
1750 next unless defined $table[$i];
1751 $table[$i] = ($table[$i] - $i) & 0xffff;
1754 my @array = compress_array( 256, 0, @table[0..65535] );
1756 printf OUTPUT "const WCHAR %s[%d] =\n", $name, scalar @array;
1757 printf OUTPUT "{\n /* index */\n";
1758 printf OUTPUT "%s,\n", dump_array( 16, 0, @array[0..255] );
1759 printf OUTPUT " /* data */\n";
1760 printf OUTPUT "%s", dump_array( 16, 0, @array[256..$#array] );
1761 printf OUTPUT "\n};\n";
1764 ################################################################
1765 # compress a mapping table by removing identical rows
1766 sub compress_array($$@)
1768 my $rows = shift;
1769 my $def = shift;
1770 my @table = @_;
1771 my $len = @table / $rows;
1772 my @array;
1773 my $data = "";
1775 # try to merge table rows
1776 for (my $row = 0; $row < $rows; $row++)
1778 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1779 my $pos = index $data, $rowtxt;
1780 if ($pos == -1)
1782 # check if the tail of the data can match the start of the new row
1783 my $first = substr( $rowtxt, 0, 1 );
1784 for (my $i = length($data) - 1; $i > 0; $i--)
1786 $pos = index( substr( $data, -$i ), $first );
1787 last if $pos == -1;
1788 $i -= $pos;
1789 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1790 substr( $data, -$i ) = "";
1791 last;
1793 $pos = length $data;
1794 $data .= $rowtxt;
1796 $array[$row] = $rows + $pos;
1798 return @array, unpack "U*", $data;
1801 ################################################################
1802 # dump a simple char -> 16-bit value mapping table
1803 sub dump_simple_mapping($$@)
1805 my $name = shift;
1806 my $def = shift;
1807 my @array = compress_array( 256, $def, @_[0..65535] );
1809 printf OUTPUT "const unsigned short %s[%d] =\n{\n", $name, $#array+1;
1810 printf OUTPUT " /* offsets */\n%s,\n", dump_array( 16, 0, @array[0..255] );
1811 printf OUTPUT " /* values */\n%s\n};\n", dump_array( 16, 0, @array[256..$#array] );
1814 ################################################################
1815 # dump a char -> 16-bit value mapping table using two-level tables
1816 sub dump_two_level_mapping($$@)
1818 my $name = shift;
1819 my $def = shift;
1820 my $size = shift;
1821 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1822 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1823 my @array = compress_array( 256, 0, @row_array[0..4095] );
1825 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1827 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1828 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1829 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1830 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1833 ################################################################
1834 # dump a char -> value mapping table using three-level tables
1835 sub dump_three_level_mapping($$@)
1837 my $name = shift;
1838 my $def = shift;
1839 my $size = shift;
1840 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1841 my $level3 = ($MAX_CHAR + 1) / 16;
1842 my $level2 = $level3 / 16;
1843 my $level1 = $level2 / 16;
1844 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1845 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1846 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1848 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1849 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1851 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1852 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1853 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1854 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1855 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1858 ################################################################
1859 # dump a binary case mapping table in l_intl.nls format
1860 sub dump_binary_case_table(@)
1862 my (@table) = @_;
1863 my $max_char = 0x10000;
1864 my $level1 = $max_char / 16;
1865 my $level2 = $level1 / 16;
1867 my @difftable;
1868 for (my $i = 0; $i < @table; $i++)
1870 next unless defined $table[$i];
1871 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1874 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1875 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1876 my $offset = @array - $level1;
1877 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1878 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1881 ################################################################
1882 # dump case mappings for l_intl.nls
1883 sub dump_intl_nls($)
1885 my @upper_table = @toupper_table;
1886 my @lower_table = @tolower_table;
1887 remove_linguistic_mappings( \@upper_table, \@lower_table );
1889 my $upper = dump_binary_case_table( @upper_table );
1890 my $lower = dump_binary_case_table( @lower_table );
1892 my $filename = shift;
1893 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1894 printf "Building $filename\n";
1896 binmode OUTPUT;
1897 print OUTPUT pack "S<", 1; # version
1898 print OUTPUT $upper;
1899 print OUTPUT $lower;
1900 close OUTPUT;
1901 save_file($filename);
1905 ################################################################
1906 # dump the bidi direction table
1907 sub dump_bidi_dir_table($)
1909 my $filename = shift;
1910 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1911 printf "Building $filename\n";
1912 printf OUTPUT "/* Unicode BiDi direction table */\n";
1913 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1914 printf OUTPUT "#include \"windef.h\"\n\n";
1916 my @table;
1918 for (my $i = 0; $i < 65536; $i++)
1920 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1923 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1925 close OUTPUT;
1926 save_file($filename);
1930 ################################################################
1931 # dump the ctype tables
1932 sub dump_ctype_tables($)
1934 my $filename = shift;
1935 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1936 printf "Building $filename\n";
1937 printf OUTPUT "/* Unicode ctype tables */\n";
1938 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1939 printf OUTPUT "#include \"windef.h\"\n\n";
1941 my @table = map { ($_ || 0) & 0xffff; } @category_table;
1943 # add the direction in the high 4 bits of the category
1944 for (my $i = 0; $i < 65536; $i++)
1946 $table[$i] |= $directions{$direction_table[$i]} << 12 if defined $direction_table[$i];
1949 dump_simple_mapping( "wine_wctype_table", 0, @table );
1951 close OUTPUT;
1952 save_file($filename);
1956 sub rol($$)
1958 my ($byte, $count) = @_;
1959 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1962 ################################################################
1963 # compress the character properties table
1964 sub compress_char_props_table($@)
1966 my $rows = shift;
1967 my @table = @_;
1968 my $len = @table / $rows;
1969 my $pos = 0;
1970 my @array = (0) x $rows;
1971 my %sequences;
1973 # add some predefined sequences
1974 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1976 # try to merge table rows
1977 for (my $row = 0; $row < $rows; $row++)
1979 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1980 my $rowtxt = pack "L*", @table_row;
1981 if (defined($sequences{$rowtxt}))
1983 # reuse an existing row
1984 $array[$row] = $sequences{$rowtxt};
1986 else
1988 # create a new row
1989 $sequences{$rowtxt} = $array[$row] = ++$pos;
1990 push @array, @table_row;
1993 return @array;
1996 ################################################################
1997 # dump a normalization table in binary format
1998 sub dump_norm_table($)
2000 my $filename = shift;
2002 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2003 my %decomp = ( "nfc" => \@decomp_table,
2004 "nfd" => \@decomp_table,
2005 "nfkc" => \@decomp_compat_table,
2006 "nfkd" => \@decomp_compat_table ,
2007 "idna" => \@idna_decomp_table );
2009 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2010 print "Building $filename\n";
2012 my $type = $filename;
2013 $type =~ s!.*/norm(\w+)\.nls!$1!;
2015 my $compose = $forms{$type} & 1;
2016 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2018 my @version = split /\./, $UNIVERSION;
2020 # combining classes
2022 my @classes;
2023 my @class_values;
2025 foreach my $c (grep defined, @combining_class_table)
2027 $classes[$c] = 1 if $c < 0x100;
2029 for (my $i = 0; $i < @classes; $i++)
2031 next unless defined $classes[$i];
2032 $classes[$i] = @class_values;
2033 push @class_values, $i;
2035 push @class_values, 0 if (@class_values % 2);
2036 die "too many classes" if @class_values >= 0x40;
2038 # character properties
2040 my @char_props;
2041 my @decomposed;
2042 my @comp_hash_table;
2043 my $comp_hash_size = $compose ? 254 : 0;
2045 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2047 next unless defined $combining_class_table[$i];
2048 if (defined $decomp{$type}->[$i])
2050 my @dec = get_decomposition( $i, $decomp{$type} );
2051 if ($compose && (my @comp = get_composition( $i, $compat )))
2053 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2054 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2056 my $val = 0;
2057 foreach my $d (@dec)
2059 $val = $combining_class_table[$d];
2060 last if $val;
2062 $char_props[$i] = $classes[$val];
2064 else
2066 $char_props[$i] = 0xbf;
2068 @dec = compose_hangul( @dec ) if $compose;
2069 @dec = to_utf16( @dec );
2070 push @dec, 0 if @dec >= 7;
2071 $decomposed[$i] = \@dec;
2073 else
2075 if ($combining_class_table[$i] == 0x100)
2077 $char_props[$i] = 0x7f;
2079 elsif ($combining_class_table[$i])
2081 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2083 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2085 $char_props[$i] = 0xff;
2087 else
2089 $char_props[$i] = 0;
2094 if ($compose)
2096 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2098 my @comp = get_composition( $i, $compat );
2099 next unless @comp;
2100 if ($combining_class_table[$comp[1]])
2102 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2103 $char_props[$comp[1]] |= 0x40;
2105 else
2107 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2108 $char_props[$comp[1]] |= 0xc0;
2113 # surrogates
2114 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2115 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2117 # Hangul
2118 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2119 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2120 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2122 # invalid chars
2123 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2124 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2125 foreach my $i (0x00..0x10)
2127 $char_props[($i << 16) | 0xfffe] = 0xff;
2128 $char_props[($i << 16) | 0xffff] = 0xff;
2131 # decomposition hash table
2133 my @decomp_hash_table;
2134 my @decomp_hash_index;
2135 my @decomp_hash_data;
2136 my $decomp_hash_size = 944;
2138 # build string of character data, reusing substrings when possible
2139 my $decomp_char_data = "";
2140 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2142 my $str = pack "U*", @{$i};
2143 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2145 for (my $i = 0; $i < @decomposed; $i++)
2147 next unless defined $decomposed[$i];
2148 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2149 die "sequence not found" if $pos == -1;
2150 my $len = @{$decomposed[$i]};
2151 $len = 7 if $len > 7;
2152 my $hash = $i % $decomp_hash_size;
2153 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2155 for (my $i = 0; $i < $decomp_hash_size; $i++)
2157 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2158 next unless defined $decomp_hash_table[$i];
2159 if (@{$decomp_hash_table[$i]} == 1)
2161 my $entry = $decomp_hash_table[$i]->[0];
2162 if ($char_props[$entry->[0]] == 0xbf)
2164 $decomp_hash_index[$i] = $entry->[1];
2165 next;
2168 foreach my $entry (@{$decomp_hash_table[$i]})
2170 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2173 push @decomp_hash_data, 0, 0;
2175 # composition hash table
2177 my @comp_hash_index;
2178 my @comp_hash_data;
2179 if (@comp_hash_table)
2181 for (my $i = 0; $i < $comp_hash_size; $i++)
2183 $comp_hash_index[$i] = @comp_hash_data;
2184 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2186 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2187 push @comp_hash_data, 0, 0, 0;
2190 my $level1 = ($MAX_CHAR + 1) / 128;
2191 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2193 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2194 0, $decomp_hash_size, $comp_hash_size, 0 );
2195 my @tables = (0) x 8;
2197 $tables[0] = 16 + @header + @tables;
2198 $tables[1] = $tables[0] + @class_values / 2;
2199 $tables[2] = $tables[1] + $level1 / 2;
2200 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2201 $tables[4] = $tables[3] + @decomp_hash_index;
2202 $tables[5] = $tables[4] + @decomp_hash_data;
2203 $tables[6] = $tables[5] + length $decomp_char_data;
2204 $tables[7] = $tables[6] + @comp_hash_index;
2206 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2207 print OUTPUT pack "S<*", @header;
2208 print OUTPUT pack "S<*", @tables;
2209 print OUTPUT pack "C*", @class_values;
2211 print OUTPUT pack "C*", @rows[0..$level1-1];
2212 print OUTPUT pack "C*", @rows[$level1..$#rows];
2213 print OUTPUT pack "S<*", @decomp_hash_index;
2214 print OUTPUT pack "S<*", @decomp_hash_data;
2215 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2216 print OUTPUT pack "S<*", @comp_hash_index;
2217 print OUTPUT pack "S<*", @comp_hash_data;
2219 close OUTPUT;
2220 save_file($filename);
2222 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2226 ################################################################
2227 # output a codepage definition file from the global tables
2228 sub output_codepage_file($)
2230 my $codepage = shift;
2232 my $output = sprintf "nls/c_%03d.nls", $codepage;
2233 open OUTPUT,">$output.new" or die "Cannot create $output";
2235 printf "Building %s\n", $output;
2236 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2237 else { dump_binary_dbcs_table( $codepage ); }
2239 close OUTPUT;
2240 save_file($output);
2242 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2245 ################################################################
2246 # output a codepage table from a Microsoft-style mapping file
2247 sub dump_msdata_codepage($)
2249 my $filename = shift;
2251 my $state = "";
2252 my ($codepage, $width, $count);
2253 my ($lb_cur, $lb_end);
2255 @cp2uni = ();
2256 @glyph2uni = ();
2257 @lead_bytes = ();
2258 @uni2cp = ();
2259 $default_char = $DEF_CHAR;
2260 $default_wchar = $DEF_CHAR;
2262 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2264 while (<$INPUT>)
2266 next if /^;/; # skip comments
2267 next if /^\s*$/; # skip empty lines
2268 next if /\x1a/; # skip ^Z
2269 last if /^ENDCODEPAGE/;
2271 if (/^CODEPAGE\s+(\d+)/)
2273 $codepage = $1;
2274 next;
2276 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2278 $width = $1;
2279 $default_char = hex $2;
2280 $default_wchar = hex $3;
2281 next;
2283 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2285 $state = $1;
2286 $count = $2;
2287 next;
2289 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2291 if ($state eq "MBTABLE")
2293 my $cp = hex $1;
2294 my $uni = hex $2;
2295 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2296 next;
2298 if ($state eq "GLYPHTABLE")
2300 my $cp = hex $1;
2301 my $uni = hex $2;
2302 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2303 next;
2305 if ($state eq "WCTABLE")
2307 my $uni = hex $1;
2308 my $cp = hex $2;
2309 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2310 next;
2312 if ($state eq "DBCSRANGE")
2314 my $start = hex $1;
2315 my $end = hex $2;
2316 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2317 $lb_cur = $start;
2318 $lb_end = $end;
2319 next;
2321 if ($state eq "DBCSTABLE")
2323 my $mb = hex $1;
2324 my $uni = hex $2;
2325 my $cp = ($lb_cur << 8) | $mb;
2326 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2327 if (!--$count)
2329 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2331 next;
2334 die "$filename: Unrecognized line $_\n";
2336 close $INPUT;
2338 output_codepage_file( $codepage );
2341 ################################################################
2342 # align a string length
2343 sub align_string($$)
2345 my ($align, $str) = @_;
2346 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2347 return $str;
2350 ################################################################
2351 # pack a GUID string
2352 sub pack_guid($)
2354 $_ = shift;
2355 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2356 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2359 ################################################################
2360 # comparison function for compression sort
2361 sub cmp_compression
2363 return scalar @{$a} <=> scalar @{$b} ||
2364 $a->[4] <=> $b->[4] ||
2365 $a->[5] <=> $b->[5] ||
2366 $a->[6] <=> $b->[6] ||
2367 $a->[7] <=> $b->[7] ||
2368 $a->[8] <=> $b->[8] ||
2369 $a->[9] <=> $b->[9] ||
2370 $a->[10] <=> $b->[10] ||
2371 $a->[11] <=> $b->[11] ||
2372 $a->[12] <=> $b->[12];
2375 ################################################################
2376 # build a binary sort keys table
2377 sub dump_sortkey_table($$)
2379 my ($filename, $download) = @_;
2381 my @keys;
2382 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2383 my @multiple_weights;
2384 my @expansions;
2385 my @compressions;
2386 my %exceptions;
2387 my %guids;
2388 my %compr_flags;
2389 my %locales;
2390 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2391 my $jamostr = "";
2393 my $re_hex = '0x[0-9A-Fa-f]+';
2394 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2395 $guids{$default_guid} = { };
2397 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2399 my $KEYS = open_data_file( $MSDATA, $download );
2401 printf "Building $filename\n";
2403 while (<$KEYS>)
2405 s/\s*;.*$//;
2406 next if /^\s*$/; # skip empty lines
2407 if (/^\s*(SORTKEY|SORTTABLES)/)
2409 $part = $1;
2410 next;
2412 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2414 $part = $section = "";
2415 next;
2417 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2419 $section = $1;
2420 $guid = undef;
2421 next;
2423 next unless $part;
2424 if ("$part.$section" eq "SORTKEY.DEFAULT")
2426 if (/^\s*($re_hex)\s+$re_key/)
2428 $keys[hex $1] = [ split(/\s+/,$2) ];
2429 next;
2432 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2434 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2436 $version = hex $1;
2437 next;
2439 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2441 # ignore for now
2442 next;
2445 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2446 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2447 "$part.$section" eq "SORTTABLES.INVERSECASING")
2449 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2451 $guid = lc $1;
2452 $guids{$guid} = { } unless defined $guids{$guid};
2453 $guids{$guid}->{flags} |= $flags{$section};
2454 next;
2456 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2458 $locales{$1} = $guid;
2459 next;
2462 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2464 if (/^\s*(\d+)\s+(\d+)/)
2466 push @multiple_weights, $1, $2;
2467 next;
2470 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2472 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2474 my $pos = scalar @expansions / 2;
2475 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2476 push @expansions, hex $2, hex $3;
2477 next;
2480 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2482 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2484 $keys[hex $1] = $keys[hex $2];
2485 next;
2488 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2490 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2492 if ($subsection || !$guid) # start a new one
2494 $guid = lc $1;
2495 $subsection = "";
2496 $guids{$guid} = { } unless defined $guids{$guid};
2497 $guids{$guid}->{flags} |= $flags{$2} if $2;
2498 $guids{$guid}->{compr} = @compressions;
2499 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2500 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2501 push @compressions, [ ];
2503 else # merge with current one
2505 $guids{lc $1} = { } unless defined $guids{lc $1};
2506 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2507 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2508 $compr_flags{lc $1} = $compr_flags{$guid};
2510 next;
2512 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2514 $locales{$1} = $guid;
2515 next;
2517 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2519 $subsection = $1;
2520 next;
2522 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2524 my @comp = map { hex $_; } split(/\s+/,$1);
2525 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2526 # add compression flags
2527 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2528 next;
2531 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2533 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2535 $guid = lc $1;
2536 $guids{$guid} = { } unless defined $guids{lc $1};
2537 $ling_flag = ($2 ? "+" : "-");
2538 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2539 next;
2541 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2543 $locales{$1} = $guid;
2544 next;
2546 if (/^\s*($re_hex)\s+$re_key/)
2548 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2549 next;
2552 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2554 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2556 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2557 next;
2560 die "$download: $part.$section: unrecognized line $_\n";
2562 close $KEYS;
2564 # Sortkey table
2566 my $table;
2567 for (my $i = 0; $i < 0x10000; $i++)
2569 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2570 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2573 foreach my $id (sort keys %exceptions)
2575 my $pos = length($table) / 4;
2576 my @exc = @{$exceptions{$id}};
2577 my @filled;
2578 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2579 my $guid = substr( $id, 0, -1 );
2580 $guids{$guid}->{$key} = $pos;
2581 $pos += 0x100;
2582 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2583 for (my $j = 0; $j < 0x10000; $j++)
2585 next unless defined $exc[$j] || defined $flags[$j];
2586 $filled[$j >> 8] = 1;
2587 $j |= 0xff;
2589 for (my $j = 0; $j < 0x100; $j++)
2591 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2592 $pos += 0x100 if $filled[$j];
2594 for (my $j = 0; $j < 0x10000; $j++)
2596 next unless $filled[$j >> 8];
2597 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2598 $k[3] |= $flags[$j] || 0;
2599 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2603 # Case mapping tables
2605 # standard table
2606 my @casemaps;
2607 my @upper = @toupper_table;
2608 my @lower = @tolower_table;
2609 remove_linguistic_mappings( \@upper, \@lower );
2610 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2612 # linguistic table
2613 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2615 # Turkish table
2616 @upper = @toupper_table;
2617 @lower = @tolower_table;
2618 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2619 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2620 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2621 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2623 # Char type table
2625 my @table;
2626 my $types = "";
2627 my %typestr;
2628 for (my $i = 0; $i < 0x10000; $i++)
2630 my $str = pack "S<3",
2631 ($category_table[$i] || 0) & 0xffff,
2632 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2633 ($category_table[$i] || 0) >> 16;
2635 if (!defined($typestr{$str}))
2637 $typestr{$str} = length($types) / 6;
2638 $types .= $str;
2640 $table[$i] = $typestr{$str};
2643 my @rows = compress_array( 4096, 0, @table[0..65535] );
2644 my @array = compress_array( 256, 0, @rows[0..4095] );
2645 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2646 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2648 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2649 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2650 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2652 # Sort tables
2654 # guids
2655 my $sorttables = pack "L<2", $version, scalar %guids;
2656 foreach my $id (sort keys %guids)
2658 my %guid = %{$guids{$id}};
2659 my $flags = $guid{flags} || 0;
2660 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2661 $sorttables .= pack_guid($id) . pack "L<5",
2662 $flags,
2663 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2664 $guid{except} || 0,
2665 $guid{ling_except} || 0,
2666 $map / 2;
2669 # expansions
2670 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2672 # compressions
2673 $sorttables .= pack "L<", scalar @compressions;
2674 my $rowstr = "";
2675 foreach my $c (@compressions)
2677 my $pos = length($rowstr) / 2;
2678 my $min = 0xffff;
2679 my $max = 0;
2680 my @lengths = (0) x 8;
2681 foreach my $r (sort cmp_compression @{$c})
2683 my @row = @{$r};
2684 $lengths[scalar @row - 6]++;
2685 foreach my $val (@row[4..$#row])
2687 $min = $val if $min > $val;
2688 $max = $val if $max < $val;
2690 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2691 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2693 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2695 $sorttables .= $rowstr;
2697 # multiple weights
2698 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2700 # jamo sort
2701 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2703 # Locales
2705 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2706 foreach my $loc (sort keys %locales)
2708 # skip specific locales that match more general ones
2709 my @parts = split /[-_]/, $loc;
2710 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2711 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2712 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2715 # File header
2717 my @header;
2718 $header[0] = 16;
2719 $header[1] = $header[0] + length $table;
2720 $header[2] = $header[1] + length $casemaps;
2721 $header[3] = $header[2] + length $chartypes;
2723 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2724 print OUTPUT pack "L<*", @header;
2725 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2726 close OUTPUT;
2727 save_file($filename);
2731 ################################################################
2732 # build the script to create registry keys
2733 sub dump_registry_script($%)
2735 my ($filename, %keys) = @_;
2736 my $indent = 1;
2738 printf "Building %s\n", $filename;
2739 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2740 print OUTPUT "HKLM\n{\n";
2741 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2743 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2744 $indent++;
2746 foreach my $k (sort keys %keys)
2748 my @subkeys = split /\\/, $k;
2749 my ($def, @vals) = @{$keys{$k}};
2750 for (my $i = 0; $i < @subkeys; $i++)
2752 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2753 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2754 $indent++;
2756 foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2757 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2759 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2760 close OUTPUT;
2761 save_file($filename);
2765 ################################################################
2766 # save a file if modified
2767 sub save_file($)
2769 my $file = shift;
2770 if (-f $file && !system "cmp $file $file.new >/dev/null")
2772 unlink "$file.new";
2774 else
2776 rename "$file.new", "$file";
2781 ################################################################
2782 # main routine
2784 chdir ".." if -f "./make_unicode";
2785 load_data();
2786 dump_case_mappings( "libs/port/casemap.c" );
2787 dump_sortkeys( "dlls/kernelbase/collation.c" );
2788 dump_ctype_tables( "libs/port/wctype.c" );
2789 dump_bidi_dir_table( "dlls/gdi32/direction.c" );
2790 dump_bidi_dir_table( "dlls/usp10/direction.c" );
2791 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2792 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2793 dump_mirroring( "dlls/usp10/mirror.c" );
2794 dump_mirroring( "dlls/dwrite/mirror.c" );
2795 dump_bracket( "dlls/usp10/bracket.c" );
2796 dump_bracket( "dlls/dwrite/bracket.c" );
2797 dump_shaping( "dlls/usp10/shaping.c" );
2798 dump_linebreak( "dlls/usp10/linebreak.c" );
2799 dump_linebreak( "dlls/dwrite/linebreak.c" );
2800 dump_scripts( "dlls/dwrite/scripts" );
2801 dump_indic( "dlls/usp10/indicsyllable.c" );
2802 dump_vertical( "dlls/gdi32/vertical.c" );
2803 dump_vertical( "dlls/wineps.drv/vertical.c" );
2804 dump_intl_nls("nls/l_intl.nls");
2805 dump_norm_table( "nls/normnfc.nls" );
2806 dump_norm_table( "nls/normnfd.nls" );
2807 dump_norm_table( "nls/normnfkc.nls" );
2808 dump_norm_table( "nls/normnfkd.nls" );
2809 dump_norm_table( "nls/normidna.nls" );
2810 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2811 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2812 dump_eucjp_codepage();
2813 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2815 exit 0;
2817 # Local Variables:
2818 # compile-command: "./make_unicode"
2819 # End: