qcap: Move the video capture streaming thread to vfwcapture.c.
[wine.git] / tools / make_unicode
blob8d5747ce253a812bc8cca963016fbec3130c522c
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "13.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $REPORTS = "http://www.unicode.org/reports";
30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
33 # Sort keys file
34 my $SORTKEYS = "tr10/allkeys.txt";
36 # Default char for undefined mappings
37 my $DEF_CHAR = ord '?';
39 # Last valid Unicode character
40 my $MAX_CHAR = 0x10ffff;
42 my @allfiles =
44 "CodpageFiles/037.txt",
45 "CodpageFiles/437.txt",
46 "CodpageFiles/500.txt",
47 "CodpageFiles/708.txt",
48 "CodpageFiles/737.txt",
49 "CodpageFiles/775.txt",
50 "CodpageFiles/850.txt",
51 "CodpageFiles/852.txt",
52 "CodpageFiles/855.txt",
53 "CodpageFiles/857.txt",
54 "CodpageFiles/860.txt",
55 "CodpageFiles/861.txt",
56 "CodpageFiles/862.txt",
57 "CodpageFiles/863.txt",
58 "CodpageFiles/864.txt",
59 "CodpageFiles/865.txt",
60 "CodpageFiles/866.txt",
61 "CodpageFiles/869.txt",
62 "CodpageFiles/874.txt",
63 "CodpageFiles/875.txt",
64 "CodpageFiles/932.txt",
65 "CodpageFiles/936.txt",
66 "CodpageFiles/949.txt",
67 "CodpageFiles/950.txt",
68 "CodpageFiles/1026.txt",
69 "CodpageFiles/1250.txt",
70 "CodpageFiles/1251.txt",
71 "CodpageFiles/1252.txt",
72 "CodpageFiles/1253.txt",
73 "CodpageFiles/1254.txt",
74 "CodpageFiles/1255.txt",
75 "CodpageFiles/1256.txt",
76 "CodpageFiles/1257.txt",
77 "CodpageFiles/1258.txt",
78 "CodpageFiles/1361.txt",
79 "CodpageFiles/10000.txt",
80 "CodpageFiles/10001.txt",
81 "CodpageFiles/10002.txt",
82 "CodpageFiles/10003.txt",
83 "CodpageFiles/10004.txt",
84 "CodpageFiles/10005.txt",
85 "CodpageFiles/10006.txt",
86 "CodpageFiles/10007.txt",
87 "CodpageFiles/10008.txt",
88 "CodpageFiles/10010.txt",
89 "CodpageFiles/10017.txt",
90 "CodpageFiles/10021.txt",
91 "CodpageFiles/10029.txt",
92 "CodpageFiles/10079.txt",
93 "CodpageFiles/10081.txt",
94 "CodpageFiles/10082.txt",
95 "CodpageFiles/20127.txt",
96 "CodpageFiles/20866.txt",
97 "CodpageFiles/21866.txt",
98 "CodpageFiles/28591.txt",
99 "CodpageFiles/28592.txt",
100 "CodpageFiles/28593.txt",
101 "CodpageFiles/28594.txt",
102 "CodpageFiles/28595.txt",
103 "CodpageFiles/28596.txt",
104 "CodpageFiles/28597.txt",
105 "CodpageFiles/28598.txt",
106 "CodpageFiles/28599.txt",
107 "CodpageFiles/28603.txt",
108 "CodpageFiles/28605.txt",
112 my %ctype =
114 # CT_CTYPE1
115 "upper" => 0x0001,
116 "lower" => 0x0002,
117 "digit" => 0x0004,
118 "space" => 0x0008,
119 "punct" => 0x0010,
120 "cntrl" => 0x0020,
121 "blank" => 0x0040,
122 "xdigit" => 0x0080,
123 "alpha" => 0x0100 | 0x80000000,
124 "defin" => 0x0200,
125 # CT_CTYPE3 in high 16 bits
126 "nonspacing" => 0x00010000,
127 "diacritic" => 0x00020000,
128 "vowelmark" => 0x00040000,
129 "symbol" => 0x00080000,
130 "katakana" => 0x00100000,
131 "hiragana" => 0x00200000,
132 "halfwidth" => 0x00400000,
133 "fullwidth" => 0x00800000,
134 "ideograph" => 0x01000000,
135 "kashida" => 0x02000000,
136 "lexical" => 0x04000000,
137 "highsurrogate" => 0x08000000,
138 "lowsurrogate" => 0x10000000,
141 my %bracket_types =
143 "o" => 0x0000,
144 "c" => 0x0001,
147 my %indic_types =
149 "Other" => 0x0000,
150 "Bindu" => 0x0001,
151 "Visarga" => 0x0002,
152 "Avagraha" => 0x0003,
153 "Nukta" => 0x0004,
154 "Virama" => 0x0005,
155 "Vowel_Independent" => 0x0006,
156 "Vowel_Dependent" => 0x0007,
157 "Vowel" => 0x0008,
158 "Consonant_Placeholder" => 0x0009,
159 "Consonant" => 0x000a,
160 "Consonant_Dead" => 0x000b,
161 "Consonant_Succeeding_Repha" => 0x000c,
162 "Consonant_Subjoined" => 0x000d,
163 "Consonant_Medial" => 0x000e,
164 "Consonant_Final" => 0x000f,
165 "Consonant_Head_Letter" => 0x0010,
166 "Modifying_Letter" => 0x0011,
167 "Tone_Letter" => 0x0012,
168 "Tone_Mark" => 0x0013,
169 "Register_Shifter" => 0x0014,
170 "Consonant_Preceding_Repha" => 0x0015,
171 "Pure_Killer" => 0x0016,
172 "Invisible_Stacker" => 0x0017,
173 "Gemination_Mark" => 0x0018,
174 "Cantillation_Mark" => 0x0019,
175 "Non_Joiner" => 0x001a,
176 "Joiner" => 0x001b,
177 "Number_Joiner" => 0x001c,
178 "Number" => 0x001d,
179 "Brahmi_Joining_Number" => 0x001e,
180 "Consonant_With_Stacker" => 0x001f,
181 "Consonant_Prefixed" => 0x0020,
182 "Syllable_Modifier" => 0x0021,
183 "Consonant_Killer" => 0x0022,
184 "Consonant_Initial_Postfixed" => 0x0023,
187 my %matra_types =
189 "Right" => 0x01,
190 "Left" => 0x02,
191 "Visual_Order_Left" => 0x03,
192 "Left_And_Right" => 0x04,
193 "Top" => 0x05,
194 "Bottom" => 0x06,
195 "Top_And_Bottom" => 0x07,
196 "Top_And_Right" => 0x08,
197 "Top_And_Left" => 0x09,
198 "Top_And_Left_And_Right" => 0x0a,
199 "Bottom_And_Right" => 0x0b,
200 "Top_And_Bottom_And_Right" => 0x0c,
201 "Overstruck" => 0x0d,
202 "Invisible" => 0x0e,
203 "Bottom_And_Left" => 0x0f,
204 "Top_And_Bottom_And_Left" => 0x10,
207 my %break_types =
209 "BK" => 0x0001,
210 "CR" => 0x0002,
211 "LF" => 0x0003,
212 "CM" => 0x0004,
213 "SG" => 0x0005,
214 "GL" => 0x0006,
215 "CB" => 0x0007,
216 "SP" => 0x0008,
217 "ZW" => 0x0009,
218 "NL" => 0x000a,
219 "WJ" => 0x000b,
220 "JL" => 0x000c,
221 "JV" => 0x000d,
222 "JT" => 0x000e,
223 "H2" => 0x000f,
224 "H3" => 0x0010,
225 "XX" => 0x0011,
226 "OP" => 0x0012,
227 "CL" => 0x0013,
228 "CP" => 0x0014,
229 "QU" => 0x0015,
230 "NS" => 0x0016,
231 "EX" => 0x0017,
232 "SY" => 0x0018,
233 "IS" => 0x0019,
234 "PR" => 0x001a,
235 "PO" => 0x001b,
236 "NU" => 0x001c,
237 "AL" => 0x001d,
238 "ID" => 0x001e,
239 "IN" => 0x001f,
240 "HY" => 0x0020,
241 "BB" => 0x0021,
242 "BA" => 0x0022,
243 "SA" => 0x0023,
244 "AI" => 0x0024,
245 "B2" => 0x0025,
246 "HL" => 0x0026,
247 "CJ" => 0x0027,
248 "RI" => 0x0028,
249 "EB" => 0x0029,
250 "EM" => 0x002a,
251 "ZWJ" => 0x002b,
254 my %vertical_types =
256 "R" => 0x0000,
257 "U" => 0x0001,
258 "Tr" => 0x0002,
259 "Tu" => 0x0003,
262 my %categories =
264 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
265 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
266 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
267 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
268 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
269 "Me" => $ctype{"defin"}, # Mark, Enclosing
270 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
271 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
272 "No" => $ctype{"defin"}, # Number, Other
273 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
274 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
275 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
276 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
277 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
278 "Cs" => $ctype{"defin"}, # Other, Surrogate
279 "Co" => $ctype{"defin"}, # Other, Private Use
280 "Cn" => $ctype{"defin"}, # Other, Not Assigned
281 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
282 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
283 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
284 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
285 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
286 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
287 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
288 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
289 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
290 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
291 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
292 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
293 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
296 # a few characters need additional categories that cannot be determined automatically
297 my %special_categories =
299 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
300 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
301 "space" => [ 0x09..0x0d, 0x85 ],
302 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
303 "cntrl" => [ 0x070f, 0x200c, 0x200d,
304 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
305 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
306 0xfff9, 0xfffa, 0xfffb ],
307 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
308 0xd7, 0xf7 ],
309 "digit" => [ 0xb2, 0xb3, 0xb9 ],
310 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
311 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
312 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
313 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
314 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
315 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
316 0x02b9..0x02ba, 0x02c6..0x02cf ],
317 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
318 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
319 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
320 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
321 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
322 0x3131..0x3164 ],
323 "ideograph" => [ 0x3006..0x3007 ],
324 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
325 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
326 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
327 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
328 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
329 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
330 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
331 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
332 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
333 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
334 "kashida" => [ 0x0640 ],
337 my %directions =
339 "L" => 1, # Left-to-Right
340 "R" => 2, # Right-to-Left
341 "AL" => 12, # Right-to-Left Arabic
342 "EN" => 3, # European Number
343 "ES" => 4, # European Number Separator
344 "ET" => 5, # European Number Terminator
345 "AN" => 6, # Arabic Number
346 "CS" => 7, # Common Number Separator
347 "NSM" => 13, # Non-Spacing Mark
348 "BN" => 14, # Boundary Neutral
349 "B" => 8, # Paragraph Separator
350 "S" => 9, # Segment Separator
351 "WS" => 10, # Whitespace
352 "ON" => 11, # Other Neutrals
353 "LRE" => 15, # Left-to-Right Embedding
354 "LRO" => 15, # Left-to-Right Override
355 "RLE" => 15, # Right-to-Left Embedding
356 "RLO" => 15, # Right-to-Left Override
357 "PDF" => 15, # Pop Directional Format
358 "LRI" => 15, # Left-to-Right Isolate
359 "RLI" => 15, # Right-to-Left Isolate
360 "FSI" => 15, # First Strong Isolate
361 "PDI" => 15 # Pop Directional Isolate
364 my %c2_types =
366 "L" => 1, # C2_LEFTTORIGHT
367 "R" => 2, # C2_RIGHTTOLEFT
368 "AL" => 2, # C2_RIGHTTOLEFT
369 "EN" => 3, # C2_EUROPENUMBER
370 "ES" => 4, # C2_EUROPESEPARATOR
371 "ET" => 5, # C2_EUROPETERMINATOR
372 "AN" => 6, # C2_ARABICNUMBER
373 "CS" => 7, # C2_COMMONSEPARATOR
374 "NSM" => 11, # C2_OTHERNEUTRAL
375 "BN" => 0, # C2_NOTAPPLICABLE
376 "B" => 8, # C2_BLOCKSEPARATOR
377 "S" => 9, # C2_SEGMENTSEPARATOR
378 "WS" => 10, # C2_WHITESPACE
379 "ON" => 11, # C2_OTHERNEUTRAL
380 "LRE" => 11, # C2_OTHERNEUTRAL
381 "LRO" => 11, # C2_OTHERNEUTRAL
382 "RLE" => 11, # C2_OTHERNEUTRAL
383 "RLO" => 11, # C2_OTHERNEUTRAL
384 "PDF" => 11, # C2_OTHERNEUTRAL
385 "LRI" => 11, # C2_OTHERNEUTRAL
386 "RLI" => 11, # C2_OTHERNEUTRAL
387 "FSI" => 11, # C2_OTHERNEUTRAL
388 "PDI" => 11 # C2_OTHERNEUTRAL
391 my %bidi_types =
393 "ON" => 0, # Other Neutrals
394 "L" => 1, # Left-to-Right
395 "R" => 2, # Right-to-Left
396 "AN" => 3, # Arabic Number
397 "EN" => 4, # European Number
398 "AL" => 5, # Right-to-Left Arabic
399 "NSM" => 6, # Non-Spacing Mark
400 "CS" => 7, # Common Number Separator
401 "ES" => 8, # European Number Separator
402 "ET" => 9, # European Number Terminator
403 "BN" => 10, # Boundary Neutral
404 "S" => 11, # Segment Separator
405 "WS" => 12, # Whitespace
406 "B" => 13, # Paragraph Separator
407 "RLO" => 14, # Right-to-Left Override
408 "RLE" => 15, # Right-to-Left Embedding
409 "LRO" => 16, # Left-to-Right Override
410 "LRE" => 17, # Left-to-Right Embedding
411 "PDF" => 18, # Pop Directional Format
412 "LRI" => 19, # Left-to-Right Isolate
413 "RLI" => 20, # Right-to-Left Isolate
414 "FSI" => 21, # First Strong Isolate
415 "PDI" => 22 # Pop Directional Isolate
418 my %joining_types =
420 "U" => 0, # Non_Joining
421 "T" => 1, # Transparent
422 "R" => 2, # Right_Joining
423 "L" => 3, # Left_Joining
424 "D" => 4, # Dual_Joining
425 "C" => 5, # Join_Causing
428 my @cp2uni = ();
429 my @glyph2uni = ();
430 my @lead_bytes = ();
431 my @uni2cp = ();
432 my @tolower_table = ();
433 my @toupper_table = ();
434 my @digitmap_table = ();
435 my @category_table = ();
436 my @joining_table = ();
437 my @direction_table = ();
438 my @decomp_table = ();
439 my @combining_class_table = ();
440 my @decomp_compat_table = ();
441 my @comp_exclusions = ();
442 my @idna_decomp_table = ();
443 my @idna_disallowed = ();
444 my %registry_keys;
445 my $default_char;
446 my $default_wchar;
448 my %joining_forms =
450 "isolated" => [],
451 "final" => [],
452 "initial" => [],
453 "medial" => []
456 sub to_utf16(@)
458 my @ret;
459 foreach my $ch (@_)
461 if ($ch < 0x10000)
463 push @ret, $ch;
465 else
467 my $val = $ch - 0x10000;
468 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
471 return @ret;
474 ################################################################
475 # fetch a unicode.org file and open it
476 sub open_data_file($$)
478 my ($base, $name) = @_;
479 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
480 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
481 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
482 local *FILE;
484 if ($base =~ /.*\/([^\/]+)\.zip$/)
486 my $zip = "$1$suffix.zip";
487 unless (-f "$cache/$zip")
489 system "mkdir", "-p", $cache;
490 print "Fetching $base...\n";
491 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
493 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
495 else
497 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
498 unless (-f $dest)
500 system "mkdir", "-p", $dir;
501 print "Fetching $base/$name...\n";
502 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
504 open FILE, "<$dest" or die "cannot open $dest";
506 return *FILE;
509 ################################################################
510 # recursively get the decomposition for a character
511 sub get_decomposition($$);
512 sub get_decomposition($$)
514 my ($char, $table) = @_;
515 my @ret;
517 return $char unless defined $table->[$char];
518 foreach my $ch (@{$table->[$char]})
520 push @ret, get_decomposition( $ch, $table );
522 return @ret;
525 ################################################################
526 # get the composition that results in a given character
527 sub get_composition($$)
529 my ($ch, $compat) = @_;
530 return () unless defined $decomp_table[$ch]; # no decomposition
531 my @ret = @{$decomp_table[$ch]};
532 return () if @ret < 2; # singleton decomposition
533 return () if $comp_exclusions[$ch]; # composition exclusion
534 return () if $combining_class_table[$ch]; # non-starter
535 return () if $combining_class_table[$ret[0]]; # first char is non-starter
536 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
537 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
538 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
539 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
540 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
541 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
542 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
543 return @ret;
546 ################################################################
547 # recursively build decompositions
548 sub build_decompositions(@)
550 my @src = @_;
551 my @dst;
553 for (my $i = 0; $i < @src; $i++)
555 next unless defined $src[$i];
556 my @decomp = to_utf16( get_decomposition( $i, \@src ));
557 $dst[$i] = \@decomp;
559 return @dst;
562 ################################################################
563 # compose Hangul sequences
564 sub compose_hangul(@)
566 my $SBASE = 0xac00;
567 my $LBASE = 0x1100;
568 my $VBASE = 0x1161;
569 my $TBASE = 0x11a7;
570 my $LCOUNT = 19;
571 my $VCOUNT = 21;
572 my $TCOUNT = 28;
573 my $NCOUNT = $VCOUNT * $TCOUNT;
574 my $SCOUNT = $LCOUNT * $NCOUNT;
576 my @seq = @_;
577 my @ret;
578 my $i;
580 for ($i = 0; $i < @seq; $i++)
582 my $ch = $seq[$i];
583 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
584 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
586 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
587 $i++;
589 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
590 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
592 $ch += $seq[$i+1] - $TBASE;
593 $i++;
595 push @ret, $ch;
597 return @ret;
600 ################################################################
601 # remove linguistic-only mappings from the case table
602 sub remove_linguistic_mappings($$)
604 my ($upper, $lower) = @_;
606 # remove case mappings that don't round-trip
608 for (my $i = 0; $i < @{$upper}; $i++)
610 next unless defined ${$upper}[$i];
611 my $ch = ${$upper}[$i];
612 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
614 for (my $i = 0; $i < @{$lower}; $i++)
616 next unless defined ${$lower}[$i];
617 my $ch = ${$lower}[$i];
618 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
622 ################################################################
623 # read in the Unicode database files
624 sub load_data()
626 my $start;
628 # now build mappings from the decomposition field of the Unicode database
630 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
631 while (<$UNICODE_DATA>)
633 # Decode the fields ...
634 my ($code, $name, $cat, $comb, $bidi,
635 $decomp, $dec, $dig, $num, $mirror,
636 $oldname, $comment, $upper, $lower, $title) = split /;/;
637 my $src = hex $code;
639 die "unknown category $cat" unless defined $categories{$cat};
640 die "unknown directionality $bidi" unless defined $directions{$bidi};
642 $category_table[$src] = $categories{$cat};
643 $direction_table[$src] = $bidi;
644 $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
646 if ($lower ne "")
648 $tolower_table[$src] = hex $lower;
650 if ($upper ne "")
652 $toupper_table[$src] = hex $upper;
654 if ($dec ne "")
656 $category_table[$src] |= $ctype{"digit"};
658 if ($dig ne "")
660 $digitmap_table[$src] = ord $dig;
662 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
664 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
665 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
666 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
667 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
668 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
669 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
670 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
671 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
672 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
673 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
674 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
675 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
677 # copy the category and direction for everything between First/Last pairs
678 if ($name =~ /, First>/) { $start = $src; }
679 if ($name =~ /, Last>/)
681 while ($start < $src)
683 $category_table[$start] = $category_table[$src];
684 $direction_table[$start] = $direction_table[$src];
685 $combining_class_table[$start] = $combining_class_table[$src];
686 $start++;
690 next if $decomp eq ""; # no decomposition, skip it
692 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
694 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
695 $decomp_compat_table[$src] = \@seq;
698 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
700 # decomposition of the form "<foo> 1234" -> use char if type is known
701 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
703 ${joining_forms{$1}}[hex $2] = $src;
706 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
708 # decomposition "<compat> 0020 1234" -> combining accent
710 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
712 # store decomposition
713 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
715 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
717 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
719 # Single char decomposition
720 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
724 close $UNICODE_DATA;
726 # patch the category of some special characters
728 for (my $i = 0; $i < @decomp_table; $i++)
730 next unless defined $decomp_table[$i];
731 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
733 foreach my $cat (keys %special_categories)
735 my $flag = $ctype{$cat};
736 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
738 for (my $i = 0; $i < @decomp_compat_table; $i++)
740 next unless defined $decomp_compat_table[$i];
741 next unless @{$decomp_compat_table[$i]} == 2;
742 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
745 # load the composition exclusions
747 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
748 while (<$EXCL>)
750 s/\#.*//; # remove comments
751 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
753 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
755 elsif (/^([0-9a-fA-F]+)\s*$/)
757 $comp_exclusions[hex $1] = 1;
760 close $EXCL;
762 # load the IDNA mappings
764 @idna_decomp_table = @decomp_compat_table;
765 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
766 while (<$IDNA>)
768 s/\#.*//; # remove comments
769 next if /^\s*$/;
770 my ($char, $type, $mapping) = split /;/;
771 my ($ch1, $ch2);
772 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
774 $ch1 = hex $1;
775 $ch2 = hex $2;
777 elsif ($char =~ /([0-9a-fA-F]+)/)
779 $ch1 = $ch2 = hex $1;
782 if ($type =~ /mapped/ || $type =~ /deviation/)
784 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
785 my @seq = map { hex $_; } split /\s+/, $mapping;
786 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
788 elsif ($type =~ /valid/)
791 elsif ($type =~ /ignored/)
793 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
795 elsif ($type =~ /disallowed/)
797 foreach my $i ($ch1 .. $ch2)
799 $idna_decomp_table[$i] = undef;
800 $idna_disallowed[$i] = 1;
804 close $IDNA;
808 ################################################################
809 # add a new registry key
810 sub add_registry_key($$)
812 my ($key, $defval) = @_;
813 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
816 ################################################################
817 # add a new registry value
818 sub add_registry_value($$$)
820 my ($key, $name, $value) = @_;
821 add_registry_key( $key, undef );
822 push @{$registry_keys{$key}}, "'$name' = s '$value'";
825 ################################################################
826 # define a new lead byte
827 sub add_lead_byte($)
829 my $ch = shift;
830 return if defined $cp2uni[$ch];
831 push @lead_bytes, $ch;
832 $cp2uni[$ch] = 0;
835 ################################################################
836 # define a new char mapping
837 sub add_mapping($$)
839 my ($cp, $uni) = @_;
840 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
841 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
842 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
845 ################################################################
846 # get a mapping including glyph chars for MB_USEGLYPHCHARS
847 sub get_glyphs_mapping(@)
849 my @table = @_;
851 for (my $i = 0; $i < @glyph2uni; $i++)
853 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
855 return @table;
858 ################################################################
859 # build EUC-JP table from the JIS 0208/0212 files
860 sub dump_eucjp_codepage()
862 @cp2uni = ();
863 @glyph2uni = ();
864 @lead_bytes = ();
865 @uni2cp = ();
866 $default_char = $DEF_CHAR;
867 $default_wchar = 0x30fb;
869 # ASCII chars
870 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
872 # lead bytes
873 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
875 # JIS X 0201 right plane
876 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
878 # undefined chars
879 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
880 $cp2uni[0xa0] = 0xf8f0;
881 $cp2uni[0xff] = 0xf8f3;
883 # Fix backslash conversion
884 add_mapping( 0xa1c0, 0xff3c );
886 # Add private mappings for rows undefined in JIS 0208/0212
887 my $private = 0xe000;
888 foreach my $hi (0xf5 .. 0xfe)
890 foreach my $lo (0xa1 .. 0xfe)
892 add_mapping( ($hi << 8) + $lo, $private++ );
895 foreach my $hi (0xf5 .. 0xfe)
897 foreach my $lo (0x21 .. 0x7e)
899 add_mapping( ($hi << 8) + $lo, $private++ );
903 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
904 while (<$INPUT>)
906 next if /^\#/; # skip comments
907 next if /^$/; # skip empty lines
908 next if /\x1a/; # skip ^Z
909 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
911 add_mapping( 0x8080 + hex $1, hex $2 );
912 next;
914 die "Unrecognized line $_\n";
916 close $INPUT;
918 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
919 while (<$INPUT>)
921 next if /^\#/; # skip comments
922 next if /^$/; # skip empty lines
923 next if /\x1a/; # skip ^Z
924 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
926 add_mapping( 0x8000 + hex $1, hex $2 );
927 next;
929 die "Unrecognized line $_\n";
931 close $INPUT;
933 output_codepage_file( 20932 );
937 ################################################################
938 # build the sort keys table
939 sub dump_sortkeys($)
941 my $filename = shift;
942 my @sortkeys = ();
944 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
945 while (<$INPUT>)
947 next if /^\#/; # skip comments
948 next if /^$/; # skip empty lines
949 next if /\x1a/; # skip ^Z
950 next if /^\@version/; # skip @version header
951 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
953 my ($uni,$variable) = (hex $1, $2);
954 next if $uni > 65535;
955 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
956 next;
958 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
960 # multiple character sequence, ignored for now
961 next;
963 die "$SORTKEYS: Unrecognized line $_\n";
965 close $INPUT;
967 # compress the keys to 32 bit:
968 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
970 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
971 ${$a}[2] <=> ${$b}[2] or
972 ${$a}[3] <=> ${$b}[3] or
973 ${$a}[4] <=> ${$b}[4] or
974 $a cmp $b; } @sortkeys;
976 my ($n2, $n3) = (1, 1);
977 my @keys = (-1, -1, -1, -1, -1 );
978 my @flatkeys = ();
980 for (my $i = 0; $i < @sortkeys; $i++)
982 next unless defined $sortkeys[$i];
983 my @current = @{$sortkeys[$i]};
984 if ($current[1] == $keys[1])
986 if ($current[2] == $keys[2])
988 if ($current[3] == $keys[3])
990 # nothing
992 else
994 $keys[3] = $current[3];
995 $n3++;
996 die if ($n3 >= 16);
999 else
1001 $keys[2] = $current[2];
1002 $keys[3] = $current[3];
1003 $n2++;
1004 $n3 = 1;
1005 die if ($n2 >= 256);
1008 else
1010 $keys[1] = $current[1];
1011 $keys[2] = $current[2];
1012 $keys[3] = $current[3];
1013 $n2 = 1;
1014 $n3 = 1;
1017 if ($current[2]) { $current[2] = $n2; }
1018 if ($current[3]) { $current[3] = $n3; }
1019 if ($current[4]) { $current[4] = 1; }
1021 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1024 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1025 printf "Building $filename\n";
1026 printf OUTPUT "/* Unicode collation element table */\n";
1027 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1028 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1029 print OUTPUT "#include \"windef.h\"\n\n";
1031 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1033 close OUTPUT;
1034 save_file($filename);
1038 ################################################################
1039 # dump an array of integers
1040 sub dump_array($$@)
1042 my ($bit_width, $default, @array) = @_;
1043 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1044 my $i;
1045 my $ret = " ";
1046 for ($i = 0; $i < $#array; $i++)
1048 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1049 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1051 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1052 return $ret;
1056 ################################################################
1057 # dump an SBCS mapping table in binary format
1058 sub dump_binary_sbcs_table($)
1060 my $codepage = shift;
1062 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1063 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1065 print OUTPUT pack "S<*", @header;
1066 print OUTPUT pack "C12", (0) x 12;
1067 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1069 if (@glyph2uni)
1071 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1073 else
1075 print OUTPUT pack "S<*", 0;
1078 print OUTPUT pack "S<*", 0, 0;
1080 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1084 ################################################################
1085 # dump a DBCS mapping table in binary format
1086 sub dump_binary_dbcs_table($)
1088 my $codepage = shift;
1089 my @lb_ranges = get_lb_ranges();
1090 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1092 my @offsets = (0) x 256;
1093 my $pos = 0;
1094 foreach my $i (@lead_bytes)
1096 $offsets[$i] = ($pos += 256);
1097 $cp2uni[$i] = 0;
1100 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1102 print OUTPUT pack "S<*", @header;
1103 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1104 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1105 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1107 foreach my $i (@lead_bytes)
1109 my $base = $i << 8;
1110 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1113 print OUTPUT pack "S<", 4;
1114 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1118 ################################################################
1119 # get the list of defined lead byte ranges
1120 sub get_lb_ranges()
1122 my @list = ();
1123 my @ranges = ();
1125 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1126 my $on = 0;
1127 for (my $i = 0; $i < 256; $i++)
1129 if ($on)
1131 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1133 else
1135 if ($list[$i]) { push @ranges, $i; $on = 1; }
1138 if ($on) { push @ranges, 0xff; }
1139 return @ranges;
1142 ################################################################
1143 # dump the Indic Syllabic Category table
1144 sub dump_indic($)
1146 my $filename = shift;
1147 my @indic_table;
1149 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1150 while (<$INPUT>)
1152 next if /^\#/; # skip comments
1153 next if /^\s*$/; # skip empty lines
1154 next if /\x1a/; # skip ^Z
1155 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1157 my $type = $2;
1158 die "unknown indic $type" unless defined $indic_types{$type};
1159 if (hex $1 < 65536)
1161 $indic_table[hex $1] = $indic_types{$type};
1163 next;
1165 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1167 my $type = $3;
1168 die "unknown indic $type" unless defined $indic_types{$type};
1169 if (hex $1 < 65536 and hex $2 < 65536)
1171 foreach my $i (hex $1 .. hex $2)
1173 $indic_table[$i] = $indic_types{$type};
1176 next;
1178 die "malformed line $_";
1180 close $INPUT;
1182 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1183 while (<$INPUT>)
1185 next if /^\#/; # skip comments
1186 next if /^\s*$/; # skip empty lines
1187 next if /\x1a/; # skip ^Z
1188 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1190 my $type = $2;
1191 die "unknown matra $type" unless defined $matra_types{$type};
1192 $indic_table[hex $1] |= $matra_types{$type} << 8;
1193 next;
1195 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1197 my $type = $3;
1198 die "unknown matra $type" unless defined $matra_types{$type};
1199 foreach my $i (hex $1 .. hex $2)
1201 $indic_table[$i] |= $matra_types{$type} << 8;
1203 next;
1205 die "malformed line $_";
1207 close $INPUT;
1209 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1210 print "Building $filename\n";
1211 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1212 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1213 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1214 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1215 print OUTPUT "#include \"windef.h\"\n\n";
1217 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1219 close OUTPUT;
1220 save_file($filename);
1223 ################################################################
1224 # dump the Line Break Properties table
1225 sub dump_linebreak($)
1227 my $filename = shift;
1228 my @break_table;
1229 my $next_group = 0;
1231 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1232 while (<$INPUT>)
1234 next if /^\#/; # skip comments
1235 next if /^\s*$/; # skip empty lines
1236 next if /\x1a/; # skip ^Z
1237 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1239 my $type = $2;
1240 die "unknown breaktype $type" unless defined $break_types{$type};
1241 $break_table[hex $1] = $break_types{$type};
1242 next;
1244 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1246 my $type = $3;
1247 die "unknown breaktype $type" unless defined $break_types{$type};
1248 foreach my $i (hex $1 .. hex $2)
1250 $break_table[$i] = $break_types{$type};
1252 next;
1254 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1256 my $type = $2;
1257 die "unknown breaktype $type" unless defined $break_types{$type};
1258 $break_table[hex $1] = $break_types{$type};
1259 next;
1261 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1263 my $type = $3;
1264 die "unknown breaktype $type" unless defined $break_types{$type};
1265 foreach my $i (hex $1 .. hex $2)
1267 $break_table[$i] = $break_types{$type};
1269 next;
1271 die "malformed line $_";
1273 close $INPUT;
1275 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1276 print "Building $filename\n";
1277 print OUTPUT "/* Unicode Line Break Properties */\n";
1278 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1279 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1280 print OUTPUT "#include \"windef.h\"\n\n";
1282 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1284 close OUTPUT;
1285 save_file($filename);
1288 my %scripts =
1290 "Unknown" => 0,
1291 "Common" => 1,
1292 "Inherited" => 2,
1293 "Arabic" => 3,
1294 "Armenian" => 4,
1295 "Avestan" => 5,
1296 "Balinese" => 6,
1297 "Bamum" => 7,
1298 "Batak" => 8,
1299 "Bengali" => 9,
1300 "Bopomofo" => 10,
1301 "Brahmi" => 11,
1302 "Braille" => 12,
1303 "Buginese" => 13,
1304 "Buhid" => 14,
1305 "Canadian_Aboriginal" => 15,
1306 "Carian" => 16,
1307 "Cham" => 17,
1308 "Cherokee" => 18,
1309 "Coptic" => 19,
1310 "Cuneiform" => 20,
1311 "Cypriot" => 21,
1312 "Cyrillic" => 22,
1313 "Deseret" => 23,
1314 "Devanagari" => 24,
1315 "Egyptian_Hieroglyphs" => 25,
1316 "Ethiopic" => 26,
1317 "Georgian" => 27,
1318 "Glagolitic" => 28,
1319 "Gothic" => 29,
1320 "Greek" => 30,
1321 "Gujarati" => 31,
1322 "Gurmukhi" => 32,
1323 "Han" => 33,
1324 "Hangul" => 34,
1325 "Hanunoo" => 35,
1326 "Hebrew" => 36,
1327 "Hiragana" => 37,
1328 "Imperial_Aramaic" => 38,
1329 "Inscriptional_Pahlavi" => 39,
1330 "Inscriptional_Parthian" => 40,
1331 "Javanese" => 41,
1332 "Kaithi" => 42,
1333 "Kannada" => 43,
1334 "Katakana" => 44,
1335 "Kayah_Li" => 45,
1336 "Kharoshthi" => 46,
1337 "Khmer" => 47,
1338 "Lao" => 48,
1339 "Latin" => 49,
1340 "Lepcha" => 50,
1341 "Limbu" => 51,
1342 "Linear_B" => 52,
1343 "Lisu" => 53,
1344 "Lycian" => 54,
1345 "Lydian" => 55,
1346 "Malayalam" => 56,
1347 "Mandaic" => 57,
1348 "Meetei_Mayek" => 58,
1349 "Mongolian" => 59,
1350 "Myanmar" => 60,
1351 "New_Tai_Lue" => 61,
1352 "Nko" => 62,
1353 "Ogham" => 63,
1354 "Ol_Chiki" => 64,
1355 "Old_Italic" => 65,
1356 "Old_Persian" => 66,
1357 "Old_South_Arabian" => 67,
1358 "Old_Turkic" => 68,
1359 "Oriya" => 69,
1360 "Osmanya" => 70,
1361 "Phags_Pa" => 71,
1362 "Phoenician" => 72,
1363 "Rejang" => 73,
1364 "Runic" => 74,
1365 "Samaritan" => 75,
1366 "Saurashtra" => 76,
1367 "Shavian" => 77,
1368 "Sinhala" => 78,
1369 "Sundanese" => 79,
1370 "Syloti_Nagri" => 80,
1371 "Syriac" => 81,
1372 "Tagalog" => 82,
1373 "Tagbanwa" => 83,
1374 "Tai_Le" => 84,
1375 "Tai_Tham" => 85,
1376 "Tai_Viet" => 86,
1377 "Tamil" => 87,
1378 "Telugu" => 88,
1379 "Thaana" => 89,
1380 "Thai" => 90,
1381 "Tibetan" => 91,
1382 "Tifinagh" => 92,
1383 "Ugaritic" => 93,
1384 "Vai" => 94,
1385 "Yi" => 95,
1386 # Win8/Win8.1
1387 "Chakma" => 96,
1388 "Meroitic_Cursive" => 97,
1389 "Meroitic_Hieroglyphs" => 98,
1390 "Miao" => 99,
1391 "Sharada" => 100,
1392 "Sora_Sompeng" => 101,
1393 "Takri" => 102,
1394 # Win10
1395 "Bassa_Vah" => 103,
1396 "Caucasian_Albanian" => 104,
1397 "Duployan" => 105,
1398 "Elbasan" => 106,
1399 "Grantha" => 107,
1400 "Khojki" => 108,
1401 "Khudawadi" => 109,
1402 "Linear_A" => 110,
1403 "Mahajani" => 111,
1404 "Manichaean" => 112,
1405 "Mende_Kikakui" => 113,
1406 "Modi" => 114,
1407 "Mro" => 115,
1408 "Nabataean" => 116,
1409 "Old_North_Arabian" => 117,
1410 "Old_Permic" => 118,
1411 "Pahawh_Hmong" => 119,
1412 "Palmyrene" => 120,
1413 "Pau_Cin_Hau" => 121,
1414 "Psalter_Pahlavi" => 122,
1415 "Siddham" => 123,
1416 "Tirhuta" => 124,
1417 "Warang_Citi" => 125,
1418 # Win10 RS1
1419 "Adlam" => 126,
1420 "Ahom" => 127,
1421 "Anatolian_Hieroglyphs" => 128,
1422 "Bhaiksuki" => 129,
1423 "Hatran" => 130,
1424 "Marchen" => 131,
1425 "Multani" => 132,
1426 "Newa" => 133,
1427 "Old_Hungarian" => 134,
1428 "Osage" => 135,
1429 "SignWriting" => 136,
1430 "Tangut" => 137,
1431 # Win10 RS4
1432 "Masaram_Gondi" => 138,
1433 "Nushu" => 139,
1434 "Soyombo" => 140,
1435 "Zanabazar_Square" => 141,
1436 # Win10 1903
1437 "Dogra" => 142,
1438 "Gunjala_Gondi" => 143,
1439 "Hanifi_Rohingya" => 144,
1440 "Makasar" => 145,
1441 "Medefaidrin" => 146,
1442 "Old_Sogdian" => 147,
1443 "Sogdian" => 148,
1444 # Win10 2004
1445 "Elymaic" => 149,
1446 "Nyiakeng_Puachue_Hmong" => 150,
1447 "Nandinagari" => 151,
1448 "Wancho" => 152,
1451 ################################################################
1452 # dump Script IDs table
1453 sub dump_scripts($)
1455 my $filename = shift;
1456 my $header = $filename;
1457 my @scripts_table;
1458 my $script_index;
1459 my $i;
1461 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1462 # Fill the table
1463 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1464 while (<$INPUT>)
1466 my $type = "";
1468 next if /^\#/; # skip comments
1469 next if /^\s*$/; # skip empty lines
1470 next if /\x1a/; # skip ^Z
1471 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1473 $type = $2;
1474 if (defined $scripts{$type})
1476 $scripts_table[hex $1] = $scripts{$type};
1478 next;
1480 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1482 $type = $3;
1483 if (defined $scripts{$type})
1485 foreach my $i (hex $1 .. hex $2)
1487 $scripts_table[$i] = $scripts{$type};
1490 next;
1494 close $INPUT;
1496 $header = "$filename.h";
1497 open OUTPUT,">$header.new" or die "Cannot create $header";
1498 print "Building $header\n";
1499 print OUTPUT "/* Unicode Script IDs */\n";
1500 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1501 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1503 print OUTPUT "enum unicode_script_id {\n";
1504 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1506 print OUTPUT " Script_$script = $scripts{$script},\n";
1508 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1509 print OUTPUT "};\n";
1511 close OUTPUT;
1512 save_file($header);
1514 $filename = "$filename.c";
1515 open OUTPUT,">$filename.new" or die "Cannot create $header";
1516 print "Building $filename\n";
1517 print OUTPUT "/* Unicode Script IDs */\n";
1518 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1519 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1520 print OUTPUT "#include \"windef.h\"\n\n";
1522 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1523 close OUTPUT;
1524 save_file($filename);
1527 ################################################################
1528 # dump the BiDi mirroring table
1529 sub dump_mirroring($)
1531 my $filename = shift;
1532 my @mirror_table = ();
1534 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1535 while (<$INPUT>)
1537 next if /^\#/; # skip comments
1538 next if /^$/; # skip empty lines
1539 next if /\x1a/; # skip ^Z
1540 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1542 $mirror_table[hex $1] = hex $2;
1543 next;
1545 die "malformed line $_";
1547 close $INPUT;
1549 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1550 print "Building $filename\n";
1551 print OUTPUT "/* Unicode BiDi mirroring */\n";
1552 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1553 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1554 print OUTPUT "#include \"windef.h\"\n\n";
1555 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1556 close OUTPUT;
1557 save_file($filename);
1560 ################################################################
1561 # dump the Bidi Brackets
1562 sub dump_bracket($)
1564 my $filename = shift;
1565 my @bracket_table;
1567 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1568 while (<$INPUT>)
1570 next if /^\#/; # skip comments
1571 next if /^\s*$/; # skip empty lines
1572 next if /\x1a/; # skip ^Z
1573 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1575 my $type = $3;
1576 die "unknown bracket $type" unless defined $bracket_types{$type};
1577 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1578 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1579 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1580 next;
1582 die "malformed line $_";
1584 close $INPUT;
1586 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1587 print "Building $filename\n";
1588 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1589 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1590 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1591 print OUTPUT "#include \"windef.h\"\n\n";
1593 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1595 close OUTPUT;
1596 save_file($filename);
1599 ################################################################
1600 # dump the Arabic shaping table
1601 sub dump_shaping($)
1603 my $filename = shift;
1604 my %groups;
1605 my $next_group = 0;
1607 $groups{"No_Joining_Group"} = $next_group++;
1609 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1610 while (<$INPUT>)
1612 next if /^\#/; # skip comments
1613 next if /^\s*$/; # skip empty lines
1614 next if /\x1a/; # skip ^Z
1615 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1617 my $type = $2;
1618 my $group = $3;
1619 $groups{$group} = $next_group++ unless defined $groups{$group};
1620 $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1621 next;
1623 die "malformed line $_";
1625 close $INPUT;
1627 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1628 print "Building $filename\n";
1629 print OUTPUT "/* Unicode Arabic shaping */\n";
1630 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1631 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1632 print OUTPUT "#include \"windef.h\"\n\n";
1634 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1636 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1637 for (my $i = 0x600; $i <= 0x6ff; $i++)
1639 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1640 ${joining_forms{"isolated"}}[$i] || $i,
1641 ${joining_forms{"final"}}[$i] || $i,
1642 ${joining_forms{"initial"}}[$i] || $i,
1643 ${joining_forms{"medial"}}[$i] || $i;
1645 print OUTPUT "};\n";
1647 close OUTPUT;
1648 save_file($filename);
1651 ################################################################
1652 # dump the Vertical Orientation table
1653 sub dump_vertical($)
1655 my $filename = shift;
1656 my @vertical_table;
1658 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1659 while (<$INPUT>)
1661 next if /^\#/; # skip comments
1662 next if /^\s*$/; # skip empty lines
1663 next if /\x1a/; # skip ^Z
1664 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1666 my $type = $2;
1667 die "unknown vertical $type" unless defined $vertical_types{$type};
1668 if (hex $1 < 65536)
1670 $vertical_table[hex $1] = $vertical_types{$type};
1672 next;
1674 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1676 my $type = $3;
1677 die "unknown vertical $type" unless defined $vertical_types{$type};
1678 foreach my $i (hex $1 .. hex $2)
1680 $vertical_table[$i] = $vertical_types{$type};
1682 next;
1684 die "malformed line $_";
1686 close $INPUT;
1688 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1689 print "Building $filename\n";
1690 print OUTPUT "/* Unicode Vertical Orientation */\n";
1691 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1692 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1693 print OUTPUT "#include \"windef.h\"\n\n";
1695 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1697 close OUTPUT;
1698 save_file($filename);
1701 ################################################################
1702 # dump the digit folding tables
1703 sub dump_digit_folding($)
1705 my ($filename) = shift;
1706 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1707 print "Building $filename\n";
1708 print OUTPUT "/* Unicode digit folding mappings */\n";
1709 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1710 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1711 print OUTPUT "#include \"windef.h\"\n\n";
1713 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1714 close OUTPUT;
1715 save_file($filename);
1719 ################################################################
1720 # dump the case mapping tables
1721 sub dump_case_mappings($)
1723 my $filename = shift;
1724 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1725 print "Building $filename\n";
1726 print OUTPUT "/* Unicode case mappings */\n";
1727 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1728 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1729 print OUTPUT "#include \"windef.h\"\n\n";
1731 my @upper = @toupper_table;
1732 my @lower = @tolower_table;
1733 remove_linguistic_mappings( \@upper, \@lower );
1735 dump_case_table( "wine_casemap_lower", @lower );
1736 print OUTPUT "\n";
1737 dump_case_table( "wine_casemap_upper", @upper );
1738 close OUTPUT;
1739 save_file($filename);
1743 ################################################################
1744 # dump a case mapping table
1745 sub dump_case_table($@)
1747 my ($name,@table) = @_;
1749 for (my $i = 0; $i < 65536; $i++)
1751 next unless defined $table[$i];
1752 $table[$i] = ($table[$i] - $i) & 0xffff;
1755 my @array = compress_array( 256, 0, @table[0..65535] );
1757 printf OUTPUT "const WCHAR %s[%d] =\n", $name, scalar @array;
1758 printf OUTPUT "{\n /* index */\n";
1759 printf OUTPUT "%s,\n", dump_array( 16, 0, @array[0..255] );
1760 printf OUTPUT " /* data */\n";
1761 printf OUTPUT "%s", dump_array( 16, 0, @array[256..$#array] );
1762 printf OUTPUT "\n};\n";
1765 ################################################################
1766 # compress a mapping table by removing identical rows
1767 sub compress_array($$@)
1769 my $rows = shift;
1770 my $def = shift;
1771 my @table = @_;
1772 my $len = @table / $rows;
1773 my @array;
1774 my $data = "";
1776 # try to merge table rows
1777 for (my $row = 0; $row < $rows; $row++)
1779 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1780 my $pos = index $data, $rowtxt;
1781 if ($pos == -1)
1783 # check if the tail of the data can match the start of the new row
1784 my $first = substr( $rowtxt, 0, 1 );
1785 for (my $i = length($data) - 1; $i > 0; $i--)
1787 $pos = index( substr( $data, -$i ), $first );
1788 last if $pos == -1;
1789 $i -= $pos;
1790 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1791 substr( $data, -$i ) = "";
1792 last;
1794 $pos = length $data;
1795 $data .= $rowtxt;
1797 $array[$row] = $rows + $pos;
1799 return @array, unpack "U*", $data;
1802 ################################################################
1803 # dump a simple char -> 16-bit value mapping table
1804 sub dump_simple_mapping($$@)
1806 my $name = shift;
1807 my $def = shift;
1808 my @array = compress_array( 256, $def, @_[0..65535] );
1810 printf OUTPUT "const unsigned short %s[%d] =\n{\n", $name, $#array+1;
1811 printf OUTPUT " /* offsets */\n%s,\n", dump_array( 16, 0, @array[0..255] );
1812 printf OUTPUT " /* values */\n%s\n};\n", dump_array( 16, 0, @array[256..$#array] );
1815 ################################################################
1816 # dump a char -> 16-bit value mapping table using two-level tables
1817 sub dump_two_level_mapping($$@)
1819 my $name = shift;
1820 my $def = shift;
1821 my $size = shift;
1822 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1823 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1824 my @array = compress_array( 256, 0, @row_array[0..4095] );
1826 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1828 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1829 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1830 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1831 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1834 ################################################################
1835 # dump a char -> value mapping table using three-level tables
1836 sub dump_three_level_mapping($$@)
1838 my $name = shift;
1839 my $def = shift;
1840 my $size = shift;
1841 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1842 my $level3 = ($MAX_CHAR + 1) / 16;
1843 my $level2 = $level3 / 16;
1844 my $level1 = $level2 / 16;
1845 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1846 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1847 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1849 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1850 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1852 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1853 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1854 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1855 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1856 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1859 ################################################################
1860 # dump a binary case mapping table in l_intl.nls format
1861 sub dump_binary_case_table(@)
1863 my (@table) = @_;
1864 my $max_char = 0x10000;
1865 my $level1 = $max_char / 16;
1866 my $level2 = $level1 / 16;
1868 my @difftable;
1869 for (my $i = 0; $i < @table; $i++)
1871 next unless defined $table[$i];
1872 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1875 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1876 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1877 my $offset = @array - $level1;
1878 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1879 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1882 ################################################################
1883 # dump case mappings for l_intl.nls
1884 sub dump_intl_nls($)
1886 my @upper_table = @toupper_table;
1887 my @lower_table = @tolower_table;
1888 remove_linguistic_mappings( \@upper_table, \@lower_table );
1890 my $upper = dump_binary_case_table( @upper_table );
1891 my $lower = dump_binary_case_table( @lower_table );
1893 my $filename = shift;
1894 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1895 printf "Building $filename\n";
1897 binmode OUTPUT;
1898 print OUTPUT pack "S<", 1; # version
1899 print OUTPUT $upper;
1900 print OUTPUT $lower;
1901 close OUTPUT;
1902 save_file($filename);
1906 ################################################################
1907 # dump the bidi direction table
1908 sub dump_bidi_dir_table($)
1910 my $filename = shift;
1911 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1912 printf "Building $filename\n";
1913 printf OUTPUT "/* Unicode BiDi direction table */\n";
1914 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1915 printf OUTPUT "#include \"windef.h\"\n\n";
1917 my @table;
1919 for (my $i = 0; $i < 65536; $i++)
1921 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1924 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1926 close OUTPUT;
1927 save_file($filename);
1931 ################################################################
1932 # dump the ctype tables
1933 sub dump_ctype_tables($)
1935 my $filename = shift;
1936 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1937 printf "Building $filename\n";
1938 printf OUTPUT "/* Unicode ctype tables */\n";
1939 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1940 printf OUTPUT "#include \"windef.h\"\n\n";
1942 my @table = map { ($_ || 0) & 0xffff; } @category_table;
1944 # add the direction in the high 4 bits of the category
1945 for (my $i = 0; $i < 65536; $i++)
1947 $table[$i] |= $directions{$direction_table[$i]} << 12 if defined $direction_table[$i];
1950 dump_simple_mapping( "wine_wctype_table", 0, @table );
1952 close OUTPUT;
1953 save_file($filename);
1957 sub rol($$)
1959 my ($byte, $count) = @_;
1960 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1963 ################################################################
1964 # compress the character properties table
1965 sub compress_char_props_table($@)
1967 my $rows = shift;
1968 my @table = @_;
1969 my $len = @table / $rows;
1970 my $pos = 0;
1971 my @array = (0) x $rows;
1972 my %sequences;
1974 # add some predefined sequences
1975 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1977 # try to merge table rows
1978 for (my $row = 0; $row < $rows; $row++)
1980 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1981 my $rowtxt = pack "L*", @table_row;
1982 if (defined($sequences{$rowtxt}))
1984 # reuse an existing row
1985 $array[$row] = $sequences{$rowtxt};
1987 else
1989 # create a new row
1990 $sequences{$rowtxt} = $array[$row] = ++$pos;
1991 push @array, @table_row;
1994 return @array;
1997 ################################################################
1998 # dump a normalization table in binary format
1999 sub dump_norm_table($)
2001 my $filename = shift;
2003 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2004 my %decomp = ( "nfc" => \@decomp_table,
2005 "nfd" => \@decomp_table,
2006 "nfkc" => \@decomp_compat_table,
2007 "nfkd" => \@decomp_compat_table ,
2008 "idna" => \@idna_decomp_table );
2010 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2011 print "Building $filename\n";
2013 my $type = $filename;
2014 $type =~ s!.*/norm(\w+)\.nls!$1!;
2016 my $compose = $forms{$type} & 1;
2017 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2019 my @version = split /\./, $UNIVERSION;
2021 # combining classes
2023 my @classes;
2024 my @class_values;
2026 foreach my $c (grep defined, @combining_class_table)
2028 $classes[$c] = 1 if $c < 0x100;
2030 for (my $i = 0; $i < @classes; $i++)
2032 next unless defined $classes[$i];
2033 $classes[$i] = @class_values;
2034 push @class_values, $i;
2036 push @class_values, 0 if (@class_values % 2);
2037 die "too many classes" if @class_values >= 0x40;
2039 # character properties
2041 my @char_props;
2042 my @decomposed;
2043 my @comp_hash_table;
2044 my $comp_hash_size = $compose ? 254 : 0;
2046 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2048 next unless defined $combining_class_table[$i];
2049 if (defined $decomp{$type}->[$i])
2051 my @dec = get_decomposition( $i, $decomp{$type} );
2052 if ($compose && (my @comp = get_composition( $i, $compat )))
2054 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2055 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2057 my $val = 0;
2058 foreach my $d (@dec)
2060 $val = $combining_class_table[$d];
2061 last if $val;
2063 $char_props[$i] = $classes[$val];
2065 else
2067 $char_props[$i] = 0xbf;
2069 @dec = compose_hangul( @dec ) if $compose;
2070 @dec = to_utf16( @dec );
2071 push @dec, 0 if @dec >= 7;
2072 $decomposed[$i] = \@dec;
2074 else
2076 if ($combining_class_table[$i] == 0x100)
2078 $char_props[$i] = 0x7f;
2080 elsif ($combining_class_table[$i])
2082 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2084 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2086 $char_props[$i] = 0xff;
2088 else
2090 $char_props[$i] = 0;
2095 if ($compose)
2097 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2099 my @comp = get_composition( $i, $compat );
2100 next unless @comp;
2101 if ($combining_class_table[$comp[1]])
2103 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2104 $char_props[$comp[1]] |= 0x40;
2106 else
2108 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2109 $char_props[$comp[1]] |= 0xc0;
2114 # surrogates
2115 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2116 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2118 # Hangul
2119 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2120 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2121 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2123 # invalid chars
2124 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2125 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2126 foreach my $i (0x00..0x10)
2128 $char_props[($i << 16) | 0xfffe] = 0xff;
2129 $char_props[($i << 16) | 0xffff] = 0xff;
2132 # decomposition hash table
2134 my @decomp_hash_table;
2135 my @decomp_hash_index;
2136 my @decomp_hash_data;
2137 my $decomp_hash_size = 944;
2139 # build string of character data, reusing substrings when possible
2140 my $decomp_char_data = "";
2141 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2143 my $str = pack "U*", @{$i};
2144 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2146 for (my $i = 0; $i < @decomposed; $i++)
2148 next unless defined $decomposed[$i];
2149 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2150 die "sequence not found" if $pos == -1;
2151 my $len = @{$decomposed[$i]};
2152 $len = 7 if $len > 7;
2153 my $hash = $i % $decomp_hash_size;
2154 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2156 for (my $i = 0; $i < $decomp_hash_size; $i++)
2158 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2159 next unless defined $decomp_hash_table[$i];
2160 if (@{$decomp_hash_table[$i]} == 1)
2162 my $entry = $decomp_hash_table[$i]->[0];
2163 if ($char_props[$entry->[0]] == 0xbf)
2165 $decomp_hash_index[$i] = $entry->[1];
2166 next;
2169 foreach my $entry (@{$decomp_hash_table[$i]})
2171 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2174 push @decomp_hash_data, 0, 0;
2176 # composition hash table
2178 my @comp_hash_index;
2179 my @comp_hash_data;
2180 if (@comp_hash_table)
2182 for (my $i = 0; $i < $comp_hash_size; $i++)
2184 $comp_hash_index[$i] = @comp_hash_data;
2185 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2187 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2188 push @comp_hash_data, 0, 0, 0;
2191 my $level1 = ($MAX_CHAR + 1) / 128;
2192 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2194 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2195 0, $decomp_hash_size, $comp_hash_size, 0 );
2196 my @tables = (0) x 8;
2198 $tables[0] = 16 + @header + @tables;
2199 $tables[1] = $tables[0] + @class_values / 2;
2200 $tables[2] = $tables[1] + $level1 / 2;
2201 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2202 $tables[4] = $tables[3] + @decomp_hash_index;
2203 $tables[5] = $tables[4] + @decomp_hash_data;
2204 $tables[6] = $tables[5] + length $decomp_char_data;
2205 $tables[7] = $tables[6] + @comp_hash_index;
2207 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2208 print OUTPUT pack "S<*", @header;
2209 print OUTPUT pack "S<*", @tables;
2210 print OUTPUT pack "C*", @class_values;
2212 print OUTPUT pack "C*", @rows[0..$level1-1];
2213 print OUTPUT pack "C*", @rows[$level1..$#rows];
2214 print OUTPUT pack "S<*", @decomp_hash_index;
2215 print OUTPUT pack "S<*", @decomp_hash_data;
2216 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2217 print OUTPUT pack "S<*", @comp_hash_index;
2218 print OUTPUT pack "S<*", @comp_hash_data;
2220 close OUTPUT;
2221 save_file($filename);
2223 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2227 ################################################################
2228 # output a codepage definition file from the global tables
2229 sub output_codepage_file($)
2231 my $codepage = shift;
2233 my $output = sprintf "nls/c_%03d.nls", $codepage;
2234 open OUTPUT,">$output.new" or die "Cannot create $output";
2236 printf "Building %s\n", $output;
2237 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2238 else { dump_binary_dbcs_table( $codepage ); }
2240 close OUTPUT;
2241 save_file($output);
2243 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2246 ################################################################
2247 # output a codepage table from a Microsoft-style mapping file
2248 sub dump_msdata_codepage($)
2250 my $filename = shift;
2252 my $state = "";
2253 my ($codepage, $width, $count);
2254 my ($lb_cur, $lb_end);
2256 @cp2uni = ();
2257 @glyph2uni = ();
2258 @lead_bytes = ();
2259 @uni2cp = ();
2260 $default_char = $DEF_CHAR;
2261 $default_wchar = $DEF_CHAR;
2263 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2265 while (<$INPUT>)
2267 next if /^;/; # skip comments
2268 next if /^\s*$/; # skip empty lines
2269 next if /\x1a/; # skip ^Z
2270 last if /^ENDCODEPAGE/;
2272 if (/^CODEPAGE\s+(\d+)/)
2274 $codepage = $1;
2275 next;
2277 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2279 $width = $1;
2280 $default_char = hex $2;
2281 $default_wchar = hex $3;
2282 next;
2284 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2286 $state = $1;
2287 $count = $2;
2288 next;
2290 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2292 if ($state eq "MBTABLE")
2294 my $cp = hex $1;
2295 my $uni = hex $2;
2296 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2297 next;
2299 if ($state eq "GLYPHTABLE")
2301 my $cp = hex $1;
2302 my $uni = hex $2;
2303 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2304 next;
2306 if ($state eq "WCTABLE")
2308 my $uni = hex $1;
2309 my $cp = hex $2;
2310 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2311 next;
2313 if ($state eq "DBCSRANGE")
2315 my $start = hex $1;
2316 my $end = hex $2;
2317 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2318 $lb_cur = $start;
2319 $lb_end = $end;
2320 next;
2322 if ($state eq "DBCSTABLE")
2324 my $mb = hex $1;
2325 my $uni = hex $2;
2326 my $cp = ($lb_cur << 8) | $mb;
2327 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2328 if (!--$count)
2330 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2332 next;
2335 die "$filename: Unrecognized line $_\n";
2337 close $INPUT;
2339 output_codepage_file( $codepage );
2342 ################################################################
2343 # align a string length
2344 sub align_string($$)
2346 my ($align, $str) = @_;
2347 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2348 return $str;
2351 ################################################################
2352 # pack a GUID string
2353 sub pack_guid($)
2355 $_ = shift;
2356 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2357 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2360 ################################################################
2361 # comparison function for compression sort
2362 sub cmp_compression
2364 return scalar @{$a} <=> scalar @{$b} ||
2365 $a->[4] <=> $b->[4] ||
2366 $a->[5] <=> $b->[5] ||
2367 $a->[6] <=> $b->[6] ||
2368 $a->[7] <=> $b->[7] ||
2369 $a->[8] <=> $b->[8] ||
2370 $a->[9] <=> $b->[9] ||
2371 $a->[10] <=> $b->[10] ||
2372 $a->[11] <=> $b->[11] ||
2373 $a->[12] <=> $b->[12];
2376 ################################################################
2377 # build a binary sort keys table
2378 sub dump_sortkey_table($$)
2380 my ($filename, $download) = @_;
2382 my @keys;
2383 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2384 my @multiple_weights;
2385 my @expansions;
2386 my @compressions;
2387 my %exceptions;
2388 my %guids;
2389 my %compr_flags;
2390 my %locales;
2391 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2392 my $jamostr = "";
2394 my $re_hex = '0x[0-9A-Fa-f]+';
2395 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2396 $guids{$default_guid} = { };
2398 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2400 my $KEYS = open_data_file( $MSDATA, $download );
2402 printf "Building $filename\n";
2404 while (<$KEYS>)
2406 s/\s*;.*$//;
2407 next if /^\s*$/; # skip empty lines
2408 if (/^\s*(SORTKEY|SORTTABLES)/)
2410 $part = $1;
2411 next;
2413 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2415 $part = $section = "";
2416 next;
2418 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2420 $section = $1;
2421 $guid = undef;
2422 next;
2424 next unless $part;
2425 if ("$part.$section" eq "SORTKEY.DEFAULT")
2427 if (/^\s*($re_hex)\s+$re_key/)
2429 $keys[hex $1] = [ split(/\s+/,$2) ];
2430 next;
2433 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2435 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2437 $version = hex $1;
2438 next;
2440 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2442 # ignore for now
2443 next;
2446 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2447 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2448 "$part.$section" eq "SORTTABLES.INVERSECASING")
2450 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2452 $guid = lc $1;
2453 $guids{$guid} = { } unless defined $guids{$guid};
2454 $guids{$guid}->{flags} |= $flags{$section};
2455 next;
2457 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2459 $locales{$1} = $guid;
2460 next;
2463 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2465 if (/^\s*(\d+)\s+(\d+)/)
2467 push @multiple_weights, $1, $2;
2468 next;
2471 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2473 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2475 my $pos = scalar @expansions / 2;
2476 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2477 push @expansions, hex $2, hex $3;
2478 next;
2481 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2483 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2485 $keys[hex $1] = $keys[hex $2];
2486 next;
2489 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2491 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2493 if ($subsection || !$guid) # start a new one
2495 $guid = lc $1;
2496 $subsection = "";
2497 $guids{$guid} = { } unless defined $guids{$guid};
2498 $guids{$guid}->{flags} |= $flags{$2} if $2;
2499 $guids{$guid}->{compr} = @compressions;
2500 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2501 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2502 push @compressions, [ ];
2504 else # merge with current one
2506 $guids{lc $1} = { } unless defined $guids{lc $1};
2507 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2508 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2509 $compr_flags{lc $1} = $compr_flags{$guid};
2511 next;
2513 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2515 $locales{$1} = $guid;
2516 next;
2518 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2520 $subsection = $1;
2521 next;
2523 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2525 my @comp = map { hex $_; } split(/\s+/,$1);
2526 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2527 # add compression flags
2528 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2529 next;
2532 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2534 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2536 $guid = lc $1;
2537 $guids{$guid} = { } unless defined $guids{lc $1};
2538 $ling_flag = ($2 ? "+" : "-");
2539 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2540 next;
2542 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2544 $locales{$1} = $guid;
2545 next;
2547 if (/^\s*($re_hex)\s+$re_key/)
2549 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2550 next;
2553 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2555 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2557 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2558 next;
2561 die "$download: $part.$section: unrecognized line $_\n";
2563 close $KEYS;
2565 # Sortkey table
2567 my $table;
2568 for (my $i = 0; $i < 0x10000; $i++)
2570 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2571 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2574 foreach my $id (sort keys %exceptions)
2576 my $pos = length($table) / 4;
2577 my @exc = @{$exceptions{$id}};
2578 my @filled;
2579 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2580 my $guid = substr( $id, 0, -1 );
2581 $guids{$guid}->{$key} = $pos;
2582 $pos += 0x100;
2583 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2584 for (my $j = 0; $j < 0x10000; $j++)
2586 next unless defined $exc[$j] || defined $flags[$j];
2587 $filled[$j >> 8] = 1;
2588 $j |= 0xff;
2590 for (my $j = 0; $j < 0x100; $j++)
2592 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2593 $pos += 0x100 if $filled[$j];
2595 for (my $j = 0; $j < 0x10000; $j++)
2597 next unless $filled[$j >> 8];
2598 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2599 $k[3] |= $flags[$j] || 0;
2600 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2604 # Case mapping tables
2606 # standard table
2607 my @casemaps;
2608 my @upper = @toupper_table;
2609 my @lower = @tolower_table;
2610 remove_linguistic_mappings( \@upper, \@lower );
2611 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2613 # linguistic table
2614 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2616 # Turkish table
2617 @upper = @toupper_table;
2618 @lower = @tolower_table;
2619 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2620 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2621 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2622 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2624 # Char type table
2626 my @table;
2627 my $types = "";
2628 my %typestr;
2629 for (my $i = 0; $i < 0x10000; $i++)
2631 my $str = pack "S<3",
2632 ($category_table[$i] || 0) & 0xffff,
2633 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2634 ($category_table[$i] || 0) >> 16;
2636 if (!defined($typestr{$str}))
2638 $typestr{$str} = length($types) / 6;
2639 $types .= $str;
2641 $table[$i] = $typestr{$str};
2644 my @rows = compress_array( 4096, 0, @table[0..65535] );
2645 my @array = compress_array( 256, 0, @rows[0..4095] );
2646 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2647 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2649 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2650 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2651 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2653 # Sort tables
2655 # guids
2656 my $sorttables = pack "L<2", $version, scalar %guids;
2657 foreach my $id (sort keys %guids)
2659 my %guid = %{$guids{$id}};
2660 my $flags = $guid{flags} || 0;
2661 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2662 $sorttables .= pack_guid($id) . pack "L<5",
2663 $flags,
2664 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2665 $guid{except} || 0,
2666 $guid{ling_except} || 0,
2667 $map / 2;
2670 # expansions
2671 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2673 # compressions
2674 $sorttables .= pack "L<", scalar @compressions;
2675 my $rowstr = "";
2676 foreach my $c (@compressions)
2678 my $pos = length($rowstr) / 2;
2679 my $min = 0xffff;
2680 my $max = 0;
2681 my @lengths = (0) x 8;
2682 foreach my $r (sort cmp_compression @{$c})
2684 my @row = @{$r};
2685 $lengths[scalar @row - 6]++;
2686 foreach my $val (@row[4..$#row])
2688 $min = $val if $min > $val;
2689 $max = $val if $max < $val;
2691 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2692 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2694 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2696 $sorttables .= $rowstr;
2698 # multiple weights
2699 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2701 # jamo sort
2702 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2704 # Locales
2706 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2707 foreach my $loc (sort keys %locales)
2709 # skip specific locales that match more general ones
2710 my @parts = split /[-_]/, $loc;
2711 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2712 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2713 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2716 # File header
2718 my @header;
2719 $header[0] = 16;
2720 $header[1] = $header[0] + length $table;
2721 $header[2] = $header[1] + length $casemaps;
2722 $header[3] = $header[2] + length $chartypes;
2724 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2725 print OUTPUT pack "L<*", @header;
2726 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2727 close OUTPUT;
2728 save_file($filename);
2732 ################################################################
2733 # build the script to create registry keys
2734 sub dump_registry_script($%)
2736 my ($filename, %keys) = @_;
2737 my $indent = 1;
2739 printf "Building %s\n", $filename;
2740 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2741 print OUTPUT "HKLM\n{\n";
2742 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2744 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2745 $indent++;
2747 foreach my $k (sort keys %keys)
2749 my @subkeys = split /\\/, $k;
2750 my ($def, @vals) = @{$keys{$k}};
2751 for (my $i = 0; $i < @subkeys; $i++)
2753 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2754 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2755 $indent++;
2757 foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2758 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2760 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2761 close OUTPUT;
2762 save_file($filename);
2766 ################################################################
2767 # save a file if modified
2768 sub save_file($)
2770 my $file = shift;
2771 if (-f $file && !system "cmp $file $file.new >/dev/null")
2773 unlink "$file.new";
2775 else
2777 rename "$file.new", "$file";
2782 ################################################################
2783 # main routine
2785 chdir ".." if -f "./make_unicode";
2786 load_data();
2787 dump_case_mappings( "libs/port/casemap.c" );
2788 dump_sortkeys( "dlls/kernelbase/collation.c" );
2789 dump_ctype_tables( "libs/port/wctype.c" );
2790 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2791 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2792 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2793 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2794 dump_mirroring( "dlls/dwrite/mirror.c" );
2795 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2796 dump_bracket( "dlls/dwrite/bracket.c" );
2797 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2798 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2799 dump_linebreak( "dlls/dwrite/linebreak.c" );
2800 dump_scripts( "dlls/dwrite/scripts" );
2801 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2802 dump_vertical( "dlls/gdi32/vertical.c" );
2803 dump_vertical( "dlls/wineps.drv/vertical.c" );
2804 dump_intl_nls("nls/l_intl.nls");
2805 dump_norm_table( "nls/normnfc.nls" );
2806 dump_norm_table( "nls/normnfd.nls" );
2807 dump_norm_table( "nls/normnfkc.nls" );
2808 dump_norm_table( "nls/normnfkd.nls" );
2809 dump_norm_table( "nls/normidna.nls" );
2810 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2811 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2812 dump_eucjp_codepage();
2813 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2815 exit 0;
2817 # Local Variables:
2818 # compile-command: "./make_unicode"
2819 # End: