dwrite/tests: Avoid "misleading indentation" warnings.
[wine.git] / tools / make_unicode
blob2bfe6f9bb32a4d1b7623868267cd033e2daef02c
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "14.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
30 my $REPORTS = "http://www.unicode.org/reports";
31 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
32 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
34 # Sort keys file
35 my $SORTKEYS = "tr10/allkeys.txt";
37 # Default char for undefined mappings
38 my $DEF_CHAR = ord '?';
40 # Last valid Unicode character
41 my $MAX_CHAR = 0x10ffff;
43 my @allfiles =
45 "CodpageFiles/037.txt",
46 "CodpageFiles/437.txt",
47 "CodpageFiles/500.txt",
48 "CodpageFiles/708.txt",
49 "CodpageFiles/720.txt",
50 "CodpageFiles/737.txt",
51 "CodpageFiles/775.txt",
52 "CodpageFiles/850.txt",
53 "CodpageFiles/852.txt",
54 "CodpageFiles/855.txt",
55 "CodpageFiles/857.txt",
56 "CodpageFiles/860.txt",
57 "CodpageFiles/861.txt",
58 "CodpageFiles/862.txt",
59 "CodpageFiles/863.txt",
60 "CodpageFiles/864.txt",
61 "CodpageFiles/865.txt",
62 "CodpageFiles/866.txt",
63 "CodpageFiles/869.txt",
64 "CodpageFiles/874.txt",
65 "CodpageFiles/875.txt",
66 "CodpageFiles/932.txt",
67 "CodpageFiles/936.txt",
68 "CodpageFiles/949.txt",
69 "CodpageFiles/950.txt",
70 "CodpageFiles/1026.txt",
71 "CodpageFiles/1250.txt",
72 "CodpageFiles/1251.txt",
73 "CodpageFiles/1252.txt",
74 "CodpageFiles/1253.txt",
75 "CodpageFiles/1254.txt",
76 "CodpageFiles/1255.txt",
77 "CodpageFiles/1256.txt",
78 "CodpageFiles/1257.txt",
79 "CodpageFiles/1258.txt",
80 "CodpageFiles/1361.txt",
81 "CodpageFiles/10000.txt",
82 "CodpageFiles/10001.txt",
83 "CodpageFiles/10002.txt",
84 "CodpageFiles/10003.txt",
85 "CodpageFiles/10004.txt",
86 "CodpageFiles/10005.txt",
87 "CodpageFiles/10006.txt",
88 "CodpageFiles/10007.txt",
89 "CodpageFiles/10008.txt",
90 "CodpageFiles/10010.txt",
91 "CodpageFiles/10017.txt",
92 "CodpageFiles/10021.txt",
93 "CodpageFiles/10029.txt",
94 "CodpageFiles/10079.txt",
95 "CodpageFiles/10081.txt",
96 "CodpageFiles/10082.txt",
97 "CodpageFiles/20127.txt",
98 "CodpageFiles/20866.txt",
99 "CodpageFiles/21866.txt",
100 "CodpageFiles/28591.txt",
101 "CodpageFiles/28592.txt",
102 "CodpageFiles/28593.txt",
103 "CodpageFiles/28594.txt",
104 "CodpageFiles/28595.txt",
105 "CodpageFiles/28596.txt",
106 "CodpageFiles/28597.txt",
107 "CodpageFiles/28598.txt",
108 "CodpageFiles/28599.txt",
109 "CodpageFiles/28603.txt",
110 "CodpageFiles/28605.txt",
114 my %ctype =
116 # CT_CTYPE1
117 "upper" => 0x0001,
118 "lower" => 0x0002,
119 "digit" => 0x0004,
120 "space" => 0x0008,
121 "punct" => 0x0010,
122 "cntrl" => 0x0020,
123 "blank" => 0x0040,
124 "xdigit" => 0x0080,
125 "alpha" => 0x0100 | 0x80000000,
126 "defin" => 0x0200,
127 # CT_CTYPE3 in high 16 bits
128 "nonspacing" => 0x00010000,
129 "diacritic" => 0x00020000,
130 "vowelmark" => 0x00040000,
131 "symbol" => 0x00080000,
132 "katakana" => 0x00100000,
133 "hiragana" => 0x00200000,
134 "halfwidth" => 0x00400000,
135 "fullwidth" => 0x00800000,
136 "ideograph" => 0x01000000,
137 "kashida" => 0x02000000,
138 "lexical" => 0x04000000,
139 "highsurrogate" => 0x08000000,
140 "lowsurrogate" => 0x10000000,
143 my %bracket_types =
145 "o" => 0x0000,
146 "c" => 0x0001,
149 my %indic_types =
151 "Other" => 0x0000,
152 "Bindu" => 0x0001,
153 "Visarga" => 0x0002,
154 "Avagraha" => 0x0003,
155 "Nukta" => 0x0004,
156 "Virama" => 0x0005,
157 "Vowel_Independent" => 0x0006,
158 "Vowel_Dependent" => 0x0007,
159 "Vowel" => 0x0008,
160 "Consonant_Placeholder" => 0x0009,
161 "Consonant" => 0x000a,
162 "Consonant_Dead" => 0x000b,
163 "Consonant_Succeeding_Repha" => 0x000c,
164 "Consonant_Subjoined" => 0x000d,
165 "Consonant_Medial" => 0x000e,
166 "Consonant_Final" => 0x000f,
167 "Consonant_Head_Letter" => 0x0010,
168 "Modifying_Letter" => 0x0011,
169 "Tone_Letter" => 0x0012,
170 "Tone_Mark" => 0x0013,
171 "Register_Shifter" => 0x0014,
172 "Consonant_Preceding_Repha" => 0x0015,
173 "Pure_Killer" => 0x0016,
174 "Invisible_Stacker" => 0x0017,
175 "Gemination_Mark" => 0x0018,
176 "Cantillation_Mark" => 0x0019,
177 "Non_Joiner" => 0x001a,
178 "Joiner" => 0x001b,
179 "Number_Joiner" => 0x001c,
180 "Number" => 0x001d,
181 "Brahmi_Joining_Number" => 0x001e,
182 "Consonant_With_Stacker" => 0x001f,
183 "Consonant_Prefixed" => 0x0020,
184 "Syllable_Modifier" => 0x0021,
185 "Consonant_Killer" => 0x0022,
186 "Consonant_Initial_Postfixed" => 0x0023,
189 my %matra_types =
191 "Right" => 0x01,
192 "Left" => 0x02,
193 "Visual_Order_Left" => 0x03,
194 "Left_And_Right" => 0x04,
195 "Top" => 0x05,
196 "Bottom" => 0x06,
197 "Top_And_Bottom" => 0x07,
198 "Top_And_Right" => 0x08,
199 "Top_And_Left" => 0x09,
200 "Top_And_Left_And_Right" => 0x0a,
201 "Bottom_And_Right" => 0x0b,
202 "Top_And_Bottom_And_Right" => 0x0c,
203 "Overstruck" => 0x0d,
204 "Invisible" => 0x0e,
205 "Bottom_And_Left" => 0x0f,
206 "Top_And_Bottom_And_Left" => 0x10,
209 my %break_types =
211 "BK" => 0x0001,
212 "CR" => 0x0002,
213 "LF" => 0x0003,
214 "CM" => 0x0004,
215 "SG" => 0x0005,
216 "GL" => 0x0006,
217 "CB" => 0x0007,
218 "SP" => 0x0008,
219 "ZW" => 0x0009,
220 "NL" => 0x000a,
221 "WJ" => 0x000b,
222 "JL" => 0x000c,
223 "JV" => 0x000d,
224 "JT" => 0x000e,
225 "H2" => 0x000f,
226 "H3" => 0x0010,
227 "XX" => 0x0011,
228 "OP" => 0x0012,
229 "CL" => 0x0013,
230 "CP" => 0x0014,
231 "QU" => 0x0015,
232 "NS" => 0x0016,
233 "EX" => 0x0017,
234 "SY" => 0x0018,
235 "IS" => 0x0019,
236 "PR" => 0x001a,
237 "PO" => 0x001b,
238 "NU" => 0x001c,
239 "AL" => 0x001d,
240 "ID" => 0x001e,
241 "IN" => 0x001f,
242 "HY" => 0x0020,
243 "BB" => 0x0021,
244 "BA" => 0x0022,
245 "SA" => 0x0023,
246 "AI" => 0x0024,
247 "B2" => 0x0025,
248 "HL" => 0x0026,
249 "CJ" => 0x0027,
250 "RI" => 0x0028,
251 "EB" => 0x0029,
252 "EM" => 0x002a,
253 "ZWJ" => 0x002b,
256 my %vertical_types =
258 "R" => 0x0000,
259 "U" => 0x0001,
260 "Tr" => 0x0002,
261 "Tu" => 0x0003,
264 my %categories =
266 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
267 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
268 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
269 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
270 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
271 "Me" => $ctype{"defin"}, # Mark, Enclosing
272 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
273 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
274 "No" => $ctype{"defin"}, # Number, Other
275 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
276 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
277 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
278 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
279 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
280 "Cs" => $ctype{"defin"}, # Other, Surrogate
281 "Co" => $ctype{"defin"}, # Other, Private Use
282 "Cn" => $ctype{"defin"}, # Other, Not Assigned
283 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
284 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
285 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
286 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
287 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
288 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
289 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
290 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
291 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
292 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
293 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
294 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
295 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
298 # a few characters need additional categories that cannot be determined automatically
299 my %special_categories =
301 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
302 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
303 "space" => [ 0x09..0x0d, 0x85 ],
304 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
305 "cntrl" => [ 0x070f, 0x200c, 0x200d,
306 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
307 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
308 0xfff9, 0xfffa, 0xfffb ],
309 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
310 0xd7, 0xf7 ],
311 "digit" => [ 0xb2, 0xb3, 0xb9 ],
312 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
313 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
314 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
315 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
316 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
317 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
318 0x02b9..0x02ba, 0x02c6..0x02cf ],
319 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
320 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
321 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
322 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
323 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
324 0x3131..0x3164 ],
325 "ideograph" => [ 0x3006..0x3007 ],
326 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
327 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
328 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
329 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
330 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
331 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
332 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
333 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
334 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
335 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
336 "kashida" => [ 0x0640 ],
339 my %directions =
341 "L" => 1, # Left-to-Right
342 "R" => 2, # Right-to-Left
343 "AL" => 12, # Right-to-Left Arabic
344 "EN" => 3, # European Number
345 "ES" => 4, # European Number Separator
346 "ET" => 5, # European Number Terminator
347 "AN" => 6, # Arabic Number
348 "CS" => 7, # Common Number Separator
349 "NSM" => 13, # Non-Spacing Mark
350 "BN" => 14, # Boundary Neutral
351 "B" => 8, # Paragraph Separator
352 "S" => 9, # Segment Separator
353 "WS" => 10, # Whitespace
354 "ON" => 11, # Other Neutrals
355 "LRE" => 15, # Left-to-Right Embedding
356 "LRO" => 15, # Left-to-Right Override
357 "RLE" => 15, # Right-to-Left Embedding
358 "RLO" => 15, # Right-to-Left Override
359 "PDF" => 15, # Pop Directional Format
360 "LRI" => 15, # Left-to-Right Isolate
361 "RLI" => 15, # Right-to-Left Isolate
362 "FSI" => 15, # First Strong Isolate
363 "PDI" => 15 # Pop Directional Isolate
366 my %c2_types =
368 "L" => 1, # C2_LEFTTORIGHT
369 "R" => 2, # C2_RIGHTTOLEFT
370 "AL" => 2, # C2_RIGHTTOLEFT
371 "EN" => 3, # C2_EUROPENUMBER
372 "ES" => 4, # C2_EUROPESEPARATOR
373 "ET" => 5, # C2_EUROPETERMINATOR
374 "AN" => 6, # C2_ARABICNUMBER
375 "CS" => 7, # C2_COMMONSEPARATOR
376 "NSM" => 11, # C2_OTHERNEUTRAL
377 "BN" => 0, # C2_NOTAPPLICABLE
378 "B" => 8, # C2_BLOCKSEPARATOR
379 "S" => 9, # C2_SEGMENTSEPARATOR
380 "WS" => 10, # C2_WHITESPACE
381 "ON" => 11, # C2_OTHERNEUTRAL
382 "LRE" => 11, # C2_OTHERNEUTRAL
383 "LRO" => 11, # C2_OTHERNEUTRAL
384 "RLE" => 11, # C2_OTHERNEUTRAL
385 "RLO" => 11, # C2_OTHERNEUTRAL
386 "PDF" => 11, # C2_OTHERNEUTRAL
387 "LRI" => 11, # C2_OTHERNEUTRAL
388 "RLI" => 11, # C2_OTHERNEUTRAL
389 "FSI" => 11, # C2_OTHERNEUTRAL
390 "PDI" => 11 # C2_OTHERNEUTRAL
393 my %bidi_types =
395 "ON" => 0, # Other Neutrals
396 "L" => 1, # Left-to-Right
397 "R" => 2, # Right-to-Left
398 "AN" => 3, # Arabic Number
399 "EN" => 4, # European Number
400 "AL" => 5, # Right-to-Left Arabic
401 "NSM" => 6, # Non-Spacing Mark
402 "CS" => 7, # Common Number Separator
403 "ES" => 8, # European Number Separator
404 "ET" => 9, # European Number Terminator
405 "BN" => 10, # Boundary Neutral
406 "S" => 11, # Segment Separator
407 "WS" => 12, # Whitespace
408 "B" => 13, # Paragraph Separator
409 "RLO" => 14, # Right-to-Left Override
410 "RLE" => 15, # Right-to-Left Embedding
411 "LRO" => 16, # Left-to-Right Override
412 "LRE" => 17, # Left-to-Right Embedding
413 "PDF" => 18, # Pop Directional Format
414 "LRI" => 19, # Left-to-Right Isolate
415 "RLI" => 20, # Right-to-Left Isolate
416 "FSI" => 21, # First Strong Isolate
417 "PDI" => 22 # Pop Directional Isolate
420 my %joining_types =
422 "U" => 0, # Non_Joining
423 "L" => 1, # Left_Joining
424 "R" => 2, # Right_Joining
425 "D" => 3, # Dual_Joining
426 "C" => 3, # Join_Causing
427 "ALAPH" => 4, # Syriac ALAPH
428 "DALATH RISH" => 5, # Syriac DALATH RISH group
429 "T" => 6, # Transparent
432 my @cp2uni = ();
433 my @glyph2uni = ();
434 my @lead_bytes = ();
435 my @uni2cp = ();
436 my @tolower_table = ();
437 my @toupper_table = ();
438 my @digitmap_table = ();
439 my @category_table = ();
440 my @initial_joining_table = ();
441 my @direction_table = ();
442 my @decomp_table = ();
443 my @combining_class_table = ();
444 my @decomp_compat_table = ();
445 my @comp_exclusions = ();
446 my @idna_decomp_table = ();
447 my @idna_disallowed = ();
448 my %registry_keys;
449 my $default_char;
450 my $default_wchar;
452 my %joining_forms =
454 "isolated" => [],
455 "final" => [],
456 "initial" => [],
457 "medial" => []
460 sub to_utf16(@)
462 my @ret;
463 foreach my $ch (@_)
465 if ($ch < 0x10000)
467 push @ret, $ch;
469 else
471 my $val = $ch - 0x10000;
472 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
475 return @ret;
478 ################################################################
479 # fetch a unicode.org file and open it
480 sub open_data_file($$)
482 my ($base, $name) = @_;
483 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
484 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
485 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
486 local *FILE;
488 if ($base =~ /.*\/([^\/]+)\.zip$/)
490 my $zip = "$1$suffix.zip";
491 unless (-f "$cache/$zip")
493 system "mkdir", "-p", $cache;
494 print "Fetching $base...\n";
495 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
497 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
499 else
501 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
502 unless (-f $dest)
504 system "mkdir", "-p", $dir;
505 print "Fetching $base/$name...\n";
506 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
508 open FILE, "<$dest" or die "cannot open $dest";
510 return *FILE;
513 ################################################################
514 # recursively get the decomposition for a character
515 sub get_decomposition($$);
516 sub get_decomposition($$)
518 my ($char, $table) = @_;
519 my @ret;
521 return $char unless defined $table->[$char];
522 foreach my $ch (@{$table->[$char]})
524 push @ret, get_decomposition( $ch, $table );
526 return @ret;
529 ################################################################
530 # get the composition that results in a given character
531 sub get_composition($$)
533 my ($ch, $compat) = @_;
534 return () unless defined $decomp_table[$ch]; # no decomposition
535 my @ret = @{$decomp_table[$ch]};
536 return () if @ret < 2; # singleton decomposition
537 return () if $comp_exclusions[$ch]; # composition exclusion
538 return () if $combining_class_table[$ch]; # non-starter
539 return () if $combining_class_table[$ret[0]]; # first char is non-starter
540 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
541 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
542 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
543 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
544 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
545 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
546 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
547 return @ret;
550 ################################################################
551 # recursively build decompositions
552 sub build_decompositions(@)
554 my @src = @_;
555 my @dst;
557 for (my $i = 0; $i < @src; $i++)
559 next unless defined $src[$i];
560 my @decomp = to_utf16( get_decomposition( $i, \@src ));
561 $dst[$i] = \@decomp;
563 return @dst;
566 ################################################################
567 # compose Hangul sequences
568 sub compose_hangul(@)
570 my $SBASE = 0xac00;
571 my $LBASE = 0x1100;
572 my $VBASE = 0x1161;
573 my $TBASE = 0x11a7;
574 my $LCOUNT = 19;
575 my $VCOUNT = 21;
576 my $TCOUNT = 28;
577 my $NCOUNT = $VCOUNT * $TCOUNT;
578 my $SCOUNT = $LCOUNT * $NCOUNT;
580 my @seq = @_;
581 my @ret;
582 my $i;
584 for ($i = 0; $i < @seq; $i++)
586 my $ch = $seq[$i];
587 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
588 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
590 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
591 $i++;
593 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
594 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
596 $ch += $seq[$i+1] - $TBASE;
597 $i++;
599 push @ret, $ch;
601 return @ret;
604 ################################################################
605 # remove linguistic-only mappings from the case table
606 sub remove_linguistic_mappings($$)
608 my ($upper, $lower) = @_;
610 # remove case mappings that don't round-trip
612 for (my $i = 0; $i < @{$upper}; $i++)
614 next unless defined ${$upper}[$i];
615 my $ch = ${$upper}[$i];
616 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
618 for (my $i = 0; $i < @{$lower}; $i++)
620 next unless defined ${$lower}[$i];
621 my $ch = ${$lower}[$i];
622 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
626 ################################################################
627 # read in the Unicode database files
628 sub load_data()
630 my $start;
632 # now build mappings from the decomposition field of the Unicode database
634 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
635 while (<$UNICODE_DATA>)
637 # Decode the fields ...
638 my ($code, $name, $cat, $comb, $bidi,
639 $decomp, $dec, $dig, $num, $mirror,
640 $oldname, $comment, $upper, $lower, $title) = split /;/;
641 my $src = hex $code;
643 die "unknown category $cat" unless defined $categories{$cat};
644 die "unknown directionality $bidi" unless defined $directions{$bidi};
646 $category_table[$src] = $categories{$cat};
647 $direction_table[$src] = $bidi;
648 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
650 $initial_joining_table[$src] = $joining_types{"T"};
652 else
654 $initial_joining_table[$src] = $joining_types{"U"};
657 if ($lower ne "")
659 $tolower_table[$src] = hex $lower;
661 if ($upper ne "")
663 $toupper_table[$src] = hex $upper;
665 if ($dec ne "")
667 $category_table[$src] |= $ctype{"digit"};
669 if ($dig ne "")
671 $digitmap_table[$src] = ord $dig;
673 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
675 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
676 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
677 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
678 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
679 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
680 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
681 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
682 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
683 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
684 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
685 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
686 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
688 # copy the category and direction for everything between First/Last pairs
689 if ($name =~ /, First>/) { $start = $src; }
690 if ($name =~ /, Last>/)
692 while ($start < $src)
694 $category_table[$start] = $category_table[$src];
695 $direction_table[$start] = $direction_table[$src];
696 $combining_class_table[$start] = $combining_class_table[$src];
697 $start++;
701 next if $decomp eq ""; # no decomposition, skip it
703 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
705 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
706 $decomp_compat_table[$src] = \@seq;
709 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
711 # decomposition of the form "<foo> 1234" -> use char if type is known
712 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
714 ${joining_forms{$1}}[hex $2] = $src;
717 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
719 # decomposition "<compat> 0020 1234" -> combining accent
721 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
723 # store decomposition
724 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
726 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
728 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
730 # Single char decomposition
731 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
735 close $UNICODE_DATA;
737 # patch the category of some special characters
739 for (my $i = 0; $i < @decomp_table; $i++)
741 next unless defined $decomp_table[$i];
742 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
744 foreach my $cat (keys %special_categories)
746 my $flag = $ctype{$cat};
747 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
749 for (my $i = 0; $i < @decomp_compat_table; $i++)
751 next unless defined $decomp_compat_table[$i];
752 next unless @{$decomp_compat_table[$i]} == 2;
753 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
756 # load the composition exclusions
758 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
759 while (<$EXCL>)
761 s/\#.*//; # remove comments
762 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
764 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
766 elsif (/^([0-9a-fA-F]+)\s*$/)
768 $comp_exclusions[hex $1] = 1;
771 close $EXCL;
773 # load the IDNA mappings
775 @idna_decomp_table = @decomp_compat_table;
776 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
777 while (<$IDNA>)
779 s/\#.*//; # remove comments
780 next if /^\s*$/;
781 my ($char, $type, $mapping) = split /;/;
782 my ($ch1, $ch2);
783 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
785 $ch1 = hex $1;
786 $ch2 = hex $2;
788 elsif ($char =~ /([0-9a-fA-F]+)/)
790 $ch1 = $ch2 = hex $1;
793 if ($type =~ /mapped/ || $type =~ /deviation/)
795 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
796 my @seq = map { hex $_; } split /\s+/, $mapping;
797 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
799 elsif ($type =~ /valid/)
802 elsif ($type =~ /ignored/)
804 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
806 elsif ($type =~ /disallowed/)
808 foreach my $i ($ch1 .. $ch2)
810 $idna_decomp_table[$i] = undef;
811 $idna_disallowed[$i] = 1;
815 close $IDNA;
819 ################################################################
820 # add a new registry key
821 sub add_registry_key($$)
823 my ($key, $defval) = @_;
824 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
827 ################################################################
828 # add a new registry value
829 sub add_registry_value($$$)
831 my ($key, $name, $value) = @_;
832 add_registry_key( $key, undef );
833 push @{$registry_keys{$key}}, "'$name' = s '$value'";
836 ################################################################
837 # define a new lead byte
838 sub add_lead_byte($)
840 my $ch = shift;
841 return if defined $cp2uni[$ch];
842 push @lead_bytes, $ch;
843 $cp2uni[$ch] = 0;
846 ################################################################
847 # define a new char mapping
848 sub add_mapping($$)
850 my ($cp, $uni) = @_;
851 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
852 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
853 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
856 ################################################################
857 # get a mapping including glyph chars for MB_USEGLYPHCHARS
858 sub get_glyphs_mapping(@)
860 my @table = @_;
862 for (my $i = 0; $i < @glyph2uni; $i++)
864 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
866 return @table;
869 ################################################################
870 # build EUC-JP table from the JIS 0208/0212 files
871 sub dump_eucjp_codepage()
873 @cp2uni = ();
874 @glyph2uni = ();
875 @lead_bytes = ();
876 @uni2cp = ();
877 $default_char = $DEF_CHAR;
878 $default_wchar = 0x30fb;
880 # ASCII chars
881 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
883 # lead bytes
884 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
886 # JIS X 0201 right plane
887 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
889 # undefined chars
890 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
891 $cp2uni[0xa0] = 0xf8f0;
892 $cp2uni[0xff] = 0xf8f3;
894 # Fix backslash conversion
895 add_mapping( 0xa1c0, 0xff3c );
897 # Add private mappings for rows undefined in JIS 0208/0212
898 my $private = 0xe000;
899 foreach my $hi (0xf5 .. 0xfe)
901 foreach my $lo (0xa1 .. 0xfe)
903 add_mapping( ($hi << 8) + $lo, $private++ );
906 foreach my $hi (0xf5 .. 0xfe)
908 foreach my $lo (0x21 .. 0x7e)
910 add_mapping( ($hi << 8) + $lo, $private++ );
914 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
915 while (<$INPUT>)
917 next if /^\#/; # skip comments
918 next if /^$/; # skip empty lines
919 next if /\x1a/; # skip ^Z
920 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
922 add_mapping( 0x8080 + hex $1, hex $2 );
923 next;
925 die "Unrecognized line $_\n";
927 close $INPUT;
929 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
930 while (<$INPUT>)
932 next if /^\#/; # skip comments
933 next if /^$/; # skip empty lines
934 next if /\x1a/; # skip ^Z
935 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
937 add_mapping( 0x8000 + hex $1, hex $2 );
938 next;
940 die "Unrecognized line $_\n";
942 close $INPUT;
944 output_codepage_file( 20932 );
947 ################################################################
948 # build Korean Wansung table from the KSX1001 file
949 sub dump_krwansung_codepage(@)
951 my @cp949 = @_;
952 @cp2uni = ();
953 @glyph2uni = ();
954 @lead_bytes = ();
955 @uni2cp = ();
956 $default_char = 0x3f;
957 $default_wchar = 0x003f;
959 # ASCII and undefined chars
960 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
961 add_mapping( 0xa0, 0xf8e6 );
962 add_mapping( 0xad, 0xf8e7 );
963 add_mapping( 0xae, 0xf8e8 );
964 add_mapping( 0xaf, 0xf8e9 );
965 add_mapping( 0xfe, 0xf8ea );
966 add_mapping( 0xff, 0xf8eb );
968 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
969 while (<$INPUT>)
971 next if /^\#/; # skip comments
972 next if /^$/; # skip empty lines
973 next if /\x1a/; # skip ^Z
974 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
976 add_mapping( 0x8080 + hex $1, hex $2 );
977 next;
979 die "Unrecognized line $_\n";
981 close $INPUT;
983 # get some extra mappings from cp 949
984 my @defined_lb;
985 map { $defined_lb[$_] = 1; } @lead_bytes;
986 foreach my $i (0x0000 .. 0xffff)
988 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
989 next unless defined $cp949[$i];
990 if ($cp949[$i] >= 0xff)
992 # only add chars for lead bytes that exist in 20949
993 my $hi = $cp949[$i] >> 8;
994 my $lo = $cp949[$i] & 0xff;
995 next unless $defined_lb[$hi];
996 next unless $lo >= 0xa1 && $lo <= 0xfe;
998 add_mapping( $cp949[$i], $i );
1001 output_codepage_file( 20949 );
1004 ################################################################
1005 # build the sort keys table
1006 sub dump_sortkeys($)
1008 my $filename = shift;
1009 my @sortkeys = ();
1011 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
1012 while (<$INPUT>)
1014 next if /^\#/; # skip comments
1015 next if /^$/; # skip empty lines
1016 next if /\x1a/; # skip ^Z
1017 next if /^\@version/; # skip @version header
1018 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1020 my ($uni,$variable) = (hex $1, $2);
1021 next if $uni > 65535;
1022 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
1023 next;
1025 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1027 # multiple character sequence, ignored for now
1028 next;
1030 die "$SORTKEYS: Unrecognized line $_\n";
1032 close $INPUT;
1034 # compress the keys to 32 bit:
1035 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
1037 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
1038 ${$a}[2] <=> ${$b}[2] or
1039 ${$a}[3] <=> ${$b}[3] or
1040 ${$a}[4] <=> ${$b}[4] or
1041 $a cmp $b; } @sortkeys;
1043 my ($n2, $n3) = (1, 1);
1044 my @keys = (-1, -1, -1, -1, -1 );
1045 my @flatkeys = ();
1047 for (my $i = 0; $i < @sortkeys; $i++)
1049 next unless defined $sortkeys[$i];
1050 my @current = @{$sortkeys[$i]};
1051 if ($current[1] == $keys[1])
1053 if ($current[2] == $keys[2])
1055 if ($current[3] == $keys[3])
1057 # nothing
1059 else
1061 $keys[3] = $current[3];
1062 $n3++;
1063 die if ($n3 >= 16);
1066 else
1068 $keys[2] = $current[2];
1069 $keys[3] = $current[3];
1070 $n2++;
1071 $n3 = 1;
1072 die if ($n2 >= 256);
1075 else
1077 $keys[1] = $current[1];
1078 $keys[2] = $current[2];
1079 $keys[3] = $current[3];
1080 $n2 = 1;
1081 $n3 = 1;
1084 if ($current[2]) { $current[2] = $n2; }
1085 if ($current[3]) { $current[3] = $n3; }
1086 if ($current[4]) { $current[4] = 1; }
1088 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1091 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1092 printf "Building $filename\n";
1093 printf OUTPUT "/* Unicode collation element table */\n";
1094 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1095 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1096 print OUTPUT "#include \"windef.h\"\n\n";
1098 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1100 close OUTPUT;
1101 save_file($filename);
1105 ################################################################
1106 # dump an array of integers
1107 sub dump_array($$@)
1109 my ($bit_width, $default, @array) = @_;
1110 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1111 my $i;
1112 my $ret = " ";
1113 for ($i = 0; $i < $#array; $i++)
1115 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1116 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1118 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1119 return $ret;
1123 ################################################################
1124 # dump an SBCS mapping table in binary format
1125 sub dump_binary_sbcs_table($)
1127 my $codepage = shift;
1129 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1130 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1132 print OUTPUT pack "S<*", @header;
1133 print OUTPUT pack "C12", (0) x 12;
1134 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1136 if (@glyph2uni)
1138 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1140 else
1142 print OUTPUT pack "S<*", 0;
1145 print OUTPUT pack "S<*", 0, 0;
1147 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1151 ################################################################
1152 # dump a DBCS mapping table in binary format
1153 sub dump_binary_dbcs_table($)
1155 my $codepage = shift;
1156 my @lb_ranges = get_lb_ranges();
1157 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1159 my @offsets = (0) x 256;
1160 my $pos = 0;
1161 foreach my $i (@lead_bytes)
1163 $offsets[$i] = ($pos += 256);
1164 $cp2uni[$i] = 0;
1167 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1169 print OUTPUT pack "S<*", @header;
1170 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1171 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1172 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1174 foreach my $i (@lead_bytes)
1176 my $base = $i << 8;
1177 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1180 print OUTPUT pack "S<", 4;
1181 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1185 ################################################################
1186 # get the list of defined lead byte ranges
1187 sub get_lb_ranges()
1189 my @list = ();
1190 my @ranges = ();
1192 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1193 my $on = 0;
1194 for (my $i = 0; $i < 256; $i++)
1196 if ($on)
1198 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1200 else
1202 if ($list[$i]) { push @ranges, $i; $on = 1; }
1205 if ($on) { push @ranges, 0xff; }
1206 return @ranges;
1209 ################################################################
1210 # dump the Indic Syllabic Category table
1211 sub dump_indic($)
1213 my $filename = shift;
1214 my @indic_table;
1216 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1217 while (<$INPUT>)
1219 next if /^\#/; # skip comments
1220 next if /^\s*$/; # skip empty lines
1221 next if /\x1a/; # skip ^Z
1222 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1224 my $type = $2;
1225 die "unknown indic $type" unless defined $indic_types{$type};
1226 if (hex $1 < 65536)
1228 $indic_table[hex $1] = $indic_types{$type};
1230 next;
1232 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1234 my $type = $3;
1235 die "unknown indic $type" unless defined $indic_types{$type};
1236 if (hex $1 < 65536 and hex $2 < 65536)
1238 foreach my $i (hex $1 .. hex $2)
1240 $indic_table[$i] = $indic_types{$type};
1243 next;
1245 die "malformed line $_";
1247 close $INPUT;
1249 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1250 while (<$INPUT>)
1252 next if /^\#/; # skip comments
1253 next if /^\s*$/; # skip empty lines
1254 next if /\x1a/; # skip ^Z
1255 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1257 my $type = $2;
1258 die "unknown matra $type" unless defined $matra_types{$type};
1259 $indic_table[hex $1] |= $matra_types{$type} << 8;
1260 next;
1262 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1264 my $type = $3;
1265 die "unknown matra $type" unless defined $matra_types{$type};
1266 foreach my $i (hex $1 .. hex $2)
1268 $indic_table[$i] |= $matra_types{$type} << 8;
1270 next;
1272 die "malformed line $_";
1274 close $INPUT;
1276 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1277 print "Building $filename\n";
1278 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1279 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1280 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1281 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1282 print OUTPUT "#include \"windef.h\"\n\n";
1284 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1286 close OUTPUT;
1287 save_file($filename);
1290 ################################################################
1291 # dump the Line Break Properties table
1292 sub dump_linebreak($)
1294 my $filename = shift;
1295 my @break_table;
1297 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1298 while (<$INPUT>)
1300 next if /^\#/; # skip comments
1301 next if /^\s*$/; # skip empty lines
1302 next if /\x1a/; # skip ^Z
1303 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1305 my $type = $2;
1306 die "unknown breaktype $type" unless defined $break_types{$type};
1307 $break_table[hex $1] = $break_types{$type};
1308 next;
1310 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1312 my $type = $3;
1313 die "unknown breaktype $type" unless defined $break_types{$type};
1314 foreach my $i (hex $1 .. hex $2)
1316 $break_table[$i] = $break_types{$type};
1318 next;
1320 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1322 my $type = $2;
1323 die "unknown breaktype $type" unless defined $break_types{$type};
1324 $break_table[hex $1] = $break_types{$type};
1325 next;
1327 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1329 my $type = $3;
1330 die "unknown breaktype $type" unless defined $break_types{$type};
1331 foreach my $i (hex $1 .. hex $2)
1333 $break_table[$i] = $break_types{$type};
1335 next;
1337 die "malformed line $_";
1339 close $INPUT;
1341 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1342 print "Building $filename\n";
1343 print OUTPUT "/* Unicode Line Break Properties */\n";
1344 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1345 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1346 print OUTPUT "#include \"windef.h\"\n\n";
1348 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1350 close OUTPUT;
1351 save_file($filename);
1354 my %scripts =
1356 "Unknown" => 0,
1357 "Common" => 1,
1358 "Inherited" => 2,
1359 "Arabic" => 3,
1360 "Armenian" => 4,
1361 "Avestan" => 5,
1362 "Balinese" => 6,
1363 "Bamum" => 7,
1364 "Batak" => 8,
1365 "Bengali" => 9,
1366 "Bopomofo" => 10,
1367 "Brahmi" => 11,
1368 "Braille" => 12,
1369 "Buginese" => 13,
1370 "Buhid" => 14,
1371 "Canadian_Aboriginal" => 15,
1372 "Carian" => 16,
1373 "Cham" => 17,
1374 "Cherokee" => 18,
1375 "Coptic" => 19,
1376 "Cuneiform" => 20,
1377 "Cypriot" => 21,
1378 "Cyrillic" => 22,
1379 "Deseret" => 23,
1380 "Devanagari" => 24,
1381 "Egyptian_Hieroglyphs" => 25,
1382 "Ethiopic" => 26,
1383 "Georgian" => 27,
1384 "Glagolitic" => 28,
1385 "Gothic" => 29,
1386 "Greek" => 30,
1387 "Gujarati" => 31,
1388 "Gurmukhi" => 32,
1389 "Han" => 33,
1390 "Hangul" => 34,
1391 "Hanunoo" => 35,
1392 "Hebrew" => 36,
1393 "Hiragana" => 37,
1394 "Imperial_Aramaic" => 38,
1395 "Inscriptional_Pahlavi" => 39,
1396 "Inscriptional_Parthian" => 40,
1397 "Javanese" => 41,
1398 "Kaithi" => 42,
1399 "Kannada" => 43,
1400 "Katakana" => 44,
1401 "Kayah_Li" => 45,
1402 "Kharoshthi" => 46,
1403 "Khmer" => 47,
1404 "Lao" => 48,
1405 "Latin" => 49,
1406 "Lepcha" => 50,
1407 "Limbu" => 51,
1408 "Linear_B" => 52,
1409 "Lisu" => 53,
1410 "Lycian" => 54,
1411 "Lydian" => 55,
1412 "Malayalam" => 56,
1413 "Mandaic" => 57,
1414 "Meetei_Mayek" => 58,
1415 "Mongolian" => 59,
1416 "Myanmar" => 60,
1417 "New_Tai_Lue" => 61,
1418 "Nko" => 62,
1419 "Ogham" => 63,
1420 "Ol_Chiki" => 64,
1421 "Old_Italic" => 65,
1422 "Old_Persian" => 66,
1423 "Old_South_Arabian" => 67,
1424 "Old_Turkic" => 68,
1425 "Oriya" => 69,
1426 "Osmanya" => 70,
1427 "Phags_Pa" => 71,
1428 "Phoenician" => 72,
1429 "Rejang" => 73,
1430 "Runic" => 74,
1431 "Samaritan" => 75,
1432 "Saurashtra" => 76,
1433 "Shavian" => 77,
1434 "Sinhala" => 78,
1435 "Sundanese" => 79,
1436 "Syloti_Nagri" => 80,
1437 "Syriac" => 81,
1438 "Tagalog" => 82,
1439 "Tagbanwa" => 83,
1440 "Tai_Le" => 84,
1441 "Tai_Tham" => 85,
1442 "Tai_Viet" => 86,
1443 "Tamil" => 87,
1444 "Telugu" => 88,
1445 "Thaana" => 89,
1446 "Thai" => 90,
1447 "Tibetan" => 91,
1448 "Tifinagh" => 92,
1449 "Ugaritic" => 93,
1450 "Vai" => 94,
1451 "Yi" => 95,
1452 # Win8/Win8.1
1453 "Chakma" => 96,
1454 "Meroitic_Cursive" => 97,
1455 "Meroitic_Hieroglyphs" => 98,
1456 "Miao" => 99,
1457 "Sharada" => 100,
1458 "Sora_Sompeng" => 101,
1459 "Takri" => 102,
1460 # Win10
1461 "Bassa_Vah" => 103,
1462 "Caucasian_Albanian" => 104,
1463 "Duployan" => 105,
1464 "Elbasan" => 106,
1465 "Grantha" => 107,
1466 "Khojki" => 108,
1467 "Khudawadi" => 109,
1468 "Linear_A" => 110,
1469 "Mahajani" => 111,
1470 "Manichaean" => 112,
1471 "Mende_Kikakui" => 113,
1472 "Modi" => 114,
1473 "Mro" => 115,
1474 "Nabataean" => 116,
1475 "Old_North_Arabian" => 117,
1476 "Old_Permic" => 118,
1477 "Pahawh_Hmong" => 119,
1478 "Palmyrene" => 120,
1479 "Pau_Cin_Hau" => 121,
1480 "Psalter_Pahlavi" => 122,
1481 "Siddham" => 123,
1482 "Tirhuta" => 124,
1483 "Warang_Citi" => 125,
1484 # Win10 RS1
1485 "Adlam" => 126,
1486 "Ahom" => 127,
1487 "Anatolian_Hieroglyphs" => 128,
1488 "Bhaiksuki" => 129,
1489 "Hatran" => 130,
1490 "Marchen" => 131,
1491 "Multani" => 132,
1492 "Newa" => 133,
1493 "Old_Hungarian" => 134,
1494 "Osage" => 135,
1495 "SignWriting" => 136,
1496 "Tangut" => 137,
1497 # Win10 RS4
1498 "Masaram_Gondi" => 138,
1499 "Nushu" => 139,
1500 "Soyombo" => 140,
1501 "Zanabazar_Square" => 141,
1502 # Win10 1903
1503 "Dogra" => 142,
1504 "Gunjala_Gondi" => 143,
1505 "Hanifi_Rohingya" => 144,
1506 "Makasar" => 145,
1507 "Medefaidrin" => 146,
1508 "Old_Sogdian" => 147,
1509 "Sogdian" => 148,
1510 # Win10 2004
1511 "Elymaic" => 149,
1512 "Nyiakeng_Puachue_Hmong" => 150,
1513 "Nandinagari" => 151,
1514 "Wancho" => 152,
1515 # Win11
1516 "Chorasmian" => 153,
1517 "Dives_Akuru" => 154,
1518 "Khitan_Small_Script" => 155,
1519 "Yezidi" => 156,
1522 ################################################################
1523 # dump Script IDs table
1524 sub dump_scripts($)
1526 my $filename = shift;
1527 my $header = $filename;
1528 my @scripts_table;
1529 my $script_index;
1530 my $i;
1532 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1533 # Fill the table
1534 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1535 while (<$INPUT>)
1537 my $type = "";
1539 next if /^\#/; # skip comments
1540 next if /^\s*$/; # skip empty lines
1541 next if /\x1a/; # skip ^Z
1542 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1544 $type = $2;
1545 if (defined $scripts{$type})
1547 $scripts_table[hex $1] = $scripts{$type};
1549 next;
1551 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1553 $type = $3;
1554 if (defined $scripts{$type})
1556 foreach my $i (hex $1 .. hex $2)
1558 $scripts_table[$i] = $scripts{$type};
1561 next;
1565 close $INPUT;
1567 $header = "$filename.h";
1568 open OUTPUT,">$header.new" or die "Cannot create $header";
1569 print "Building $header\n";
1570 print OUTPUT "/* Unicode Script IDs */\n";
1571 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1572 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1574 print OUTPUT "enum unicode_script_id {\n";
1575 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1577 print OUTPUT " Script_$script = $scripts{$script},\n";
1579 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1580 print OUTPUT "};\n";
1582 close OUTPUT;
1583 save_file($header);
1585 $filename = "$filename.c";
1586 open OUTPUT,">$filename.new" or die "Cannot create $header";
1587 print "Building $filename\n";
1588 print OUTPUT "/* Unicode Script IDs */\n";
1589 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1590 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1591 print OUTPUT "#include \"windef.h\"\n\n";
1593 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1594 close OUTPUT;
1595 save_file($filename);
1598 ################################################################
1599 # dump the BiDi mirroring table
1600 sub dump_mirroring($)
1602 my $filename = shift;
1603 my @mirror_table = ();
1605 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1606 while (<$INPUT>)
1608 next if /^\#/; # skip comments
1609 next if /^$/; # skip empty lines
1610 next if /\x1a/; # skip ^Z
1611 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1613 $mirror_table[hex $1] = hex $2;
1614 next;
1616 die "malformed line $_";
1618 close $INPUT;
1620 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1621 print "Building $filename\n";
1622 print OUTPUT "/* Unicode BiDi mirroring */\n";
1623 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1624 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1625 print OUTPUT "#include \"windef.h\"\n\n";
1626 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1627 close OUTPUT;
1628 save_file($filename);
1631 ################################################################
1632 # dump the Bidi Brackets
1633 sub dump_bracket($)
1635 my $filename = shift;
1636 my @bracket_table;
1638 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1639 while (<$INPUT>)
1641 next if /^\#/; # skip comments
1642 next if /^\s*$/; # skip empty lines
1643 next if /\x1a/; # skip ^Z
1644 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1646 my $type = $3;
1647 die "unknown bracket $type" unless defined $bracket_types{$type};
1648 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1649 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1650 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1651 next;
1653 die "malformed line $_";
1655 close $INPUT;
1657 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1658 print "Building $filename\n";
1659 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1660 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1661 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1662 print OUTPUT "#include \"windef.h\"\n\n";
1664 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1666 close OUTPUT;
1667 save_file($filename);
1670 ################################################################
1671 # dump the Arabic shaping table
1672 sub dump_shaping($)
1674 my $filename = shift;
1675 my @joining_table = @initial_joining_table;
1677 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1678 while (<$INPUT>)
1680 next if /^\#/; # skip comments
1681 next if /^\s*$/; # skip empty lines
1682 next if /\x1a/; # skip ^Z
1683 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1685 my $type = $2;
1686 $joining_table[hex $1] = $joining_types{$type};
1687 next;
1689 die "malformed line $_";
1691 close $INPUT;
1693 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1694 print "Building $filename\n";
1695 print OUTPUT "/* Unicode Arabic shaping */\n";
1696 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1697 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1698 print OUTPUT "#include \"windef.h\"\n\n";
1700 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1702 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1703 for (my $i = 0x600; $i <= 0x6ff; $i++)
1705 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1706 ${joining_forms{"isolated"}}[$i] || $i,
1707 ${joining_forms{"final"}}[$i] || $i,
1708 ${joining_forms{"initial"}}[$i] || $i,
1709 ${joining_forms{"medial"}}[$i] || $i;
1711 print OUTPUT "};\n";
1713 close OUTPUT;
1714 save_file($filename);
1717 ################################################################
1718 # dump the Arabic shaping table
1719 sub dump_arabic_shaping($)
1721 my $filename = shift;
1722 my @joining_table = @initial_joining_table;
1724 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1725 while (<$INPUT>)
1727 next if /^\#/; # skip comments
1728 next if /^\s*$/; # skip empty lines
1729 next if /\x1a/; # skip ^Z
1730 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1732 my $type = $2;
1733 my $group = $3;
1735 if ($group eq "ALAPH" || $group eq "DALATH RISH")
1737 $joining_table[hex $1] = $joining_types{$group};
1739 else
1741 $joining_table[hex $1] = $joining_types{$type};
1744 next;
1746 die "malformed line $_";
1748 close $INPUT;
1750 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1751 print "Building $filename\n";
1752 print OUTPUT "/* Unicode Arabic shaping */\n";
1753 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1754 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1755 print OUTPUT "#include \"windef.h\"\n\n";
1757 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1759 close OUTPUT;
1760 save_file($filename);
1763 ################################################################
1764 # dump the Vertical Orientation table
1765 sub dump_vertical($$)
1767 my ($filename, $unix) = @_;
1768 my @vertical_table;
1770 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1771 while (<$INPUT>)
1773 next if /^\#/; # skip comments
1774 next if /^\s*$/; # skip empty lines
1775 next if /\x1a/; # skip ^Z
1776 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1778 my $type = $2;
1779 die "unknown vertical $type" unless defined $vertical_types{$type};
1780 if (hex $1 < 65536)
1782 $vertical_table[hex $1] = $vertical_types{$type};
1784 next;
1786 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1788 my $type = $3;
1789 die "unknown vertical $type" unless defined $vertical_types{$type};
1790 foreach my $i (hex $1 .. hex $2)
1792 $vertical_table[$i] = $vertical_types{$type};
1794 next;
1796 die "malformed line $_";
1798 close $INPUT;
1800 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1801 print "Building $filename\n";
1802 print OUTPUT "/* Unicode Vertical Orientation */\n";
1803 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1804 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1805 if ($unix)
1807 print OUTPUT "#if 0\n";
1808 print OUTPUT "#pragma makedep unix\n";
1809 print OUTPUT "#endif\n\n";
1811 print OUTPUT "#include \"windef.h\"\n\n";
1813 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1815 close OUTPUT;
1816 save_file($filename);
1819 ################################################################
1820 # dump the digit folding tables
1821 sub dump_digit_folding($)
1823 my ($filename) = shift;
1824 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1825 print "Building $filename\n";
1826 print OUTPUT "/* Unicode digit folding mappings */\n";
1827 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1828 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1829 print OUTPUT "#include \"windef.h\"\n\n";
1831 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1832 close OUTPUT;
1833 save_file($filename);
1837 ################################################################
1838 # compress a mapping table by removing identical rows
1839 sub compress_array($$@)
1841 my $rows = shift;
1842 my $def = shift;
1843 my @table = @_;
1844 my $len = @table / $rows;
1845 my @array;
1846 my $data = "";
1848 # try to merge table rows
1849 for (my $row = 0; $row < $rows; $row++)
1851 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1852 my $pos = index $data, $rowtxt;
1853 if ($pos == -1)
1855 # check if the tail of the data can match the start of the new row
1856 my $first = substr( $rowtxt, 0, 1 );
1857 for (my $i = length($data) - 1; $i > 0; $i--)
1859 $pos = index( substr( $data, -$i ), $first );
1860 last if $pos == -1;
1861 $i -= $pos;
1862 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1863 substr( $data, -$i ) = "";
1864 last;
1866 $pos = length $data;
1867 $data .= $rowtxt;
1869 $array[$row] = $rows + $pos;
1871 return @array, unpack "U*", $data;
1874 ################################################################
1875 # dump a char -> 16-bit value mapping table using two-level tables
1876 sub dump_two_level_mapping($$@)
1878 my $name = shift;
1879 my $def = shift;
1880 my $size = shift;
1881 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1882 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1883 my @array = compress_array( 256, 0, @row_array[0..4095] );
1885 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1887 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1888 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1889 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1890 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1893 ################################################################
1894 # dump a char -> value mapping table using three-level tables
1895 sub dump_three_level_mapping($$@)
1897 my $name = shift;
1898 my $def = shift;
1899 my $size = shift;
1900 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1901 my $level3 = ($MAX_CHAR + 1) / 16;
1902 my $level2 = $level3 / 16;
1903 my $level1 = $level2 / 16;
1904 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1905 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1906 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1908 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1909 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1911 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1912 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1913 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1914 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1915 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1918 ################################################################
1919 # dump a binary case mapping table in l_intl.nls format
1920 sub dump_binary_case_table(@)
1922 my (@table) = @_;
1923 my $max_char = 0x10000;
1924 my $level1 = $max_char / 16;
1925 my $level2 = $level1 / 16;
1927 my @difftable;
1928 for (my $i = 0; $i < @table; $i++)
1930 next unless defined $table[$i];
1931 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1934 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1935 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1936 my $offset = @array - $level1;
1937 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1938 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1941 ################################################################
1942 # dump case mappings for l_intl.nls
1943 sub dump_intl_nls($)
1945 my @upper_table = @toupper_table;
1946 my @lower_table = @tolower_table;
1947 remove_linguistic_mappings( \@upper_table, \@lower_table );
1949 my $upper = dump_binary_case_table( @upper_table );
1950 my $lower = dump_binary_case_table( @lower_table );
1952 my $filename = shift;
1953 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1954 printf "Building $filename\n";
1956 binmode OUTPUT;
1957 print OUTPUT pack "S<", 1; # version
1958 print OUTPUT $upper;
1959 print OUTPUT $lower;
1960 close OUTPUT;
1961 save_file($filename);
1965 ################################################################
1966 # dump the bidi direction table
1967 sub dump_bidi_dir_table($)
1969 my $filename = shift;
1970 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1971 printf "Building $filename\n";
1972 printf OUTPUT "/* Unicode BiDi direction table */\n";
1973 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1974 printf OUTPUT "#include \"windef.h\"\n\n";
1976 my @table;
1978 for (my $i = 0; $i < 65536; $i++)
1980 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1983 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1985 close OUTPUT;
1986 save_file($filename);
1990 sub rol($$)
1992 my ($byte, $count) = @_;
1993 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1996 ################################################################
1997 # compress the character properties table
1998 sub compress_char_props_table($@)
2000 my $rows = shift;
2001 my @table = @_;
2002 my $len = @table / $rows;
2003 my $pos = 0;
2004 my @array = (0) x $rows;
2005 my %sequences;
2007 # add some predefined sequences
2008 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
2010 # try to merge table rows
2011 for (my $row = 0; $row < $rows; $row++)
2013 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
2014 my $rowtxt = pack "L*", @table_row;
2015 if (defined($sequences{$rowtxt}))
2017 # reuse an existing row
2018 $array[$row] = $sequences{$rowtxt};
2020 else
2022 # create a new row
2023 $sequences{$rowtxt} = $array[$row] = ++$pos;
2024 push @array, @table_row;
2027 return @array;
2030 ################################################################
2031 # dump a normalization table in binary format
2032 sub dump_norm_table($)
2034 my $filename = shift;
2036 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2037 my %decomp = ( "nfc" => \@decomp_table,
2038 "nfd" => \@decomp_table,
2039 "nfkc" => \@decomp_compat_table,
2040 "nfkd" => \@decomp_compat_table ,
2041 "idna" => \@idna_decomp_table );
2043 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2044 print "Building $filename\n";
2046 my $type = $filename;
2047 $type =~ s!.*/norm(\w+)\.nls!$1!;
2049 my $compose = $forms{$type} & 1;
2050 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2052 my @version = split /\./, $UNIVERSION;
2054 # combining classes
2056 my @classes;
2057 my @class_values;
2059 foreach my $c (grep defined, @combining_class_table)
2061 $classes[$c] = 1 if $c < 0x100;
2063 for (my $i = 0; $i < @classes; $i++)
2065 next unless defined $classes[$i];
2066 $classes[$i] = @class_values;
2067 push @class_values, $i;
2069 push @class_values, 0 if (@class_values % 2);
2070 die "too many classes" if @class_values >= 0x40;
2072 # character properties
2074 my @char_props;
2075 my @decomposed;
2076 my @comp_hash_table;
2077 my $comp_hash_size = $compose ? 254 : 0;
2079 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2081 next unless defined $combining_class_table[$i];
2082 if (defined $decomp{$type}->[$i])
2084 my @dec = get_decomposition( $i, $decomp{$type} );
2085 if ($compose && (my @comp = get_composition( $i, $compat )))
2087 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2088 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2090 my $val = 0;
2091 foreach my $d (@dec)
2093 $val = $combining_class_table[$d];
2094 last if $val;
2096 $char_props[$i] = $classes[$val];
2098 else
2100 $char_props[$i] = 0xbf;
2102 @dec = compose_hangul( @dec ) if $compose;
2103 @dec = to_utf16( @dec );
2104 push @dec, 0 if @dec >= 7;
2105 $decomposed[$i] = \@dec;
2107 else
2109 if ($combining_class_table[$i] == 0x100)
2111 $char_props[$i] = 0x7f;
2113 elsif ($combining_class_table[$i])
2115 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2117 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2119 $char_props[$i] = 0xff;
2121 else
2123 $char_props[$i] = 0;
2128 if ($compose)
2130 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2132 my @comp = get_composition( $i, $compat );
2133 next unless @comp;
2134 if ($combining_class_table[$comp[1]])
2136 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2137 $char_props[$comp[1]] |= 0x40;
2139 else
2141 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2142 $char_props[$comp[1]] |= 0xc0;
2147 # surrogates
2148 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2149 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2151 # Hangul
2152 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2153 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2154 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2156 # invalid chars
2157 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2158 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2159 foreach my $i (0x00..0x10)
2161 $char_props[($i << 16) | 0xfffe] = 0xff;
2162 $char_props[($i << 16) | 0xffff] = 0xff;
2165 # decomposition hash table
2167 my @decomp_hash_table;
2168 my @decomp_hash_index;
2169 my @decomp_hash_data;
2170 my $decomp_hash_size = 944;
2172 # build string of character data, reusing substrings when possible
2173 my $decomp_char_data = "";
2174 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2176 my $str = pack "U*", @{$i};
2177 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2179 for (my $i = 0; $i < @decomposed; $i++)
2181 next unless defined $decomposed[$i];
2182 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2183 die "sequence not found" if $pos == -1;
2184 my $len = @{$decomposed[$i]};
2185 $len = 7 if $len > 7;
2186 my $hash = $i % $decomp_hash_size;
2187 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2189 for (my $i = 0; $i < $decomp_hash_size; $i++)
2191 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2192 next unless defined $decomp_hash_table[$i];
2193 if (@{$decomp_hash_table[$i]} == 1)
2195 my $entry = $decomp_hash_table[$i]->[0];
2196 if ($char_props[$entry->[0]] == 0xbf)
2198 $decomp_hash_index[$i] = $entry->[1];
2199 next;
2202 foreach my $entry (@{$decomp_hash_table[$i]})
2204 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2207 push @decomp_hash_data, 0, 0;
2209 # composition hash table
2211 my @comp_hash_index;
2212 my @comp_hash_data;
2213 if (@comp_hash_table)
2215 for (my $i = 0; $i < $comp_hash_size; $i++)
2217 $comp_hash_index[$i] = @comp_hash_data;
2218 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2220 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2221 push @comp_hash_data, 0, 0, 0;
2224 my $level1 = ($MAX_CHAR + 1) / 128;
2225 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2227 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2228 0, $decomp_hash_size, $comp_hash_size, 0 );
2229 my @tables = (0) x 8;
2231 $tables[0] = 16 + @header + @tables;
2232 $tables[1] = $tables[0] + @class_values / 2;
2233 $tables[2] = $tables[1] + $level1 / 2;
2234 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2235 $tables[4] = $tables[3] + @decomp_hash_index;
2236 $tables[5] = $tables[4] + @decomp_hash_data;
2237 $tables[6] = $tables[5] + length $decomp_char_data;
2238 $tables[7] = $tables[6] + @comp_hash_index;
2240 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2241 print OUTPUT pack "S<*", @header;
2242 print OUTPUT pack "S<*", @tables;
2243 print OUTPUT pack "C*", @class_values;
2245 print OUTPUT pack "C*", @rows[0..$level1-1];
2246 print OUTPUT pack "C*", @rows[$level1..$#rows];
2247 print OUTPUT pack "S<*", @decomp_hash_index;
2248 print OUTPUT pack "S<*", @decomp_hash_data;
2249 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2250 print OUTPUT pack "S<*", @comp_hash_index;
2251 print OUTPUT pack "S<*", @comp_hash_data;
2253 close OUTPUT;
2254 save_file($filename);
2256 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2260 ################################################################
2261 # output a codepage definition file from the global tables
2262 sub output_codepage_file($)
2264 my $codepage = shift;
2266 my $output = sprintf "nls/c_%03d.nls", $codepage;
2267 open OUTPUT,">$output.new" or die "Cannot create $output";
2269 printf "Building %s\n", $output;
2270 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2271 else { dump_binary_dbcs_table( $codepage ); }
2273 close OUTPUT;
2274 save_file($output);
2276 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2279 ################################################################
2280 # output a codepage table from a Microsoft-style mapping file
2281 sub dump_msdata_codepage($)
2283 my $filename = shift;
2285 my $state = "";
2286 my ($codepage, $width, $count);
2287 my ($lb_cur, $lb_end);
2289 @cp2uni = ();
2290 @glyph2uni = ();
2291 @lead_bytes = ();
2292 @uni2cp = ();
2293 $default_char = $DEF_CHAR;
2294 $default_wchar = $DEF_CHAR;
2296 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2298 while (<$INPUT>)
2300 next if /^;/; # skip comments
2301 next if /^\s*$/; # skip empty lines
2302 next if /\x1a/; # skip ^Z
2303 last if /^ENDCODEPAGE/;
2305 if (/^CODEPAGE\s+(\d+)/)
2307 $codepage = $1;
2308 next;
2310 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2312 $width = $1;
2313 $default_char = hex $2;
2314 $default_wchar = hex $3;
2315 next;
2317 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2319 $state = $1;
2320 $count = $2;
2321 next;
2323 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2325 if ($state eq "MBTABLE")
2327 my $cp = hex $1;
2328 my $uni = hex $2;
2329 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2330 next;
2332 if ($state eq "GLYPHTABLE")
2334 my $cp = hex $1;
2335 my $uni = hex $2;
2336 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2337 next;
2339 if ($state eq "WCTABLE")
2341 my $uni = hex $1;
2342 my $cp = hex $2;
2343 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2344 next;
2346 if ($state eq "DBCSRANGE")
2348 my $start = hex $1;
2349 my $end = hex $2;
2350 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2351 $lb_cur = $start;
2352 $lb_end = $end;
2353 next;
2355 if ($state eq "DBCSTABLE")
2357 my $mb = hex $1;
2358 my $uni = hex $2;
2359 my $cp = ($lb_cur << 8) | $mb;
2360 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2361 if (!--$count)
2363 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2365 next;
2368 die "$filename: Unrecognized line $_\n";
2370 close $INPUT;
2372 output_codepage_file( $codepage );
2374 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
2377 ################################################################
2378 # align a string length
2379 sub align_string($$)
2381 my ($align, $str) = @_;
2382 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2383 return $str;
2386 ################################################################
2387 # pack a GUID string
2388 sub pack_guid($)
2390 $_ = shift;
2391 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2392 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2395 ################################################################
2396 # comparison function for compression sort
2397 sub cmp_compression
2399 return scalar @{$a} <=> scalar @{$b} ||
2400 $a->[4] <=> $b->[4] ||
2401 $a->[5] <=> $b->[5] ||
2402 $a->[6] <=> $b->[6] ||
2403 $a->[7] <=> $b->[7] ||
2404 $a->[8] <=> $b->[8] ||
2405 $a->[9] <=> $b->[9] ||
2406 $a->[10] <=> $b->[10] ||
2407 $a->[11] <=> $b->[11] ||
2408 $a->[12] <=> $b->[12];
2411 ################################################################
2412 # build a binary sort keys table
2413 sub dump_sortkey_table($$)
2415 my ($filename, $download) = @_;
2417 my @keys;
2418 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2419 my @multiple_weights;
2420 my @expansions;
2421 my @compressions;
2422 my %exceptions;
2423 my %guids;
2424 my %compr_flags;
2425 my %locales;
2426 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2427 my $jamostr = "";
2429 my $re_hex = '0x[0-9A-Fa-f]+';
2430 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2431 $guids{$default_guid} = { };
2433 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2435 my $KEYS = open_data_file( $MSDATA, $download );
2437 printf "Building $filename\n";
2439 while (<$KEYS>)
2441 s/\s*;.*$//;
2442 next if /^\s*$/; # skip empty lines
2443 if (/^\s*(SORTKEY|SORTTABLES)/)
2445 $part = $1;
2446 next;
2448 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2450 $part = $section = "";
2451 next;
2453 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2455 $section = $1;
2456 $guid = undef;
2457 next;
2459 next unless $part;
2460 if ("$part.$section" eq "SORTKEY.DEFAULT")
2462 if (/^\s*($re_hex)\s+$re_key/)
2464 $keys[hex $1] = [ split(/\s+/,$2) ];
2465 next;
2468 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2470 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2472 $version = hex $1;
2473 next;
2475 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2477 # ignore for now
2478 next;
2481 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2482 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2483 "$part.$section" eq "SORTTABLES.INVERSECASING")
2485 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2487 $guid = lc $1;
2488 $guids{$guid} = { } unless defined $guids{$guid};
2489 $guids{$guid}->{flags} |= $flags{$section};
2490 next;
2492 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2494 $locales{$1} = $guid;
2495 next;
2498 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2500 if (/^\s*(\d+)\s+(\d+)/)
2502 push @multiple_weights, $1, $2;
2503 next;
2506 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2508 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2510 my $pos = scalar @expansions / 2;
2511 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2512 push @expansions, hex $2, hex $3;
2513 next;
2516 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2518 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2520 $keys[hex $1] = $keys[hex $2];
2521 next;
2524 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2526 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2528 if ($subsection || !$guid) # start a new one
2530 $guid = lc $1;
2531 $subsection = "";
2532 $guids{$guid} = { } unless defined $guids{$guid};
2533 $guids{$guid}->{flags} |= $flags{$2} if $2;
2534 $guids{$guid}->{compr} = @compressions;
2535 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2536 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2537 push @compressions, [ ];
2539 else # merge with current one
2541 $guids{lc $1} = { } unless defined $guids{lc $1};
2542 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2543 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2544 $compr_flags{lc $1} = $compr_flags{$guid};
2546 next;
2548 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2550 $locales{$1} = $guid;
2551 next;
2553 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2555 $subsection = $1;
2556 next;
2558 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2560 my @comp = map { hex $_; } split(/\s+/,$1);
2561 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2562 # add compression flags
2563 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2564 next;
2567 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2569 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2571 $guid = lc $1;
2572 $guids{$guid} = { } unless defined $guids{lc $1};
2573 $ling_flag = ($2 ? "+" : "-");
2574 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2575 next;
2577 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2579 $locales{$1} = $guid;
2580 next;
2582 if (/^\s*($re_hex)\s+$re_key/)
2584 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2585 next;
2588 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2590 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2592 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2593 next;
2596 die "$download: $part.$section: unrecognized line $_\n";
2598 close $KEYS;
2600 # Sortkey table
2602 my $table;
2603 for (my $i = 0; $i < 0x10000; $i++)
2605 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2606 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2609 foreach my $id (sort keys %exceptions)
2611 my $pos = length($table) / 4;
2612 my @exc = @{$exceptions{$id}};
2613 my @filled;
2614 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2615 my $guid = substr( $id, 0, -1 );
2616 $guids{$guid}->{$key} = $pos;
2617 $pos += 0x100;
2618 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2619 for (my $j = 0; $j < 0x10000; $j++)
2621 next unless defined $exc[$j] || defined $flags[$j];
2622 $filled[$j >> 8] = 1;
2623 $j |= 0xff;
2625 for (my $j = 0; $j < 0x100; $j++)
2627 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2628 $pos += 0x100 if $filled[$j];
2630 for (my $j = 0; $j < 0x10000; $j++)
2632 next unless $filled[$j >> 8];
2633 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2634 $k[3] |= $flags[$j] || 0;
2635 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2639 # Case mapping tables
2641 # standard table
2642 my @casemaps;
2643 my @upper = @toupper_table;
2644 my @lower = @tolower_table;
2645 remove_linguistic_mappings( \@upper, \@lower );
2646 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2648 # linguistic table
2649 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2651 # Turkish table
2652 @upper = @toupper_table;
2653 @lower = @tolower_table;
2654 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2655 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2656 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2657 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2659 # Char type table
2661 my @table;
2662 my $types = "";
2663 my %typestr;
2664 for (my $i = 0; $i < 0x10000; $i++)
2666 my $str = pack "S<3",
2667 ($category_table[$i] || 0) & 0xffff,
2668 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2669 ($category_table[$i] || 0) >> 16;
2671 if (!defined($typestr{$str}))
2673 $typestr{$str} = length($types) / 6;
2674 $types .= $str;
2676 $table[$i] = $typestr{$str};
2679 my @rows = compress_array( 4096, 0, @table[0..65535] );
2680 my @array = compress_array( 256, 0, @rows[0..4095] );
2681 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2682 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2684 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2685 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2686 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2688 # Sort tables
2690 # guids
2691 my $sorttables = pack "L<2", $version, scalar %guids;
2692 foreach my $id (sort keys %guids)
2694 my %guid = %{$guids{$id}};
2695 my $flags = $guid{flags} || 0;
2696 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2697 $sorttables .= pack_guid($id) . pack "L<5",
2698 $flags,
2699 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2700 $guid{except} || 0,
2701 $guid{ling_except} || 0,
2702 $map / 2;
2705 # expansions
2706 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2708 # compressions
2709 $sorttables .= pack "L<", scalar @compressions;
2710 my $rowstr = "";
2711 foreach my $c (@compressions)
2713 my $pos = length($rowstr) / 2;
2714 my $min = 0xffff;
2715 my $max = 0;
2716 my @lengths = (0) x 8;
2717 foreach my $r (sort cmp_compression @{$c})
2719 my @row = @{$r};
2720 $lengths[scalar @row - 6]++;
2721 foreach my $val (@row[4..$#row])
2723 $min = $val if $min > $val;
2724 $max = $val if $max < $val;
2726 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2727 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2729 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2731 $sorttables .= $rowstr;
2733 # multiple weights
2734 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2736 # jamo sort
2737 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2739 # Locales
2741 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2742 foreach my $loc (sort keys %locales)
2744 # skip specific locales that match more general ones
2745 my @parts = split /[-_]/, $loc;
2746 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2747 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2748 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2751 # File header
2753 my @header;
2754 $header[0] = 16;
2755 $header[1] = $header[0] + length $table;
2756 $header[2] = $header[1] + length $casemaps;
2757 $header[3] = $header[2] + length $chartypes;
2759 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2760 print OUTPUT pack "L<*", @header;
2761 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2762 close OUTPUT;
2763 save_file($filename);
2767 ################################################################
2768 # build the script to create registry keys
2769 sub dump_registry_script($%)
2771 my ($filename, %keys) = @_;
2772 my $indent = 1;
2774 printf "Building %s\n", $filename;
2775 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2776 print OUTPUT "HKLM\n{\n";
2777 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2779 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2780 $indent++;
2782 foreach my $k (sort keys %keys)
2784 my @subkeys = split /\\/, $k;
2785 my ($def, @vals) = @{$keys{$k}};
2786 for (my $i = 0; $i < @subkeys; $i++)
2788 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2789 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2790 $indent++;
2792 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2793 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2795 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2796 close OUTPUT;
2797 save_file($filename);
2801 ################################################################
2802 # save a file if modified
2803 sub save_file($)
2805 my $file = shift;
2806 if (-f $file && !system "cmp $file $file.new >/dev/null")
2808 unlink "$file.new";
2810 else
2812 rename "$file.new", "$file";
2817 ################################################################
2818 # main routine
2820 chdir ".." if -f "./make_unicode";
2821 load_data();
2822 dump_sortkeys( "dlls/kernelbase/collation.c" );
2823 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2824 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2825 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2826 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2827 dump_mirroring( "dlls/dwrite/mirror.c" );
2828 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2829 dump_bracket( "dlls/dwrite/bracket.c" );
2830 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2831 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2832 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2833 dump_linebreak( "dlls/dwrite/linebreak.c" );
2834 dump_scripts( "dlls/dwrite/scripts" );
2835 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2836 dump_vertical( "dlls/win32u/vertical.c", 1 );
2837 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
2838 dump_intl_nls("nls/l_intl.nls");
2839 dump_norm_table( "nls/normnfc.nls" );
2840 dump_norm_table( "nls/normnfd.nls" );
2841 dump_norm_table( "nls/normnfkc.nls" );
2842 dump_norm_table( "nls/normnfkd.nls" );
2843 dump_norm_table( "nls/normidna.nls" );
2844 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2845 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2846 dump_eucjp_codepage();
2847 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2849 exit 0;
2851 # Local Variables:
2852 # compile-command: "./make_unicode"
2853 # End: