msvcp140/tests: Use wide-char string literals.
[wine.git] / tools / make_unicode
blob02bb0afdb6619176d056f6f1df514f607da49a02
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
24 # base URLs for www.unicode.org files
25 my $UNIVERSION = "13.0.0";
26 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
28 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
29 my $REPORTS = "http://www.unicode.org/reports";
30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
33 # Sort keys file
34 my $SORTKEYS = "tr10/allkeys.txt";
36 # Default char for undefined mappings
37 my $DEF_CHAR = ord '?';
39 # Last valid Unicode character
40 my $MAX_CHAR = 0x10ffff;
42 my @allfiles =
44 "CodpageFiles/037.txt",
45 "CodpageFiles/437.txt",
46 "CodpageFiles/500.txt",
47 "CodpageFiles/708.txt",
48 "CodpageFiles/737.txt",
49 "CodpageFiles/775.txt",
50 "CodpageFiles/850.txt",
51 "CodpageFiles/852.txt",
52 "CodpageFiles/855.txt",
53 "CodpageFiles/857.txt",
54 "CodpageFiles/860.txt",
55 "CodpageFiles/861.txt",
56 "CodpageFiles/862.txt",
57 "CodpageFiles/863.txt",
58 "CodpageFiles/864.txt",
59 "CodpageFiles/865.txt",
60 "CodpageFiles/866.txt",
61 "CodpageFiles/869.txt",
62 "CodpageFiles/874.txt",
63 "CodpageFiles/875.txt",
64 "CodpageFiles/932.txt",
65 "CodpageFiles/936.txt",
66 "CodpageFiles/949.txt",
67 "CodpageFiles/950.txt",
68 "CodpageFiles/1026.txt",
69 "CodpageFiles/1250.txt",
70 "CodpageFiles/1251.txt",
71 "CodpageFiles/1252.txt",
72 "CodpageFiles/1253.txt",
73 "CodpageFiles/1254.txt",
74 "CodpageFiles/1255.txt",
75 "CodpageFiles/1256.txt",
76 "CodpageFiles/1257.txt",
77 "CodpageFiles/1258.txt",
78 "CodpageFiles/1361.txt",
79 "CodpageFiles/10000.txt",
80 "CodpageFiles/10001.txt",
81 "CodpageFiles/10002.txt",
82 "CodpageFiles/10003.txt",
83 "CodpageFiles/10004.txt",
84 "CodpageFiles/10005.txt",
85 "CodpageFiles/10006.txt",
86 "CodpageFiles/10007.txt",
87 "CodpageFiles/10008.txt",
88 "CodpageFiles/10010.txt",
89 "CodpageFiles/10017.txt",
90 "CodpageFiles/10021.txt",
91 "CodpageFiles/10029.txt",
92 "CodpageFiles/10079.txt",
93 "CodpageFiles/10081.txt",
94 "CodpageFiles/10082.txt",
95 "CodpageFiles/20127.txt",
96 "CodpageFiles/20866.txt",
97 "CodpageFiles/21866.txt",
98 "CodpageFiles/28591.txt",
99 "CodpageFiles/28592.txt",
100 "CodpageFiles/28593.txt",
101 "CodpageFiles/28594.txt",
102 "CodpageFiles/28595.txt",
103 "CodpageFiles/28596.txt",
104 "CodpageFiles/28597.txt",
105 "CodpageFiles/28598.txt",
106 "CodpageFiles/28599.txt",
107 "CodpageFiles/28603.txt",
108 "CodpageFiles/28605.txt",
112 my %ctype =
114 # CT_CTYPE1
115 "upper" => 0x0001,
116 "lower" => 0x0002,
117 "digit" => 0x0004,
118 "space" => 0x0008,
119 "punct" => 0x0010,
120 "cntrl" => 0x0020,
121 "blank" => 0x0040,
122 "xdigit" => 0x0080,
123 "alpha" => 0x0100 | 0x80000000,
124 "defin" => 0x0200,
125 # CT_CTYPE3 in high 16 bits
126 "nonspacing" => 0x00010000,
127 "diacritic" => 0x00020000,
128 "vowelmark" => 0x00040000,
129 "symbol" => 0x00080000,
130 "katakana" => 0x00100000,
131 "hiragana" => 0x00200000,
132 "halfwidth" => 0x00400000,
133 "fullwidth" => 0x00800000,
134 "ideograph" => 0x01000000,
135 "kashida" => 0x02000000,
136 "lexical" => 0x04000000,
137 "highsurrogate" => 0x08000000,
138 "lowsurrogate" => 0x10000000,
141 my %bracket_types =
143 "o" => 0x0000,
144 "c" => 0x0001,
147 my %indic_types =
149 "Other" => 0x0000,
150 "Bindu" => 0x0001,
151 "Visarga" => 0x0002,
152 "Avagraha" => 0x0003,
153 "Nukta" => 0x0004,
154 "Virama" => 0x0005,
155 "Vowel_Independent" => 0x0006,
156 "Vowel_Dependent" => 0x0007,
157 "Vowel" => 0x0008,
158 "Consonant_Placeholder" => 0x0009,
159 "Consonant" => 0x000a,
160 "Consonant_Dead" => 0x000b,
161 "Consonant_Succeeding_Repha" => 0x000c,
162 "Consonant_Subjoined" => 0x000d,
163 "Consonant_Medial" => 0x000e,
164 "Consonant_Final" => 0x000f,
165 "Consonant_Head_Letter" => 0x0010,
166 "Modifying_Letter" => 0x0011,
167 "Tone_Letter" => 0x0012,
168 "Tone_Mark" => 0x0013,
169 "Register_Shifter" => 0x0014,
170 "Consonant_Preceding_Repha" => 0x0015,
171 "Pure_Killer" => 0x0016,
172 "Invisible_Stacker" => 0x0017,
173 "Gemination_Mark" => 0x0018,
174 "Cantillation_Mark" => 0x0019,
175 "Non_Joiner" => 0x001a,
176 "Joiner" => 0x001b,
177 "Number_Joiner" => 0x001c,
178 "Number" => 0x001d,
179 "Brahmi_Joining_Number" => 0x001e,
180 "Consonant_With_Stacker" => 0x001f,
181 "Consonant_Prefixed" => 0x0020,
182 "Syllable_Modifier" => 0x0021,
183 "Consonant_Killer" => 0x0022,
184 "Consonant_Initial_Postfixed" => 0x0023,
187 my %matra_types =
189 "Right" => 0x01,
190 "Left" => 0x02,
191 "Visual_Order_Left" => 0x03,
192 "Left_And_Right" => 0x04,
193 "Top" => 0x05,
194 "Bottom" => 0x06,
195 "Top_And_Bottom" => 0x07,
196 "Top_And_Right" => 0x08,
197 "Top_And_Left" => 0x09,
198 "Top_And_Left_And_Right" => 0x0a,
199 "Bottom_And_Right" => 0x0b,
200 "Top_And_Bottom_And_Right" => 0x0c,
201 "Overstruck" => 0x0d,
202 "Invisible" => 0x0e,
203 "Bottom_And_Left" => 0x0f,
204 "Top_And_Bottom_And_Left" => 0x10,
207 my %break_types =
209 "BK" => 0x0001,
210 "CR" => 0x0002,
211 "LF" => 0x0003,
212 "CM" => 0x0004,
213 "SG" => 0x0005,
214 "GL" => 0x0006,
215 "CB" => 0x0007,
216 "SP" => 0x0008,
217 "ZW" => 0x0009,
218 "NL" => 0x000a,
219 "WJ" => 0x000b,
220 "JL" => 0x000c,
221 "JV" => 0x000d,
222 "JT" => 0x000e,
223 "H2" => 0x000f,
224 "H3" => 0x0010,
225 "XX" => 0x0011,
226 "OP" => 0x0012,
227 "CL" => 0x0013,
228 "CP" => 0x0014,
229 "QU" => 0x0015,
230 "NS" => 0x0016,
231 "EX" => 0x0017,
232 "SY" => 0x0018,
233 "IS" => 0x0019,
234 "PR" => 0x001a,
235 "PO" => 0x001b,
236 "NU" => 0x001c,
237 "AL" => 0x001d,
238 "ID" => 0x001e,
239 "IN" => 0x001f,
240 "HY" => 0x0020,
241 "BB" => 0x0021,
242 "BA" => 0x0022,
243 "SA" => 0x0023,
244 "AI" => 0x0024,
245 "B2" => 0x0025,
246 "HL" => 0x0026,
247 "CJ" => 0x0027,
248 "RI" => 0x0028,
249 "EB" => 0x0029,
250 "EM" => 0x002a,
251 "ZWJ" => 0x002b,
254 my %vertical_types =
256 "R" => 0x0000,
257 "U" => 0x0001,
258 "Tr" => 0x0002,
259 "Tu" => 0x0003,
262 my %categories =
264 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
265 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
266 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
267 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
268 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
269 "Me" => $ctype{"defin"}, # Mark, Enclosing
270 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
271 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
272 "No" => $ctype{"defin"}, # Number, Other
273 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
274 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
275 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
276 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
277 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
278 "Cs" => $ctype{"defin"}, # Other, Surrogate
279 "Co" => $ctype{"defin"}, # Other, Private Use
280 "Cn" => $ctype{"defin"}, # Other, Not Assigned
281 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
282 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
283 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
284 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
285 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
286 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
287 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
288 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
289 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
290 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
291 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
292 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
293 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
296 # a few characters need additional categories that cannot be determined automatically
297 my %special_categories =
299 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
300 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
301 "space" => [ 0x09..0x0d, 0x85 ],
302 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
303 "cntrl" => [ 0x070f, 0x200c, 0x200d,
304 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
305 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
306 0xfff9, 0xfffa, 0xfffb ],
307 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
308 0xd7, 0xf7 ],
309 "digit" => [ 0xb2, 0xb3, 0xb9 ],
310 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
311 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
312 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
313 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
314 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
315 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
316 0x02b9..0x02ba, 0x02c6..0x02cf ],
317 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
318 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
319 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
320 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
321 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
322 0x3131..0x3164 ],
323 "ideograph" => [ 0x3006..0x3007 ],
324 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
325 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
326 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
327 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
328 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
329 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
330 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
331 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
332 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
333 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
334 "kashida" => [ 0x0640 ],
337 my %directions =
339 "L" => 1, # Left-to-Right
340 "R" => 2, # Right-to-Left
341 "AL" => 12, # Right-to-Left Arabic
342 "EN" => 3, # European Number
343 "ES" => 4, # European Number Separator
344 "ET" => 5, # European Number Terminator
345 "AN" => 6, # Arabic Number
346 "CS" => 7, # Common Number Separator
347 "NSM" => 13, # Non-Spacing Mark
348 "BN" => 14, # Boundary Neutral
349 "B" => 8, # Paragraph Separator
350 "S" => 9, # Segment Separator
351 "WS" => 10, # Whitespace
352 "ON" => 11, # Other Neutrals
353 "LRE" => 15, # Left-to-Right Embedding
354 "LRO" => 15, # Left-to-Right Override
355 "RLE" => 15, # Right-to-Left Embedding
356 "RLO" => 15, # Right-to-Left Override
357 "PDF" => 15, # Pop Directional Format
358 "LRI" => 15, # Left-to-Right Isolate
359 "RLI" => 15, # Right-to-Left Isolate
360 "FSI" => 15, # First Strong Isolate
361 "PDI" => 15 # Pop Directional Isolate
364 my %c2_types =
366 "L" => 1, # C2_LEFTTORIGHT
367 "R" => 2, # C2_RIGHTTOLEFT
368 "AL" => 2, # C2_RIGHTTOLEFT
369 "EN" => 3, # C2_EUROPENUMBER
370 "ES" => 4, # C2_EUROPESEPARATOR
371 "ET" => 5, # C2_EUROPETERMINATOR
372 "AN" => 6, # C2_ARABICNUMBER
373 "CS" => 7, # C2_COMMONSEPARATOR
374 "NSM" => 11, # C2_OTHERNEUTRAL
375 "BN" => 0, # C2_NOTAPPLICABLE
376 "B" => 8, # C2_BLOCKSEPARATOR
377 "S" => 9, # C2_SEGMENTSEPARATOR
378 "WS" => 10, # C2_WHITESPACE
379 "ON" => 11, # C2_OTHERNEUTRAL
380 "LRE" => 11, # C2_OTHERNEUTRAL
381 "LRO" => 11, # C2_OTHERNEUTRAL
382 "RLE" => 11, # C2_OTHERNEUTRAL
383 "RLO" => 11, # C2_OTHERNEUTRAL
384 "PDF" => 11, # C2_OTHERNEUTRAL
385 "LRI" => 11, # C2_OTHERNEUTRAL
386 "RLI" => 11, # C2_OTHERNEUTRAL
387 "FSI" => 11, # C2_OTHERNEUTRAL
388 "PDI" => 11 # C2_OTHERNEUTRAL
391 my %bidi_types =
393 "ON" => 0, # Other Neutrals
394 "L" => 1, # Left-to-Right
395 "R" => 2, # Right-to-Left
396 "AN" => 3, # Arabic Number
397 "EN" => 4, # European Number
398 "AL" => 5, # Right-to-Left Arabic
399 "NSM" => 6, # Non-Spacing Mark
400 "CS" => 7, # Common Number Separator
401 "ES" => 8, # European Number Separator
402 "ET" => 9, # European Number Terminator
403 "BN" => 10, # Boundary Neutral
404 "S" => 11, # Segment Separator
405 "WS" => 12, # Whitespace
406 "B" => 13, # Paragraph Separator
407 "RLO" => 14, # Right-to-Left Override
408 "RLE" => 15, # Right-to-Left Embedding
409 "LRO" => 16, # Left-to-Right Override
410 "LRE" => 17, # Left-to-Right Embedding
411 "PDF" => 18, # Pop Directional Format
412 "LRI" => 19, # Left-to-Right Isolate
413 "RLI" => 20, # Right-to-Left Isolate
414 "FSI" => 21, # First Strong Isolate
415 "PDI" => 22 # Pop Directional Isolate
418 my %joining_types =
420 "U" => 0, # Non_Joining
421 "T" => 1, # Transparent
422 "R" => 2, # Right_Joining
423 "L" => 3, # Left_Joining
424 "D" => 4, # Dual_Joining
425 "C" => 5, # Join_Causing
428 my @cp2uni = ();
429 my @glyph2uni = ();
430 my @lead_bytes = ();
431 my @uni2cp = ();
432 my @tolower_table = ();
433 my @toupper_table = ();
434 my @digitmap_table = ();
435 my @category_table = ();
436 my @joining_table = ();
437 my @direction_table = ();
438 my @decomp_table = ();
439 my @combining_class_table = ();
440 my @decomp_compat_table = ();
441 my @comp_exclusions = ();
442 my @idna_decomp_table = ();
443 my @idna_disallowed = ();
444 my %registry_keys;
445 my $default_char;
446 my $default_wchar;
448 my %joining_forms =
450 "isolated" => [],
451 "final" => [],
452 "initial" => [],
453 "medial" => []
456 sub to_utf16(@)
458 my @ret;
459 foreach my $ch (@_)
461 if ($ch < 0x10000)
463 push @ret, $ch;
465 else
467 my $val = $ch - 0x10000;
468 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
471 return @ret;
474 ################################################################
475 # fetch a unicode.org file and open it
476 sub open_data_file($$)
478 my ($base, $name) = @_;
479 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
480 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
481 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
482 local *FILE;
484 if ($base =~ /.*\/([^\/]+)\.zip$/)
486 my $zip = "$1$suffix.zip";
487 unless (-f "$cache/$zip")
489 system "mkdir", "-p", $cache;
490 print "Fetching $base...\n";
491 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
493 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
495 else
497 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
498 unless (-f $dest)
500 system "mkdir", "-p", $dir;
501 print "Fetching $base/$name...\n";
502 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
504 open FILE, "<$dest" or die "cannot open $dest";
506 return *FILE;
509 ################################################################
510 # recursively get the decomposition for a character
511 sub get_decomposition($$);
512 sub get_decomposition($$)
514 my ($char, $table) = @_;
515 my @ret;
517 return $char unless defined $table->[$char];
518 foreach my $ch (@{$table->[$char]})
520 push @ret, get_decomposition( $ch, $table );
522 return @ret;
525 ################################################################
526 # get the composition that results in a given character
527 sub get_composition($$)
529 my ($ch, $compat) = @_;
530 return () unless defined $decomp_table[$ch]; # no decomposition
531 my @ret = @{$decomp_table[$ch]};
532 return () if @ret < 2; # singleton decomposition
533 return () if $comp_exclusions[$ch]; # composition exclusion
534 return () if $combining_class_table[$ch]; # non-starter
535 return () if $combining_class_table[$ret[0]]; # first char is non-starter
536 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
537 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
538 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
539 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
540 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
541 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
542 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
543 return @ret;
546 ################################################################
547 # recursively build decompositions
548 sub build_decompositions(@)
550 my @src = @_;
551 my @dst;
553 for (my $i = 0; $i < @src; $i++)
555 next unless defined $src[$i];
556 my @decomp = to_utf16( get_decomposition( $i, \@src ));
557 $dst[$i] = \@decomp;
559 return @dst;
562 ################################################################
563 # compose Hangul sequences
564 sub compose_hangul(@)
566 my $SBASE = 0xac00;
567 my $LBASE = 0x1100;
568 my $VBASE = 0x1161;
569 my $TBASE = 0x11a7;
570 my $LCOUNT = 19;
571 my $VCOUNT = 21;
572 my $TCOUNT = 28;
573 my $NCOUNT = $VCOUNT * $TCOUNT;
574 my $SCOUNT = $LCOUNT * $NCOUNT;
576 my @seq = @_;
577 my @ret;
578 my $i;
580 for ($i = 0; $i < @seq; $i++)
582 my $ch = $seq[$i];
583 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
584 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
586 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
587 $i++;
589 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
590 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
592 $ch += $seq[$i+1] - $TBASE;
593 $i++;
595 push @ret, $ch;
597 return @ret;
600 ################################################################
601 # remove linguistic-only mappings from the case table
602 sub remove_linguistic_mappings($$)
604 my ($upper, $lower) = @_;
606 # remove case mappings that don't round-trip
608 for (my $i = 0; $i < @{$upper}; $i++)
610 next unless defined ${$upper}[$i];
611 my $ch = ${$upper}[$i];
612 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
614 for (my $i = 0; $i < @{$lower}; $i++)
616 next unless defined ${$lower}[$i];
617 my $ch = ${$lower}[$i];
618 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
622 ################################################################
623 # read in the Unicode database files
624 sub load_data()
626 my $start;
628 # now build mappings from the decomposition field of the Unicode database
630 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
631 while (<$UNICODE_DATA>)
633 # Decode the fields ...
634 my ($code, $name, $cat, $comb, $bidi,
635 $decomp, $dec, $dig, $num, $mirror,
636 $oldname, $comment, $upper, $lower, $title) = split /;/;
637 my $src = hex $code;
639 die "unknown category $cat" unless defined $categories{$cat};
640 die "unknown directionality $bidi" unless defined $directions{$bidi};
642 $category_table[$src] = $categories{$cat};
643 $direction_table[$src] = $bidi;
644 $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
646 if ($lower ne "")
648 $tolower_table[$src] = hex $lower;
650 if ($upper ne "")
652 $toupper_table[$src] = hex $upper;
654 if ($dec ne "")
656 $category_table[$src] |= $ctype{"digit"};
658 if ($dig ne "")
660 $digitmap_table[$src] = ord $dig;
662 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
664 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
665 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
666 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
667 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
668 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
669 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
670 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
671 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
672 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
673 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
674 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
675 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
677 # copy the category and direction for everything between First/Last pairs
678 if ($name =~ /, First>/) { $start = $src; }
679 if ($name =~ /, Last>/)
681 while ($start < $src)
683 $category_table[$start] = $category_table[$src];
684 $direction_table[$start] = $direction_table[$src];
685 $combining_class_table[$start] = $combining_class_table[$src];
686 $start++;
690 next if $decomp eq ""; # no decomposition, skip it
692 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
694 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
695 $decomp_compat_table[$src] = \@seq;
698 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
700 # decomposition of the form "<foo> 1234" -> use char if type is known
701 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
703 ${joining_forms{$1}}[hex $2] = $src;
706 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
708 # decomposition "<compat> 0020 1234" -> combining accent
710 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
712 # store decomposition
713 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
715 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
717 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
719 # Single char decomposition
720 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
724 close $UNICODE_DATA;
726 # patch the category of some special characters
728 for (my $i = 0; $i < @decomp_table; $i++)
730 next unless defined $decomp_table[$i];
731 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
733 foreach my $cat (keys %special_categories)
735 my $flag = $ctype{$cat};
736 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
738 for (my $i = 0; $i < @decomp_compat_table; $i++)
740 next unless defined $decomp_compat_table[$i];
741 next unless @{$decomp_compat_table[$i]} == 2;
742 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
745 # load the composition exclusions
747 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
748 while (<$EXCL>)
750 s/\#.*//; # remove comments
751 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
753 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
755 elsif (/^([0-9a-fA-F]+)\s*$/)
757 $comp_exclusions[hex $1] = 1;
760 close $EXCL;
762 # load the IDNA mappings
764 @idna_decomp_table = @decomp_compat_table;
765 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
766 while (<$IDNA>)
768 s/\#.*//; # remove comments
769 next if /^\s*$/;
770 my ($char, $type, $mapping) = split /;/;
771 my ($ch1, $ch2);
772 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
774 $ch1 = hex $1;
775 $ch2 = hex $2;
777 elsif ($char =~ /([0-9a-fA-F]+)/)
779 $ch1 = $ch2 = hex $1;
782 if ($type =~ /mapped/ || $type =~ /deviation/)
784 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
785 my @seq = map { hex $_; } split /\s+/, $mapping;
786 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
788 elsif ($type =~ /valid/)
791 elsif ($type =~ /ignored/)
793 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
795 elsif ($type =~ /disallowed/)
797 foreach my $i ($ch1 .. $ch2)
799 $idna_decomp_table[$i] = undef;
800 $idna_disallowed[$i] = 1;
804 close $IDNA;
808 ################################################################
809 # add a new registry key
810 sub add_registry_key($$)
812 my ($key, $defval) = @_;
813 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
816 ################################################################
817 # add a new registry value
818 sub add_registry_value($$$)
820 my ($key, $name, $value) = @_;
821 add_registry_key( $key, undef );
822 push @{$registry_keys{$key}}, "'$name' = s '$value'";
825 ################################################################
826 # define a new lead byte
827 sub add_lead_byte($)
829 my $ch = shift;
830 return if defined $cp2uni[$ch];
831 push @lead_bytes, $ch;
832 $cp2uni[$ch] = 0;
835 ################################################################
836 # define a new char mapping
837 sub add_mapping($$)
839 my ($cp, $uni) = @_;
840 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
841 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
842 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
845 ################################################################
846 # get a mapping including glyph chars for MB_USEGLYPHCHARS
847 sub get_glyphs_mapping(@)
849 my @table = @_;
851 for (my $i = 0; $i < @glyph2uni; $i++)
853 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
855 return @table;
858 ################################################################
859 # build EUC-JP table from the JIS 0208/0212 files
860 sub dump_eucjp_codepage()
862 @cp2uni = ();
863 @glyph2uni = ();
864 @lead_bytes = ();
865 @uni2cp = ();
866 $default_char = $DEF_CHAR;
867 $default_wchar = 0x30fb;
869 # ASCII chars
870 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
872 # lead bytes
873 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
875 # JIS X 0201 right plane
876 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
878 # undefined chars
879 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
880 $cp2uni[0xa0] = 0xf8f0;
881 $cp2uni[0xff] = 0xf8f3;
883 # Fix backslash conversion
884 add_mapping( 0xa1c0, 0xff3c );
886 # Add private mappings for rows undefined in JIS 0208/0212
887 my $private = 0xe000;
888 foreach my $hi (0xf5 .. 0xfe)
890 foreach my $lo (0xa1 .. 0xfe)
892 add_mapping( ($hi << 8) + $lo, $private++ );
895 foreach my $hi (0xf5 .. 0xfe)
897 foreach my $lo (0x21 .. 0x7e)
899 add_mapping( ($hi << 8) + $lo, $private++ );
903 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
904 while (<$INPUT>)
906 next if /^\#/; # skip comments
907 next if /^$/; # skip empty lines
908 next if /\x1a/; # skip ^Z
909 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
911 add_mapping( 0x8080 + hex $1, hex $2 );
912 next;
914 die "Unrecognized line $_\n";
916 close $INPUT;
918 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
919 while (<$INPUT>)
921 next if /^\#/; # skip comments
922 next if /^$/; # skip empty lines
923 next if /\x1a/; # skip ^Z
924 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
926 add_mapping( 0x8000 + hex $1, hex $2 );
927 next;
929 die "Unrecognized line $_\n";
931 close $INPUT;
933 output_codepage_file( 20932 );
937 ################################################################
938 # build the sort keys table
939 sub dump_sortkeys($)
941 my $filename = shift;
942 my @sortkeys = ();
944 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
945 while (<$INPUT>)
947 next if /^\#/; # skip comments
948 next if /^$/; # skip empty lines
949 next if /\x1a/; # skip ^Z
950 next if /^\@version/; # skip @version header
951 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
953 my ($uni,$variable) = (hex $1, $2);
954 next if $uni > 65535;
955 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
956 next;
958 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
960 # multiple character sequence, ignored for now
961 next;
963 die "$SORTKEYS: Unrecognized line $_\n";
965 close $INPUT;
967 # compress the keys to 32 bit:
968 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
970 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
971 ${$a}[2] <=> ${$b}[2] or
972 ${$a}[3] <=> ${$b}[3] or
973 ${$a}[4] <=> ${$b}[4] or
974 $a cmp $b; } @sortkeys;
976 my ($n2, $n3) = (1, 1);
977 my @keys = (-1, -1, -1, -1, -1 );
978 my @flatkeys = ();
980 for (my $i = 0; $i < @sortkeys; $i++)
982 next unless defined $sortkeys[$i];
983 my @current = @{$sortkeys[$i]};
984 if ($current[1] == $keys[1])
986 if ($current[2] == $keys[2])
988 if ($current[3] == $keys[3])
990 # nothing
992 else
994 $keys[3] = $current[3];
995 $n3++;
996 die if ($n3 >= 16);
999 else
1001 $keys[2] = $current[2];
1002 $keys[3] = $current[3];
1003 $n2++;
1004 $n3 = 1;
1005 die if ($n2 >= 256);
1008 else
1010 $keys[1] = $current[1];
1011 $keys[2] = $current[2];
1012 $keys[3] = $current[3];
1013 $n2 = 1;
1014 $n3 = 1;
1017 if ($current[2]) { $current[2] = $n2; }
1018 if ($current[3]) { $current[3] = $n3; }
1019 if ($current[4]) { $current[4] = 1; }
1021 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1024 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1025 printf "Building $filename\n";
1026 printf OUTPUT "/* Unicode collation element table */\n";
1027 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1028 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1029 print OUTPUT "#include \"windef.h\"\n\n";
1031 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1033 close OUTPUT;
1034 save_file($filename);
1038 ################################################################
1039 # dump an array of integers
1040 sub dump_array($$@)
1042 my ($bit_width, $default, @array) = @_;
1043 my $format = sprintf "0x%%0%ux", $bit_width / 4;
1044 my $i;
1045 my $ret = " ";
1046 for ($i = 0; $i < $#array; $i++)
1048 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1049 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
1051 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1052 return $ret;
1056 ################################################################
1057 # dump an SBCS mapping table in binary format
1058 sub dump_binary_sbcs_table($)
1060 my $codepage = shift;
1062 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1063 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1065 print OUTPUT pack "S<*", @header;
1066 print OUTPUT pack "C12", (0) x 12;
1067 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1069 if (@glyph2uni)
1071 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1073 else
1075 print OUTPUT pack "S<*", 0;
1078 print OUTPUT pack "S<*", 0, 0;
1080 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1084 ################################################################
1085 # dump a DBCS mapping table in binary format
1086 sub dump_binary_dbcs_table($)
1088 my $codepage = shift;
1089 my @lb_ranges = get_lb_ranges();
1090 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1092 my @offsets = (0) x 256;
1093 my $pos = 0;
1094 foreach my $i (@lead_bytes)
1096 $offsets[$i] = ($pos += 256);
1097 $cp2uni[$i] = 0;
1100 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1102 print OUTPUT pack "S<*", @header;
1103 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1104 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1105 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1107 foreach my $i (@lead_bytes)
1109 my $base = $i << 8;
1110 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1113 print OUTPUT pack "S<", 4;
1114 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1118 ################################################################
1119 # get the list of defined lead byte ranges
1120 sub get_lb_ranges()
1122 my @list = ();
1123 my @ranges = ();
1125 foreach my $i (@lead_bytes) { $list[$i] = 1; }
1126 my $on = 0;
1127 for (my $i = 0; $i < 256; $i++)
1129 if ($on)
1131 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1133 else
1135 if ($list[$i]) { push @ranges, $i; $on = 1; }
1138 if ($on) { push @ranges, 0xff; }
1139 return @ranges;
1142 ################################################################
1143 # dump the Indic Syllabic Category table
1144 sub dump_indic($)
1146 my $filename = shift;
1147 my @indic_table;
1149 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1150 while (<$INPUT>)
1152 next if /^\#/; # skip comments
1153 next if /^\s*$/; # skip empty lines
1154 next if /\x1a/; # skip ^Z
1155 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1157 my $type = $2;
1158 die "unknown indic $type" unless defined $indic_types{$type};
1159 if (hex $1 < 65536)
1161 $indic_table[hex $1] = $indic_types{$type};
1163 next;
1165 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1167 my $type = $3;
1168 die "unknown indic $type" unless defined $indic_types{$type};
1169 if (hex $1 < 65536 and hex $2 < 65536)
1171 foreach my $i (hex $1 .. hex $2)
1173 $indic_table[$i] = $indic_types{$type};
1176 next;
1178 die "malformed line $_";
1180 close $INPUT;
1182 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1183 while (<$INPUT>)
1185 next if /^\#/; # skip comments
1186 next if /^\s*$/; # skip empty lines
1187 next if /\x1a/; # skip ^Z
1188 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1190 my $type = $2;
1191 die "unknown matra $type" unless defined $matra_types{$type};
1192 $indic_table[hex $1] |= $matra_types{$type} << 8;
1193 next;
1195 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1197 my $type = $3;
1198 die "unknown matra $type" unless defined $matra_types{$type};
1199 foreach my $i (hex $1 .. hex $2)
1201 $indic_table[$i] |= $matra_types{$type} << 8;
1203 next;
1205 die "malformed line $_";
1207 close $INPUT;
1209 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1210 print "Building $filename\n";
1211 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1212 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1213 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
1214 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1215 print OUTPUT "#include \"windef.h\"\n\n";
1217 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1219 close OUTPUT;
1220 save_file($filename);
1223 ################################################################
1224 # dump the Line Break Properties table
1225 sub dump_linebreak($)
1227 my $filename = shift;
1228 my @break_table;
1229 my $next_group = 0;
1231 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1232 while (<$INPUT>)
1234 next if /^\#/; # skip comments
1235 next if /^\s*$/; # skip empty lines
1236 next if /\x1a/; # skip ^Z
1237 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1239 my $type = $2;
1240 die "unknown breaktype $type" unless defined $break_types{$type};
1241 $break_table[hex $1] = $break_types{$type};
1242 next;
1244 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1246 my $type = $3;
1247 die "unknown breaktype $type" unless defined $break_types{$type};
1248 foreach my $i (hex $1 .. hex $2)
1250 $break_table[$i] = $break_types{$type};
1252 next;
1254 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1256 my $type = $2;
1257 die "unknown breaktype $type" unless defined $break_types{$type};
1258 $break_table[hex $1] = $break_types{$type};
1259 next;
1261 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1263 my $type = $3;
1264 die "unknown breaktype $type" unless defined $break_types{$type};
1265 foreach my $i (hex $1 .. hex $2)
1267 $break_table[$i] = $break_types{$type};
1269 next;
1271 die "malformed line $_";
1273 close $INPUT;
1275 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1276 print "Building $filename\n";
1277 print OUTPUT "/* Unicode Line Break Properties */\n";
1278 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1279 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1280 print OUTPUT "#include \"windef.h\"\n\n";
1282 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1284 close OUTPUT;
1285 save_file($filename);
1288 my %scripts =
1290 "Unknown" => 0,
1291 "Common" => 1,
1292 "Inherited" => 2,
1293 "Arabic" => 3,
1294 "Armenian" => 4,
1295 "Avestan" => 5,
1296 "Balinese" => 6,
1297 "Bamum" => 7,
1298 "Batak" => 8,
1299 "Bengali" => 9,
1300 "Bopomofo" => 10,
1301 "Brahmi" => 11,
1302 "Braille" => 12,
1303 "Buginese" => 13,
1304 "Buhid" => 14,
1305 "Canadian_Aboriginal" => 15,
1306 "Carian" => 16,
1307 "Cham" => 17,
1308 "Cherokee" => 18,
1309 "Coptic" => 19,
1310 "Cuneiform" => 20,
1311 "Cypriot" => 21,
1312 "Cyrillic" => 22,
1313 "Deseret" => 23,
1314 "Devanagari" => 24,
1315 "Egyptian_Hieroglyphs" => 25,
1316 "Ethiopic" => 26,
1317 "Georgian" => 27,
1318 "Glagolitic" => 28,
1319 "Gothic" => 29,
1320 "Greek" => 30,
1321 "Gujarati" => 31,
1322 "Gurmukhi" => 32,
1323 "Han" => 33,
1324 "Hangul" => 34,
1325 "Hanunoo" => 35,
1326 "Hebrew" => 36,
1327 "Hiragana" => 37,
1328 "Imperial_Aramaic" => 38,
1329 "Inscriptional_Pahlavi" => 39,
1330 "Inscriptional_Parthian" => 40,
1331 "Javanese" => 41,
1332 "Kaithi" => 42,
1333 "Kannada" => 43,
1334 "Katakana" => 44,
1335 "Kayah_Li" => 45,
1336 "Kharoshthi" => 46,
1337 "Khmer" => 47,
1338 "Lao" => 48,
1339 "Latin" => 49,
1340 "Lepcha" => 50,
1341 "Limbu" => 51,
1342 "Linear_B" => 52,
1343 "Lisu" => 53,
1344 "Lycian" => 54,
1345 "Lydian" => 55,
1346 "Malayalam" => 56,
1347 "Mandaic" => 57,
1348 "Meetei_Mayek" => 58,
1349 "Mongolian" => 59,
1350 "Myanmar" => 60,
1351 "New_Tai_Lue" => 61,
1352 "Nko" => 62,
1353 "Ogham" => 63,
1354 "Ol_Chiki" => 64,
1355 "Old_Italic" => 65,
1356 "Old_Persian" => 66,
1357 "Old_South_Arabian" => 67,
1358 "Old_Turkic" => 68,
1359 "Oriya" => 69,
1360 "Osmanya" => 70,
1361 "Phags_Pa" => 71,
1362 "Phoenician" => 72,
1363 "Rejang" => 73,
1364 "Runic" => 74,
1365 "Samaritan" => 75,
1366 "Saurashtra" => 76,
1367 "Shavian" => 77,
1368 "Sinhala" => 78,
1369 "Sundanese" => 79,
1370 "Syloti_Nagri" => 80,
1371 "Syriac" => 81,
1372 "Tagalog" => 82,
1373 "Tagbanwa" => 83,
1374 "Tai_Le" => 84,
1375 "Tai_Tham" => 85,
1376 "Tai_Viet" => 86,
1377 "Tamil" => 87,
1378 "Telugu" => 88,
1379 "Thaana" => 89,
1380 "Thai" => 90,
1381 "Tibetan" => 91,
1382 "Tifinagh" => 92,
1383 "Ugaritic" => 93,
1384 "Vai" => 94,
1385 "Yi" => 95,
1386 # Win8/Win8.1
1387 "Chakma" => 96,
1388 "Meroitic_Cursive" => 97,
1389 "Meroitic_Hieroglyphs" => 98,
1390 "Miao" => 99,
1391 "Sharada" => 100,
1392 "Sora_Sompeng" => 101,
1393 "Takri" => 102,
1394 # Win10
1395 "Bassa_Vah" => 103,
1396 "Caucasian_Albanian" => 104,
1397 "Duployan" => 105,
1398 "Elbasan" => 106,
1399 "Grantha" => 107,
1400 "Khojki" => 108,
1401 "Khudawadi" => 109,
1402 "Linear_A" => 110,
1403 "Mahajani" => 111,
1404 "Manichaean" => 112,
1405 "Mende_Kikakui" => 113,
1406 "Modi" => 114,
1407 "Mro" => 115,
1408 "Nabataean" => 116,
1409 "Old_North_Arabian" => 117,
1410 "Old_Permic" => 118,
1411 "Pahawh_Hmong" => 119,
1412 "Palmyrene" => 120,
1413 "Pau_Cin_Hau" => 121,
1414 "Psalter_Pahlavi" => 122,
1415 "Siddham" => 123,
1416 "Tirhuta" => 124,
1417 "Warang_Citi" => 125,
1418 # Win10 RS1
1419 "Adlam" => 126,
1420 "Ahom" => 127,
1421 "Anatolian_Hieroglyphs" => 128,
1422 "Bhaiksuki" => 129,
1423 "Hatran" => 130,
1424 "Marchen" => 131,
1425 "Multani" => 132,
1426 "Newa" => 133,
1427 "Old_Hungarian" => 134,
1428 "Osage" => 135,
1429 "SignWriting" => 136,
1430 "Tangut" => 137,
1431 # Win10 RS4
1432 "Masaram_Gondi" => 138,
1433 "Nushu" => 139,
1434 "Soyombo" => 140,
1435 "Zanabazar_Square" => 141,
1436 # Win10 1903
1437 "Dogra" => 142,
1438 "Gunjala_Gondi" => 143,
1439 "Hanifi_Rohingya" => 144,
1440 "Makasar" => 145,
1441 "Medefaidrin" => 146,
1442 "Old_Sogdian" => 147,
1443 "Sogdian" => 148,
1444 # Win10 2004
1445 "Elymaic" => 149,
1446 "Nyiakeng_Puachue_Hmong" => 150,
1447 "Nandinagari" => 151,
1448 "Wancho" => 152,
1451 ################################################################
1452 # dump Script IDs table
1453 sub dump_scripts($)
1455 my $filename = shift;
1456 my $header = $filename;
1457 my @scripts_table;
1458 my $script_index;
1459 my $i;
1461 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1462 # Fill the table
1463 # Unknown script id is always 0, so undefined scripts are automatically treated as such
1464 while (<$INPUT>)
1466 my $type = "";
1468 next if /^\#/; # skip comments
1469 next if /^\s*$/; # skip empty lines
1470 next if /\x1a/; # skip ^Z
1471 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1473 $type = $2;
1474 if (defined $scripts{$type})
1476 $scripts_table[hex $1] = $scripts{$type};
1478 next;
1480 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1482 $type = $3;
1483 if (defined $scripts{$type})
1485 foreach my $i (hex $1 .. hex $2)
1487 $scripts_table[$i] = $scripts{$type};
1490 next;
1494 close $INPUT;
1496 $header = "$filename.h";
1497 open OUTPUT,">$header.new" or die "Cannot create $header";
1498 print "Building $header\n";
1499 print OUTPUT "/* Unicode Script IDs */\n";
1500 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1501 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1503 print OUTPUT "enum unicode_script_id {\n";
1504 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1506 print OUTPUT " Script_$script = $scripts{$script},\n";
1508 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1509 print OUTPUT "};\n";
1511 close OUTPUT;
1512 save_file($header);
1514 $filename = "$filename.c";
1515 open OUTPUT,">$filename.new" or die "Cannot create $header";
1516 print "Building $filename\n";
1517 print OUTPUT "/* Unicode Script IDs */\n";
1518 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1519 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1520 print OUTPUT "#include \"windef.h\"\n\n";
1522 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1523 close OUTPUT;
1524 save_file($filename);
1527 ################################################################
1528 # dump the BiDi mirroring table
1529 sub dump_mirroring($)
1531 my $filename = shift;
1532 my @mirror_table = ();
1534 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1535 while (<$INPUT>)
1537 next if /^\#/; # skip comments
1538 next if /^$/; # skip empty lines
1539 next if /\x1a/; # skip ^Z
1540 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1542 $mirror_table[hex $1] = hex $2;
1543 next;
1545 die "malformed line $_";
1547 close $INPUT;
1549 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1550 print "Building $filename\n";
1551 print OUTPUT "/* Unicode BiDi mirroring */\n";
1552 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1553 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1554 print OUTPUT "#include \"windef.h\"\n\n";
1555 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1556 close OUTPUT;
1557 save_file($filename);
1560 ################################################################
1561 # dump the Bidi Brackets
1562 sub dump_bracket($)
1564 my $filename = shift;
1565 my @bracket_table;
1567 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1568 while (<$INPUT>)
1570 next if /^\#/; # skip comments
1571 next if /^\s*$/; # skip empty lines
1572 next if /\x1a/; # skip ^Z
1573 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1575 my $type = $3;
1576 die "unknown bracket $type" unless defined $bracket_types{$type};
1577 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1578 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1579 $bracket_table[hex $1] += $bracket_types{$type} << 8;
1580 next;
1582 die "malformed line $_";
1584 close $INPUT;
1586 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1587 print "Building $filename\n";
1588 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1589 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1590 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1591 print OUTPUT "#include \"windef.h\"\n\n";
1593 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1595 close OUTPUT;
1596 save_file($filename);
1599 ################################################################
1600 # dump the Arabic shaping table
1601 sub dump_shaping($)
1603 my $filename = shift;
1604 my %groups;
1605 my $next_group = 0;
1607 $groups{"No_Joining_Group"} = $next_group++;
1609 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1610 while (<$INPUT>)
1612 next if /^\#/; # skip comments
1613 next if /^\s*$/; # skip empty lines
1614 next if /\x1a/; # skip ^Z
1615 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1617 my $type = $2;
1618 my $group = $3;
1619 $groups{$group} = $next_group++ unless defined $groups{$group};
1620 $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1621 next;
1623 die "malformed line $_";
1625 close $INPUT;
1627 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1628 print "Building $filename\n";
1629 print OUTPUT "/* Unicode Arabic shaping */\n";
1630 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1631 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1632 print OUTPUT "#include \"windef.h\"\n\n";
1634 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1636 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1637 for (my $i = 0x600; $i <= 0x6ff; $i++)
1639 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1640 ${joining_forms{"isolated"}}[$i] || $i,
1641 ${joining_forms{"final"}}[$i] || $i,
1642 ${joining_forms{"initial"}}[$i] || $i,
1643 ${joining_forms{"medial"}}[$i] || $i;
1645 print OUTPUT "};\n";
1647 close OUTPUT;
1648 save_file($filename);
1651 ################################################################
1652 # dump the Vertical Orientation table
1653 sub dump_vertical($)
1655 my $filename = shift;
1656 my @vertical_table;
1658 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1659 while (<$INPUT>)
1661 next if /^\#/; # skip comments
1662 next if /^\s*$/; # skip empty lines
1663 next if /\x1a/; # skip ^Z
1664 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1666 my $type = $2;
1667 die "unknown vertical $type" unless defined $vertical_types{$type};
1668 if (hex $1 < 65536)
1670 $vertical_table[hex $1] = $vertical_types{$type};
1672 next;
1674 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1676 my $type = $3;
1677 die "unknown vertical $type" unless defined $vertical_types{$type};
1678 foreach my $i (hex $1 .. hex $2)
1680 $vertical_table[$i] = $vertical_types{$type};
1682 next;
1684 die "malformed line $_";
1686 close $INPUT;
1688 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1689 print "Building $filename\n";
1690 print OUTPUT "/* Unicode Vertical Orientation */\n";
1691 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1692 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1693 print OUTPUT "#include \"windef.h\"\n\n";
1695 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1697 close OUTPUT;
1698 save_file($filename);
1701 ################################################################
1702 # dump the digit folding tables
1703 sub dump_digit_folding($)
1705 my ($filename) = shift;
1706 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1707 print "Building $filename\n";
1708 print OUTPUT "/* Unicode digit folding mappings */\n";
1709 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1710 print OUTPUT "/* DO NOT EDIT!! */\n\n";
1711 print OUTPUT "#include \"windef.h\"\n\n";
1713 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1714 close OUTPUT;
1715 save_file($filename);
1719 ################################################################
1720 # compress a mapping table by removing identical rows
1721 sub compress_array($$@)
1723 my $rows = shift;
1724 my $def = shift;
1725 my @table = @_;
1726 my $len = @table / $rows;
1727 my @array;
1728 my $data = "";
1730 # try to merge table rows
1731 for (my $row = 0; $row < $rows; $row++)
1733 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1734 my $pos = index $data, $rowtxt;
1735 if ($pos == -1)
1737 # check if the tail of the data can match the start of the new row
1738 my $first = substr( $rowtxt, 0, 1 );
1739 for (my $i = length($data) - 1; $i > 0; $i--)
1741 $pos = index( substr( $data, -$i ), $first );
1742 last if $pos == -1;
1743 $i -= $pos;
1744 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1745 substr( $data, -$i ) = "";
1746 last;
1748 $pos = length $data;
1749 $data .= $rowtxt;
1751 $array[$row] = $rows + $pos;
1753 return @array, unpack "U*", $data;
1756 ################################################################
1757 # dump a char -> 16-bit value mapping table using two-level tables
1758 sub dump_two_level_mapping($$@)
1760 my $name = shift;
1761 my $def = shift;
1762 my $size = shift;
1763 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1764 my @row_array = compress_array( 4096, $def, @_[0..65535] );
1765 my @array = compress_array( 256, 0, @row_array[0..4095] );
1767 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1769 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1770 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1771 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1772 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1775 ################################################################
1776 # dump a char -> value mapping table using three-level tables
1777 sub dump_three_level_mapping($$@)
1779 my $name = shift;
1780 my $def = shift;
1781 my $size = shift;
1782 my $type = $size == 16 ? "unsigned short" : "unsigned int";
1783 my $level3 = ($MAX_CHAR + 1) / 16;
1784 my $level2 = $level3 / 16;
1785 my $level1 = $level2 / 16;
1786 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1787 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1788 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1790 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1791 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1793 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1794 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1795 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1796 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1797 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1800 ################################################################
1801 # dump a binary case mapping table in l_intl.nls format
1802 sub dump_binary_case_table(@)
1804 my (@table) = @_;
1805 my $max_char = 0x10000;
1806 my $level1 = $max_char / 16;
1807 my $level2 = $level1 / 16;
1809 my @difftable;
1810 for (my $i = 0; $i < @table; $i++)
1812 next unless defined $table[$i];
1813 $difftable[$i] = ($table[$i] - $i) & 0xffff;
1816 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1817 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1818 my $offset = @array - $level1;
1819 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1820 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1823 ################################################################
1824 # dump case mappings for l_intl.nls
1825 sub dump_intl_nls($)
1827 my @upper_table = @toupper_table;
1828 my @lower_table = @tolower_table;
1829 remove_linguistic_mappings( \@upper_table, \@lower_table );
1831 my $upper = dump_binary_case_table( @upper_table );
1832 my $lower = dump_binary_case_table( @lower_table );
1834 my $filename = shift;
1835 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1836 printf "Building $filename\n";
1838 binmode OUTPUT;
1839 print OUTPUT pack "S<", 1; # version
1840 print OUTPUT $upper;
1841 print OUTPUT $lower;
1842 close OUTPUT;
1843 save_file($filename);
1847 ################################################################
1848 # dump the bidi direction table
1849 sub dump_bidi_dir_table($)
1851 my $filename = shift;
1852 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1853 printf "Building $filename\n";
1854 printf OUTPUT "/* Unicode BiDi direction table */\n";
1855 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1856 printf OUTPUT "#include \"windef.h\"\n\n";
1858 my @table;
1860 for (my $i = 0; $i < 65536; $i++)
1862 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1865 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1867 close OUTPUT;
1868 save_file($filename);
1872 sub rol($$)
1874 my ($byte, $count) = @_;
1875 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1878 ################################################################
1879 # compress the character properties table
1880 sub compress_char_props_table($@)
1882 my $rows = shift;
1883 my @table = @_;
1884 my $len = @table / $rows;
1885 my $pos = 0;
1886 my @array = (0) x $rows;
1887 my %sequences;
1889 # add some predefined sequences
1890 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1892 # try to merge table rows
1893 for (my $row = 0; $row < $rows; $row++)
1895 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1896 my $rowtxt = pack "L*", @table_row;
1897 if (defined($sequences{$rowtxt}))
1899 # reuse an existing row
1900 $array[$row] = $sequences{$rowtxt};
1902 else
1904 # create a new row
1905 $sequences{$rowtxt} = $array[$row] = ++$pos;
1906 push @array, @table_row;
1909 return @array;
1912 ################################################################
1913 # dump a normalization table in binary format
1914 sub dump_norm_table($)
1916 my $filename = shift;
1918 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
1919 my %decomp = ( "nfc" => \@decomp_table,
1920 "nfd" => \@decomp_table,
1921 "nfkc" => \@decomp_compat_table,
1922 "nfkd" => \@decomp_compat_table ,
1923 "idna" => \@idna_decomp_table );
1925 open OUTPUT,">$filename.new" or die "Cannot create $filename";
1926 print "Building $filename\n";
1928 my $type = $filename;
1929 $type =~ s!.*/norm(\w+)\.nls!$1!;
1931 my $compose = $forms{$type} & 1;
1932 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
1934 my @version = split /\./, $UNIVERSION;
1936 # combining classes
1938 my @classes;
1939 my @class_values;
1941 foreach my $c (grep defined, @combining_class_table)
1943 $classes[$c] = 1 if $c < 0x100;
1945 for (my $i = 0; $i < @classes; $i++)
1947 next unless defined $classes[$i];
1948 $classes[$i] = @class_values;
1949 push @class_values, $i;
1951 push @class_values, 0 if (@class_values % 2);
1952 die "too many classes" if @class_values >= 0x40;
1954 # character properties
1956 my @char_props;
1957 my @decomposed;
1958 my @comp_hash_table;
1959 my $comp_hash_size = $compose ? 254 : 0;
1961 for (my $i = 0; $i <= $MAX_CHAR; $i++)
1963 next unless defined $combining_class_table[$i];
1964 if (defined $decomp{$type}->[$i])
1966 my @dec = get_decomposition( $i, $decomp{$type} );
1967 if ($compose && (my @comp = get_composition( $i, $compat )))
1969 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
1970 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
1972 my $val = 0;
1973 foreach my $d (@dec)
1975 $val = $combining_class_table[$d];
1976 last if $val;
1978 $char_props[$i] = $classes[$val];
1980 else
1982 $char_props[$i] = 0xbf;
1984 @dec = compose_hangul( @dec ) if $compose;
1985 @dec = to_utf16( @dec );
1986 push @dec, 0 if @dec >= 7;
1987 $decomposed[$i] = \@dec;
1989 else
1991 if ($combining_class_table[$i] == 0x100)
1993 $char_props[$i] = 0x7f;
1995 elsif ($combining_class_table[$i])
1997 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
1999 elsif ($type eq "idna" && defined $idna_disallowed[$i])
2001 $char_props[$i] = 0xff;
2003 else
2005 $char_props[$i] = 0;
2010 if ($compose)
2012 for (my $i = 0; $i <= $MAX_CHAR; $i++)
2014 my @comp = get_composition( $i, $compat );
2015 next unless @comp;
2016 if ($combining_class_table[$comp[1]])
2018 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2019 $char_props[$comp[1]] |= 0x40;
2021 else
2023 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2024 $char_props[$comp[1]] |= 0xc0;
2029 # surrogates
2030 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2031 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2033 # Hangul
2034 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2035 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2036 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2038 # invalid chars
2039 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2040 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2041 foreach my $i (0x00..0x10)
2043 $char_props[($i << 16) | 0xfffe] = 0xff;
2044 $char_props[($i << 16) | 0xffff] = 0xff;
2047 # decomposition hash table
2049 my @decomp_hash_table;
2050 my @decomp_hash_index;
2051 my @decomp_hash_data;
2052 my $decomp_hash_size = 944;
2054 # build string of character data, reusing substrings when possible
2055 my $decomp_char_data = "";
2056 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2058 my $str = pack "U*", @{$i};
2059 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2061 for (my $i = 0; $i < @decomposed; $i++)
2063 next unless defined $decomposed[$i];
2064 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2065 die "sequence not found" if $pos == -1;
2066 my $len = @{$decomposed[$i]};
2067 $len = 7 if $len > 7;
2068 my $hash = $i % $decomp_hash_size;
2069 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2071 for (my $i = 0; $i < $decomp_hash_size; $i++)
2073 $decomp_hash_index[$i] = @decomp_hash_data / 2;
2074 next unless defined $decomp_hash_table[$i];
2075 if (@{$decomp_hash_table[$i]} == 1)
2077 my $entry = $decomp_hash_table[$i]->[0];
2078 if ($char_props[$entry->[0]] == 0xbf)
2080 $decomp_hash_index[$i] = $entry->[1];
2081 next;
2084 foreach my $entry (@{$decomp_hash_table[$i]})
2086 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2089 push @decomp_hash_data, 0, 0;
2091 # composition hash table
2093 my @comp_hash_index;
2094 my @comp_hash_data;
2095 if (@comp_hash_table)
2097 for (my $i = 0; $i < $comp_hash_size; $i++)
2099 $comp_hash_index[$i] = @comp_hash_data;
2100 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2102 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2103 push @comp_hash_data, 0, 0, 0;
2106 my $level1 = ($MAX_CHAR + 1) / 128;
2107 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2109 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2110 0, $decomp_hash_size, $comp_hash_size, 0 );
2111 my @tables = (0) x 8;
2113 $tables[0] = 16 + @header + @tables;
2114 $tables[1] = $tables[0] + @class_values / 2;
2115 $tables[2] = $tables[1] + $level1 / 2;
2116 $tables[3] = $tables[2] + (@rows - $level1) / 2;
2117 $tables[4] = $tables[3] + @decomp_hash_index;
2118 $tables[5] = $tables[4] + @decomp_hash_data;
2119 $tables[6] = $tables[5] + length $decomp_char_data;
2120 $tables[7] = $tables[6] + @comp_hash_index;
2122 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2123 print OUTPUT pack "S<*", @header;
2124 print OUTPUT pack "S<*", @tables;
2125 print OUTPUT pack "C*", @class_values;
2127 print OUTPUT pack "C*", @rows[0..$level1-1];
2128 print OUTPUT pack "C*", @rows[$level1..$#rows];
2129 print OUTPUT pack "S<*", @decomp_hash_index;
2130 print OUTPUT pack "S<*", @decomp_hash_data;
2131 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2132 print OUTPUT pack "S<*", @comp_hash_index;
2133 print OUTPUT pack "S<*", @comp_hash_data;
2135 close OUTPUT;
2136 save_file($filename);
2138 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2142 ################################################################
2143 # output a codepage definition file from the global tables
2144 sub output_codepage_file($)
2146 my $codepage = shift;
2148 my $output = sprintf "nls/c_%03d.nls", $codepage;
2149 open OUTPUT,">$output.new" or die "Cannot create $output";
2151 printf "Building %s\n", $output;
2152 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2153 else { dump_binary_dbcs_table( $codepage ); }
2155 close OUTPUT;
2156 save_file($output);
2158 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2161 ################################################################
2162 # output a codepage table from a Microsoft-style mapping file
2163 sub dump_msdata_codepage($)
2165 my $filename = shift;
2167 my $state = "";
2168 my ($codepage, $width, $count);
2169 my ($lb_cur, $lb_end);
2171 @cp2uni = ();
2172 @glyph2uni = ();
2173 @lead_bytes = ();
2174 @uni2cp = ();
2175 $default_char = $DEF_CHAR;
2176 $default_wchar = $DEF_CHAR;
2178 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2180 while (<$INPUT>)
2182 next if /^;/; # skip comments
2183 next if /^\s*$/; # skip empty lines
2184 next if /\x1a/; # skip ^Z
2185 last if /^ENDCODEPAGE/;
2187 if (/^CODEPAGE\s+(\d+)/)
2189 $codepage = $1;
2190 next;
2192 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2194 $width = $1;
2195 $default_char = hex $2;
2196 $default_wchar = hex $3;
2197 next;
2199 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2201 $state = $1;
2202 $count = $2;
2203 next;
2205 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2207 if ($state eq "MBTABLE")
2209 my $cp = hex $1;
2210 my $uni = hex $2;
2211 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2212 next;
2214 if ($state eq "GLYPHTABLE")
2216 my $cp = hex $1;
2217 my $uni = hex $2;
2218 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2219 next;
2221 if ($state eq "WCTABLE")
2223 my $uni = hex $1;
2224 my $cp = hex $2;
2225 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2226 next;
2228 if ($state eq "DBCSRANGE")
2230 my $start = hex $1;
2231 my $end = hex $2;
2232 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2233 $lb_cur = $start;
2234 $lb_end = $end;
2235 next;
2237 if ($state eq "DBCSTABLE")
2239 my $mb = hex $1;
2240 my $uni = hex $2;
2241 my $cp = ($lb_cur << 8) | $mb;
2242 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2243 if (!--$count)
2245 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2247 next;
2250 die "$filename: Unrecognized line $_\n";
2252 close $INPUT;
2254 output_codepage_file( $codepage );
2257 ################################################################
2258 # align a string length
2259 sub align_string($$)
2261 my ($align, $str) = @_;
2262 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2263 return $str;
2266 ################################################################
2267 # pack a GUID string
2268 sub pack_guid($)
2270 $_ = shift;
2271 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2272 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2275 ################################################################
2276 # comparison function for compression sort
2277 sub cmp_compression
2279 return scalar @{$a} <=> scalar @{$b} ||
2280 $a->[4] <=> $b->[4] ||
2281 $a->[5] <=> $b->[5] ||
2282 $a->[6] <=> $b->[6] ||
2283 $a->[7] <=> $b->[7] ||
2284 $a->[8] <=> $b->[8] ||
2285 $a->[9] <=> $b->[9] ||
2286 $a->[10] <=> $b->[10] ||
2287 $a->[11] <=> $b->[11] ||
2288 $a->[12] <=> $b->[12];
2291 ################################################################
2292 # build a binary sort keys table
2293 sub dump_sortkey_table($$)
2295 my ($filename, $download) = @_;
2297 my @keys;
2298 my ($part, $section, $subsection, $guid, $version, $ling_flag);
2299 my @multiple_weights;
2300 my @expansions;
2301 my @compressions;
2302 my %exceptions;
2303 my %guids;
2304 my %compr_flags;
2305 my %locales;
2306 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2307 my $jamostr = "";
2309 my $re_hex = '0x[0-9A-Fa-f]+';
2310 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2311 $guids{$default_guid} = { };
2313 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2315 my $KEYS = open_data_file( $MSDATA, $download );
2317 printf "Building $filename\n";
2319 while (<$KEYS>)
2321 s/\s*;.*$//;
2322 next if /^\s*$/; # skip empty lines
2323 if (/^\s*(SORTKEY|SORTTABLES)/)
2325 $part = $1;
2326 next;
2328 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2330 $part = $section = "";
2331 next;
2333 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2335 $section = $1;
2336 $guid = undef;
2337 next;
2339 next unless $part;
2340 if ("$part.$section" eq "SORTKEY.DEFAULT")
2342 if (/^\s*($re_hex)\s+$re_key/)
2344 $keys[hex $1] = [ split(/\s+/,$2) ];
2345 next;
2348 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2350 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2352 $version = hex $1;
2353 next;
2355 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2357 # ignore for now
2358 next;
2361 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2362 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2363 "$part.$section" eq "SORTTABLES.INVERSECASING")
2365 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2367 $guid = lc $1;
2368 $guids{$guid} = { } unless defined $guids{$guid};
2369 $guids{$guid}->{flags} |= $flags{$section};
2370 next;
2372 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2374 $locales{$1} = $guid;
2375 next;
2378 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2380 if (/^\s*(\d+)\s+(\d+)/)
2382 push @multiple_weights, $1, $2;
2383 next;
2386 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2388 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2390 my $pos = scalar @expansions / 2;
2391 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2392 push @expansions, hex $2, hex $3;
2393 next;
2396 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2398 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2400 $keys[hex $1] = $keys[hex $2];
2401 next;
2404 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2406 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2408 if ($subsection || !$guid) # start a new one
2410 $guid = lc $1;
2411 $subsection = "";
2412 $guids{$guid} = { } unless defined $guids{$guid};
2413 $guids{$guid}->{flags} |= $flags{$2} if $2;
2414 $guids{$guid}->{compr} = @compressions;
2415 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2416 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2417 push @compressions, [ ];
2419 else # merge with current one
2421 $guids{lc $1} = { } unless defined $guids{lc $1};
2422 $guids{lc $1}->{flags} |= $flags{$2} if $2;
2423 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2424 $compr_flags{lc $1} = $compr_flags{$guid};
2426 next;
2428 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2430 $locales{$1} = $guid;
2431 next;
2433 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2435 $subsection = $1;
2436 next;
2438 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2440 my @comp = map { hex $_; } split(/\s+/,$1);
2441 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2442 # add compression flags
2443 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2444 next;
2447 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2449 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2451 $guid = lc $1;
2452 $guids{$guid} = { } unless defined $guids{lc $1};
2453 $ling_flag = ($2 ? "+" : "-");
2454 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2455 next;
2457 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2459 $locales{$1} = $guid;
2460 next;
2462 if (/^\s*($re_hex)\s+$re_key/)
2464 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2465 next;
2468 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2470 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2472 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2473 next;
2476 die "$download: $part.$section: unrecognized line $_\n";
2478 close $KEYS;
2480 # Sortkey table
2482 my $table;
2483 for (my $i = 0; $i < 0x10000; $i++)
2485 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2486 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2489 foreach my $id (sort keys %exceptions)
2491 my $pos = length($table) / 4;
2492 my @exc = @{$exceptions{$id}};
2493 my @filled;
2494 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2495 my $guid = substr( $id, 0, -1 );
2496 $guids{$guid}->{$key} = $pos;
2497 $pos += 0x100;
2498 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2499 for (my $j = 0; $j < 0x10000; $j++)
2501 next unless defined $exc[$j] || defined $flags[$j];
2502 $filled[$j >> 8] = 1;
2503 $j |= 0xff;
2505 for (my $j = 0; $j < 0x100; $j++)
2507 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2508 $pos += 0x100 if $filled[$j];
2510 for (my $j = 0; $j < 0x10000; $j++)
2512 next unless $filled[$j >> 8];
2513 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2514 $k[3] |= $flags[$j] || 0;
2515 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2519 # Case mapping tables
2521 # standard table
2522 my @casemaps;
2523 my @upper = @toupper_table;
2524 my @lower = @tolower_table;
2525 remove_linguistic_mappings( \@upper, \@lower );
2526 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2528 # linguistic table
2529 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2531 # Turkish table
2532 @upper = @toupper_table;
2533 @lower = @tolower_table;
2534 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2535 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2536 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2537 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2539 # Char type table
2541 my @table;
2542 my $types = "";
2543 my %typestr;
2544 for (my $i = 0; $i < 0x10000; $i++)
2546 my $str = pack "S<3",
2547 ($category_table[$i] || 0) & 0xffff,
2548 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2549 ($category_table[$i] || 0) >> 16;
2551 if (!defined($typestr{$str}))
2553 $typestr{$str} = length($types) / 6;
2554 $types .= $str;
2556 $table[$i] = $typestr{$str};
2559 my @rows = compress_array( 4096, 0, @table[0..65535] );
2560 my @array = compress_array( 256, 0, @rows[0..4095] );
2561 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
2562 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2564 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2565 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2566 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2568 # Sort tables
2570 # guids
2571 my $sorttables = pack "L<2", $version, scalar %guids;
2572 foreach my $id (sort keys %guids)
2574 my %guid = %{$guids{$id}};
2575 my $flags = $guid{flags} || 0;
2576 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2577 $sorttables .= pack_guid($id) . pack "L<5",
2578 $flags,
2579 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2580 $guid{except} || 0,
2581 $guid{ling_except} || 0,
2582 $map / 2;
2585 # expansions
2586 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2588 # compressions
2589 $sorttables .= pack "L<", scalar @compressions;
2590 my $rowstr = "";
2591 foreach my $c (@compressions)
2593 my $pos = length($rowstr) / 2;
2594 my $min = 0xffff;
2595 my $max = 0;
2596 my @lengths = (0) x 8;
2597 foreach my $r (sort cmp_compression @{$c})
2599 my @row = @{$r};
2600 $lengths[scalar @row - 6]++;
2601 foreach my $val (@row[4..$#row])
2603 $min = $val if $min > $val;
2604 $max = $val if $max < $val;
2606 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2607 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2609 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2611 $sorttables .= $rowstr;
2613 # multiple weights
2614 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2616 # jamo sort
2617 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2619 # Locales
2621 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2622 foreach my $loc (sort keys %locales)
2624 # skip specific locales that match more general ones
2625 my @parts = split /[-_]/, $loc;
2626 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2627 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2628 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2631 # File header
2633 my @header;
2634 $header[0] = 16;
2635 $header[1] = $header[0] + length $table;
2636 $header[2] = $header[1] + length $casemaps;
2637 $header[3] = $header[2] + length $chartypes;
2639 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2640 print OUTPUT pack "L<*", @header;
2641 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2642 close OUTPUT;
2643 save_file($filename);
2647 ################################################################
2648 # build the script to create registry keys
2649 sub dump_registry_script($%)
2651 my ($filename, %keys) = @_;
2652 my $indent = 1;
2654 printf "Building %s\n", $filename;
2655 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2656 print OUTPUT "HKLM\n{\n";
2657 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2659 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2660 $indent++;
2662 foreach my $k (sort keys %keys)
2664 my @subkeys = split /\\/, $k;
2665 my ($def, @vals) = @{$keys{$k}};
2666 for (my $i = 0; $i < @subkeys; $i++)
2668 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2669 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2670 $indent++;
2672 foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2673 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2675 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2676 close OUTPUT;
2677 save_file($filename);
2681 ################################################################
2682 # save a file if modified
2683 sub save_file($)
2685 my $file = shift;
2686 if (-f $file && !system "cmp $file $file.new >/dev/null")
2688 unlink "$file.new";
2690 else
2692 rename "$file.new", "$file";
2697 ################################################################
2698 # main routine
2700 chdir ".." if -f "./make_unicode";
2701 load_data();
2702 dump_sortkeys( "dlls/kernelbase/collation.c" );
2703 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2704 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2705 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2706 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2707 dump_mirroring( "dlls/dwrite/mirror.c" );
2708 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2709 dump_bracket( "dlls/dwrite/bracket.c" );
2710 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2711 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2712 dump_linebreak( "dlls/dwrite/linebreak.c" );
2713 dump_scripts( "dlls/dwrite/scripts" );
2714 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2715 dump_vertical( "dlls/gdi32/vertical.c" );
2716 dump_vertical( "dlls/wineps.drv/vertical.c" );
2717 dump_intl_nls("nls/l_intl.nls");
2718 dump_norm_table( "nls/normnfc.nls" );
2719 dump_norm_table( "nls/normnfd.nls" );
2720 dump_norm_table( "nls/normnfkc.nls" );
2721 dump_norm_table( "nls/normnfkd.nls" );
2722 dump_norm_table( "nls/normidna.nls" );
2723 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2724 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2725 dump_eucjp_codepage();
2726 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2728 exit 0;
2730 # Local Variables:
2731 # compile-command: "./make_unicode"
2732 # End: