make_unicode: Generate locale.nls using the Unicode CLDR data.
[wine.git] / tools / make_unicode
bloba37af197b31a0dce5f1dc0d9374f0558969823e6
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
23 use XML::LibXML;
24 use Encode;
26 # base URLs for www.unicode.org files
27 my $UNIVERSION = "14.0.0";
28 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
29 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
30 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
31 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
32 my $REPORTS = "http://www.unicode.org/reports";
33 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
34 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
36 #my $CLDRDATA = "https://www.unicode.org/Public/cldr/40/cldr-common-40.0.zip";
37 my $CLDRVERSION = "40";
38 my $CLDRDATA = "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip";
39 my $CLDR33DATA = "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip";
41 my $ISO639VERSION = "20220120";
42 my $ISO639 = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip";
44 # Sort keys file
45 my $SORTKEYS = "tr10/allkeys.txt";
47 # Default char for undefined mappings
48 my $DEF_CHAR = ord '?';
50 # Last valid Unicode character
51 my $MAX_CHAR = 0x10ffff;
53 my @allfiles =
55 "CodpageFiles/037.txt",
56 "CodpageFiles/437.txt",
57 "CodpageFiles/500.txt",
58 "CodpageFiles/708.txt",
59 "CodpageFiles/720.txt",
60 "CodpageFiles/737.txt",
61 "CodpageFiles/775.txt",
62 "CodpageFiles/850.txt",
63 "CodpageFiles/852.txt",
64 "CodpageFiles/855.txt",
65 "CodpageFiles/857.txt",
66 "CodpageFiles/860.txt",
67 "CodpageFiles/861.txt",
68 "CodpageFiles/862.txt",
69 "CodpageFiles/863.txt",
70 "CodpageFiles/864.txt",
71 "CodpageFiles/865.txt",
72 "CodpageFiles/866.txt",
73 "CodpageFiles/869.txt",
74 "CodpageFiles/874.txt",
75 "CodpageFiles/875.txt",
76 "CodpageFiles/932.txt",
77 "CodpageFiles/936.txt",
78 "CodpageFiles/949.txt",
79 "CodpageFiles/950.txt",
80 "CodpageFiles/1026.txt",
81 "CodpageFiles/1250.txt",
82 "CodpageFiles/1251.txt",
83 "CodpageFiles/1252.txt",
84 "CodpageFiles/1253.txt",
85 "CodpageFiles/1254.txt",
86 "CodpageFiles/1255.txt",
87 "CodpageFiles/1256.txt",
88 "CodpageFiles/1257.txt",
89 "CodpageFiles/1258.txt",
90 "CodpageFiles/1361.txt",
91 "CodpageFiles/10000.txt",
92 "CodpageFiles/10001.txt",
93 "CodpageFiles/10002.txt",
94 "CodpageFiles/10003.txt",
95 "CodpageFiles/10004.txt",
96 "CodpageFiles/10005.txt",
97 "CodpageFiles/10006.txt",
98 "CodpageFiles/10007.txt",
99 "CodpageFiles/10008.txt",
100 "CodpageFiles/10010.txt",
101 "CodpageFiles/10017.txt",
102 "CodpageFiles/10021.txt",
103 "CodpageFiles/10029.txt",
104 "CodpageFiles/10079.txt",
105 "CodpageFiles/10081.txt",
106 "CodpageFiles/10082.txt",
107 "CodpageFiles/20127.txt",
108 "CodpageFiles/20866.txt",
109 "CodpageFiles/21866.txt",
110 "CodpageFiles/28591.txt",
111 "CodpageFiles/28592.txt",
112 "CodpageFiles/28593.txt",
113 "CodpageFiles/28594.txt",
114 "CodpageFiles/28595.txt",
115 "CodpageFiles/28596.txt",
116 "CodpageFiles/28597.txt",
117 "CodpageFiles/28598.txt",
118 "CodpageFiles/28599.txt",
119 "CodpageFiles/28603.txt",
120 "CodpageFiles/28605.txt",
124 my %ctype =
126 # CT_CTYPE1
127 "upper" => 0x0001,
128 "lower" => 0x0002,
129 "digit" => 0x0004,
130 "space" => 0x0008,
131 "punct" => 0x0010,
132 "cntrl" => 0x0020,
133 "blank" => 0x0040,
134 "xdigit" => 0x0080,
135 "alpha" => 0x0100 | 0x80000000,
136 "defin" => 0x0200,
137 # CT_CTYPE3 in high 16 bits
138 "nonspacing" => 0x00010000,
139 "diacritic" => 0x00020000,
140 "vowelmark" => 0x00040000,
141 "symbol" => 0x00080000,
142 "katakana" => 0x00100000,
143 "hiragana" => 0x00200000,
144 "halfwidth" => 0x00400000,
145 "fullwidth" => 0x00800000,
146 "ideograph" => 0x01000000,
147 "kashida" => 0x02000000,
148 "lexical" => 0x04000000,
149 "highsurrogate" => 0x08000000,
150 "lowsurrogate" => 0x10000000,
153 my %bracket_types =
155 "o" => 0x0000,
156 "c" => 0x0001,
159 my %indic_types =
161 "Other" => 0x0000,
162 "Bindu" => 0x0001,
163 "Visarga" => 0x0002,
164 "Avagraha" => 0x0003,
165 "Nukta" => 0x0004,
166 "Virama" => 0x0005,
167 "Vowel_Independent" => 0x0006,
168 "Vowel_Dependent" => 0x0007,
169 "Vowel" => 0x0008,
170 "Consonant_Placeholder" => 0x0009,
171 "Consonant" => 0x000a,
172 "Consonant_Dead" => 0x000b,
173 "Consonant_Succeeding_Repha" => 0x000c,
174 "Consonant_Subjoined" => 0x000d,
175 "Consonant_Medial" => 0x000e,
176 "Consonant_Final" => 0x000f,
177 "Consonant_Head_Letter" => 0x0010,
178 "Modifying_Letter" => 0x0011,
179 "Tone_Letter" => 0x0012,
180 "Tone_Mark" => 0x0013,
181 "Register_Shifter" => 0x0014,
182 "Consonant_Preceding_Repha" => 0x0015,
183 "Pure_Killer" => 0x0016,
184 "Invisible_Stacker" => 0x0017,
185 "Gemination_Mark" => 0x0018,
186 "Cantillation_Mark" => 0x0019,
187 "Non_Joiner" => 0x001a,
188 "Joiner" => 0x001b,
189 "Number_Joiner" => 0x001c,
190 "Number" => 0x001d,
191 "Brahmi_Joining_Number" => 0x001e,
192 "Consonant_With_Stacker" => 0x001f,
193 "Consonant_Prefixed" => 0x0020,
194 "Syllable_Modifier" => 0x0021,
195 "Consonant_Killer" => 0x0022,
196 "Consonant_Initial_Postfixed" => 0x0023,
199 my %matra_types =
201 "Right" => 0x01,
202 "Left" => 0x02,
203 "Visual_Order_Left" => 0x03,
204 "Left_And_Right" => 0x04,
205 "Top" => 0x05,
206 "Bottom" => 0x06,
207 "Top_And_Bottom" => 0x07,
208 "Top_And_Right" => 0x08,
209 "Top_And_Left" => 0x09,
210 "Top_And_Left_And_Right" => 0x0a,
211 "Bottom_And_Right" => 0x0b,
212 "Top_And_Bottom_And_Right" => 0x0c,
213 "Overstruck" => 0x0d,
214 "Invisible" => 0x0e,
215 "Bottom_And_Left" => 0x0f,
216 "Top_And_Bottom_And_Left" => 0x10,
219 my %break_types =
221 "BK" => 0x0001,
222 "CR" => 0x0002,
223 "LF" => 0x0003,
224 "CM" => 0x0004,
225 "SG" => 0x0005,
226 "GL" => 0x0006,
227 "CB" => 0x0007,
228 "SP" => 0x0008,
229 "ZW" => 0x0009,
230 "NL" => 0x000a,
231 "WJ" => 0x000b,
232 "JL" => 0x000c,
233 "JV" => 0x000d,
234 "JT" => 0x000e,
235 "H2" => 0x000f,
236 "H3" => 0x0010,
237 "XX" => 0x0011,
238 "OP" => 0x0012,
239 "CL" => 0x0013,
240 "CP" => 0x0014,
241 "QU" => 0x0015,
242 "NS" => 0x0016,
243 "EX" => 0x0017,
244 "SY" => 0x0018,
245 "IS" => 0x0019,
246 "PR" => 0x001a,
247 "PO" => 0x001b,
248 "NU" => 0x001c,
249 "AL" => 0x001d,
250 "ID" => 0x001e,
251 "IN" => 0x001f,
252 "HY" => 0x0020,
253 "BB" => 0x0021,
254 "BA" => 0x0022,
255 "SA" => 0x0023,
256 "AI" => 0x0024,
257 "B2" => 0x0025,
258 "HL" => 0x0026,
259 "CJ" => 0x0027,
260 "RI" => 0x0028,
261 "EB" => 0x0029,
262 "EM" => 0x002a,
263 "ZWJ" => 0x002b,
266 my %vertical_types =
268 "R" => 0x0000,
269 "U" => 0x0001,
270 "Tr" => 0x0002,
271 "Tu" => 0x0003,
274 my %categories =
276 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
277 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
278 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
279 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
280 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
281 "Me" => $ctype{"defin"}, # Mark, Enclosing
282 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
283 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
284 "No" => $ctype{"defin"}, # Number, Other
285 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
286 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
287 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
288 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
289 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
290 "Cs" => $ctype{"defin"}, # Other, Surrogate
291 "Co" => $ctype{"defin"}, # Other, Private Use
292 "Cn" => $ctype{"defin"}, # Other, Not Assigned
293 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
294 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
295 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
296 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
297 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
298 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
299 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
300 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
301 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
302 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
303 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
304 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
305 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
308 # a few characters need additional categories that cannot be determined automatically
309 my %special_categories =
311 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
312 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
313 "space" => [ 0x09..0x0d, 0x85 ],
314 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
315 "cntrl" => [ 0x070f, 0x200c, 0x200d,
316 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
317 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
318 0xfff9, 0xfffa, 0xfffb ],
319 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
320 0xd7, 0xf7 ],
321 "digit" => [ 0xb2, 0xb3, 0xb9 ],
322 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
323 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
324 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
325 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
326 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
327 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
328 0x02b9..0x02ba, 0x02c6..0x02cf ],
329 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
330 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
331 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
332 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
333 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
334 0x3131..0x3164 ],
335 "ideograph" => [ 0x3006..0x3007 ],
336 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
337 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
338 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
339 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
340 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
341 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
342 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
343 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
344 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
345 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
346 "kashida" => [ 0x0640 ],
349 my %directions =
351 "L" => 1, # Left-to-Right
352 "R" => 2, # Right-to-Left
353 "AL" => 12, # Right-to-Left Arabic
354 "EN" => 3, # European Number
355 "ES" => 4, # European Number Separator
356 "ET" => 5, # European Number Terminator
357 "AN" => 6, # Arabic Number
358 "CS" => 7, # Common Number Separator
359 "NSM" => 13, # Non-Spacing Mark
360 "BN" => 14, # Boundary Neutral
361 "B" => 8, # Paragraph Separator
362 "S" => 9, # Segment Separator
363 "WS" => 10, # Whitespace
364 "ON" => 11, # Other Neutrals
365 "LRE" => 15, # Left-to-Right Embedding
366 "LRO" => 15, # Left-to-Right Override
367 "RLE" => 15, # Right-to-Left Embedding
368 "RLO" => 15, # Right-to-Left Override
369 "PDF" => 15, # Pop Directional Format
370 "LRI" => 15, # Left-to-Right Isolate
371 "RLI" => 15, # Right-to-Left Isolate
372 "FSI" => 15, # First Strong Isolate
373 "PDI" => 15 # Pop Directional Isolate
376 my %c2_types =
378 "L" => 1, # C2_LEFTTORIGHT
379 "R" => 2, # C2_RIGHTTOLEFT
380 "AL" => 2, # C2_RIGHTTOLEFT
381 "EN" => 3, # C2_EUROPENUMBER
382 "ES" => 4, # C2_EUROPESEPARATOR
383 "ET" => 5, # C2_EUROPETERMINATOR
384 "AN" => 6, # C2_ARABICNUMBER
385 "CS" => 7, # C2_COMMONSEPARATOR
386 "NSM" => 11, # C2_OTHERNEUTRAL
387 "BN" => 0, # C2_NOTAPPLICABLE
388 "B" => 8, # C2_BLOCKSEPARATOR
389 "S" => 9, # C2_SEGMENTSEPARATOR
390 "WS" => 10, # C2_WHITESPACE
391 "ON" => 11, # C2_OTHERNEUTRAL
392 "LRE" => 11, # C2_OTHERNEUTRAL
393 "LRO" => 11, # C2_OTHERNEUTRAL
394 "RLE" => 11, # C2_OTHERNEUTRAL
395 "RLO" => 11, # C2_OTHERNEUTRAL
396 "PDF" => 11, # C2_OTHERNEUTRAL
397 "LRI" => 11, # C2_OTHERNEUTRAL
398 "RLI" => 11, # C2_OTHERNEUTRAL
399 "FSI" => 11, # C2_OTHERNEUTRAL
400 "PDI" => 11 # C2_OTHERNEUTRAL
403 my %bidi_types =
405 "ON" => 0, # Other Neutrals
406 "L" => 1, # Left-to-Right
407 "R" => 2, # Right-to-Left
408 "AN" => 3, # Arabic Number
409 "EN" => 4, # European Number
410 "AL" => 5, # Right-to-Left Arabic
411 "NSM" => 6, # Non-Spacing Mark
412 "CS" => 7, # Common Number Separator
413 "ES" => 8, # European Number Separator
414 "ET" => 9, # European Number Terminator
415 "BN" => 10, # Boundary Neutral
416 "S" => 11, # Segment Separator
417 "WS" => 12, # Whitespace
418 "B" => 13, # Paragraph Separator
419 "RLO" => 14, # Right-to-Left Override
420 "RLE" => 15, # Right-to-Left Embedding
421 "LRO" => 16, # Left-to-Right Override
422 "LRE" => 17, # Left-to-Right Embedding
423 "PDF" => 18, # Pop Directional Format
424 "LRI" => 19, # Left-to-Right Isolate
425 "RLI" => 20, # Right-to-Left Isolate
426 "FSI" => 21, # First Strong Isolate
427 "PDI" => 22 # Pop Directional Isolate
430 my %joining_types =
432 "U" => 0, # Non_Joining
433 "L" => 1, # Left_Joining
434 "R" => 2, # Right_Joining
435 "D" => 3, # Dual_Joining
436 "C" => 3, # Join_Causing
437 "ALAPH" => 4, # Syriac ALAPH
438 "DALATH RISH" => 5, # Syriac DALATH RISH group
439 "T" => 6, # Transparent
442 my @locales =
444 { name => "", lcid => 0x0000007f, file => "root", territory => "IV", sabbrevlangname => "IVL", sopentypelang =>"dflt" },
445 { name => "aa", dir => "seed", sopentypelang => "AFR" },
446 { name => "aa-DJ", dir => "seed" },
447 { name => "aa-ER", dir => "seed" },
448 { name => "aa-ET", dir => "seed" },
449 { name => "af", lcid => 0x00000036, oemcp => 850, sabbrevlangname => "AFK", sopentypelang => "AFK" },
450 { name => "af-NA" },
451 { name => "af-ZA", lcid => 0x00000436 },
452 { name => "agq" },
453 { name => "agq-CM" },
454 { name => "ak", sopentypelang => "TWI" },
455 { name => "ak-GH" },
456 { name => "am", lcid => 0x0000005e, sabbrevlangname => "AMH" },
457 { name => "am-ET", lcid => 0x0000045e },
458 { name => "ar", lcid => 0x00000001, territory => "SA", oemcp => 720, group => 13 },
459 { name => "ar-001" },
460 { name => "ar-AE", lcid => 0x00003801, sabbrevlangname => "ARU" },
461 { name => "ar-BH", lcid => 0x00003c01, sabbrevlangname => "ARH" },
462 { name => "ar-DJ" },
463 { name => "ar-DZ", lcid => 0x00001401, sabbrevlangname => "ARG" },
464 { name => "ar-EG", lcid => 0x00000c01, sabbrevlangname => "ARE" },
465 { name => "ar-EH" },
466 { name => "ar-ER" },
467 { name => "ar-IL" },
468 { name => "ar-IQ", lcid => 0x00000801, sabbrevlangname => "ARI" },
469 { name => "ar-JO", lcid => 0x00002c01, sabbrevlangname => "ARJ" },
470 { name => "ar-KM" },
471 { name => "ar-KW", lcid => 0x00003401, sabbrevlangname => "ARK" },
472 { name => "ar-LB", lcid => 0x00003001, sabbrevlangname => "ARB" },
473 { name => "ar-LY", lcid => 0x00001001, sabbrevlangname => "ARL" },
474 { name => "ar-MA", lcid => 0x00001801, sabbrevlangname => "ARM" },
475 { name => "ar-MR" },
476 { name => "ar-OM", lcid => 0x00002001, sabbrevlangname => "ARO" },
477 { name => "ar-PS" },
478 { name => "ar-QA", lcid => 0x00004001, sabbrevlangname => "ARQ" },
479 { name => "ar-SA", lcid => 0x00000401, sabbrevlangname => "ARA" },
480 { name => "ar-SD" },
481 { name => "ar-SO" },
482 { name => "ar-SS" },
483 { name => "ar-SY", lcid => 0x00002801, sabbrevlangname => "ARS" },
484 { name => "ar-TD" },
485 { name => "ar-TN", lcid => 0x00001c01, sabbrevlangname => "ART" },
486 { name => "ar-YE", lcid => 0x00002401, sabbrevlangname => "ARY" },
487 { name => "arn", lcid => 0x0000007a, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sabbrevlangname => "MPD", sopentypelang => "MAP" },
488 { name => "arn-CL", lcid => 0x0000047a, dir => "seed" },
489 { name => "arn-Latn", alias => "arn" },
490 { name => "arn-Latn-CL", alias => "arn-CL" },
491 { name => "as", lcid => 0x0000004d, slist => ",", group => 15 },
492 { name => "as-IN", lcid => 0x0000044d },
493 { name => "asa" },
494 { name => "asa-TZ" },
495 { name => "ast" },
496 { name => "ast-ES" },
497 { name => "az", lcid => 0x0000002c, oemcp => 857, ebcdiccp => 20905, group => 2 },
498 { name => "az-Cyrl", lcid => 0x0000742c, oemcp => 866, ebcdiccp => 20880, group => 5, sabbrevlangname => "AZC" },
499 { name => "az-Cyrl-AZ", lcid => 0x0000082c },
500 { name => "az-Latn", lcid => 0x0000782c },
501 { name => "az-Latn-AZ", lcid => 0x0000042c },
502 { name => "ba", lcid => 0x0000006d, oemcp => 866, group => 5, dir => "seed", sabbrevlangname => "BAS", sopentypelang => "BSH" },
503 { name => "ba-Cyrl", alias => "ba" },
504 { name => "ba-Cyrl-RU", alias => "ba-RU" },
505 { name => "ba-RU", lcid => 0x0000046d, dir => "seed" },
506 { name => "bas" },
507 { name => "bas-CM" },
508 { name => "be", lcid => 0x00000023, oemcp => 866, ebcdiccp => 500, group => 5 },
509 { name => "be-BY", lcid => 0x00000423 },
510 { name => "bem" },
511 { name => "bem-ZM" },
512 { name => "bez" },
513 { name => "bez-TZ" },
514 { name => "bg", lcid => 0x00000002, oemcp => 866, ebcdiccp => 21025, group => 5, sabbrevlangname => "BGR", sopentypelang => "BGR" },
515 { name => "bg-BG", lcid => 0x00000402 },
516 { name => "bin", lcid => 0x00000066, oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "EDO" },
517 { name => "bin-NG", lcid => 0x00000466, file => "bin", dir => "exemplars" },
518 { name => "bm", sopentypelang => "BMB" },
519 { name => "bm-Latn", file => "bm" },
520 { name => "bm-Latn-ML", file => "bm_ML" },
521 { name => "bm-ML", alias => "bm-Latn-ML" },
522 { name => "bn", lcid => 0x00000045, slist => ",", group => 15, sabbrevlangname => "BNB" },
523 { name => "bn-BD", lcid => 0x00000845 },
524 { name => "bn-IN", lcid => 0x00000445, sabbrevlangname => "BNG" },
525 { name => "bo", lcid => 0x00000051, slist => ",", group => 15, sabbrevlangname => "BOB", sopentypelang => "TIB" },
526 { name => "bo-CN", lcid => 0x00000451 },
527 { name => "bo-IN", slist => "," },
528 { name => "bo-Tibt", alias => "bo" },
529 { name => "bo-Tibt-CN", alias => "bo-CN" },
530 { name => "bo-Tibt-IN", alias => "bo-IN" },
531 { name => "br", lcid => 0x0000007e, oemcp => 850, ebcdiccp => 20297 },
532 { name => "br-FR", lcid => 0x0000047e },
533 { name => "br-Latn", alias => "br" },
534 { name => "br-Latn-FR", alias => "br-FR" },
535 { name => "brx" },
536 { name => "brx-IN" },
537 { name => "bs", lcid => 0x0000781a, oemcp => 852, maccp => 10082, ebcdiccp => 870, group => 2, sabbrevlangname => "BSB" },
538 { name => "bs-Cyrl", lcid => 0x0000641a, oemcp => 855, group => 5, sabbrevlangname => "BSC" },
539 { name => "bs-Cyrl-BA", lcid => 0x0000201a },
540 { name => "bs-Latn", lcid => 0x0000681a },
541 { name => "bs-Latn-BA", lcid => 0x0000141a },
542 { name => "byn", dir => "seed", sopentypelang => "BIL" },
543 { name => "byn-ER", dir => "seed" },
544 { name => "ca", lcid => 0x00000003, oemcp => 850 },
545 { name => "ca-AD", maccp => 65001 },
546 { name => "ca-ES", lcid => 0x00000403 },
547 { name => "ca-ES-valencia", lcid => 0x00000803, file => "ca_ES_VALENCIA", sabbrevlangname => "VAL" },
548 { name => "ca-FR", maccp => 65001 },
549 { name => "ca-IT", maccp => 65001 },
550 { name => "ccp" },
551 { name => "ccp-BD", alias => "ccp-Cakm-BD" },
552 { name => "ccp-Cakm", file => "ccp" },
553 { name => "ccp-Cakm-BD", file => "ccp_BD" },
554 { name => "ccp-Cakm-IN", file => "ccp_IN" },
555 { name => "ccp-IN", alias => "ccp-Cakm-IN" },
556 { name => "ce" },
557 { name => "ce-RU" },
558 { name => "ceb" },
559 { name => "ceb-Latn", file => "ceb" },
560 { name => "ceb-Latn-PH", file => "ceb_PH" },
561 { name => "ceb-PH", alias => "ceb-Latn-PH" },
562 { name => "cgg" },
563 { name => "cgg-UG" },
564 { name => "chr", lcid => 0x0000005c, slist => ",", sabbrevlangname => "CRE" },
565 { name => "chr-Cher", lcid => 0x00007c5c, file => "chr" },
566 { name => "chr-Cher-US", lcid => 0x0000045c, file => "chr_US" },
567 { name => "chr-US", alias => "chr-Cher-US" },
568 { name => "ckb", alias => "ku" },
569 { name => "ckb-IQ", alias => "ku-Arab-IQ" },
570 { name => "ckb-IR", alias => "ku-Arab-IR" },
571 { name => "co", lcid => 0x00000083, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
572 { name => "co-FR", lcid => 0x00000483, dir => "seed" },
573 { name => "co-Latn", alias => "co" },
574 { name => "co-Latn-FR", alias => "co-FR" },
575 { name => "cs", lcid => 0x00000005, oemcp => 852, group => 2, sabbrevlangname => "CSY", sopentypelang => "CSY" },
576 { name => "cs-CZ", lcid => 0x00000405 },
577 { name => "cu", dir => "seed", sopentypelang => "CSL" },
578 { name => "cu-RU", dir => "seed" },
579 { name => "cy", lcid => 0x00000052, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "CYM", sopentypelang => "WEL" },
580 { name => "cy-GB", lcid => 0x00000452 },
581 { name => "da", lcid => 0x00000006, oemcp => 850, ebcdiccp => 20277 },
582 { name => "da-DK", lcid => 0x00000406 },
583 { name => "da-GL", maccp => 65001 },
584 { name => "dav" },
585 { name => "dav-KE" },
586 { name => "de", lcid => 0x00000007, oemcp => 850, ebcdiccp => 20273 },
587 { name => "de-AT", lcid => 0x00000c07, sabbrevlangname => "DEA" },
588 { name => "de-BE" },
589 { name => "de-CH", lcid => 0x00000807, sabbrevlangname => "DES" },
590 { name => "de-DE", lcid => 0x00000407 },
591 { name => "de-DE_phoneb", lcid => 0x00010407, alias => "de-DE" },
592 { name => "de-DE-u-co-phonebk", alias => "de-DE_phoneb" },
593 { name => "de-IT", oemcp => 65001 },
594 { name => "de-LI", lcid => 0x00001407, sabbrevlangname => "DEC" },
595 { name => "de-LU", lcid => 0x00001007, sabbrevlangname => "DEL" },
596 { name => "dje", sopentypelang => "DJR" },
597 { name => "dje-NE" },
598 { name => "doi" },
599 { name => "doi-IN" },
600 { name => "dsb", lcid => 0x00007c2e, sparent => "hsb", oemcp => 850, ebcdiccp => 870, sabbrevlangname => "DSB", sopentypelang => "LSB" },
601 { name => "dsb-DE", lcid => 0x0000082e },
602 { name => "dua" },
603 { name => "dua-CM" },
604 { name => "dv", lcid => 0x00000065, slist => "\x{060c}", group => 13, dir => "seed" },
605 { name => "dv-MV", lcid => 0x00000465, dir => "seed" },
606 { name => "dyo" },
607 { name => "dyo-SN" },
608 { name => "dz", sopentypelang => "DZN" },
609 { name => "dz-BT", lcid => 0x00000c51, sabbrevlangname => "ZZZ" },
610 { name => "ebu" },
611 { name => "ebu-KE" },
612 { name => "ee" },
613 { name => "ee-GH" },
614 { name => "ee-TG" },
615 { name => "el", lcid => 0x00000008, oemcp => 737, group => 4 },
616 { name => "el-CY" },
617 { name => "el-GR", lcid => 0x00000408 },
618 { name => "en", lcid => 0x00000009, oemcp => 437, slist => ",", sabbrevlangname => "ENU" },
619 { name => "en-001", oemcp => 850 },
620 { name => "en-029", lcid => 0x00002409, file => "en", oemcp => 850, sintlsymbol => "XCD", sabbrevlangname => "ENB" },
621 { name => "en-150", oemcp => 65001 },
622 { name => "en-AE", lcid => 0x00004c09, oemcp => 65001, sabbrevlangname => "ZZZ" },
623 { name => "en-AG", oemcp => 850 },
624 { name => "en-AI", oemcp => 850 },
625 { name => "en-AS", oemcp => 850 },
626 { name => "en-AT", oemcp => 65001 },
627 { name => "en-AU", lcid => 0x00000c09, oemcp => 850, sabbrevlangname => "ENA" },
628 { name => "en-BB", oemcp => 850 },
629 { name => "en-BE", oemcp => 850 },
630 { name => "en-BI", oemcp => 65001 },
631 { name => "en-BM", oemcp => 850 },
632 { name => "en-BS", oemcp => 850 },
633 { name => "en-BW", oemcp => 850 },
634 { name => "en-BZ", lcid => 0x00002809, oemcp => 850, sabbrevlangname => "ENL" },
635 { name => "en-CA", lcid => 0x00001009, oemcp => 850, ebcdiccp => 37, sabbrevlangname => "ENC" },
636 { name => "en-CC", oemcp => 850 },
637 { name => "en-CH", oemcp => 65001 },
638 { name => "en-CK", oemcp => 850 },
639 { name => "en-CM", oemcp => 850 },
640 { name => "en-CX", oemcp => 850 },
641 { name => "en-CY", oemcp => 65001 },
642 { name => "en-DE", oemcp => 65001 },
643 { name => "en-DG", oemcp => 850 },
644 { name => "en-DK", oemcp => 65001 },
645 { name => "en-DM", oemcp => 850 },
646 { name => "en-ER", oemcp => 850 },
647 { name => "en-FI", oemcp => 65001 },
648 { name => "en-FJ", oemcp => 850 },
649 { name => "en-FK", oemcp => 850 },
650 { name => "en-FM", oemcp => 850 },
651 { name => "en-GB", lcid => 0x00000809, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "ENG" },
652 { name => "en-GD", oemcp => 850 },
653 { name => "en-GG", oemcp => 850 },
654 { name => "en-GH", oemcp => 850 },
655 { name => "en-GI", oemcp => 850 },
656 { name => "en-GM", oemcp => 850 },
657 { name => "en-GU", oemcp => 850 },
658 { name => "en-GY", oemcp => 850 },
659 { name => "en-HK", lcid => 0x00003c09, oemcp => 850, sabbrevlangname => "ENH" },
660 { name => "en-ID", lcid => 0x00003809, file => "en", oemcp => 850, sabbrevlangname => "ZZZ" },
661 { name => "en-IE", lcid => 0x00001809, oemcp => 850, sabbrevlangname => "ENI" },
662 { name => "en-IL", oemcp => 65001 },
663 { name => "en-IM", oemcp => 850 },
664 { name => "en-IN", lcid => 0x00004009, sabbrevlangname => "ENN" },
665 { name => "en-IO", oemcp => 850 },
666 { name => "en-JE", oemcp => 850 },
667 { name => "en-JM", lcid => 0x00002009, oemcp => 850, sabbrevlangname => "ENJ" },
668 { name => "en-KE", oemcp => 850 },
669 { name => "en-KI", oemcp => 850 },
670 { name => "en-KN", oemcp => 850 },
671 { name => "en-KY", oemcp => 850 },
672 { name => "en-LC", oemcp => 850 },
673 { name => "en-LR", oemcp => 850 },
674 { name => "en-LS", oemcp => 850 },
675 { name => "en-MG", oemcp => 850 },
676 { name => "en-MH", oemcp => 850 },
677 { name => "en-MO", oemcp => 850 },
678 { name => "en-MP", oemcp => 850 },
679 { name => "en-MS", oemcp => 850 },
680 { name => "en-MT", oemcp => 850 },
681 { name => "en-MU", oemcp => 850 },
682 { name => "en-MW", oemcp => 850 },
683 { name => "en-MY", lcid => 0x00004409, sabbrevlangname => "ENM" },
684 { name => "en-NA", oemcp => 850 },
685 { name => "en-NF", oemcp => 850 },
686 { name => "en-NG", oemcp => 850 },
687 { name => "en-NL", oemcp => 65001 },
688 { name => "en-NR", oemcp => 850 },
689 { name => "en-NU", oemcp => 850 },
690 { name => "en-NZ", lcid => 0x00001409, oemcp => 850, sabbrevlangname => "ENZ" },
691 { name => "en-PG", oemcp => 850 },
692 { name => "en-PH", lcid => 0x00003409, ebcdiccp => 500, sabbrevlangname => "ENP" },
693 { name => "en-PK", oemcp => 850 },
694 { name => "en-PN", oemcp => 850 },
695 { name => "en-PR", oemcp => 850 },
696 { name => "en-PW", oemcp => 850 },
697 { name => "en-RW", oemcp => 850 },
698 { name => "en-SB", oemcp => 850 },
699 { name => "en-SC", oemcp => 850 },
700 { name => "en-SD", oemcp => 850 },
701 { name => "en-SE", oemcp => 65001 },
702 { name => "en-SG", lcid => 0x00004809, sabbrevlangname => "ENE" },
703 { name => "en-SH", oemcp => 850 },
704 { name => "en-SI", oemcp => 65001 },
705 { name => "en-SL", oemcp => 850 },
706 { name => "en-SS", oemcp => 850 },
707 { name => "en-SX", oemcp => 850 },
708 { name => "en-SZ", oemcp => 850 },
709 { name => "en-TC", oemcp => 850 },
710 { name => "en-TK", oemcp => 850 },
711 { name => "en-TO", oemcp => 850 },
712 { name => "en-TT", lcid => 0x00002c09, oemcp => 850, sabbrevlangname => "ENT" },
713 { name => "en-TV", oemcp => 850 },
714 { name => "en-TZ", oemcp => 850 },
715 { name => "en-UG", oemcp => 850 },
716 { name => "en-UM", oemcp => 850 },
717 { name => "en-US", lcid => 0x00000409 },
718 { name => "en-VC", oemcp => 850 },
719 { name => "en-VG", oemcp => 850 },
720 { name => "en-VI", oemcp => 850 },
721 { name => "en-VU", oemcp => 850 },
722 { name => "en-WS", oemcp => 850 },
723 { name => "en-ZA", lcid => 0x00001c09, ebcdiccp => 500, sabbrevlangname => "ENS" },
724 { name => "en-ZM", oemcp => 850 },
725 { name => "en-ZW", lcid => 0x00003009, ebcdiccp => 500, sabbrevlangname => "ENW" },
726 { name => "eo", sopentypelang => "NTO" },
727 { name => "eo-001" },
728 { name => "es", lcid => 0x0000000a, oemcp => 850, ebcdiccp => 20284, sabbrevlangname => "ESP", sopentypelang => "ESP" },
729 { name => "es-419", lcid => 0x0000580a, sabbrevlangname => "ESJ" },
730 { name => "es-AR", lcid => 0x00002c0a, sabbrevlangname => "ESS" },
731 { name => "es-BO", lcid => 0x0000400a, sabbrevlangname => "ESB" },
732 { name => "es-BR", oemcp => 65001 },
733 { name => "es-BZ", oemcp => 65001 },
734 { name => "es-CL", lcid => 0x0000340a, sabbrevlangname => "ESL" },
735 { name => "es-CO", lcid => 0x0000240a, sabbrevlangname => "ESO" },
736 { name => "es-CR", lcid => 0x0000140a, sabbrevlangname => "ESC" },
737 { name => "es-CU", lcid => 0x00005c0a, sabbrevlangname => "ESK" },
738 { name => "es-DO", lcid => 0x00001c0a, sabbrevlangname => "ESD" },
739 { name => "es-EA" },
740 { name => "es-EC", lcid => 0x0000300a, sabbrevlangname => "ESF" },
741 { name => "es-ES", lcid => 0x00000c0a, sabbrevlangname => "ESN" },
742 { name => "es-ES_tradnl", lcid => 0x0000040a, file => "es_ES" },
743 { name => "es-ES-u-co-trad", alias => "es-ES_tradnl" },
744 { name => "es-GQ" },
745 { name => "es-GT", lcid => 0x0000100a, sabbrevlangname => "ESG" },
746 { name => "es-HN", lcid => 0x0000480a, sabbrevlangname => "ESH" },
747 { name => "es-IC" },
748 { name => "es-MX", lcid => 0x0000080a, sabbrevlangname => "ESM" },
749 { name => "es-NI", lcid => 0x00004c0a, sabbrevlangname => "ESI" },
750 { name => "es-PA", lcid => 0x0000180a, sabbrevlangname => "ESA" },
751 { name => "es-PE", lcid => 0x0000280a, sabbrevlangname => "ESR" },
752 { name => "es-PH" },
753 { name => "es-PR", lcid => 0x0000500a, sabbrevlangname => "ESU" },
754 { name => "es-PY", lcid => 0x00003c0a, sabbrevlangname => "ESZ" },
755 { name => "es-SV", lcid => 0x0000440a, sabbrevlangname => "ESE" },
756 { name => "es-US", lcid => 0x0000540a, sabbrevlangname => "EST" },
757 { name => "es-UY", lcid => 0x0000380a, sabbrevlangname => "ESY" },
758 { name => "es-VE", lcid => 0x0000200a, sabbrevlangname => "ESV" },
759 { name => "et", lcid => 0x00000025, oemcp => 775, group => 3, sabbrevlangname => "ETI", sopentypelang => "ETI" },
760 { name => "et-EE", lcid => 0x00000425 },
761 { name => "eu", lcid => 0x0000002d, oemcp => 850, maccp => 65001, sabbrevlangname => "EUQ", sopentypelang => "EUQ" },
762 { name => "eu-ES", lcid => 0x0000042d },
763 { name => "ewo" },
764 { name => "ewo-CM" },
765 { name => "fa", lcid => 0x00000029, inegnumber => 3, oemcp => 720, slist => "\x{061b}", group => 13, sabbrevlangname => "FAR", sopentypelang => "FAR" },
766 { name => "fa-AF", alias => "prs-AF" },
767 { name => "fa-IR", lcid => 0x00000429 },
768 { name => "ff", lcid => 0x00000067, oemcp => 850, ebcdiccp => 20297 },
769 { name => "ff-CM", alias => "ff-Latn-CM" },
770 { name => "ff-GN", alias => "ff-Latn-GN" },
771 { name => "ff-MR", alias => "ff-Latn-MR" },
772 { name => "ff-NG", alias => "ff-Latn-NG" },
773 { name => "ff-SN", alias => "ff-Latn-SN" },
774 { name => "ff-Adlm" },
775 { name => "ff-Adlm-BF" },
776 { name => "ff-Adlm-CM" },
777 { name => "ff-Adlm-GH" },
778 { name => "ff-Adlm-GM" },
779 { name => "ff-Adlm-GN" },
780 { name => "ff-Adlm-GW" },
781 { name => "ff-Adlm-LR" },
782 { name => "ff-Adlm-MR" },
783 { name => "ff-Adlm-NE" },
784 { name => "ff-Adlm-NG" },
785 { name => "ff-Adlm-SL" },
786 { name => "ff-Adlm-SN" },
787 { name => "ff-Latn", lcid => 0x00007c67 },
788 { name => "ff-Latn-BF", oemcp => 65001 },
789 { name => "ff-Latn-CM" },
790 { name => "ff-Latn-GH", oemcp => 65001 },
791 { name => "ff-Latn-GM", oemcp => 65001 },
792 { name => "ff-Latn-GN" },
793 { name => "ff-Latn-GW", oemcp => 65001 },
794 { name => "ff-Latn-LR", oemcp => 65001 },
795 { name => "ff-Latn-MR" },
796 { name => "ff-Latn-NE", oemcp => 65001 },
797 { name => "ff-Latn-NG", lcid => 0x00000467, sabbrevlangname => "ZZZ" },
798 { name => "ff-Latn-SL", oemcp => 65001 },
799 { name => "ff-Latn-SN", lcid => 0x00000867 },
800 { name => "fi", lcid => 0x0000000b, oemcp => 850, ebcdiccp => 20278 },
801 { name => "fi-FI", lcid => 0x0000040b },
802 { name => "fil", lcid => 0x00000064, oemcp => 437, ebcdiccp => 500, sabbrevlangname => "FPO", sopentypelang => "PIL" },
803 { name => "fil-PH", lcid => 0x00000464 },
804 { name => "fil-Latn", alias => "fil" },
805 { name => "fil-Latn-PH", alias => "fil-PH" },
806 { name => "fo", lcid => 0x00000038, oemcp => 850, maccp => 10079, ebcdiccp => 20277, sabbrevlangname => "FOS", sopentypelang => "FOS" },
807 { name => "fo-DK", oemcp => 65001, maccp => 65001 },
808 { name => "fo-FO", lcid => 0x00000438 },
809 { name => "fr", lcid => 0x0000000c, oemcp => 850, ebcdiccp => 20297 },
810 { name => "fr-029", lcid => 0x00001c0c, file => "fr", sintlsymbol => "XCD", sabbrevlangname => "ZZZ" },
811 { name => "fr-BE", lcid => 0x0000080c, sabbrevlangname => "FRB" },
812 { name => "fr-BF" },
813 { name => "fr-BI" },
814 { name => "fr-BJ" },
815 { name => "fr-BL" },
816 { name => "fr-CA", lcid => 0x00000c0c, sabbrevlangname => "FRC" },
817 { name => "fr-CD", lcid => 0x0000240c, sabbrevlangname => "FRD" },
818 { name => "fr-CF" },
819 { name => "fr-CG" },
820 { name => "fr-CH", lcid => 0x0000100c, sabbrevlangname => "FRS" },
821 { name => "fr-CI", lcid => 0x0000300c, sabbrevlangname => "FRI" },
822 { name => "fr-CM", lcid => 0x00002c0c, sabbrevlangname => "FRE" },
823 { name => "fr-DJ" },
824 { name => "fr-DZ" },
825 { name => "fr-FR", lcid => 0x0000040c },
826 { name => "fr-GA" },
827 { name => "fr-GF" },
828 { name => "fr-GN" },
829 { name => "fr-GP" },
830 { name => "fr-GQ" },
831 { name => "fr-HT", lcid => 0x00003c0c, sabbrevlangname => "FRH" },
832 { name => "fr-KM" },
833 { name => "fr-LU", lcid => 0x0000140c, sabbrevlangname => "FRL" },
834 { name => "fr-MA", lcid => 0x0000380c, sabbrevlangname => "FRO" },
835 { name => "fr-MC", lcid => 0x0000180c, sabbrevlangname => "FRM" },
836 { name => "fr-MF" },
837 { name => "fr-MG" },
838 { name => "fr-ML", lcid => 0x0000340c, sabbrevlangname => "FRF" },
839 { name => "fr-MQ" },
840 { name => "fr-MR" },
841 { name => "fr-MU" },
842 { name => "fr-NC" },
843 { name => "fr-NE" },
844 { name => "fr-PF" },
845 { name => "fr-PM" },
846 { name => "fr-RE", lcid => 0x0000200c, sabbrevlangname => "FRR" },
847 { name => "fr-RW" },
848 { name => "fr-SC" },
849 { name => "fr-SN", lcid => 0x0000280c, sabbrevlangname => "FRN" },
850 { name => "fr-SY" },
851 { name => "fr-TD" },
852 { name => "fr-TG" },
853 { name => "fr-TN" },
854 { name => "fr-VU" },
855 { name => "fr-WF" },
856 { name => "fr-YT" },
857 { name => "fur", sopentypelang => "FRL" },
858 { name => "fur-IT" },
859 { name => "fy", lcid => 0x00000062, oemcp => 850, sabbrevlangname => "FYN", sopentypelang => "FRI" },
860 { name => "fy-NL", lcid => 0x00000462 },
861 { name => "ga", lcid => 0x0000003c, oemcp => 850, sabbrevlangname => "IRE", sopentypelang => "IRI" },
862 { name => "ga-GB" },
863 { name => "ga-IE", lcid => 0x0000083c },
864 { name => "gd", lcid => 0x00000091, oemcp => 850, ebcdiccp => 20285, sopentypelang => "GAE" },
865 { name => "gd-GB", lcid => 0x00000491 },
866 { name => "gd-Latn", alias => "gd" },
867 { name => "gl", lcid => 0x00000056, oemcp => 850, sabbrevlangname => "GLC", sopentypelang => "GAL" },
868 { name => "gl-ES", lcid => 0x00000456 },
869 { name => "gn", lcid => 0x00000074, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sopentypelang => "GUA" },
870 { name => "gn-PY", lcid => 0x00000474, dir => "seed" },
871 { name => "gsw", lcid => 0x00000084, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "ZZZ", sopentypelang => "ALS" },
872 { name => "gsw-CH" },
873 { name => "gsw-FR", lcid => 0x00000484, sabbrevlangname => "GSW" },
874 { name => "gsw-LI" },
875 { name => "gu", lcid => 0x00000047, slist => ",", group => 15 },
876 { name => "gu-IN", lcid => 0x00000447 },
877 { name => "guz" },
878 { name => "guz-KE" },
879 { name => "gv", sopentypelang => "MNX" },
880 { name => "gv-IM" },
881 { name => "ha", lcid => 0x00000068, oemcp => 437 },
882 { name => "ha-GH", alias => "ha-Latn-GH" },
883 { name => "ha-Latn", lcid => 0x00007c68, file => "ha" },
884 { name => "ha-Latn-GH", file => "ha_GH", ebcdiccp => 500 },
885 { name => "ha-Latn-NE", file => "ha_NE", ebcdiccp => 500 },
886 { name => "ha-Latn-NG", lcid => 0x00000468, file => "ha_NG" },
887 { name => "ha-NE", alias => "ha-Latn-NE" },
888 { name => "ha-NG", alias => "ha-Latn-NG" },
889 { name => "haw", lcid => 0x00000075, oemcp => 437 },
890 { name => "haw-Latn", alias => "haw" },
891 { name => "haw-Latn-US", alias => "haw-US" },
892 { name => "haw-US", lcid => 0x00000475 },
893 { name => "he", lcid => 0x0000000d, oemcp => 862, slist => ",", group => 12, sopentypelang => "IWR" },
894 { name => "he-IL", lcid => 0x0000040d },
895 { name => "hi", lcid => 0x00000039, slist => ",", group => 15 },
896 { name => "hi-IN", lcid => 0x00000439 },
897 { name => "hr", lcid => 0x0000001a, inegnumber => 2, oemcp => 852, maccp => 10082, group => 2 },
898 { name => "hr-BA", lcid => 0x0000101a, ebcdiccp => 870, inegnumber => 1, sabbrevlangname => "HRB" },
899 { name => "hr-HR", lcid => 0x0000041a },
900 { name => "hsb", lcid => 0x0000002e, oemcp => 850, ebcdiccp => 870, sopentypelang => "USB" },
901 { name => "hsb-DE", lcid => 0x0000042e },
902 { name => "hu", lcid => 0x0000000e, oemcp => 852, group => 2 },
903 { name => "hu-HU", lcid => 0x0000040e },
904 { name => "hu-HU_technl", lcid => 0x0001040e, alias => "hu-HU" },
905 { name => "hy", lcid => 0x0000002b, slist => ",", group => 17 },
906 { name => "hy-AM", lcid => 0x0000042b },
907 { name => "ia" },
908 { name => "ia-001" },
909 ## name => "ibb", lcid => 0x00000069 },
910 ## name => "ibb-NG", lcid => 0x00000469 },
911 { name => "id", lcid => 0x00000021, oemcp => 850 },
912 { name => "id-ID", lcid => 0x00000421 },
913 { name => "ig", lcid => 0x00000070, oemcp => 437 },
914 { name => "ig-Latn", alias => "ig" },
915 { name => "ig-Latn-NG", alias => "ig-NG" },
916 { name => "ig-NG", lcid => 0x00000470 },
917 { name => "ii", lcid => 0x00000078, group => 9, sopentypelang => "YIM" },
918 { name => "ii-CN", lcid => 0x00000478 },
919 { name => "ii-Yiii", alias => "ii" },
920 { name => "ii-Yiii-CN", alias => "ii-CN" },
921 { name => "is", lcid => 0x0000000f, oemcp => 850, maccp => 10079, ebcdiccp => 20871 },
922 { name => "is-IS", lcid => 0x0000040f },
923 { name => "it", lcid => 0x00000010, oemcp => 850, ebcdiccp => 20280 },
924 { name => "it-CH", lcid => 0x00000810, ebcdiccp => 500, sabbrevlangname => "ITS" },
925 { name => "it-IT", lcid => 0x00000410 },
926 { name => "it-SM" },
927 { name => "it-VA", oemcp => 65001 },
928 { name => "iu", lcid => 0x0000005d, oemcp => 437, slist => ",", sortlocale => "iu-Latn-CA", dir => "seed", sabbrevlangname => "IUK", sopentypelang => "INU" },
929 { name => "iu-Cans", lcid => 0x0000785d, file => "iu", oemcp => 65001, dir => "seed", sabbrevlangname => "IUS" },
930 { name => "iu-Cans-CA", lcid => 0x0000045d, file => "iu_CA", dir => "seed" },
931 { name => "iu-Latn", lcid => 0x00007c5d, dir => "seed" },
932 { name => "iu-Latn-CA", lcid => 0x0000085d, dir => "seed" },
933 { name => "ja", lcid => 0x00000011, ireadinglayout => 2, oemcp => 932, slist => ",", sscripts => "Hani Hira Jpan Kana", group => 7, sopentypelang => "JAN" },
934 { name => "ja-JP", lcid => 0x00000411 },
935 { name => "ja-JP_radstr", lcid => 0x00040411, alias => "ja-JP" },
936 { name => "ja-JP-u-co-unihan", alias => "ja-JP_radstr" },
937 { name => "jgo" },
938 { name => "jgo-CM" },
939 { name => "jmc" },
940 { name => "jmc-TZ" },
941 { name => "jv", oemcp => 850 },
942 { name => "jv-ID", alias => "jv-Latn-ID" },
943 ## name => "jv-Java" },
944 ## name => "jv-Java-ID" },
945 { name => "jv-Latn", file => "jv" },
946 { name => "jv-Latn-ID", file => "jv_ID" },
947 { name => "ka", lcid => 0x00000037, group => 16 },
948 { name => "ka-GE", lcid => 0x00000437 },
949 { name => "ka-GE_modern", lcid => 0x00010437, alias => "ka-GE" },
950 { name => "kab", sopentypelang => "KAB0" },
951 { name => "kab-DZ" },
952 { name => "kam", sopentypelang => "KMB" },
953 { name => "kam-KE" },
954 { name => "kde" },
955 { name => "kde-TZ" },
956 { name => "kea" },
957 { name => "kea-CV" },
958 { name => "kgp" },
959 { name => "kgp-BR" },
960 { name => "khq" },
961 { name => "khq-ML" },
962 { name => "ki" },
963 { name => "ki-KE" },
964 { name => "kk", lcid => 0x0000003f, group => 5, sabbrevlangname => "KKZ" },
965 { name => "kk-Cyrl", alias => "kk" },
966 { name => "kk-Cyrl-KZ", alias => "kk-KZ" },
967 { name => "kk-KZ", lcid => 0x0000043f },
968 { name => "kkj" },
969 { name => "kkj-CM" },
970 { name => "kl", lcid => 0x0000006f, oemcp => 850, ebcdiccp => 20277, sopentypelang => "GRN" },
971 { name => "kl-GL", lcid => 0x0000046f },
972 { name => "kln", sopentypelang => "KAL" },
973 { name => "kln-KE" },
974 { name => "km", lcid => 0x00000053, inegnumber => 2, slist => ",", group => 15 },
975 { name => "km-KH", lcid => 0x00000453 },
976 { name => "kn", lcid => 0x0000004b, slist => ",", group => 15, sabbrevlangname => "KDI" },
977 { name => "kn-IN", lcid => 0x0000044b },
978 { name => "ko", lcid => 0x00000012, ireadinglayout => 2, slist => ",", oemcp => 949, ebcdiccp => 20833, sscripts => "Hang Hani Kore", group => 8 },
979 { name => "ko-KP", oemcp => 65001 },
980 { name => "ko-KR", lcid => 0x00000412 },
981 { name => "kok", lcid => 0x00000057, slist => ",", group => 15, sabbrevlangname => "KNK" },
982 { name => "kok-IN", lcid => 0x00000457 },
983 { name => "kr", lcid => 0x00000071, sortlocale => "kr-Latn-NG", oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "KNR" },
984 { name => "kr-Latn", file => "kr", dir => "exemplars" },
985 { name => "kr-Latn-NG", lcid => 0x00000471, file => "kr", dir => "exemplars" },
986 { name => "kr-NG", alias => "kr-Latn-NG" },
987 { name => "ks", lcid => 0x00000060, group => 15, sabbrevlangname => "ZZZ", sopentypelang => "KSH" },
988 { name => "ks-Arab", lcid => 0x00000460 },
989 { name => "ks-Arab-IN" },
990 { name => "ks-Deva", slist => ",", dir => "seed" },
991 { name => "ks-Deva-IN", lcid => 0x00000860, dir => "seed" },
992 { name => "ks-IN", alias => "ks-Arab-IN" },
993 { name => "ksb" },
994 { name => "ksb-TZ" },
995 { name => "ksf" },
996 { name => "ksf-CM" },
997 { name => "ksh", sopentypelang => "KSH0" },
998 { name => "ksh-DE" },
999 { name => "ku", lcid => 0x00000092, file => "ckb", slist => "\x{061b}", sortlocale => "ku-Arab-IQ", oemcp => 720 },
1000 { name => "ku-Arab", lcid => 0x00007c92, file => "ckb", group => 13 },
1001 { name => "ku-Arab-IQ", lcid => 0x00000492, file => "ckb_IQ" },
1002 { name => "ku-Arab-IR", file => "ckb_IR", oemcp => 65001 },
1003 { name => "kw" },
1004 { name => "kw-GB" },
1005 { name => "ky", lcid => 0x00000040, oemcp => 866, group => 5, sabbrevlangname => "KYR" },
1006 { name => "ky-Cyrl", alias => "ky" },
1007 { name => "ky-Cyrl-KG", alias => "ky-KG" },
1008 { name => "ky-KG", lcid => 0x00000440 },
1009 { name => "la", lcid => 0x00000076, oemcp => 437, slist => ",", dir => "seed", sabbrevlangname => "ZZZ" },
1010 { name => "la-001", lcid => 0x00000476, file => "la", dir => "seed" },
1011 { name => "lag" },
1012 { name => "lag-TZ" },
1013 { name => "lb", lcid => 0x0000006e, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "LBX" },
1014 { name => "lb-LU", lcid => 0x0000046e },
1015 { name => "lg" },
1016 { name => "lg-UG" },
1017 { name => "lkt" },
1018 { name => "lkt-US" },
1019 { name => "ln" },
1020 { name => "ln-AO" },
1021 { name => "ln-CD" },
1022 { name => "ln-CF" },
1023 { name => "ln-CG" },
1024 { name => "lo", lcid => 0x00000054, group => 15 },
1025 { name => "lo-LA", lcid => 0x00000454 },
1026 { name => "lrc" },
1027 { name => "lrc-IQ" },
1028 { name => "lrc-IR" },
1029 { name => "lt", lcid => 0x00000027, oemcp => 775, group => 3, sabbrevlangname => "LTH", sopentypelang => "LTH" },
1030 { name => "lt-LT", lcid => 0x00000427 },
1031 { name => "lu" },
1032 { name => "lu-CD" },
1033 { name => "luo" },
1034 { name => "luo-KE" },
1035 { name => "luy", sopentypelang => "LUH" },
1036 { name => "luy-KE" },
1037 { name => "lv", lcid => 0x00000026, oemcp => 775, group => 3, sabbrevlangname => "LVI", sopentypelang => "LVI" },
1038 { name => "lv-LV", lcid => 0x00000426 },
1039 { name => "mai" },
1040 { name => "mai-IN" },
1041 { name => "mas" },
1042 { name => "mas-KE" },
1043 { name => "mas-TZ" },
1044 { name => "mer" },
1045 { name => "mer-KE" },
1046 { name => "mfe" },
1047 { name => "mfe-MU" },
1048 { name => "mg" },
1049 { name => "mg-MG" },
1050 { name => "mgh" },
1051 { name => "mgh-MZ" },
1052 { name => "mgo" },
1053 { name => "mgo-CM" },
1054 { name => "mi", lcid => 0x00000081, slist => "," },
1055 { name => "mi-Latn", alias => "mi" },
1056 { name => "mi-Latn-NZ", alias => "mi-NZ" },
1057 { name => "mi-NZ", lcid => 0x00000481 },
1058 { name => "mk", lcid => 0x0000002f, oemcp => 866, ebcdiccp => 500, group => 5, sabbrevlangname => "MKI" },
1059 { name => "mk-MK", lcid => 0x0000042f },
1060 { name => "ml", lcid => 0x0000004c, group => 15, sabbrevlangname => "MYM", sopentypelang => "MLR" },
1061 { name => "ml-IN", lcid => 0x0000044c },
1062 { name => "mn", lcid => 0x00000050, oemcp => 866, sopentypelang => "MNG" },
1063 { name => "mn-Cyrl", lcid => 0x00007850, file => "mn", sabbrevlangname => "MNN" },
1064 { name => "mn-Cyrl-MN", alias => "mn-MN" },
1065 { name => "mn-MN", lcid => 0x00000450, sparent => "mn-Cyrl", group => 5 },
1066 { name => "mn-Mong", lcid => 0x00007c50, oemcp => 65001, slist => ",", group => 15, dir => "seed", sabbrevlangname => "MNG" },
1067 { name => "mn-Mong-CN", lcid => 0x00000850, dir => "seed" },
1068 { name => "mn-Mong-MN", lcid => 0x00000c50, dir => "seed", sabbrevlangname => "MNM" },
1069 { name => "mni", lcid => 0x00000058, slist => ",", sabbrevlangname => "ZZZ" },
1070 { name => "mni-IN", lcid => 0x00000458, file => "mni_Beng_IN" },
1071 { name => "moh", lcid => 0x0000007c, oemcp => 850, ebcdiccp => 37, slist => ",", dir => "seed", sabbrevlangname => "MWK" },
1072 { name => "moh-CA", lcid => 0x0000047c, dir => "seed" },
1073 { name => "moh-Latn", alias => "moh" },
1074 { name => "moh-Latn-CA", alias => "moh-CA" },
1075 { name => "mr", lcid => 0x0000004e, slist => ",", group => 15 },
1076 { name => "mr-IN", lcid => 0x0000044e },
1077 { name => "ms", lcid => 0x0000003e, oemcp => 850, sabbrevlangname => "MSL", sopentypelang => "MLY" },
1078 { name => "ms-BN", lcid => 0x0000083e, sabbrevlangname => "MSB" },
1079 { name => "ms-ID" },
1080 { name => "ms-Latn", alias => "ms" },
1081 { name => "ms-Latn-BN", alias => "ms-BN" },
1082 { name => "ms-Latn-MY", alias => "ms-MY" },
1083 { name => "ms-Latn-SG", alias => "ms-SG" },
1084 { name => "ms-MY", lcid => 0x0000043e },
1085 { name => "ms-SG" },
1086 { name => "mt", lcid => 0x0000003a, sopentypelang => "MTS" },
1087 { name => "mt-MT", lcid => 0x0000043a },
1088 { name => "mua" },
1089 { name => "mua-CM" },
1090 { name => "my", lcid => 0x00000055, sopentypelang => "BRM" },
1091 { name => "my-MM", lcid => 0x00000455 },
1092 { name => "mzn" },
1093 { name => "mzn-IR" },
1094 { name => "naq" },
1095 { name => "naq-NA" },
1096 { name => "nb", lcid => 0x00007c14, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NOR", sopentypelang => "NOR" },
1097 { name => "nb-NO", lcid => 0x00000414 },
1098 { name => "nb-SJ" },
1099 { name => "nd", sopentypelang => "NDB" },
1100 { name => "nd-ZW" },
1101 { name => "nds" },
1102 { name => "nds-DE" },
1103 { name => "nds-NL" },
1104 { name => "ne", lcid => 0x00000061, slist => "," },
1105 { name => "ne-IN", lcid => 0x00000861, sabbrevlangname => "NEI" },
1106 { name => "ne-NP", lcid => 0x00000461, group => 15 },
1107 { name => "nl", lcid => 0x00000013, oemcp => 850 },
1108 { name => "nl-AW" },
1109 { name => "nl-BE", lcid => 0x00000813, sabbrevlangname => "NLB" },
1110 { name => "nl-BQ" },
1111 { name => "nl-CW" },
1112 { name => "nl-NL", lcid => 0x00000413 },
1113 { name => "nl-SR" },
1114 { name => "nl-SX" },
1115 { name => "nmg" },
1116 { name => "nmg-CM" },
1117 { name => "nn", lcid => 0x00007814, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NON", sopentypelang => "NYN" },
1118 { name => "nn-NO", lcid => 0x00000814 },
1119 { name => "nnh" },
1120 { name => "nnh-CM" },
1121 { name => "no", lcid => 0x00000014, oemcp => 850, ebcdiccp => 20277, sortlocale => "nb-NO" },
1122 { name => "nqo", idigits => 3, inegnumber => 3, slist => "\x{060c}", dir => "seed", sopentypelang => "NKO" },
1123 { name => "nqo-GN", dir => "seed" },
1124 { name => "nr", dir => "seed", sopentypelang => "NDB" },
1125 { name => "nr-ZA", dir => "seed" },
1126 { name => "nso", lcid => 0x0000006c, oemcp => 850, dir => "seed", sopentypelang => "SOT" },
1127 { name => "nso-ZA", lcid => 0x0000046c, dir => "seed" },
1128 { name => "nus" },
1129 { name => "nus-SD", alias => "nus-SS" },
1130 { name => "nus-SS" },
1131 { name => "nyn", sopentypelang => "NKL" },
1132 { name => "nyn-UG" },
1133 { name => "oc", lcid => 0x00000082, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
1134 { name => "oc-FR", lcid => 0x00000482, dir => "seed" },
1135 { name => "oc-Latn", alias => "oc" },
1136 { name => "oc-Latn-FR", alias => "oc-FR" },
1137 { name => "om", lcid => 0x00000072, sopentypelang => "ORO" },
1138 { name => "om-ET", lcid => 0x00000472 },
1139 { name => "om-KE" },
1140 { name => "or", lcid => 0x00000048, slist => ",", group => 15 },
1141 { name => "or-IN", lcid => 0x00000448 },
1142 { name => "os" },
1143 { name => "os-GE" },
1144 { name => "os-RU" },
1145 { name => "pa", lcid => 0x00000046, slist => "," },
1146 { name => "pa-Arab", lcid => 0x00007c46, slist => ";", inegnumber => 2, oemcp => 720, group => 13, sabbrevlangname => "PAP" },
1147 { name => "pa-Arab-PK", lcid => 0x00000846 },
1148 { name => "pa-Guru" },
1149 { name => "pa-Guru-IN", alias => "pa-IN" },
1150 { name => "pa-IN", lcid => 0x00000446, sparent => "pa-Guru", file => "pa_Guru_IN", group => 15 },
1151 ## name => "pap", lcid => 0x00000079 },
1152 ## name => "pap-029", lcid => 0x00000479 },
1153 { name => "pcm" },
1154 { name => "pcm-NG" },
1155 { name => "pl", lcid => 0x00000015, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "PLK", sopentypelang => "PLK" },
1156 { name => "pl-PL", lcid => 0x00000415 },
1157 ## name => "prg" },
1158 ## name => "prg-001" },
1159 { name => "prs", lcid => 0x0000008c, file => "fa", inegnumber => 3, oemcp => 720, group => 13, sopentypelang => "DRI" },
1160 { name => "prs-AF", lcid => 0x0000048c, file => "fa_AF" },
1161 { name => "prs-Arab", alias => "prs" },
1162 { name => "prs-Arab-AF", alias => "prs-AF" },
1163 { name => "ps", lcid => 0x00000063, group => 13, sabbrevlangname => "PAS", sopentypelang => "PAS" },
1164 { name => "ps-AF", lcid => 0x00000463 },
1165 { name => "ps-PK" },
1166 { name => "pt", lcid => 0x00000016, oemcp => 850, sabbrevlangname => "PTB", sopentypelang => "PTG" },
1167 { name => "pt-AO" },
1168 { name => "pt-BR", lcid => 0x00000416 },
1169 { name => "pt-CH", oemcp => 65001 },
1170 { name => "pt-CV" },
1171 { name => "pt-GQ", oemcp => 65001 },
1172 { name => "pt-GW" },
1173 { name => "pt-LU", oemcp => 65001 },
1174 { name => "pt-MO" },
1175 { name => "pt-MZ" },
1176 { name => "pt-PT", lcid => 0x00000816, sabbrevlangname => "PTG" },
1177 { name => "pt-ST" },
1178 { name => "pt-TL" },
1179 ## name => qps-Latn-x-sh", lcid => 0x80000901 },
1180 ## name => qps-ploc", lcid => 0x80000501 },
1181 ## name => qps-ploca", lcid => 0x800005fe },
1182 ## name => qps-plocm", lcid => 0x800009ff },
1183 { name => "qu", alias => "quz" },
1184 { name => "qu-BO", alias => "quz-BO" },
1185 { name => "qu-EC", alias => "quz-EC" },
1186 { name => "qu-PE", alias => "quz-PE" },
1187 { name => "quc", lcid => 0x00000086, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed" },
1188 { name => "quc-Latn", lcid => 0x00007c86, file => "quc", dir => "seed" },
1189 { name => "quc-Latn-GT", lcid => 0x00000486, file => "quc_GT", dir => "seed" },
1190 { name => "qut", alias => "quc" },
1191 { name => "qut-GT", alias => "quc-Latn-GT" },
1192 { name => "quz", lcid => 0x0000006b, file => "qu", territory => "BO", oemcp => 850, ebcdiccp => 20284, slist => "," },
1193 { name => "quz-BO", lcid => 0x0000046b, file => "qu_BO" },
1194 { name => "quz-EC", lcid => 0x0000086b, file => "qu_EC" },
1195 { name => "quz-Latn", alias => "quz" },
1196 { name => "quz-Latn-BO", alias => "quz-BO" },
1197 { name => "quz-Latn-EC", alias => "quz-EC" },
1198 { name => "quz-Latn-PE", alias => "quz-PE" },
1199 { name => "quz-PE", lcid => 0x00000c6b, file => "qu_PE" },
1200 { name => "rm", lcid => 0x00000017, oemcp => 850, ebcdiccp => 20273, sabbrevlangname => "RMC", sopentypelang => "RMS" },
1201 { name => "rm-CH", lcid => 0x00000417 },
1202 { name => "rn" },
1203 { name => "rn-BI" },
1204 { name => "ro", lcid => 0x00000018, oemcp => 852, ebcdiccp => 20880, sabbrevlangname => "ROM", sopentypelang => "ROM" },
1205 { name => "ro-MD", lcid => 0x00000818, maccp => 65001, sabbrevlangname => "ROD" },
1206 { name => "ro-RO", lcid => 0x00000418, group => 2 },
1207 { name => "rof" },
1208 { name => "rof-TZ" },
1209 { name => "ru", lcid => 0x00000019, oemcp => 866 },
1210 { name => "ru-BY", maccp => 65001 },
1211 { name => "ru-KG", maccp => 65001 },
1212 { name => "ru-KZ", maccp => 65001 },
1213 { name => "ru-MD", lcid => 0x00000819, maccp => 65001, sabbrevlangname => "RUM" },
1214 { name => "ru-RU", lcid => 0x00000419, group => 5 },
1215 { name => "ru-UA", maccp => 65001 },
1216 { name => "rw", lcid => 0x00000087, oemcp => 437, sopentypelang => "RUA" },
1217 { name => "rw-RW", lcid => 0x00000487 },
1218 { name => "rwk" },
1219 { name => "rwk-TZ" },
1220 { name => "sa", lcid => 0x0000004f, slist => ",", group => 15 },
1221 { name => "sa-Deva", alias => "sa" },
1222 { name => "sa-Deva-IN", alias => "sa-IN" },
1223 { name => "sa-IN", lcid => 0x0000044f },
1224 { name => "sah", lcid => 0x00000085, oemcp => 866, group => 5, sopentypelang => "YAK" },
1225 { name => "sah-Cyrl", alias => "sah" },
1226 { name => "sah-Cyrl-RU", alias => "sah-RU" },
1227 { name => "sah-RU", lcid => 0x00000485 },
1228 { name => "saq" },
1229 { name => "saq-KE" },
1230 { name => "sat" },
1231 { name => "sat-Olck" },
1232 { name => "sat-Olck-IN" },
1233 { name => "sbp" },
1234 { name => "sbp-TZ" },
1235 { name => "sc" },
1236 { name => "sc-IT" },
1237 { name => "sd", lcid => 0x00000059, inegnumber => 3, oemcp => 720, sabbrevlangname => "SIP" },
1238 { name => "sd-Arab", lcid => 0x00007c59, group => 13 },
1239 { name => "sd-Arab-PK", lcid => 0x00000859 },
1240 { name => "sd-Deva", inegnumber => 1, slist => ",", oemcp => 65001, group => 15 },
1241 { name => "sd-Deva-IN", lcid => 0x00000459, sabbrevlangname => "ZZZ" },
1242 { name => "sd-PK", alias => "sd-Arab-PK" },
1243 { name => "se", lcid => 0x0000003b, oemcp => 850, ebcdiccp => 20277, sopentypelang => "NSM" },
1244 { name => "se-FI", lcid => 0x00000c3b, ebcdiccp => 20278, sabbrevlangname => "SMG" },
1245 { name => "se-NO", lcid => 0x0000043b },
1246 { name => "se-SE", lcid => 0x0000083b, ebcdiccp => 20278, sabbrevlangname => "SMF" },
1247 { name => "se-Latn", alias => "se" },
1248 { name => "se-Latn-FI", alias => "se-FI" },
1249 { name => "se-Latn-NO", alias => "se-NO" },
1250 { name => "se-Latn-SE", alias => "se-SE" },
1251 { name => "seh" },
1252 { name => "seh-MZ" },
1253 { name => "ses" },
1254 { name => "ses-ML" },
1255 { name => "sg", sopentypelang => "SGO" },
1256 { name => "sg-CF" },
1257 { name => "shi" },
1258 { name => "shi-Latn" },
1259 { name => "shi-Latn-MA" },
1260 { name => "shi-Tfng" },
1261 { name => "shi-Tfng-MA" },
1262 { name => "si", lcid => 0x0000005b, group => 15, sopentypelang => "SNH" },
1263 { name => "si-LK", lcid => 0x0000045b },
1264 { name => "sk", lcid => 0x0000001b, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "SKY", sopentypelang => "SKY" },
1265 { name => "sk-SK", lcid => 0x0000041b },
1266 { name => "sl", lcid => 0x00000024, oemcp => 852, ebcdiccp => 20880, group => 2 },
1267 { name => "sl-SI", lcid => 0x00000424 },
1268 { name => "sma", lcid => 0x0000783b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMB", sopentypelang => "SSM" },
1269 { name => "sma-Latn", alias => "sma" },
1270 { name => "sma-Latn-NO", alias => "sma-NO" },
1271 { name => "sma-Latn-SE", alias => "sma-SE" },
1272 { name => "sma-NO", lcid => 0x0000183b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMA" },
1273 { name => "sma-SE", lcid => 0x00001c3b, dir => "seed" },
1274 { name => "smj", lcid => 0x00007c3b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMK", sopentypelang => "LSM" },
1275 { name => "smj-Latn", alias => "smj" },
1276 { name => "smj-Latn-NO", alias => "smj-NO" },
1277 { name => "smj-Latn-SE", alias => "smj-SE" },
1278 { name => "smj-NO", lcid => 0x0000103b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMJ" },
1279 { name => "smj-SE", lcid => 0x0000143b, dir => "seed" },
1280 { name => "smn", lcid => 0x0000703b, sparent => "se", ebcdiccp => 20278, sopentypelang => "ISM" },
1281 { name => "smn-FI", lcid => 0x0000243b },
1282 { name => "smn-Latn", alias => "smn" },
1283 { name => "smn-Latn-FI", alias => "smn-FI" },
1284 { name => "sms", lcid => 0x0000743b, sparent => "se", ebcdiccp => 20278, dir => "seed", sopentypelang => "SKS" },
1285 { name => "sms-FI", lcid => 0x0000203b, dir => "seed" },
1286 { name => "sms-Latn", alias => "sms" },
1287 { name => "sms-Latn-FI", alias => "sms-FI" },
1288 { name => "sn", sopentypelang => "SNA0" },
1289 { name => "sn-Latn", file => "sn" },
1290 { name => "sn-Latn-ZW", file => "sn_ZW" },
1291 { name => "sn-ZW", alias => "sn-Latn-ZW" },
1292 { name => "so", lcid => 0x00000077, sopentypelang => "SML" },
1293 { name => "so-DJ" },
1294 { name => "so-ET" },
1295 { name => "so-KE" },
1296 { name => "so-SO", lcid => 0x00000477 },
1297 { name => "sq", lcid => 0x0000001c, oemcp => 852, ebcdiccp => 20880, group => 2 },
1298 { name => "sq-AL", lcid => 0x0000041c },
1299 { name => "sq-MK" },
1300 { name => "sq-XK" },
1301 { name => "sr", lcid => 0x00007c1a, sortlocale => "sr-Latn-RS", oemcp => 852, group => 2, sabbrevlangname => "SRB", sopentypelang => "SRB" },
1302 { name => "sr-Cyrl", lcid => 0x00006c1a, oemcp => 855, ebcdiccp => 21025, group => 5, sabbrevlangname => "SRO" },
1303 { name => "sr-Cyrl-BA", lcid => 0x00001c1a, sabbrevlangname => "SRN" },
1304 { name => "sr-Cyrl-ME", lcid => 0x0000301a, sabbrevlangname => "SRQ" },
1305 { name => "sr-Cyrl-RS", lcid => 0x0000281a },
1306 { name => "sr-Cyrl-XK" },
1307 { name => "sr-Latn", lcid => 0x0000701a, sabbrevlangname => "SRM" },
1308 { name => "sr-Latn-BA", lcid => 0x0000181a, maccp => 10082, ebcdiccp => 870, sabbrevlangname => "SRS" },
1309 { name => "sr-Latn-ME", lcid => 0x00002c1a, sabbrevlangname => "SRP" },
1310 { name => "sr-Latn-RS", lcid => 0x0000241a, sabbrevlangname => "SRM" },
1311 { name => "sr-Latn-XK" },
1312 ## name => "sr-Cyrl-CS", lcid => 0x00000c1a },
1313 ## name => "sr-Latn-CS", lcid => 0x0000081a },
1314 { name => "ss", dir => "seed", sopentypelang => "SWZ" },
1315 { name => "ss-SZ", dir => "seed" },
1316 { name => "ss-ZA", dir => "seed" },
1317 { name => "ssy", dir => "seed" },
1318 { name => "ssy-ER", dir => "seed" },
1319 { name => "st", lcid => 0x00000030, dir => "seed" },
1320 { name => "st-LS", dir => "seed" },
1321 { name => "st-ZA", lcid => 0x00000430, dir => "seed" },
1322 { name => "su" },
1323 { name => "su-Latn" },
1324 { name => "su-Latn-ID" },
1325 { name => "sv", lcid => 0x0000001d, oemcp => 850, ebcdiccp => 20278, sabbrevlangname => "SVE", sopentypelang => "SVE" },
1326 { name => "sv-AX" },
1327 { name => "sv-FI", lcid => 0x0000081d, sabbrevlangname => "SVF" },
1328 { name => "sv-SE", lcid => 0x0000041d, sabbrevlangname => "SVE" },
1329 { name => "sw", lcid => 0x00000041, territory => "KE", oemcp => 437, ebcdiccp => 500, sabbrevlangname => "SWK", sopentypelang => "SWK" },
1330 { name => "sw-CD" },
1331 { name => "sw-KE", lcid => 0x00000441 },
1332 { name => "sw-TZ" },
1333 { name => "sw-UG" },
1334 { name => "swc-CD", alias => "sw-CD" },
1335 { name => "syr", lcid => 0x0000005a, slist => ",", group => 13, dir => "seed" },
1336 { name => "syr-SY", lcid => 0x0000045a, dir => "seed" },
1337 { name => "syr-Syrc", alias => "syr" },
1338 { name => "syr-Syrc-SY", alias => "syr-SY" },
1339 { name => "ta", lcid => 0x00000049, slist => ",", group => 15, sabbrevlangname => "TAI" },
1340 { name => "ta-IN", lcid => 0x00000449 },
1341 { name => "ta-LK", lcid => 0x00000849, sabbrevlangname => "TAM" },
1342 { name => "ta-MY" },
1343 { name => "ta-SG" },
1344 { name => "te", lcid => 0x0000004a, group => 15 },
1345 { name => "te-IN", lcid => 0x0000044a },
1346 { name => "teo" },
1347 { name => "teo-KE" },
1348 { name => "teo-UG" },
1349 { name => "tg", lcid => 0x00000028, oemcp => 866, group => 5, sabbrevlangname => "TAJ", sopentypelang => "TAJ" },
1350 { name => "tg-Cyrl", lcid => 0x00007c28, file => "tg" },
1351 { name => "tg-Cyrl-TJ", lcid => 0x00000428, file => "tg_TJ" },
1352 { name => "tg-TJ", alias => "tg-Cyrl-TJ" },
1353 { name => "th", lcid => 0x0000001e, oemcp => 874, ebcdiccp => 20838, slist => ",", group => 11 },
1354 { name => "th-TH", lcid => 0x0000041e },
1355 { name => "ti", lcid => 0x00000073, territory => "ER", sopentypelang => "TGY" },
1356 { name => "ti-ER", lcid => 0x00000873 },
1357 { name => "ti-ET", lcid => 0x00000473, sabbrevlangname => "TIE" },
1358 { name => "tig", dir => "seed", sopentypelang => "TGR" },
1359 { name => "tig-ER", dir => "seed" },
1360 { name => "tk", lcid => 0x00000042, oemcp => 852, ebcdiccp => 20880, group => 2, sopentypelang => "TKM" },
1361 { name => "tk-Latn", alias => "tk" },
1362 { name => "tk-Latn-TM", alias => "tk-TM" },
1363 { name => "tk-TM", lcid => 0x00000442 },
1364 { name => "tn", lcid => 0x00000032, oemcp => 850, dir => "seed", sopentypelang => "TNA" },
1365 { name => "tn-BW", lcid => 0x00000832, dir => "seed", sabbrevlangname => "TSB" },
1366 { name => "tn-ZA", lcid => 0x00000432, dir => "seed" },
1367 { name => "to", sopentypelang => "TGN" },
1368 { name => "to-TO" },
1369 { name => "tr", lcid => 0x0000001f, oemcp => 857, ebcdiccp => 20905, group => 6, sabbrevlangname => "TRK", sopentypelang => "TRK" },
1370 { name => "tr-CY" },
1371 { name => "tr-TR", lcid => 0x0000041f },
1372 { name => "ts", lcid => 0x00000031, dir => "seed", sopentypelang => "TSG" },
1373 { name => "ts-ZA", lcid => 0x00000431, dir => "seed" },
1374 { name => "tt", lcid => 0x00000044, oemcp => 866, group => 5, sabbrevlangname => "TTT" },
1375 { name => "tt-Cyrl", alias => "tt" },
1376 { name => "tt-Cyrl-RU", alias => "tt-RU" },
1377 { name => "tt-RU", lcid => 0x00000444 },
1378 { name => "twq" },
1379 { name => "twq-NE" },
1380 { name => "tzm", lcid => 0x0000005f, sortlocale => "tzm-Latn-DZ", oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "TZA" },
1381 { name => "tzm-Latn", lcid => 0x00007c5f, territory => "DZ", file => "tzm" },
1382 { name => "tzm-Latn-MA", file => "tzm_MA", oemcp => 65001 },
1383 { name => "tzm-Latn-DZ", lcid => 0x0000085f, file => "tzm" },
1384 { name => "tzm-MA", alias => "tzm-Latn-MA" },
1385 ## name => "tzm-Arab", group => 13 },
1386 ## name => "tzm-Arab-MA", lcid => 0x0000045f },
1387 ## name => "tzm-Tfng", lcid => 0x0000785f },
1388 ## name => "tzm-Tfng-MA", lcid => 0x0000105f },
1389 { name => "ug", lcid => 0x00000080, oemcp => 720, slist => ",", group => 13, sopentypelang => "UYG" },
1390 { name => "ug-Arab", alias => "ug" },
1391 { name => "ug-Arab-CN", alias => "ug-CN" },
1392 { name => "ug-CN", lcid => 0x00000480 },
1393 { name => "uk", lcid => 0x00000022, oemcp => 866, maccp => 10017, ebcdiccp => 500, group => 5 },
1394 { name => "uk-UA", lcid => 0x00000422 },
1395 { name => "ur", lcid => 0x00000020, oemcp => 720 },
1396 { name => "ur-IN", lcid => 0x00000820, maccp => 65001, sabbrevlangname => "URI" },
1397 { name => "ur-PK", lcid => 0x00000420, group => 13 },
1398 { name => "uz", lcid => 0x00000043, oemcp => 857, maccp => 10029, group => 2 },
1399 { name => "uz-Arab", oemcp => 65001, maccp => 65001 },
1400 { name => "uz-Arab-AF" },
1401 { name => "uz-Cyrl", lcid => 0x00007843, oemcp => 866, maccp => 10007, group => 5, sabbrevlangname => "UZC" },
1402 { name => "uz-Cyrl-UZ", lcid => 0x00000843 },
1403 { name => "uz-Latn", lcid => 0x00007c43 },
1404 { name => "uz-Latn-UZ", lcid => 0x00000443 },
1405 { name => "vai" },
1406 { name => "vai-Latn" },
1407 { name => "vai-Latn-LR" },
1408 { name => "vai-Vaii" },
1409 { name => "vai-Vaii-LR" },
1410 { name => "ve", lcid => 0x00000033, dir => "seed", sabbrevlangname => "ZZZ" },
1411 { name => "ve-ZA", lcid => 0x00000433, dir => "seed" },
1412 { name => "vi", lcid => 0x0000002a, oemcp => 1258, slist => ",", group => 14, sabbrevlangname => "VIT", sopentypelang => "VIT" },
1413 { name => "vi-VN", lcid => 0x0000042a },
1414 { name => "vo", dir => "seed" },
1415 { name => "vo-001", dir => "seed" },
1416 { name => "vun" },
1417 { name => "vun-TZ" },
1418 { name => "wae" },
1419 { name => "wae-CH" },
1420 { name => "wal", dir => "seed" },
1421 { name => "wal-ET", dir => "seed" },
1422 { name => "wo", lcid => 0x00000088, oemcp => 850, ebcdiccp => 20297, sopentypelang => "WLF" },
1423 { name => "wo-Latn", alias => "wo" },
1424 { name => "wo-Latn-SN", alias => "wo-SN" },
1425 { name => "wo-SN", lcid => 0x00000488 },
1426 { name => "x-IV_mathan", lcid => 0x0001007f, alias => "" },
1427 { name => "xh", lcid => 0x00000034, oemcp => 850, sopentypelang => "XHS" },
1428 { name => "xh-ZA", lcid => 0x00000434 },
1429 { name => "xog" },
1430 { name => "xog-UG" },
1431 { name => "yav" },
1432 { name => "yav-CM" },
1433 { name => "yi", lcid => 0x0000003d, sabbrevlangname => "ZZZ", sopentypelang => "JII" },
1434 { name => "yi-001", lcid => 0x0000043d },
1435 { name => "yo", lcid => 0x0000006a, oemcp => 437, sopentypelang => "YBA" },
1436 { name => "yo-BJ", ebcdiccp => 500 },
1437 { name => "yo-Latn", alias => "yo" },
1438 { name => "yo-Latn-NG", alias => "yo-NG" },
1439 { name => "yo-NG", lcid => 0x0000046a },
1440 { name => "yrl" },
1441 { name => "yrl-BR" },
1442 { name => "yrl-CO" },
1443 { name => "yrl-VE" },
1444 { name => "yue" },
1445 { name => "yue-Hans" },
1446 { name => "yue-Hans-CN" },
1447 { name => "yue-Hant" },
1448 { name => "yue-Hant-HK" },
1449 { name => "zgh" },
1450 { name => "zgh-MA", alias => "zgh-Tfng-MA" },
1451 { name => "zgh-Tfng", file => "zgh" },
1452 { name => "zgh-Tfng-MA", file => "zgh_MA" },
1453 { name => "zh", lcid => 0x00007804, ireadinglayout => 2, oemcp => 936, slist => ",", sscripts => "Hani Hans", sabbrevlangname => "CHS", sopentypelang => "ZHS" },
1454 { name => "zh-CN", lcid => 0x00000804, file => "zh_Hans_CN", sparent => "zh-Hans" },
1455 { name => "zh-CN_phoneb", lcid => 0x00050804, alias => "zh-CN" },
1456 { name => "zh-CN_stroke", lcid => 0x00020804, alias => "zh-CN" },
1457 { name => "zh-Hans", lcid => 0x00000004, group => 10 },
1458 { name => "zh-Hans-CN", alias => "zh-CN" },
1459 { name => "zh-Hans-CN-u-co-phonebk", alias => "zh-CN_phoneb" },
1460 { name => "zh-Hans-CN-u-co-stroke", alias => "zh-CN_stroke" },
1461 { name => "zh-Hans-HK", slist => ";" },
1462 { name => "zh-Hans-MO", slist => ";" },
1463 { name => "zh-Hans-SG", alias => "zh-SG" },
1464 { name => "zh-Hans-SG-u-co-phonebk", alias => "zh-SG_phoneb" },
1465 { name => "zh-Hans-SG-u-co-stroke", alias => "zh-SG_stroke" },
1466 { name => "zh-Hant", lcid => 0x00007c04, sortlocale => "zh-HK", ireadinglayout => 2, oemcp => 950, slist => ",", sscripts => "Hani Hant", group => 9, sabbrevlangname => "CHT", sopentypelang => "ZHH" },
1467 { name => "zh-Hant-HK", alias => "zh-HK" },
1468 { name => "zh-Hant-HK-u-co-unihan", alias => "zh-HK_radstr" },
1469 { name => "zh-Hant-MO", alias => "zh-MO" },
1470 { name => "zh-Hant-MO-u-co-stroke", alias => "zh-MO_stroke" },
1471 { name => "zh-Hant-MO-u-co-unihan", alias => "zh-MO_radstr" },
1472 { name => "zh-Hant-TW", alias => "zh-TW" },
1473 { name => "zh-Hant-TW-u-co-phonetic", alias => "zh-TW_pronun" },
1474 { name => "zh-Hant-TW-u-co-unihan", alias => "zh-TW_radstr" },
1475 { name => "zh-HK", lcid => 0x00000c04, file => "zh_Hant_HK", sparent => "zh-Hant", sabbrevlangname => "ZHH" },
1476 { name => "zh-HK_radstr", lcid => 0x00040c04, alias => "zh-HK" },
1477 { name => "zh-MO", lcid => 0x00001404, file => "zh_Hant_MO", sparent => "zh-Hant", sabbrevlangname => "ZHM", sopentypelang => "ZHT" },
1478 { name => "zh-MO_radstr", lcid => 0x00041404, alias => "zh-MO" },
1479 { name => "zh-MO_stroke", lcid => 0x00021404, alias => "zh-MO" },
1480 { name => "zh-SG", lcid => 0x00001004, file => "zh_Hans_SG", sparent => "zh-Hans", sabbrevlangname => "ZHI" },
1481 { name => "zh-SG_phoneb", lcid => 0x00051004, alias => "zh-SG" },
1482 { name => "zh-SG_stroke", lcid => 0x00021004, alias => "zh-SG" },
1483 { name => "zh-TW", lcid => 0x00000404, file => "zh_Hant_TW", sparent => "zh-Hant", sopentypelang => "ZHT" },
1484 { name => "zh-TW_pronun", lcid => 0x00030404, alias => "zh-TW" },
1485 { name => "zh-TW_radstr", lcid => 0x00040404, alias => "zh-TW" },
1486 { name => "zu", lcid => 0x00000035, oemcp => 850 },
1487 { name => "zu-ZA", lcid => 0x00000435 },
1490 my @cp2uni = ();
1491 my @glyph2uni = ();
1492 my @lead_bytes = ();
1493 my @uni2cp = ();
1494 my @tolower_table = ();
1495 my @toupper_table = ();
1496 my @digitmap_table = ();
1497 my @category_table = ();
1498 my @initial_joining_table = ();
1499 my @direction_table = ();
1500 my @decomp_table = ();
1501 my @combining_class_table = ();
1502 my @decomp_compat_table = ();
1503 my @comp_exclusions = ();
1504 my @idna_decomp_table = ();
1505 my @idna_disallowed = ();
1506 my %registry_keys;
1507 my $default_char;
1508 my $default_wchar;
1510 my %joining_forms =
1512 "isolated" => [],
1513 "final" => [],
1514 "initial" => [],
1515 "medial" => []
1518 sub to_utf16(@)
1520 my @ret;
1521 foreach my $ch (@_)
1523 if ($ch < 0x10000)
1525 push @ret, $ch;
1527 else
1529 my $val = $ch - 0x10000;
1530 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
1533 return @ret;
1536 ################################################################
1537 # fetch a unicode.org file and open it
1538 sub open_data_file($$)
1540 my ($base, $name) = @_;
1541 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
1542 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
1543 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
1544 local *FILE;
1546 if ($base =~ /.*\/([^\/]+)\.zip$/)
1548 my $zip = "$1$suffix.zip";
1549 unless (-f "$cache/$zip")
1551 system "mkdir", "-p", $cache;
1552 print "Fetching $base...\n";
1553 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
1555 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
1557 else
1559 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
1560 unless (-f $dest)
1562 system "mkdir", "-p", $dir;
1563 print "Fetching $base/$name...\n";
1564 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
1566 open FILE, "<$dest" or die "cannot open $dest";
1568 return *FILE;
1571 ################################################################
1572 # load a unicode.org file as XML data
1573 sub load_xml_data_file($$)
1575 my ($base, $name) = @_;
1576 my $FILE = open_data_file( $base, $name );
1577 my $xml = XML::LibXML->load_xml( IO => $FILE );
1578 close FILE;
1579 return $xml;
1582 ################################################################
1583 # recursively get the decomposition for a character
1584 sub get_decomposition($$);
1585 sub get_decomposition($$)
1587 my ($char, $table) = @_;
1588 my @ret;
1590 return $char unless defined $table->[$char];
1591 foreach my $ch (@{$table->[$char]})
1593 push @ret, get_decomposition( $ch, $table );
1595 return @ret;
1598 ################################################################
1599 # get the composition that results in a given character
1600 sub get_composition($$)
1602 my ($ch, $compat) = @_;
1603 return () unless defined $decomp_table[$ch]; # no decomposition
1604 my @ret = @{$decomp_table[$ch]};
1605 return () if @ret < 2; # singleton decomposition
1606 return () if $comp_exclusions[$ch]; # composition exclusion
1607 return () if $combining_class_table[$ch]; # non-starter
1608 return () if $combining_class_table[$ret[0]]; # first char is non-starter
1609 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
1610 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
1611 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
1612 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
1613 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
1614 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
1615 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
1616 return @ret;
1619 ################################################################
1620 # recursively build decompositions
1621 sub build_decompositions(@)
1623 my @src = @_;
1624 my @dst;
1626 for (my $i = 0; $i < @src; $i++)
1628 next unless defined $src[$i];
1629 my @decomp = to_utf16( get_decomposition( $i, \@src ));
1630 $dst[$i] = \@decomp;
1632 return @dst;
1635 ################################################################
1636 # compose Hangul sequences
1637 sub compose_hangul(@)
1639 my $SBASE = 0xac00;
1640 my $LBASE = 0x1100;
1641 my $VBASE = 0x1161;
1642 my $TBASE = 0x11a7;
1643 my $LCOUNT = 19;
1644 my $VCOUNT = 21;
1645 my $TCOUNT = 28;
1646 my $NCOUNT = $VCOUNT * $TCOUNT;
1647 my $SCOUNT = $LCOUNT * $NCOUNT;
1649 my @seq = @_;
1650 my @ret;
1651 my $i;
1653 for ($i = 0; $i < @seq; $i++)
1655 my $ch = $seq[$i];
1656 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
1657 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
1659 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
1660 $i++;
1662 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
1663 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
1665 $ch += $seq[$i+1] - $TBASE;
1666 $i++;
1668 push @ret, $ch;
1670 return @ret;
1673 ################################################################
1674 # remove linguistic-only mappings from the case table
1675 sub remove_linguistic_mappings($$)
1677 my ($upper, $lower) = @_;
1679 # remove case mappings that don't round-trip
1681 for (my $i = 0; $i < @{$upper}; $i++)
1683 next unless defined ${$upper}[$i];
1684 my $ch = ${$upper}[$i];
1685 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
1687 for (my $i = 0; $i < @{$lower}; $i++)
1689 next unless defined ${$lower}[$i];
1690 my $ch = ${$lower}[$i];
1691 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
1695 ################################################################
1696 # read in the Unicode database files
1697 sub load_data()
1699 my $start;
1701 # now build mappings from the decomposition field of the Unicode database
1703 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
1704 while (<$UNICODE_DATA>)
1706 # Decode the fields ...
1707 my ($code, $name, $cat, $comb, $bidi,
1708 $decomp, $dec, $dig, $num, $mirror,
1709 $oldname, $comment, $upper, $lower, $title) = split /;/;
1710 my $src = hex $code;
1712 die "unknown category $cat" unless defined $categories{$cat};
1713 die "unknown directionality $bidi" unless defined $directions{$bidi};
1715 $category_table[$src] = $categories{$cat};
1716 $direction_table[$src] = $bidi;
1717 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
1719 $initial_joining_table[$src] = $joining_types{"T"};
1721 else
1723 $initial_joining_table[$src] = $joining_types{"U"};
1726 if ($lower ne "")
1728 $tolower_table[$src] = hex $lower;
1730 if ($upper ne "")
1732 $toupper_table[$src] = hex $upper;
1734 if ($dec ne "")
1736 $category_table[$src] |= $ctype{"digit"};
1738 if ($dig ne "")
1740 $digitmap_table[$src] = ord $dig;
1742 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
1744 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
1745 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
1746 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
1747 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
1748 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
1749 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
1750 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
1751 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
1752 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
1753 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
1754 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
1755 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
1757 # copy the category and direction for everything between First/Last pairs
1758 if ($name =~ /, First>/) { $start = $src; }
1759 if ($name =~ /, Last>/)
1761 while ($start < $src)
1763 $category_table[$start] = $category_table[$src];
1764 $direction_table[$start] = $direction_table[$src];
1765 $combining_class_table[$start] = $combining_class_table[$src];
1766 $start++;
1770 next if $decomp eq ""; # no decomposition, skip it
1772 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
1774 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
1775 $decomp_compat_table[$src] = \@seq;
1778 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
1780 # decomposition of the form "<foo> 1234" -> use char if type is known
1781 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
1783 ${joining_forms{$1}}[hex $2] = $src;
1786 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
1788 # decomposition "<compat> 0020 1234" -> combining accent
1790 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
1792 # store decomposition
1793 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
1795 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
1797 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
1799 # Single char decomposition
1800 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
1804 close $UNICODE_DATA;
1806 # patch the category of some special characters
1808 for (my $i = 0; $i < @decomp_table; $i++)
1810 next unless defined $decomp_table[$i];
1811 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
1813 foreach my $cat (keys %special_categories)
1815 my $flag = $ctype{$cat};
1816 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
1818 for (my $i = 0; $i < @decomp_compat_table; $i++)
1820 next unless defined $decomp_compat_table[$i];
1821 next unless @{$decomp_compat_table[$i]} == 2;
1822 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
1825 # load the composition exclusions
1827 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
1828 while (<$EXCL>)
1830 s/\#.*//; # remove comments
1831 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
1833 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
1835 elsif (/^([0-9a-fA-F]+)\s*$/)
1837 $comp_exclusions[hex $1] = 1;
1840 close $EXCL;
1842 # load the IDNA mappings
1844 @idna_decomp_table = @decomp_compat_table;
1845 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
1846 while (<$IDNA>)
1848 s/\#.*//; # remove comments
1849 next if /^\s*$/;
1850 my ($char, $type, $mapping) = split /;/;
1851 my ($ch1, $ch2);
1852 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
1854 $ch1 = hex $1;
1855 $ch2 = hex $2;
1857 elsif ($char =~ /([0-9a-fA-F]+)/)
1859 $ch1 = $ch2 = hex $1;
1862 if ($type =~ /mapped/ || $type =~ /deviation/)
1864 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
1865 my @seq = map { hex $_; } split /\s+/, $mapping;
1866 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
1868 elsif ($type =~ /valid/)
1871 elsif ($type =~ /ignored/)
1873 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
1875 elsif ($type =~ /disallowed/)
1877 foreach my $i ($ch1 .. $ch2)
1879 $idna_decomp_table[$i] = undef;
1880 $idna_disallowed[$i] = 1;
1884 close $IDNA;
1888 ################################################################
1889 # add a new registry key
1890 sub add_registry_key($$)
1892 my ($key, $defval) = @_;
1893 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
1896 ################################################################
1897 # add a new registry value
1898 sub add_registry_value($$$)
1900 my ($key, $name, $value) = @_;
1901 add_registry_key( $key, undef );
1902 push @{$registry_keys{$key}}, "'$name' = s '$value'";
1905 ################################################################
1906 # define a new lead byte
1907 sub add_lead_byte($)
1909 my $ch = shift;
1910 return if defined $cp2uni[$ch];
1911 push @lead_bytes, $ch;
1912 $cp2uni[$ch] = 0;
1915 ################################################################
1916 # define a new char mapping
1917 sub add_mapping($$)
1919 my ($cp, $uni) = @_;
1920 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
1921 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
1922 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
1925 ################################################################
1926 # get a mapping including glyph chars for MB_USEGLYPHCHARS
1927 sub get_glyphs_mapping(@)
1929 my @table = @_;
1931 for (my $i = 0; $i < @glyph2uni; $i++)
1933 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
1935 return @table;
1938 ################################################################
1939 # build EUC-JP table from the JIS 0208/0212 files
1940 sub dump_eucjp_codepage()
1942 @cp2uni = ();
1943 @glyph2uni = ();
1944 @lead_bytes = ();
1945 @uni2cp = ();
1946 $default_char = $DEF_CHAR;
1947 $default_wchar = 0x30fb;
1949 # ASCII chars
1950 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
1952 # lead bytes
1953 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
1955 # JIS X 0201 right plane
1956 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
1958 # undefined chars
1959 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
1960 $cp2uni[0xa0] = 0xf8f0;
1961 $cp2uni[0xff] = 0xf8f3;
1963 # Fix backslash conversion
1964 add_mapping( 0xa1c0, 0xff3c );
1966 # Add private mappings for rows undefined in JIS 0208/0212
1967 my $private = 0xe000;
1968 foreach my $hi (0xf5 .. 0xfe)
1970 foreach my $lo (0xa1 .. 0xfe)
1972 add_mapping( ($hi << 8) + $lo, $private++ );
1975 foreach my $hi (0xf5 .. 0xfe)
1977 foreach my $lo (0x21 .. 0x7e)
1979 add_mapping( ($hi << 8) + $lo, $private++ );
1983 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
1984 while (<$INPUT>)
1986 next if /^\#/; # skip comments
1987 next if /^$/; # skip empty lines
1988 next if /\x1a/; # skip ^Z
1989 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
1991 add_mapping( 0x8080 + hex $1, hex $2 );
1992 next;
1994 die "Unrecognized line $_\n";
1996 close $INPUT;
1998 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
1999 while (<$INPUT>)
2001 next if /^\#/; # skip comments
2002 next if /^$/; # skip empty lines
2003 next if /\x1a/; # skip ^Z
2004 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2006 add_mapping( 0x8000 + hex $1, hex $2 );
2007 next;
2009 die "Unrecognized line $_\n";
2011 close $INPUT;
2013 output_codepage_file( 20932 );
2016 ################################################################
2017 # build Korean Wansung table from the KSX1001 file
2018 sub dump_krwansung_codepage(@)
2020 my @cp949 = @_;
2021 @cp2uni = ();
2022 @glyph2uni = ();
2023 @lead_bytes = ();
2024 @uni2cp = ();
2025 $default_char = 0x3f;
2026 $default_wchar = 0x003f;
2028 # ASCII and undefined chars
2029 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
2030 add_mapping( 0xa0, 0xf8e6 );
2031 add_mapping( 0xad, 0xf8e7 );
2032 add_mapping( 0xae, 0xf8e8 );
2033 add_mapping( 0xaf, 0xf8e9 );
2034 add_mapping( 0xfe, 0xf8ea );
2035 add_mapping( 0xff, 0xf8eb );
2037 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
2038 while (<$INPUT>)
2040 next if /^\#/; # skip comments
2041 next if /^$/; # skip empty lines
2042 next if /\x1a/; # skip ^Z
2043 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2045 add_mapping( 0x8080 + hex $1, hex $2 );
2046 next;
2048 die "Unrecognized line $_\n";
2050 close $INPUT;
2052 # get some extra mappings from cp 949
2053 my @defined_lb;
2054 map { $defined_lb[$_] = 1; } @lead_bytes;
2055 foreach my $i (0x0000 .. 0xffff)
2057 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
2058 next unless defined $cp949[$i];
2059 if ($cp949[$i] >= 0xff)
2061 # only add chars for lead bytes that exist in 20949
2062 my $hi = $cp949[$i] >> 8;
2063 my $lo = $cp949[$i] & 0xff;
2064 next unless $defined_lb[$hi];
2065 next unless $lo >= 0xa1 && $lo <= 0xfe;
2067 add_mapping( $cp949[$i], $i );
2070 output_codepage_file( 20949 );
2073 ################################################################
2074 # build the sort keys table
2075 sub dump_sortkeys($)
2077 my $filename = shift;
2078 my @sortkeys = ();
2080 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
2081 while (<$INPUT>)
2083 next if /^\#/; # skip comments
2084 next if /^$/; # skip empty lines
2085 next if /\x1a/; # skip ^Z
2086 next if /^\@version/; # skip @version header
2087 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
2089 my ($uni,$variable) = (hex $1, $2);
2090 next if $uni > 65535;
2091 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
2092 next;
2094 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
2096 # multiple character sequence, ignored for now
2097 next;
2099 die "$SORTKEYS: Unrecognized line $_\n";
2101 close $INPUT;
2103 # compress the keys to 32 bit:
2104 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
2106 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
2107 ${$a}[2] <=> ${$b}[2] or
2108 ${$a}[3] <=> ${$b}[3] or
2109 ${$a}[4] <=> ${$b}[4] or
2110 $a cmp $b; } @sortkeys;
2112 my ($n2, $n3) = (1, 1);
2113 my @keys = (-1, -1, -1, -1, -1 );
2114 my @flatkeys = ();
2116 for (my $i = 0; $i < @sortkeys; $i++)
2118 next unless defined $sortkeys[$i];
2119 my @current = @{$sortkeys[$i]};
2120 if ($current[1] == $keys[1])
2122 if ($current[2] == $keys[2])
2124 if ($current[3] == $keys[3])
2126 # nothing
2128 else
2130 $keys[3] = $current[3];
2131 $n3++;
2132 die if ($n3 >= 16);
2135 else
2137 $keys[2] = $current[2];
2138 $keys[3] = $current[3];
2139 $n2++;
2140 $n3 = 1;
2141 die if ($n2 >= 256);
2144 else
2146 $keys[1] = $current[1];
2147 $keys[2] = $current[2];
2148 $keys[3] = $current[3];
2149 $n2 = 1;
2150 $n3 = 1;
2153 if ($current[2]) { $current[2] = $n2; }
2154 if ($current[3]) { $current[3] = $n3; }
2155 if ($current[4]) { $current[4] = 1; }
2157 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
2160 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2161 printf "Building $filename\n";
2162 printf OUTPUT "/* Unicode collation element table */\n";
2163 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
2164 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
2165 print OUTPUT "#include \"windef.h\"\n\n";
2167 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
2169 close OUTPUT;
2170 save_file($filename);
2174 ################################################################
2175 # dump an array of integers
2176 sub dump_array($$@)
2178 my ($bit_width, $default, @array) = @_;
2179 my $format = sprintf "0x%%0%ux", $bit_width / 4;
2180 my $i;
2181 my $ret = " ";
2182 for ($i = 0; $i < $#array; $i++)
2184 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2185 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
2187 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2188 return $ret;
2192 ################################################################
2193 # dump an SBCS mapping table in binary format
2194 sub dump_binary_sbcs_table($)
2196 my $codepage = shift;
2198 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2199 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
2201 print OUTPUT pack "S<*", @header;
2202 print OUTPUT pack "C12", (0) x 12;
2203 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2205 if (@glyph2uni)
2207 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
2209 else
2211 print OUTPUT pack "S<*", 0;
2214 print OUTPUT pack "S<*", 0, 0;
2216 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2220 ################################################################
2221 # dump a DBCS mapping table in binary format
2222 sub dump_binary_dbcs_table($)
2224 my $codepage = shift;
2225 my @lb_ranges = get_lb_ranges();
2226 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2228 my @offsets = (0) x 256;
2229 my $pos = 0;
2230 foreach my $i (@lead_bytes)
2232 $offsets[$i] = ($pos += 256);
2233 $cp2uni[$i] = 0;
2236 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
2238 print OUTPUT pack "S<*", @header;
2239 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
2240 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2241 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
2243 foreach my $i (@lead_bytes)
2245 my $base = $i << 8;
2246 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
2249 print OUTPUT pack "S<", 4;
2250 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2254 ################################################################
2255 # get the list of defined lead byte ranges
2256 sub get_lb_ranges()
2258 my @list = ();
2259 my @ranges = ();
2261 foreach my $i (@lead_bytes) { $list[$i] = 1; }
2262 my $on = 0;
2263 for (my $i = 0; $i < 256; $i++)
2265 if ($on)
2267 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
2269 else
2271 if ($list[$i]) { push @ranges, $i; $on = 1; }
2274 if ($on) { push @ranges, 0xff; }
2275 return @ranges;
2278 ################################################################
2279 # dump the Indic Syllabic Category table
2280 sub dump_indic($)
2282 my $filename = shift;
2283 my @indic_table;
2285 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
2286 while (<$INPUT>)
2288 next if /^\#/; # skip comments
2289 next if /^\s*$/; # skip empty lines
2290 next if /\x1a/; # skip ^Z
2291 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2293 my $type = $2;
2294 die "unknown indic $type" unless defined $indic_types{$type};
2295 if (hex $1 < 65536)
2297 $indic_table[hex $1] = $indic_types{$type};
2299 next;
2301 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2303 my $type = $3;
2304 die "unknown indic $type" unless defined $indic_types{$type};
2305 if (hex $1 < 65536 and hex $2 < 65536)
2307 foreach my $i (hex $1 .. hex $2)
2309 $indic_table[$i] = $indic_types{$type};
2312 next;
2314 die "malformed line $_";
2316 close $INPUT;
2318 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
2319 while (<$INPUT>)
2321 next if /^\#/; # skip comments
2322 next if /^\s*$/; # skip empty lines
2323 next if /\x1a/; # skip ^Z
2324 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2326 my $type = $2;
2327 die "unknown matra $type" unless defined $matra_types{$type};
2328 $indic_table[hex $1] |= $matra_types{$type} << 8;
2329 next;
2331 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2333 my $type = $3;
2334 die "unknown matra $type" unless defined $matra_types{$type};
2335 foreach my $i (hex $1 .. hex $2)
2337 $indic_table[$i] |= $matra_types{$type} << 8;
2339 next;
2341 die "malformed line $_";
2343 close $INPUT;
2345 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2346 print "Building $filename\n";
2347 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
2348 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
2349 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
2350 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2351 print OUTPUT "#include \"windef.h\"\n\n";
2353 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
2355 close OUTPUT;
2356 save_file($filename);
2359 ################################################################
2360 # dump the Line Break Properties table
2361 sub dump_linebreak($)
2363 my $filename = shift;
2364 my @break_table;
2366 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
2367 while (<$INPUT>)
2369 next if /^\#/; # skip comments
2370 next if /^\s*$/; # skip empty lines
2371 next if /\x1a/; # skip ^Z
2372 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2374 my $type = $2;
2375 die "unknown breaktype $type" unless defined $break_types{$type};
2376 $break_table[hex $1] = $break_types{$type};
2377 next;
2379 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2381 my $type = $3;
2382 die "unknown breaktype $type" unless defined $break_types{$type};
2383 foreach my $i (hex $1 .. hex $2)
2385 $break_table[$i] = $break_types{$type};
2387 next;
2389 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2391 my $type = $2;
2392 die "unknown breaktype $type" unless defined $break_types{$type};
2393 $break_table[hex $1] = $break_types{$type};
2394 next;
2396 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2398 my $type = $3;
2399 die "unknown breaktype $type" unless defined $break_types{$type};
2400 foreach my $i (hex $1 .. hex $2)
2402 $break_table[$i] = $break_types{$type};
2404 next;
2406 die "malformed line $_";
2408 close $INPUT;
2410 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2411 print "Building $filename\n";
2412 print OUTPUT "/* Unicode Line Break Properties */\n";
2413 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
2414 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2415 print OUTPUT "#include \"windef.h\"\n\n";
2417 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
2419 close OUTPUT;
2420 save_file($filename);
2423 my %scripts =
2425 "Unknown" => 0,
2426 "Common" => 1,
2427 "Inherited" => 2,
2428 "Arabic" => 3,
2429 "Armenian" => 4,
2430 "Avestan" => 5,
2431 "Balinese" => 6,
2432 "Bamum" => 7,
2433 "Batak" => 8,
2434 "Bengali" => 9,
2435 "Bopomofo" => 10,
2436 "Brahmi" => 11,
2437 "Braille" => 12,
2438 "Buginese" => 13,
2439 "Buhid" => 14,
2440 "Canadian_Aboriginal" => 15,
2441 "Carian" => 16,
2442 "Cham" => 17,
2443 "Cherokee" => 18,
2444 "Coptic" => 19,
2445 "Cuneiform" => 20,
2446 "Cypriot" => 21,
2447 "Cyrillic" => 22,
2448 "Deseret" => 23,
2449 "Devanagari" => 24,
2450 "Egyptian_Hieroglyphs" => 25,
2451 "Ethiopic" => 26,
2452 "Georgian" => 27,
2453 "Glagolitic" => 28,
2454 "Gothic" => 29,
2455 "Greek" => 30,
2456 "Gujarati" => 31,
2457 "Gurmukhi" => 32,
2458 "Han" => 33,
2459 "Hangul" => 34,
2460 "Hanunoo" => 35,
2461 "Hebrew" => 36,
2462 "Hiragana" => 37,
2463 "Imperial_Aramaic" => 38,
2464 "Inscriptional_Pahlavi" => 39,
2465 "Inscriptional_Parthian" => 40,
2466 "Javanese" => 41,
2467 "Kaithi" => 42,
2468 "Kannada" => 43,
2469 "Katakana" => 44,
2470 "Kayah_Li" => 45,
2471 "Kharoshthi" => 46,
2472 "Khmer" => 47,
2473 "Lao" => 48,
2474 "Latin" => 49,
2475 "Lepcha" => 50,
2476 "Limbu" => 51,
2477 "Linear_B" => 52,
2478 "Lisu" => 53,
2479 "Lycian" => 54,
2480 "Lydian" => 55,
2481 "Malayalam" => 56,
2482 "Mandaic" => 57,
2483 "Meetei_Mayek" => 58,
2484 "Mongolian" => 59,
2485 "Myanmar" => 60,
2486 "New_Tai_Lue" => 61,
2487 "Nko" => 62,
2488 "Ogham" => 63,
2489 "Ol_Chiki" => 64,
2490 "Old_Italic" => 65,
2491 "Old_Persian" => 66,
2492 "Old_South_Arabian" => 67,
2493 "Old_Turkic" => 68,
2494 "Oriya" => 69,
2495 "Osmanya" => 70,
2496 "Phags_Pa" => 71,
2497 "Phoenician" => 72,
2498 "Rejang" => 73,
2499 "Runic" => 74,
2500 "Samaritan" => 75,
2501 "Saurashtra" => 76,
2502 "Shavian" => 77,
2503 "Sinhala" => 78,
2504 "Sundanese" => 79,
2505 "Syloti_Nagri" => 80,
2506 "Syriac" => 81,
2507 "Tagalog" => 82,
2508 "Tagbanwa" => 83,
2509 "Tai_Le" => 84,
2510 "Tai_Tham" => 85,
2511 "Tai_Viet" => 86,
2512 "Tamil" => 87,
2513 "Telugu" => 88,
2514 "Thaana" => 89,
2515 "Thai" => 90,
2516 "Tibetan" => 91,
2517 "Tifinagh" => 92,
2518 "Ugaritic" => 93,
2519 "Vai" => 94,
2520 "Yi" => 95,
2521 # Win8/Win8.1
2522 "Chakma" => 96,
2523 "Meroitic_Cursive" => 97,
2524 "Meroitic_Hieroglyphs" => 98,
2525 "Miao" => 99,
2526 "Sharada" => 100,
2527 "Sora_Sompeng" => 101,
2528 "Takri" => 102,
2529 # Win10
2530 "Bassa_Vah" => 103,
2531 "Caucasian_Albanian" => 104,
2532 "Duployan" => 105,
2533 "Elbasan" => 106,
2534 "Grantha" => 107,
2535 "Khojki" => 108,
2536 "Khudawadi" => 109,
2537 "Linear_A" => 110,
2538 "Mahajani" => 111,
2539 "Manichaean" => 112,
2540 "Mende_Kikakui" => 113,
2541 "Modi" => 114,
2542 "Mro" => 115,
2543 "Nabataean" => 116,
2544 "Old_North_Arabian" => 117,
2545 "Old_Permic" => 118,
2546 "Pahawh_Hmong" => 119,
2547 "Palmyrene" => 120,
2548 "Pau_Cin_Hau" => 121,
2549 "Psalter_Pahlavi" => 122,
2550 "Siddham" => 123,
2551 "Tirhuta" => 124,
2552 "Warang_Citi" => 125,
2553 # Win10 RS1
2554 "Adlam" => 126,
2555 "Ahom" => 127,
2556 "Anatolian_Hieroglyphs" => 128,
2557 "Bhaiksuki" => 129,
2558 "Hatran" => 130,
2559 "Marchen" => 131,
2560 "Multani" => 132,
2561 "Newa" => 133,
2562 "Old_Hungarian" => 134,
2563 "Osage" => 135,
2564 "SignWriting" => 136,
2565 "Tangut" => 137,
2566 # Win10 RS4
2567 "Masaram_Gondi" => 138,
2568 "Nushu" => 139,
2569 "Soyombo" => 140,
2570 "Zanabazar_Square" => 141,
2571 # Win10 1903
2572 "Dogra" => 142,
2573 "Gunjala_Gondi" => 143,
2574 "Hanifi_Rohingya" => 144,
2575 "Makasar" => 145,
2576 "Medefaidrin" => 146,
2577 "Old_Sogdian" => 147,
2578 "Sogdian" => 148,
2579 # Win10 2004
2580 "Elymaic" => 149,
2581 "Nyiakeng_Puachue_Hmong" => 150,
2582 "Nandinagari" => 151,
2583 "Wancho" => 152,
2584 # Win11
2585 "Chorasmian" => 153,
2586 "Dives_Akuru" => 154,
2587 "Khitan_Small_Script" => 155,
2588 "Yezidi" => 156,
2591 ################################################################
2592 # dump Script IDs table
2593 sub dump_scripts($)
2595 my $filename = shift;
2596 my $header = $filename;
2597 my @scripts_table;
2598 my $script_index;
2599 my $i;
2601 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
2602 # Fill the table
2603 # Unknown script id is always 0, so undefined scripts are automatically treated as such
2604 while (<$INPUT>)
2606 my $type = "";
2608 next if /^\#/; # skip comments
2609 next if /^\s*$/; # skip empty lines
2610 next if /\x1a/; # skip ^Z
2611 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2613 $type = $2;
2614 if (defined $scripts{$type})
2616 $scripts_table[hex $1] = $scripts{$type};
2618 next;
2620 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2622 $type = $3;
2623 if (defined $scripts{$type})
2625 foreach my $i (hex $1 .. hex $2)
2627 $scripts_table[$i] = $scripts{$type};
2630 next;
2634 close $INPUT;
2636 $header = "$filename.h";
2637 open OUTPUT,">$header.new" or die "Cannot create $header";
2638 print "Building $header\n";
2639 print OUTPUT "/* Unicode Script IDs */\n";
2640 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
2641 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2643 print OUTPUT "enum unicode_script_id {\n";
2644 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
2646 print OUTPUT " Script_$script = $scripts{$script},\n";
2648 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
2649 print OUTPUT "};\n";
2651 close OUTPUT;
2652 save_file($header);
2654 $filename = "$filename.c";
2655 open OUTPUT,">$filename.new" or die "Cannot create $header";
2656 print "Building $filename\n";
2657 print OUTPUT "/* Unicode Script IDs */\n";
2658 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
2659 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2660 print OUTPUT "#include \"windef.h\"\n\n";
2662 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
2663 close OUTPUT;
2664 save_file($filename);
2667 ################################################################
2668 # dump the BiDi mirroring table
2669 sub dump_mirroring($)
2671 my $filename = shift;
2672 my @mirror_table = ();
2674 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
2675 while (<$INPUT>)
2677 next if /^\#/; # skip comments
2678 next if /^$/; # skip empty lines
2679 next if /\x1a/; # skip ^Z
2680 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
2682 $mirror_table[hex $1] = hex $2;
2683 next;
2685 die "malformed line $_";
2687 close $INPUT;
2689 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2690 print "Building $filename\n";
2691 print OUTPUT "/* Unicode BiDi mirroring */\n";
2692 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
2693 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2694 print OUTPUT "#include \"windef.h\"\n\n";
2695 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
2696 close OUTPUT;
2697 save_file($filename);
2700 ################################################################
2701 # dump the Bidi Brackets
2702 sub dump_bracket($)
2704 my $filename = shift;
2705 my @bracket_table;
2707 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
2708 while (<$INPUT>)
2710 next if /^\#/; # skip comments
2711 next if /^\s*$/; # skip empty lines
2712 next if /\x1a/; # skip ^Z
2713 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
2715 my $type = $3;
2716 die "unknown bracket $type" unless defined $bracket_types{$type};
2717 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
2718 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
2719 $bracket_table[hex $1] += $bracket_types{$type} << 8;
2720 next;
2722 die "malformed line $_";
2724 close $INPUT;
2726 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2727 print "Building $filename\n";
2728 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
2729 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
2730 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2731 print OUTPUT "#include \"windef.h\"\n\n";
2733 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
2735 close OUTPUT;
2736 save_file($filename);
2739 ################################################################
2740 # dump the Arabic shaping table
2741 sub dump_shaping($)
2743 my $filename = shift;
2744 my @joining_table = @initial_joining_table;
2746 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
2747 while (<$INPUT>)
2749 next if /^\#/; # skip comments
2750 next if /^\s*$/; # skip empty lines
2751 next if /\x1a/; # skip ^Z
2752 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
2754 my $type = $2;
2755 $joining_table[hex $1] = $joining_types{$type};
2756 next;
2758 die "malformed line $_";
2760 close $INPUT;
2762 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2763 print "Building $filename\n";
2764 print OUTPUT "/* Unicode Arabic shaping */\n";
2765 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
2766 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2767 print OUTPUT "#include \"windef.h\"\n\n";
2769 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
2771 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
2772 for (my $i = 0x600; $i <= 0x6ff; $i++)
2774 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
2775 ${joining_forms{"isolated"}}[$i] || $i,
2776 ${joining_forms{"final"}}[$i] || $i,
2777 ${joining_forms{"initial"}}[$i] || $i,
2778 ${joining_forms{"medial"}}[$i] || $i;
2780 print OUTPUT "};\n";
2782 close OUTPUT;
2783 save_file($filename);
2786 ################################################################
2787 # dump the Arabic shaping table
2788 sub dump_arabic_shaping($)
2790 my $filename = shift;
2791 my @joining_table = @initial_joining_table;
2793 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
2794 while (<$INPUT>)
2796 next if /^\#/; # skip comments
2797 next if /^\s*$/; # skip empty lines
2798 next if /\x1a/; # skip ^Z
2799 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
2801 my $type = $2;
2802 my $group = $3;
2804 if ($group eq "ALAPH" || $group eq "DALATH RISH")
2806 $joining_table[hex $1] = $joining_types{$group};
2808 else
2810 $joining_table[hex $1] = $joining_types{$type};
2813 next;
2815 die "malformed line $_";
2817 close $INPUT;
2819 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2820 print "Building $filename\n";
2821 print OUTPUT "/* Unicode Arabic shaping */\n";
2822 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
2823 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2824 print OUTPUT "#include \"windef.h\"\n\n";
2826 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
2828 close OUTPUT;
2829 save_file($filename);
2832 ################################################################
2833 # dump the Vertical Orientation table
2834 sub dump_vertical($$)
2836 my ($filename, $unix) = @_;
2837 my @vertical_table;
2839 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
2840 while (<$INPUT>)
2842 next if /^\#/; # skip comments
2843 next if /^\s*$/; # skip empty lines
2844 next if /\x1a/; # skip ^Z
2845 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2847 my $type = $2;
2848 die "unknown vertical $type" unless defined $vertical_types{$type};
2849 if (hex $1 < 65536)
2851 $vertical_table[hex $1] = $vertical_types{$type};
2853 next;
2855 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
2857 my $type = $3;
2858 die "unknown vertical $type" unless defined $vertical_types{$type};
2859 foreach my $i (hex $1 .. hex $2)
2861 $vertical_table[$i] = $vertical_types{$type};
2863 next;
2865 die "malformed line $_";
2867 close $INPUT;
2869 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2870 print "Building $filename\n";
2871 print OUTPUT "/* Unicode Vertical Orientation */\n";
2872 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
2873 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2874 if ($unix)
2876 print OUTPUT "#if 0\n";
2877 print OUTPUT "#pragma makedep unix\n";
2878 print OUTPUT "#endif\n\n";
2880 print OUTPUT "#include \"windef.h\"\n\n";
2882 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
2884 close OUTPUT;
2885 save_file($filename);
2888 ################################################################
2889 # dump the digit folding tables
2890 sub dump_digit_folding($)
2892 my ($filename) = shift;
2893 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2894 print "Building $filename\n";
2895 print OUTPUT "/* Unicode digit folding mappings */\n";
2896 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
2897 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2898 print OUTPUT "#include \"windef.h\"\n\n";
2900 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
2901 close OUTPUT;
2902 save_file($filename);
2906 ################################################################
2907 # compress a mapping table by removing identical rows
2908 sub compress_array($$@)
2910 my $rows = shift;
2911 my $def = shift;
2912 my @table = @_;
2913 my $len = @table / $rows;
2914 my @array;
2915 my $data = "";
2917 # try to merge table rows
2918 for (my $row = 0; $row < $rows; $row++)
2920 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
2921 my $pos = index $data, $rowtxt;
2922 if ($pos == -1)
2924 # check if the tail of the data can match the start of the new row
2925 my $first = substr( $rowtxt, 0, 1 );
2926 for (my $i = length($data) - 1; $i > 0; $i--)
2928 $pos = index( substr( $data, -$i ), $first );
2929 last if $pos == -1;
2930 $i -= $pos;
2931 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
2932 substr( $data, -$i ) = "";
2933 last;
2935 $pos = length $data;
2936 $data .= $rowtxt;
2938 $array[$row] = $rows + $pos;
2940 return @array, unpack "U*", $data;
2943 ################################################################
2944 # dump a char -> 16-bit value mapping table using two-level tables
2945 sub dump_two_level_mapping($$@)
2947 my $name = shift;
2948 my $def = shift;
2949 my $size = shift;
2950 my $type = $size == 16 ? "unsigned short" : "unsigned int";
2951 my @row_array = compress_array( 4096, $def, @_[0..65535] );
2952 my @array = compress_array( 256, 0, @row_array[0..4095] );
2954 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
2956 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
2957 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
2958 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
2959 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
2962 ################################################################
2963 # dump a char -> value mapping table using three-level tables
2964 sub dump_three_level_mapping($$@)
2966 my $name = shift;
2967 my $def = shift;
2968 my $size = shift;
2969 my $type = $size == 16 ? "unsigned short" : "unsigned int";
2970 my $level3 = ($MAX_CHAR + 1) / 16;
2971 my $level2 = $level3 / 16;
2972 my $level1 = $level2 / 16;
2973 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
2974 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
2975 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
2977 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
2978 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
2980 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
2981 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
2982 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
2983 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
2984 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
2987 ################################################################
2988 # dump a binary case mapping table in l_intl.nls format
2989 sub dump_binary_case_table(@)
2991 my (@table) = @_;
2992 my $max_char = 0x10000;
2993 my $level1 = $max_char / 16;
2994 my $level2 = $level1 / 16;
2996 my @difftable;
2997 for (my $i = 0; $i < @table; $i++)
2999 next unless defined $table[$i];
3000 $difftable[$i] = ($table[$i] - $i) & 0xffff;
3003 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
3004 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
3005 my $offset = @array - $level1;
3006 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
3007 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
3010 ################################################################
3011 # dump case mappings for l_intl.nls
3012 sub dump_intl_nls($)
3014 my @upper_table = @toupper_table;
3015 my @lower_table = @tolower_table;
3016 remove_linguistic_mappings( \@upper_table, \@lower_table );
3018 my $upper = dump_binary_case_table( @upper_table );
3019 my $lower = dump_binary_case_table( @lower_table );
3021 my $filename = shift;
3022 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3023 printf "Building $filename\n";
3025 binmode OUTPUT;
3026 print OUTPUT pack "S<", 1; # version
3027 print OUTPUT $upper;
3028 print OUTPUT $lower;
3029 close OUTPUT;
3030 save_file($filename);
3034 ################################################################
3035 # dump the bidi direction table
3036 sub dump_bidi_dir_table($)
3038 my $filename = shift;
3039 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3040 printf "Building $filename\n";
3041 printf OUTPUT "/* Unicode BiDi direction table */\n";
3042 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
3043 printf OUTPUT "#include \"windef.h\"\n\n";
3045 my @table;
3047 for (my $i = 0; $i < 65536; $i++)
3049 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
3052 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
3054 close OUTPUT;
3055 save_file($filename);
3059 sub rol($$)
3061 my ($byte, $count) = @_;
3062 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
3065 ################################################################
3066 # compress the character properties table
3067 sub compress_char_props_table($@)
3069 my $rows = shift;
3070 my @table = @_;
3071 my $len = @table / $rows;
3072 my $pos = 0;
3073 my @array = (0) x $rows;
3074 my %sequences;
3076 # add some predefined sequences
3077 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
3079 # try to merge table rows
3080 for (my $row = 0; $row < $rows; $row++)
3082 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
3083 my $rowtxt = pack "L*", @table_row;
3084 if (defined($sequences{$rowtxt}))
3086 # reuse an existing row
3087 $array[$row] = $sequences{$rowtxt};
3089 else
3091 # create a new row
3092 $sequences{$rowtxt} = $array[$row] = ++$pos;
3093 push @array, @table_row;
3096 return @array;
3099 ################################################################
3100 # dump a normalization table in binary format
3101 sub dump_norm_table($)
3103 my $filename = shift;
3105 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
3106 my %decomp = ( "nfc" => \@decomp_table,
3107 "nfd" => \@decomp_table,
3108 "nfkc" => \@decomp_compat_table,
3109 "nfkd" => \@decomp_compat_table ,
3110 "idna" => \@idna_decomp_table );
3112 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3113 print "Building $filename\n";
3115 my $type = $filename;
3116 $type =~ s!.*/norm(\w+)\.nls!$1!;
3118 my $compose = $forms{$type} & 1;
3119 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
3121 my @version = split /\./, $UNIVERSION;
3123 # combining classes
3125 my @classes;
3126 my @class_values;
3128 foreach my $c (grep defined, @combining_class_table)
3130 $classes[$c] = 1 if $c < 0x100;
3132 for (my $i = 0; $i < @classes; $i++)
3134 next unless defined $classes[$i];
3135 $classes[$i] = @class_values;
3136 push @class_values, $i;
3138 push @class_values, 0 if (@class_values % 2);
3139 die "too many classes" if @class_values >= 0x40;
3141 # character properties
3143 my @char_props;
3144 my @decomposed;
3145 my @comp_hash_table;
3146 my $comp_hash_size = $compose ? 254 : 0;
3148 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3150 next unless defined $combining_class_table[$i];
3151 if (defined $decomp{$type}->[$i])
3153 my @dec = get_decomposition( $i, $decomp{$type} );
3154 if ($compose && (my @comp = get_composition( $i, $compat )))
3156 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
3157 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
3159 my $val = 0;
3160 foreach my $d (@dec)
3162 $val = $combining_class_table[$d];
3163 last if $val;
3165 $char_props[$i] = $classes[$val];
3167 else
3169 $char_props[$i] = 0xbf;
3171 @dec = compose_hangul( @dec ) if $compose;
3172 @dec = to_utf16( @dec );
3173 push @dec, 0 if @dec >= 7;
3174 $decomposed[$i] = \@dec;
3176 else
3178 if ($combining_class_table[$i] == 0x100)
3180 $char_props[$i] = 0x7f;
3182 elsif ($combining_class_table[$i])
3184 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
3186 elsif ($type eq "idna" && defined $idna_disallowed[$i])
3188 $char_props[$i] = 0xff;
3190 else
3192 $char_props[$i] = 0;
3197 if ($compose)
3199 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3201 my @comp = get_composition( $i, $compat );
3202 next unless @comp;
3203 if ($combining_class_table[$comp[1]])
3205 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
3206 $char_props[$comp[1]] |= 0x40;
3208 else
3210 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
3211 $char_props[$comp[1]] |= 0xc0;
3216 # surrogates
3217 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
3218 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
3220 # Hangul
3221 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
3222 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
3223 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
3225 # invalid chars
3226 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
3227 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
3228 foreach my $i (0x00..0x10)
3230 $char_props[($i << 16) | 0xfffe] = 0xff;
3231 $char_props[($i << 16) | 0xffff] = 0xff;
3234 # decomposition hash table
3236 my @decomp_hash_table;
3237 my @decomp_hash_index;
3238 my @decomp_hash_data;
3239 my $decomp_hash_size = 944;
3241 # build string of character data, reusing substrings when possible
3242 my $decomp_char_data = "";
3243 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
3245 my $str = pack "U*", @{$i};
3246 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
3248 for (my $i = 0; $i < @decomposed; $i++)
3250 next unless defined $decomposed[$i];
3251 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
3252 die "sequence not found" if $pos == -1;
3253 my $len = @{$decomposed[$i]};
3254 $len = 7 if $len > 7;
3255 my $hash = $i % $decomp_hash_size;
3256 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
3258 for (my $i = 0; $i < $decomp_hash_size; $i++)
3260 $decomp_hash_index[$i] = @decomp_hash_data / 2;
3261 next unless defined $decomp_hash_table[$i];
3262 if (@{$decomp_hash_table[$i]} == 1)
3264 my $entry = $decomp_hash_table[$i]->[0];
3265 if ($char_props[$entry->[0]] == 0xbf)
3267 $decomp_hash_index[$i] = $entry->[1];
3268 next;
3271 foreach my $entry (@{$decomp_hash_table[$i]})
3273 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
3276 push @decomp_hash_data, 0, 0;
3278 # composition hash table
3280 my @comp_hash_index;
3281 my @comp_hash_data;
3282 if (@comp_hash_table)
3284 for (my $i = 0; $i < $comp_hash_size; $i++)
3286 $comp_hash_index[$i] = @comp_hash_data;
3287 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
3289 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
3290 push @comp_hash_data, 0, 0, 0;
3293 my $level1 = ($MAX_CHAR + 1) / 128;
3294 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
3296 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
3297 0, $decomp_hash_size, $comp_hash_size, 0 );
3298 my @tables = (0) x 8;
3300 $tables[0] = 16 + @header + @tables;
3301 $tables[1] = $tables[0] + @class_values / 2;
3302 $tables[2] = $tables[1] + $level1 / 2;
3303 $tables[3] = $tables[2] + (@rows - $level1) / 2;
3304 $tables[4] = $tables[3] + @decomp_hash_index;
3305 $tables[5] = $tables[4] + @decomp_hash_data;
3306 $tables[6] = $tables[5] + length $decomp_char_data;
3307 $tables[7] = $tables[6] + @comp_hash_index;
3309 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
3310 print OUTPUT pack "S<*", @header;
3311 print OUTPUT pack "S<*", @tables;
3312 print OUTPUT pack "C*", @class_values;
3314 print OUTPUT pack "C*", @rows[0..$level1-1];
3315 print OUTPUT pack "C*", @rows[$level1..$#rows];
3316 print OUTPUT pack "S<*", @decomp_hash_index;
3317 print OUTPUT pack "S<*", @decomp_hash_data;
3318 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
3319 print OUTPUT pack "S<*", @comp_hash_index;
3320 print OUTPUT pack "S<*", @comp_hash_data;
3322 close OUTPUT;
3323 save_file($filename);
3325 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
3329 ################################################################
3330 # output a codepage definition file from the global tables
3331 sub output_codepage_file($)
3333 my $codepage = shift;
3335 my $output = sprintf "nls/c_%03d.nls", $codepage;
3336 open OUTPUT,">$output.new" or die "Cannot create $output";
3338 printf "Building %s\n", $output;
3339 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
3340 else { dump_binary_dbcs_table( $codepage ); }
3342 close OUTPUT;
3343 save_file($output);
3345 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
3348 ################################################################
3349 # output a codepage table from a Microsoft-style mapping file
3350 sub dump_msdata_codepage($)
3352 my $filename = shift;
3354 my $state = "";
3355 my ($codepage, $width, $count);
3356 my ($lb_cur, $lb_end);
3358 @cp2uni = ();
3359 @glyph2uni = ();
3360 @lead_bytes = ();
3361 @uni2cp = ();
3362 $default_char = $DEF_CHAR;
3363 $default_wchar = $DEF_CHAR;
3365 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
3367 while (<$INPUT>)
3369 next if /^;/; # skip comments
3370 next if /^\s*$/; # skip empty lines
3371 next if /\x1a/; # skip ^Z
3372 last if /^ENDCODEPAGE/;
3374 if (/^CODEPAGE\s+(\d+)/)
3376 $codepage = $1;
3377 next;
3379 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
3381 $width = $1;
3382 $default_char = hex $2;
3383 $default_wchar = hex $3;
3384 next;
3386 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
3388 $state = $1;
3389 $count = $2;
3390 next;
3392 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
3394 if ($state eq "MBTABLE")
3396 my $cp = hex $1;
3397 my $uni = hex $2;
3398 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3399 next;
3401 if ($state eq "GLYPHTABLE")
3403 my $cp = hex $1;
3404 my $uni = hex $2;
3405 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
3406 next;
3408 if ($state eq "WCTABLE")
3410 my $uni = hex $1;
3411 my $cp = hex $2;
3412 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
3413 next;
3415 if ($state eq "DBCSRANGE")
3417 my $start = hex $1;
3418 my $end = hex $2;
3419 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
3420 $lb_cur = $start;
3421 $lb_end = $end;
3422 next;
3424 if ($state eq "DBCSTABLE")
3426 my $mb = hex $1;
3427 my $uni = hex $2;
3428 my $cp = ($lb_cur << 8) | $mb;
3429 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3430 if (!--$count)
3432 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
3434 next;
3437 die "$filename: Unrecognized line $_\n";
3439 close $INPUT;
3441 output_codepage_file( $codepage );
3443 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
3446 ################################################################
3447 # align a string length
3448 sub align_string($$)
3450 my ($align, $str) = @_;
3451 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
3452 return $str;
3455 ################################################################
3456 # pack a GUID string
3457 sub pack_guid($)
3459 $_ = shift;
3460 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
3461 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
3464 ################################################################
3465 # comparison function for compression sort
3466 sub cmp_compression
3468 return scalar @{$a} <=> scalar @{$b} ||
3469 $a->[4] <=> $b->[4] ||
3470 $a->[5] <=> $b->[5] ||
3471 $a->[6] <=> $b->[6] ||
3472 $a->[7] <=> $b->[7] ||
3473 $a->[8] <=> $b->[8] ||
3474 $a->[9] <=> $b->[9] ||
3475 $a->[10] <=> $b->[10] ||
3476 $a->[11] <=> $b->[11] ||
3477 $a->[12] <=> $b->[12];
3480 ################################################################
3481 # build a binary sort keys table
3482 sub dump_sortkey_table($$)
3484 my ($filename, $download) = @_;
3486 my @keys;
3487 my ($part, $section, $subsection, $guid, $version, $ling_flag);
3488 my @multiple_weights;
3489 my @expansions;
3490 my @compressions;
3491 my %exceptions;
3492 my %guids;
3493 my %compr_flags;
3494 my %locales;
3495 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
3496 my $jamostr = "";
3498 my $re_hex = '0x[0-9A-Fa-f]+';
3499 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
3500 $guids{$default_guid} = { };
3502 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
3504 my $KEYS = open_data_file( $MSDATA, $download );
3506 printf "Building $filename\n";
3508 while (<$KEYS>)
3510 s/\s*;.*$//;
3511 next if /^\s*$/; # skip empty lines
3512 if (/^\s*(SORTKEY|SORTTABLES)/)
3514 $part = $1;
3515 next;
3517 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
3519 $part = $section = "";
3520 next;
3522 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
3524 $section = $1;
3525 $guid = undef;
3526 next;
3528 next unless $part;
3529 if ("$part.$section" eq "SORTKEY.DEFAULT")
3531 if (/^\s*($re_hex)\s+$re_key/)
3533 $keys[hex $1] = [ split(/\s+/,$2) ];
3534 next;
3537 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
3539 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
3541 $version = hex $1;
3542 next;
3544 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
3546 # ignore for now
3547 next;
3550 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
3551 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
3552 "$part.$section" eq "SORTTABLES.INVERSECASING")
3554 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
3556 $guid = lc $1;
3557 $guids{$guid} = { } unless defined $guids{$guid};
3558 $guids{$guid}->{flags} |= $flags{$section};
3559 next;
3561 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3563 $locales{$1} = $guid;
3564 next;
3567 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
3569 if (/^\s*(\d+)\s+(\d+)/)
3571 push @multiple_weights, $1, $2;
3572 next;
3575 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
3577 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3579 my $pos = scalar @expansions / 2;
3580 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
3581 push @expansions, hex $2, hex $3;
3582 next;
3585 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
3587 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3589 $keys[hex $1] = $keys[hex $2];
3590 next;
3593 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
3595 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
3597 if ($subsection || !$guid) # start a new one
3599 $guid = lc $1;
3600 $subsection = "";
3601 $guids{$guid} = { } unless defined $guids{$guid};
3602 $guids{$guid}->{flags} |= $flags{$2} if $2;
3603 $guids{$guid}->{compr} = @compressions;
3604 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
3605 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
3606 push @compressions, [ ];
3608 else # merge with current one
3610 $guids{lc $1} = { } unless defined $guids{lc $1};
3611 $guids{lc $1}->{flags} |= $flags{$2} if $2;
3612 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
3613 $compr_flags{lc $1} = $compr_flags{$guid};
3615 next;
3617 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3619 $locales{$1} = $guid;
3620 next;
3622 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
3624 $subsection = $1;
3625 next;
3627 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
3629 my @comp = map { hex $_; } split(/\s+/,$1);
3630 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
3631 # add compression flags
3632 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
3633 next;
3636 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
3638 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
3640 $guid = lc $1;
3641 $guids{$guid} = { } unless defined $guids{lc $1};
3642 $ling_flag = ($2 ? "+" : "-");
3643 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
3644 next;
3646 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3648 $locales{$1} = $guid;
3649 next;
3651 if (/^\s*($re_hex)\s+$re_key/)
3653 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
3654 next;
3657 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
3659 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
3661 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
3662 next;
3665 die "$download: $part.$section: unrecognized line $_\n";
3667 close $KEYS;
3669 # Sortkey table
3671 my $table;
3672 for (my $i = 0; $i < 0x10000; $i++)
3674 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
3675 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
3678 foreach my $id (sort keys %exceptions)
3680 my $pos = length($table) / 4;
3681 my @exc = @{$exceptions{$id}};
3682 my @filled;
3683 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
3684 my $guid = substr( $id, 0, -1 );
3685 $guids{$guid}->{$key} = $pos;
3686 $pos += 0x100;
3687 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
3688 for (my $j = 0; $j < 0x10000; $j++)
3690 next unless defined $exc[$j] || defined $flags[$j];
3691 $filled[$j >> 8] = 1;
3692 $j |= 0xff;
3694 for (my $j = 0; $j < 0x100; $j++)
3696 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
3697 $pos += 0x100 if $filled[$j];
3699 for (my $j = 0; $j < 0x10000; $j++)
3701 next unless $filled[$j >> 8];
3702 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
3703 $k[3] |= $flags[$j] || 0;
3704 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
3708 # Case mapping tables
3710 # standard table
3711 my @casemaps;
3712 my @upper = @toupper_table;
3713 my @lower = @tolower_table;
3714 remove_linguistic_mappings( \@upper, \@lower );
3715 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
3717 # linguistic table
3718 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
3720 # Turkish table
3721 @upper = @toupper_table;
3722 @lower = @tolower_table;
3723 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
3724 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
3725 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
3726 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
3728 # Char type table
3730 my @table;
3731 my $types = "";
3732 my %typestr;
3733 for (my $i = 0; $i < 0x10000; $i++)
3735 my $str = pack "S<3",
3736 ($category_table[$i] || 0) & 0xffff,
3737 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
3738 ($category_table[$i] || 0) >> 16;
3740 if (!defined($typestr{$str}))
3742 $typestr{$str} = length($types) / 6;
3743 $types .= $str;
3745 $table[$i] = $typestr{$str};
3748 my @rows = compress_array( 4096, 0, @table[0..65535] );
3749 my @array = compress_array( 256, 0, @rows[0..4095] );
3750 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
3751 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
3753 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
3754 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
3755 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
3757 # Sort tables
3759 # guids
3760 my $sorttables = pack "L<2", $version, scalar %guids;
3761 foreach my $id (sort keys %guids)
3763 my %guid = %{$guids{$id}};
3764 my $flags = $guid{flags} || 0;
3765 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
3766 $sorttables .= pack_guid($id) . pack "L<5",
3767 $flags,
3768 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
3769 $guid{except} || 0,
3770 $guid{ling_except} || 0,
3771 $map / 2;
3774 # expansions
3775 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
3777 # compressions
3778 $sorttables .= pack "L<", scalar @compressions;
3779 my $rowstr = "";
3780 foreach my $c (@compressions)
3782 my $pos = length($rowstr) / 2;
3783 my $min = 0xffff;
3784 my $max = 0;
3785 my @lengths = (0) x 8;
3786 foreach my $r (sort cmp_compression @{$c})
3788 my @row = @{$r};
3789 $lengths[scalar @row - 6]++;
3790 foreach my $val (@row[4..$#row])
3792 $min = $val if $min > $val;
3793 $max = $val if $max < $val;
3795 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
3796 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
3798 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
3800 $sorttables .= $rowstr;
3802 # multiple weights
3803 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
3805 # jamo sort
3806 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
3808 # Locales
3810 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
3811 foreach my $loc (sort keys %locales)
3813 # skip specific locales that match more general ones
3814 my @parts = split /[-_]/, $loc;
3815 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
3816 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
3817 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
3820 # File header
3822 my @header;
3823 $header[0] = 16;
3824 $header[1] = $header[0] + length $table;
3825 $header[2] = $header[1] + length $casemaps;
3826 $header[3] = $header[2] + length $chartypes;
3828 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
3829 print OUTPUT pack "L<*", @header;
3830 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
3831 close OUTPUT;
3832 save_file($filename);
3833 return $chartypes;
3837 my %lcnames;
3839 sub locale_parent($)
3841 my $loc = shift;
3843 return undef unless $loc;
3844 return $lcnames{$loc}->{sparent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{sparent};
3845 return $lcnames{$loc}->{parent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{parent};
3846 if ($loc =~ /(.*)-[0-9A-Za-z]+/) { return $1; }
3847 return "";
3850 sub compare_locales
3852 (my $n1 = $a) =~ tr/A-Z_/a-z-/;
3853 (my $n2 = $b) =~ tr/A-Z_/a-z-/;
3854 return $n1 cmp $n2;
3857 # query an xml key
3858 sub xml_query($$)
3860 my ($xml, $query) = @_;
3861 my $ret = $xml->find( $query );
3862 return undef unless $ret;
3863 printf STDERR "multiple entries for %s\n", $query if (@{$ret} > 1);
3864 return @{$ret}[0]->textContent;
3867 # query an xml key for a locale, with fallback to the parents
3868 sub loc_query($$)
3870 my ($loc, $query) = @_;
3872 $loc = $lcnames{"en-US"} unless $loc->{name}; # fallback to "en-US" for root locale
3874 for (my $cur = $loc->{name}; defined $cur; $cur = locale_parent( $cur ))
3876 next unless defined $lcnames{$cur};
3877 my $xml = $lcnames{$cur}->{xml};
3878 my $ret = $xml->find( $query );
3879 next unless $ret;
3880 printf STDERR "%s: multiple entries for %s\n", $cur, $query if (@{$ret} > 1);
3881 next if @{$ret}[0]->textContent eq "\x{2191}\x{2191}\x{2191}"; # "↑↑↑"
3882 return @{$ret}[0]->textContent;
3884 return undef;
3887 # retrieve a locale field entry by going up the parents tree
3888 sub locale_entry($$$)
3890 my ($loc, $field, $def) = @_;
3892 return $loc->{$field} if defined $loc->{$field};
3894 unless ($loc->{name}) # fallback to "en-US" for root locale
3896 $loc = $lcnames{"en-US"};
3897 return $loc->{$field} if defined $loc->{$field};
3899 while (defined $loc->{alias}) # resolve aliases
3901 $loc = $lcnames{$loc->{alias}};
3902 return $loc->{$field} if defined $loc->{$field};
3904 my $cur = $loc->{name};
3905 while ($cur)
3907 if (defined $lcnames{$cur} && defined $lcnames{$cur}->{sparent})
3909 $cur = $lcnames{$cur}->{sparent};
3911 elsif ($cur =~ /(.*)-[0-9A-Za-z]+/)
3913 $cur = $1;
3915 else
3917 return $def;
3919 return $lcnames{$cur}->{$field} if defined $lcnames{$cur} && defined $lcnames{$cur}->{$field};
3921 return $def;
3924 my $string_data;
3926 sub add_str_data($)
3928 my $txt = shift;
3929 my $ret = index( $string_data, $txt );
3930 if ($ret == -1)
3932 $ret = length($string_data);
3933 $string_data .= $txt
3935 return $ret / 2;
3938 sub add_string($)
3940 my $str = shift;
3941 return 0 unless defined($str) && $str ne "";
3942 my $utf = encode( "UTF16LE", $str );
3943 return add_str_data( (pack "S<", length($utf) / 2) . $utf . (pack "S", 0) );
3946 sub add_fontsig(@)
3948 return add_str_data( pack "S<L<*", scalar(@_) * 2, @_ );
3951 sub add_strarray(@)
3953 return 0 unless @_;
3954 return add_str_data( pack "S<L<*", scalar @_, map { add_string($_) } @_);
3957 sub format_to_grouping($)
3959 my $format = shift;
3960 if ($format =~ /#,(#+),(#+0)/) { return chr(length($2)) . chr(length($1)); }
3961 if ($format =~ /#,(#+0)/) { return chr(length($1)); }
3962 # printf STDERR "unknown format %s\n", $format;
3963 return chr(3);
3966 sub parse_currency_format($$)
3968 my $name = shift;
3969 my ($posfmt, $negfmt) = split /;/, shift;
3970 my @pospatterns = ( "\xa4[^\xa0]*#", # $1.1
3971 "00[^\xa0]*\xa4", # 1.1$
3972 "\xa4.*\xa0.*#", # $ 1.1
3973 "00.*\xa0.*\xa4" ); # 1.1 $
3974 my @negpatterns = ( "\\(\xa4[^\xa0]*#", # ($1.1)
3975 "-\xa4[^\xa0]*#", # -$1.1
3976 "\xa4[^\xa0]*-#", # $-1.1
3977 "\xa4[^\xa0]*#.*00-", # $1.1-
3978 "00[^\xa0]*\xa4\\)", # (1.1$)
3979 "-#.*00[^\xa0]*\xa4", # -1.1$
3980 "00-[^\xa0]*\xa4", # 1.1-$
3981 "00[^\xa0]*\xa4-", # 1.1$-
3982 "-#.*00.*\xa0.*\xa4", # -1.1 $
3983 "-\xa4.*\xa0.*#", # -$ 1.1
3984 "00.*\xa0.*\xa4-", # 1.1 $-
3985 "\xa4.*\xa0.*#.*00-", # $ 1.1-
3986 "\xa4.*\xa0.*-#", # $ -1.1
3987 "00-.*\xa0.*\xa4", # 1.1- $
3988 "\\(\xa4.*\xa0.*#", # ($ 1.1)
3989 "00.*\xa0.*\xa4\\)"); # (1.1 $)
3990 my ($pos, $neg);
3992 for ($pos = 0; $pos < @pospatterns; $pos++)
3994 last if ($posfmt =~ /$pospatterns[$pos]/);
3996 #printf STDERR "$name: unknown format '%s'\n", $posfmt if ($pos == @pospatterns);
3997 $pos = 0 if ($pos == @pospatterns);
3999 if (defined $negfmt)
4001 for ($neg = 0; $neg < @negpatterns; $neg++)
4003 last if ($negfmt =~ /$negpatterns[$neg]/);
4005 #printf STDERR "$name: unknown format '%s'\n", $negfmt if ($neg == @negpatterns);
4006 $neg = 0 if ($neg == @negpatterns);
4008 elsif ($pos == 0) { $neg = 1; }
4009 elsif ($pos == 1) { $neg = 5; }
4010 elsif ($pos == 2) { $neg = 9; }
4011 elsif ($pos == 3) { $neg = 8; }
4013 return ($pos, $neg);
4016 sub parse_percent_format($)
4018 my $fmt = shift;
4019 my @patterns = ( "0.+%", # 1 %
4020 "0%", # 1%
4021 "%#", # %1
4022 "%.+#" ); # % 1
4023 my $pos;
4024 for ($pos = 0; $pos < @patterns; $pos++)
4026 last if ($fmt =~ /$patterns[$pos]/);
4028 printf STDERR "unknown format '%s'\n", $fmt if ($pos == @patterns);
4029 return ($pos, ($pos == 3) ? 7 : $pos);
4032 sub convert_date_format($)
4034 my $fmt = shift;
4035 $fmt =~ s/G/g/;
4036 $fmt =~ s/LLLL/MMMM/;
4037 $fmt =~ s/LLL/MMM/;
4038 $fmt =~ s/E+/dddd/;
4039 $fmt =~ s/ccc+/dddd/;
4040 $fmt =~ s/([^y])y([^y])/$1yyyy$2/;
4041 $fmt =~ s/^y([^y])/yyyy$1/;
4042 $fmt =~ s/([^y])y$/$1yyyy/;
4043 return $fmt;
4046 sub convert_time_format($)
4048 my $fmt = shift;
4049 $fmt =~ s/a+/tt/;
4050 $fmt =~ s/B+/tt/;
4051 return $fmt;
4054 sub load_iso639()
4056 my %iso639;
4057 my $DATA = open_data_file( $ISO639, "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3.tab" );
4058 while (<$DATA>)
4060 if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
4062 close $DATA;
4063 return %iso639;
4067 ################################################################
4068 # build the locale table for locale.nls
4069 sub build_locale_data()
4071 my $base = "cldr-release-$CLDRVERSION";
4072 my $suppl = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/supplementalData.xml" );
4073 my $subtags = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/likelySubtags.xml" );
4074 my $numbers = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/numberingSystems.xml" );
4075 # obsolete phone data from CLDR version 33
4076 my $phone = load_xml_data_file( $CLDR33DATA, "common/supplemental/telephoneCodeData.xml" );
4077 my %iso639 = load_iso639();
4078 $string_data = pack "S2", 0, 0; # offset 0 == empty string
4080 %lcnames = map { $_->{name} => $_ } @locales;
4082 my %lcids;
4083 foreach my $loc (@locales) { $lcids{$loc->{lcid}} = $loc if defined $loc->{lcid}; }
4085 my %days = ( "sun" => 0, "mon" => 1, "tue" => 2, "wed" => 3, "thu" => 4, "fri" => 5, "sat" => 6 );
4087 # assign locale parents
4089 foreach my $loc (@locales)
4091 next if $loc->{name} eq "";
4092 next if defined $loc->{parent};
4093 (my $unix_name = $loc->{name}) =~ s/-/_/g;
4094 my $parent = xml_query( $suppl, "/supplementalData/parentLocales/parentLocale[contains(concat(' ',\@locales,' '),' $unix_name ')]/\@parent" );
4095 if ($parent)
4097 $parent =~ s/_/-/g;
4098 $parent = "" if $parent eq "root";
4100 elsif ($loc->{name} =~ /(.*)-[0-9A-Za-z]+/) { $parent = $1; }
4101 $loc->{parent} = $parent || "";
4104 # load per-locale XML files
4106 foreach my $loc (@locales)
4108 next if defined $loc->{alias};
4109 (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
4110 $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
4111 my $xml = load_xml_data_file( $CLDRDATA, $file );
4112 $loc->{xml} = $xml;
4113 $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
4114 $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
4115 $loc->{script} = xml_query( $xml, "/ldml/identity/script/\@type" );
4116 if (!defined($loc->{territory}) && $loc->{name} =~ /-([A-Z]{2}|[0-9]{3})$/) { $loc->{territory} = $1; }
4117 if (!defined($loc->{script}) && $loc->{name} =~ /-([A-Z][a-z]{3})(-[A-Z]{2})?$/) { $loc->{script} = $1; }
4120 # assign a default territory and sort locale
4122 foreach my $loc (@locales)
4124 next if defined $loc->{alias};
4125 next if defined $loc->{territory};
4126 my $id = $loc->{sortlocale};
4127 if (defined $id && ($id =~ /[-_]([A-Z0-9]+)$/))
4129 $loc->{territory} = $1;
4130 next;
4132 my @children = grep /^$loc->{name}-[A-Z0-9]+$/ && !defined $lcnames{$_}->{alias}, keys %lcnames;
4133 if (@children == 1)
4135 $id = $children[0];
4137 else
4139 my $name = $loc->{file} || $loc->{name};
4140 $name =~ s/-(Arab|Cyrl|Deva|Guru|Hans|Hant|Latn|Tfng|Vaii)$//;
4141 $name =~ s/-/_/g;
4142 $id = xml_query( $subtags, "/supplementalData/likelySubtags/likelySubtag[\@from='$name']/\@to" );
4143 $id =~ s/_/-/g if $id;
4145 if ($id =~ /[-_]([A-Z0-9]+)$/)
4147 $loc->{territory} = $1;
4148 next if defined $loc->{sortlocale};
4149 next unless $id =~ /^$loc->{name}/;
4150 while (defined $lcnames{$id} && defined $lcnames{$id}->{alias}) { $id = $lcnames{$id}->{alias}; }
4151 $loc->{sortlocale} = $id if defined $lcnames{$id};
4152 next;
4154 print STDERR "no territory found for $loc->{name}\n";
4157 # assign default lcid to aliases
4159 foreach my $loc (@locales)
4161 next unless defined $loc->{alias};
4162 next if defined $loc->{lcid};
4163 my $alias = $loc->{alias};
4164 my $lcid = $lcnames{$alias}->{lcid} || 0x1000;
4165 $loc->{lcid} = $lcid | 0x80000000;
4168 # assign sort aliases to parent locale
4170 foreach my $loc (@locales)
4172 next unless $loc->{name} =~ /_/;
4173 next unless defined $loc->{alias};
4174 my $alias = $loc->{alias};
4175 my $parent = $lcnames{$alias};
4176 my $basename = $parent->{name};
4177 while (1)
4179 @{$parent->{sortnames}}[($loc->{lcid} >> 16) - 1] = $loc->{name};
4180 $alias = locale_parent( $alias );
4181 last unless $alias && defined $lcnames{$alias};
4182 $parent = $lcnames{$alias};
4183 last if defined $parent->{sortbase} && $parent->{sortbase} ne $basename;
4184 $parent->{sortbase} = $basename;
4188 # assign an array index to all locales
4190 my $idx = 0;
4191 foreach my $loc (@locales)
4193 next if defined $loc->{alias};
4194 $loc->{idx} = $idx++;
4196 foreach my $loc (@locales)
4198 my $alias = $loc->{alias};
4199 next unless defined $alias;
4200 while (defined $lcnames{$alias}->{alias}) { $alias = $lcnames{$alias}->{alias}; }
4201 $loc->{idx} = $lcnames{$alias}->{idx};
4204 # output lcids table
4206 my $lcid_data = "";
4207 foreach my $id (sort { $a <=> $b } keys %lcids)
4209 my $loc = $lcids{$id};
4210 $lcid_data .= pack "L<S<2", $id, $loc->{idx}, add_string($loc->{name});
4213 # output lcnames table
4215 my $lcname_data = "";
4216 foreach my $name (sort compare_locales keys %lcnames)
4218 my $loc = $lcnames{$name};
4219 $lcname_data .= pack "S<2L<", add_string($name), $loc->{idx}, $loc->{lcid} || 0x1000;
4222 # output locales array
4224 my $locale_data = "";
4225 my $default_lcid = 0x8001;
4226 foreach my $loc (@locales)
4228 next if defined $loc->{alias};
4229 my $sname = $loc->{name};
4230 my $language = $loc->{language};
4231 my $territory = $loc->{territory};
4232 my $script = $loc->{script};
4233 my $neutral = ($sname && $sname !~ /-$territory/);
4234 my $sparent = $loc->{sparent} || (($sname =~ /(.*)-[0-9A-Za-z]+/) ? $1 : $loc->{parent});
4235 my $unique_lcid = $loc->{lcid};
4236 unless (defined $unique_lcid) { $unique_lcid = $default_lcid++; }
4237 my $territory_match = "contains(concat(' ',normalize-space(\@territories),' '),' $territory ')";
4239 # languages and scripts
4241 my $ssortlocale = $loc->{sortlocale} || ($neutral ? "$sname-$territory" : $sname);
4242 my $idefaultlanguage = defined $lcnames{$ssortlocale} ? $lcnames{$ssortlocale}->{lcid} : undef;
4243 $idefaultlanguage = $lcnames{"en-US"}->{lcid} unless $ssortlocale;
4244 (my $siso639langname = $sname) =~ s/-.*$//;
4245 my $siso639langname2 = $iso639{$siso639langname} || $siso639langname;
4246 my $sopentypelang = sprintf "%-4s", locale_entry( $loc, "sopentypelang", uc $siso639langname2 );
4247 my $sabbrevlangname = defined $loc->{lcid} ? locale_entry( $loc, "sabbrevlangname", uc $siso639langname2 ) : "ZZZ";
4248 my $siso3166ctryname2 = $territory =~ /^\d+$/ ? $territory : xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$territory']/\@alpha3");
4249 my $senglanguage = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" ) || "";
4250 my $sengcountry = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" ) || "";
4251 my $snativelangname = loc_query( $loc, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" );
4252 my $snativectryname = loc_query( $loc, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" );
4253 $sengcountry =~ s/South Korea/Korea/;
4254 $snativelangname ||= $senglanguage;
4255 $snativectryname ||= $sengcountry;
4256 if ($script)
4258 my $engscript = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4259 my $nativescript = loc_query( $loc, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4260 $senglanguage .= " ($engscript)" if $engscript;
4261 $snativelangname .= " ($nativescript)" if $nativescript;
4263 my $sengdisplayname = $neutral ? $senglanguage : "$senglanguage ($sengcountry)";
4264 my $snativedisplayname = $neutral ? $snativelangname : "$snativelangname ($snativectryname)";
4265 $sengdisplayname =~ s/\) \(/, /;
4266 $snativedisplayname =~ s/\) \(/, /;
4267 my $sscripts = locale_entry( $loc, "sscripts", $script ) || xml_query( $suppl, "/supplementalData/languageData/language[\@type='$language' and not(\@alt)]/\@scripts" );
4268 $sscripts = (join ";", (sort split / /, ($sscripts || "Latn"))) . ";";
4269 my $ireadinglayout = locale_entry( $loc, "ireadinglayout", 0 );
4270 my $charlayout = loc_query( $loc, "/ldml/layout/orientation/characterOrder" );
4271 if ($charlayout eq "right-to-left")
4273 $ireadinglayout = 1;
4275 elsif ($charlayout eq "top-to-bottom")
4277 my $linelayout = loc_query( $loc, "/ldml/layout/orientation/lineOrder" );
4278 $ireadinglayout = $linelayout eq "right-to-left" ? 2 : 3;
4280 my $icountry = xml_query( $phone, "(/supplementalData/telephoneCodeData/codesByTerritory[\@territory='$territory']/telephoneCountryCode)[1]/\@code" ) || 1;
4282 # numbers
4284 my $sdecimal = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/decimal" );
4285 my $slist = locale_entry( $loc, "slist", ";" );
4286 my $smondecimalsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyDecimal" ) || $sdecimal;
4287 my $sthousand = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/group" );
4288 $sthousand =~ s/\x{202f}/\x{00a0}/;
4289 my $smonthousandsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyGroup" ) || $sthousand;
4290 my $spositivesign = "";
4291 my $snegativesign = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/minusSign" );
4292 my $spercent = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/percentSign" );
4293 my $snan = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/nan" );
4294 my $sposinfinity = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/infinity" );
4295 my $sneginfinity = $sposinfinity ? "-$sposinfinity" : "";
4296 my $sgrouping = format_to_grouping( loc_query( $loc, "/ldml/numbers/decimalFormats[\@numberSystem='latn']/decimalFormatLength[not(\@type)]/decimalFormat/pattern" ));
4297 my $percentformat = loc_query( $loc, "/ldml/numbers/percentFormats[\@numberSystem='latn']/percentFormatLength[not(\@type)]/percentFormat/pattern" );
4298 my $currencyformat = loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='accounting']/pattern" ) ||
4299 loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='standard']/pattern" );
4300 my $smongrouping = format_to_grouping( $currencyformat );
4301 my ($icurrency, $inegcurr) = parse_currency_format( $sname, $currencyformat );
4302 my ($ipospercent, $inegpercent) = parse_percent_format( $percentformat );
4303 my $native_numbering = loc_query( $loc, "/ldml/numbers/otherNumberingSystems/native" );
4304 my @snativedigits = split //, xml_query( $numbers, "/supplementalData/numberingSystems/numberingSystem[\@id='$native_numbering']/\@digits" );
4305 my $digitsubstitution = !(ord($snativedigits[0]) >= 0x600 && ord($snativedigits[0]) <= 0x6ff);
4306 my $measure = defined xml_query( $suppl, "/supplementalData/measurementData/measurementSystem[\@type='US' and $territory_match]" );
4307 my $papersize = defined xml_query( $suppl, "/supplementalData/measurementData/paperSize[\@type='US-Letter' and $territory_match]" );
4309 # currencies
4311 my $sintlsymbol = $loc->{sintlsymbol} || xml_query( $suppl, "(/supplementalData/currencyData/region[\@iso3166='$territory']/currency[not(\@to)])[1]/\@iso4217") || "XDR";
4312 my $scurrency = $loc->{scurrency} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[\@alt='narrow']" );
4313 $scurrency ||= loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[not(\@alt)]" );
4314 $scurrency ||= $sintlsymbol;
4315 my $sengcurrname = $loc->{sengcurrname} || loc_query( $lcnames{en}, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" );
4316 my $snativecurrname = $loc->{sengcurrname} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" ) || $sengcurrname;
4317 my $icurrdigits = xml_query( $suppl, "/supplementalData/currencyData/fractions/info[\@iso4217='$sintlsymbol']/\@digits" );
4318 $icurrdigits = 2 unless defined $icurrdigits;
4320 # calendars
4322 my $firstday = xml_query( $suppl, "/supplementalData/weekData/firstDay[not(\@alt) and $territory_match]/\@day" );
4323 my $ifirstdayofweek = $firstday ? $days{$firstday} : 1;
4324 my $firstweekofyear = (xml_query( $suppl, "/supplementalData/weekData/minDays[$territory_match]/\@count" ) || 0) == 4 ? 2 : 0;
4325 my $serastring = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/eras/eraAbbr/era[\@type='1' and not(\@alt)]" );
4326 my (@sdayname, @sabbrevdayname, @sshortestdayname);
4327 foreach my $d (sort { $days{$a} <=> $days{$b} } keys %days)
4329 my $n = $days{$d};
4330 my %name;
4331 foreach my $type (qw(wide abbreviated short))
4333 $name{$type} = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/days/dayContext[\@type='format']/dayWidth[\@type='$type']/day[\@type='$d' and not(\@alt)]" );
4335 push @sdayname, $name{wide};
4336 push @sabbrevdayname, $name{abbreviated} || $name{wide};
4337 push @sshortestdayname, $name{short} || $name{abbreviated} || $name{wide};
4339 my (@smonthname, @sabbrevmonthname, @sgenitivemonth, @sabbrevgenitivemonth);
4340 foreach my $n (1..13)
4342 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='wide']/month[\@type='$n']" );
4343 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4344 my $genitive = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n']" );
4345 my $abbrevgen = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4346 push @smonthname, $name || $genitive || "";
4347 push @sabbrevmonthname, $abbrev || $abbrevgen || $name || $genitive || "";
4348 push @sgenitivemonth, $genitive || "";
4349 push @sabbrevgenitivemonth, $abbrevgen || $genitive || "";
4351 @sgenitivemonth = () if join("|",@smonthname) eq join("|",@sgenitivemonth);
4352 @sabbrevgenitivemonth = () if join("|",@sabbrevmonthname) eq join("|",@sabbrevgenitivemonth);
4353 my %caltypes = ( "gregorian" => 1, "japanese" => 3, "chinese" => 4, "dangi" => 5, "islamic" => 6, "buddhist" => 7, "hebrew" => 8,
4354 "persian" => 22, "islamic-civil" => 23, "islamic-umalqura" => 23 );
4355 my $calpref = xml_query( $suppl, "/supplementalData/calendarPreferenceData/calendarPreference[$territory_match]/\@ordering" ) || "gregorian";
4356 my $icalendartype;
4357 my @scalnames;
4358 foreach my $c (split /\s+/, $calpref)
4360 next unless defined $caltypes{$c};
4361 $icalendartype .= chr($caltypes{$c});
4362 $scalnames[$caltypes{$c} - 1] = loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$c']" );
4365 # date/time formats
4367 my $s1159 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='am' and not(\@alt)]" );
4368 my $s2359 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='pm' and not (\@alt)]" );
4369 my $sshortestam = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='am' and not(\@alt)]" );
4370 my $sshortestpm = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='pm' and not (\@alt)]" );
4371 my @stimeformat = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='medium']/timeFormat/pattern[not(\@alt)]" ));
4372 push @stimeformat, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4373 pop @stimeformat if $stimeformat[0] eq $stimeformat[1];
4374 @stimeformat = map convert_time_format($_), @stimeformat;
4375 my @sshorttime = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='short']/timeFormat/pattern[not(\@alt)]" ));
4376 push @sshorttime, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hm' and not(\@alt)]" );
4377 pop @sshorttime if $sshorttime[0] eq $sshorttime[1];
4378 @sshorttime = map convert_time_format($_), @sshorttime;
4379 my @sshortdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4380 push @sshortdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
4381 @sshortdate = map convert_date_format($_), @sshortdate;
4382 my @slongdate = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" ));
4383 push @slongdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
4384 @slongdate = map convert_date_format($_), @slongdate;
4385 my @smonthday = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMd' and not(\@alt)]" ));
4386 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Md' and not(\@alt)]" );
4387 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMd' and not(\@alt)]" );
4388 @smonthday = map convert_date_format($_), @smonthday;
4389 my @syearmonth = map convert_date_format($_), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMM' and not(\@alt)]" );
4390 my @sduration = map convert_time_format( lc $_ ), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4391 my $srelativelongdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMEd' and not(\@alt)]" ) ||
4392 loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMEd' and not(\@alt)]" );
4393 $srelativelongdate = convert_date_format( $srelativelongdate );
4395 # codepages
4397 my %ansicpmap = ( 437 => 1252, 720 => 1256, 737 => 1253, 775 => 1257, 850 => 1252,
4398 852 => 1250, 855 => 1251, 866 => 1251, 857 => 1254, 862 => 1255 );
4399 my %maccpmap = ( 437 => 10000, 720 => 10004, 737 => 10006, 775 => 10029, 850 => 10000,
4400 852 => 10029, 855 => 10007, 857 => 10081, 862 => 10005, 866 => 10007,
4401 874 => 10021, 932 => 10001, 936 => 10008, 949 => 10003, 950 => 10002,
4402 1258 => 10000 );
4403 my %ebcdiccpmap = ( 437 => 37, 720 => 20420, 737 => 20273, 866 => 20880, 932 => 20290 );
4404 my %codepagemasks = ( 874 => [ 0x01000000, 0x00000000, 0x00000000, 0, 0x00010000, 0x00000000, 0x00010000, 0x00000000 ],
4405 932 => [ 0x00000000, 0x28c70000, 0x00000010, 0, 0x00020000, 0x00000000, 0x00020000, 0x00000000 ],
4406 936 => [ 0x00000000, 0x28010000, 0x00000002, 0, 0x00040000, 0x00000000, 0x00040000, 0x00000000 ],
4407 949 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00080000, 0x00000000, 0x00080000, 0x00000000 ],
4408 950 => [ 0x00000000, 0x28c10000, 0x00000012, 0, 0x00100000, 0x00000000, 0x00100000, 0x00000000 ],
4409 1258 => [ 0x2000000f, 0x00000000, 0x00000000, 0, 0x00000100, 0x00008000, 0x00000100, 0x00008000 ],
4410 866 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x00020000, 0x00000004, 0x02020000 ],
4411 862 => [ 0x00000800, 0x40000000, 0x00000000, 0, 0x00000020, 0x00200000, 0x00000020, 0x00200000 ],
4412 857 => [ 0x0000001f, 0x00000000, 0x00000000, 0, 0x00000010, 0x01000000, 0x00000010, 0x01000000 ],
4413 855 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x02000000, 0x00000004, 0x02000000 ],
4414 852 => [ 0x00000027, 0x00000000, 0x00000000, 0, 0x00000002, 0x04000000, 0x00000002, 0x04000000 ],
4415 775 => [ 0x00000007, 0x00000000, 0x00000000, 0, 0x00000080, 0x08000000, 0x00000080, 0x08000000 ],
4416 737 => [ 0x00000080, 0x00000000, 0x00000000, 0, 0x00000008, 0x10000000, 0x00000008, 0x10010000 ],
4417 720 => [ 0x00002000, 0x00000000, 0x00000000, 0, 0x00000040, 0x20000000, 0x00000040, 0x20080000 ],
4418 850 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x40000000, 0x0000019f, 0xdfd70000 ],
4419 437 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x80000000, 0x0000019f, 0xdfd70000 ],
4420 65001 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00000000, 0x00000000, 0x0000019f, 0xdfd70000 ] );
4421 my $oemcp = locale_entry( $loc, "oemcp", 65001 );
4422 my $maccp = locale_entry( $loc, "maccp", undef ) || $maccpmap{$oemcp} || 65001;
4423 my $ebcdiccp = locale_entry( $loc, "ebcdiccp", undef ) || $ebcdiccpmap{$oemcp} || 500;
4424 $ebcdiccp = 500 if (defined $loc->{oemcp} && $loc->{oemcp} == 65001) || (defined $loc->{maccp} && $loc->{maccp} == 65001);
4425 my $ansicp = $ansicpmap{$oemcp} || $oemcp;
4426 my @fontsig = (0) x 8;
4427 my $sig = locale_entry( $loc, "fontsig", [] );
4428 foreach my $i (0..7) { $fontsig[$i] |= $codepagemasks{$oemcp}->[$i]; }
4429 foreach my $i (0..$#{$sig}) { $fontsig[$i] |= $sig->[$i]; }
4430 $fontsig[3] |= 1 << 31;
4431 $fontsig[3] |= 1 << 27 if $ireadinglayout == 1;
4432 $fontsig[3] |= 1 << 28 if $ireadinglayout == 3;
4434 # special cases for invariant locale
4436 unless ($loc->{name})
4438 $siso639langname = "iv";
4439 $siso639langname2 = "ivl";
4440 $senglanguage = $snativelangname = "Invariant Language";
4441 $sengcountry = $snativectryname = "Invariant Country";
4442 $sengdisplayname = "Invariant Language (Invariant Country)";
4443 $snativedisplayname = "Invariant Language (Invariant Region)";
4444 $sengcurrname = $snativecurrname = "International Monetary Fund";
4445 $scurrency = "\x{00a4}";
4446 $ifirstdayofweek = 0;
4447 @stimeformat = ("HH:mm:ss");
4448 @sshortdate = ("MM/dd/yyyy", "yyyy-MM-dd");
4449 @slongdate = ("dddd, dd MMMM yyyy");
4450 @syearmonth = ("yyyy MMMM");
4451 @smonthday = ("MMMM dd", "MMMM d", "M/d", "MMM d");
4452 @sshorttime = ("HH:mm", "hh:mm tt", "H:mm", "h:mm tt");
4453 $srelativelongdate = "dddd, MMMM dd";
4454 $sposinfinity = "Infinity";
4455 $sneginfinity = "-Infinity";
4456 $spositivesign = "+";
4457 $ipospercent = $inegpercent = 0;
4460 # output data
4462 $locale_data .= pack "L<2",
4463 add_string( $sname ), # name
4464 add_string( $sopentypelang ); # LOCALE_SOPENTYPELANGUAGETAG
4466 $locale_data .= pack "S<14",
4467 $loc->{lcid} || 0x1000, # LOCALE_ILANGUAGE
4468 $unique_lcid, # unique_lcid
4469 locale_entry( $loc, "idigits", 2 ), # LOCALE_IDIGITS
4470 locale_entry( $loc, "inegnumber", 1 ), # LOCALE_INEGNUMBER
4471 $icurrdigits, # LOCALE_ICURRDIGITS
4472 $icurrency, # LOCALE_ICURRENCY
4473 $inegcurr, # LOCALE_INEGCURR
4474 locale_entry( $loc, "ilzero", 1 ), # LOCALE_ILZERO
4475 !$neutral, # LOCALE_INEUTRAL
4476 $ifirstdayofweek, # LOCALE_IFIRSTDAYOFWEEK
4477 $firstweekofyear, # LOCALE_IFIRSTWEEKOFYEAR
4478 $icountry, # LOCALE_ICOUNTRY,
4479 $measure, # LOCALE_IMEASURE
4480 $digitsubstitution; # LOCALE_IDIGITSUBSTITUTION
4482 $locale_data .= pack "L<18",
4483 add_string( $sgrouping ), # LOCALE_SGROUPING
4484 add_string( $smongrouping ), # LOCALE_SMONGROUPING
4485 add_string( $slist ), # LOCALE_SLIST
4486 add_string( $sdecimal ), # LOCALE_SDECIMAL
4487 add_string( $sthousand ), # LOCALE_STHOUSAND
4488 add_string( $scurrency ), # LOCALE_SCURRENCY
4489 add_string( $smondecimalsep ), # LOCALE_SMONDECIMALSEP
4490 add_string( $smonthousandsep ), # LOCALE_SMONTHOUSANDSEP
4491 add_string( $spositivesign ), # LOCALE_SPOSITIVESIGN
4492 add_string( $snegativesign ), # LOCALE_SNEGATIVESIGN
4493 add_string( $s1159 ), # LOCALE_S1159
4494 add_string( $s2359 ), # LOCALE_S2359
4495 add_strarray( @snativedigits ), # LOCALE_SNATIVEDIGITS
4496 add_strarray( @stimeformat ), # LOCALE_STIMEFORMAT
4497 add_strarray( @sshortdate ), # LOCALE_SSHORTDATE
4498 add_strarray( @slongdate ), # LOCALE_SLONGDATE
4499 add_strarray( @syearmonth ), # LOCALE_SYEARMONTH
4500 add_strarray( @sduration ); # LOCALE_SDURATION
4502 $locale_data .= pack "S<8",
4503 $idefaultlanguage || 0x1000, # LOCALE_IDEFAULTLANGUAGE
4504 $ansicp, # LOCALE_IDEFAULTANSICODEPAGE
4505 $oemcp, # LOCALE_IDEFAULTCODEPAGE
4506 $maccp, # LOCALE_IDEFAULTMACCODEPAGE
4507 $ebcdiccp, # LOCALE_IDEFAULTEBCDICCODEPAGE
4508 0, # FIXME # LOCALE_IGEOID
4509 $papersize ? 1 : 9, # LOCALE_IPAPERSIZE
4510 0; # FIXME # islamic_cal
4512 $locale_data .= pack "L<24",
4513 add_string( $icalendartype ), # LOCALE_ICALENDARTYPE
4514 add_string( $sabbrevlangname ), # LOCALE_SABBREVLANGNAME
4515 add_string( $siso639langname ), # LOCALE_SISO639LANGNAME
4516 add_string( $senglanguage ), # LOCALE_SENGLANGUAGE
4517 add_string( $snativelangname ), # LOCALE_SNATIVELANGNAME
4518 add_string( $sengcountry ), # LOCALE_SENGCOUNTRY
4519 add_string( $snativectryname ), # LOCALE_SNATIVECTRYNAME
4520 add_string( $siso3166ctryname2 ), # LOCALE_SABBREVCTRYNAME
4521 add_string( $territory ), # LOCALE_SISO3166CTRYNAME
4522 add_string( $sintlsymbol ), # LOCALE_SINTLSYMBOL
4523 add_string( $sengcurrname ), # LOCALE_SENGCURRNAME
4524 add_string( $snativecurrname ), # LOCALE_SNATIVECURRNAME
4525 add_fontsig( @fontsig ), # LOCALE_FONTSIGNATURE
4526 add_string( $siso639langname2 ), # LOCALE_SISO639LANGNAME2
4527 add_string( $siso3166ctryname2 ), # LOCALE_SISO3166CTRYNAME2
4528 add_string( $sparent ), # LOCALE_SPARENT
4529 add_strarray( @sdayname ), # LOCALE_SDAYNAME
4530 add_strarray( @sabbrevdayname ), # LOCALE_SABBREVDAYNAME
4531 add_strarray( @smonthname ), # LOCALE_SMONTHNAME
4532 add_strarray( @sabbrevmonthname ), # LOCALE_SABBREVMONTHNAME
4533 add_strarray( @sgenitivemonth ), # LOCALE_SGENITIVEMONTH
4534 add_strarray( @sabbrevgenitivemonth ), # LOCALE_SABBREVGENITIVEMONTH
4535 add_strarray( @scalnames ), # LOCALE_SCALNAMES
4536 add_strarray( @{$loc->{sortnames}} ); # LOCALE_SSORTNAMES
4538 $locale_data .= pack "S<6",
4539 $inegpercent, # LOCALE_INEGATIVEPERCENT
4540 $ipospercent, # LOCALE_IPOSITIVEPERCENT
4541 0, # unknown
4542 $ireadinglayout, # LOCALE_IREADINGLAYOUT
4543 0x2a, # unknown
4544 0x2a; # unknown
4546 $locale_data .= pack "L<24",
4547 0, # unknown
4548 add_string( $sengdisplayname ), # LOCALE_SENGLISHDISPLAYNAME
4549 add_string( $snativedisplayname ), # LOCALE_SNATIVEDISPLAYNAME
4550 add_string( $spercent ), # LOCALE_SPERCENT
4551 add_string( $snan ), # LOCALE_SNAN
4552 add_string( $sposinfinity ), # LOCALE_SPOSINFINITY
4553 add_string( $sneginfinity ), # LOCALE_SNEGINFINITY
4554 0, # unknown
4555 add_string( $serastring ), # CAL_SERASTRING
4556 add_string( $serastring ), # CAL_SABBREVERASTRING
4557 0, # unknown
4558 add_string( $ssortlocale ), # LOCALE_SCONSOLEFALLBACKNAME
4559 add_strarray( @sshorttime ), # LOCALE_SSHORTTIME
4560 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
4561 0, # unknown
4562 add_string( $ssortlocale ), # LOCALE_SSORTLOCALE
4563 add_string( "0409:00000409" ), # FIXME # LOCALE_SKEYBOARDSTOINSTALL
4564 add_string( $sscripts ), # LOCALE_SSCRIPTS
4565 add_string( $srelativelongdate ), # LOCALE_SRELATIVELONGDATE
4566 0, # FIXME # LOCALE_IGEOID2
4567 add_string( $sshortestam || "a" ), # LOCALE_SSHORTESTAM
4568 add_string( $sshortestpm || "p" ), # LOCALE_SSHORTESTPM
4569 add_strarray( @smonthday ), # LOCALE_SMONTHDAY
4570 add_string( "k0-windows-us" ) # FIXME # keyboard_layout
4573 my $nb_lcids = scalar keys %lcids;
4574 my $nb_locales = scalar grep { !defined $_->{alias} } @locales;
4575 my $nb_lcnames = scalar keys %lcnames;
4576 my $locale_size = length($locale_data) / $nb_locales;
4577 my $nb_calendars = 0;
4578 my $lcids_offset = 19 * 4; # size of header
4579 my $lcnames_offset = $lcids_offset + length $lcid_data;
4580 my $locales_offset = $lcnames_offset + length $lcname_data;
4581 my $calendar_offset = $locales_offset + length $locale_data;
4582 my $strings_offset = $calendar_offset;
4584 my $locale_header = pack "L<7S<4L<S<2L<3S<2L<4",
4585 8, # offset
4587 7, # version
4588 0x5344534e, # magic
4589 0, 0, 0,
4591 $nb_lcids,
4592 $nb_locales,
4593 $locale_size,
4594 $locales_offset,
4595 $nb_lcnames,
4597 $lcids_offset,
4598 $lcnames_offset,
4600 $nb_calendars,
4602 $calendar_offset,
4603 $strings_offset,
4604 0, 0;
4606 return align_string( 4, $locale_header . $lcid_data . $lcname_data . $locale_data . $string_data );
4610 ################################################################
4611 # build a binary locale table
4612 sub dump_locales($$)
4614 my ($filename, $chartypes) = @_;
4616 printf "Building $filename\n";
4618 my $locale_data = build_locale_data();
4619 my $charmaps_data = ""; # FIXME
4620 my $geoids_data = ""; # FIXME
4621 my $scripts_data = ""; # FIXME
4623 my @header = ( 0 ) x 8;
4624 $header[0] = 4 * scalar @header; # chartypes offset
4625 $header[4] = $header[0] + length $chartypes; # locales offset
4626 $header[5] = $header[4] + length $locale_data; # charmaps offset
4627 $header[6] = $header[5] + length $charmaps_data; # geoids offset
4628 $header[7] = $header[6] + length $geoids_data; # scripts offset
4630 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4631 print OUTPUT pack "L<*", @header;
4632 print OUTPUT $chartypes, $locale_data, $charmaps_data, $geoids_data, $scripts_data;
4633 close OUTPUT;
4634 save_file($filename);
4638 ################################################################
4639 # build the script to create registry keys
4640 sub dump_registry_script($%)
4642 my ($filename, %keys) = @_;
4643 my $indent = 1;
4645 printf "Building %s\n", $filename;
4646 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4647 print OUTPUT "HKLM\n{\n";
4648 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
4650 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
4651 $indent++;
4653 foreach my $k (sort keys %keys)
4655 my @subkeys = split /\\/, $k;
4656 my ($def, @vals) = @{$keys{$k}};
4657 for (my $i = 0; $i < @subkeys; $i++)
4659 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
4660 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
4661 $indent++;
4663 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
4664 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
4666 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
4667 close OUTPUT;
4668 save_file($filename);
4672 ################################################################
4673 # save a file if modified
4674 sub save_file($)
4676 my $file = shift;
4677 if (-f $file && !system "cmp $file $file.new >/dev/null")
4679 unlink "$file.new";
4681 else
4683 rename "$file.new", "$file";
4688 ################################################################
4689 # main routine
4691 chdir ".." if -f "./make_unicode";
4692 load_data();
4693 dump_sortkeys( "dlls/kernelbase/collation.c" );
4694 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
4695 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
4696 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
4697 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
4698 dump_mirroring( "dlls/dwrite/mirror.c" );
4699 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
4700 dump_bracket( "dlls/dwrite/bracket.c" );
4701 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
4702 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
4703 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
4704 dump_linebreak( "dlls/dwrite/linebreak.c" );
4705 dump_scripts( "dlls/dwrite/scripts" );
4706 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
4707 dump_vertical( "dlls/win32u/vertical.c", 1 );
4708 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
4709 dump_intl_nls("nls/l_intl.nls");
4710 dump_norm_table( "nls/normnfc.nls" );
4711 dump_norm_table( "nls/normnfd.nls" );
4712 dump_norm_table( "nls/normnfkc.nls" );
4713 dump_norm_table( "nls/normnfkd.nls" );
4714 dump_norm_table( "nls/normidna.nls" );
4715 my $chartypes = dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
4716 dump_locales( "nls/locale.nls", $chartypes );
4717 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
4718 dump_eucjp_codepage();
4719 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
4721 exit 0;
4723 # Local Variables:
4724 # compile-command: "./make_unicode"
4725 # End: