midimap: Handle MIDI running status.
[wine.git] / tools / make_unicode
blob40b64ddca714c7c9b11c2582630366c3d5bd7983
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
23 use XML::LibXML;
24 use Encode;
26 # base URLs for www.unicode.org files
27 my $UNIVERSION = "14.0.0";
28 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
29 my $UNIHAN = "https://www.unicode.org/Public/$UNIVERSION/ucd/Unihan.zip";
30 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
31 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
32 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
33 my $REPORTS = "http://www.unicode.org/reports";
34 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
35 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
37 my $CLDRVERSION = "41";
38 my $CLDRDATA = "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip";
39 my $CLDR33DATA = "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip";
41 my $ISO639VERSION = "20220120";
42 my $ISO639 = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip";
44 # Sort keys file
45 my $SORTKEYS = "tr10/allkeys.txt";
47 # Default char for undefined mappings
48 my $DEF_CHAR = ord '?';
50 # Last valid Unicode character
51 my $MAX_CHAR = 0x10ffff;
53 my @allfiles =
55 "CodpageFiles/037.txt",
56 "CodpageFiles/437.txt",
57 "CodpageFiles/500.txt",
58 "CodpageFiles/708.txt",
59 "CodpageFiles/720.txt",
60 "CodpageFiles/737.txt",
61 "CodpageFiles/775.txt",
62 "CodpageFiles/850.txt",
63 "CodpageFiles/852.txt",
64 "CodpageFiles/855.txt",
65 "CodpageFiles/857.txt",
66 "CodpageFiles/860.txt",
67 "CodpageFiles/861.txt",
68 "CodpageFiles/862.txt",
69 "CodpageFiles/863.txt",
70 "CodpageFiles/864.txt",
71 "CodpageFiles/865.txt",
72 "CodpageFiles/866.txt",
73 "CodpageFiles/869.txt",
74 "CodpageFiles/874.txt",
75 "CodpageFiles/875.txt",
76 "CodpageFiles/932.txt",
77 "CodpageFiles/936.txt",
78 "CodpageFiles/949.txt",
79 "CodpageFiles/950.txt",
80 "CodpageFiles/1026.txt",
81 "CodpageFiles/1250.txt",
82 "CodpageFiles/1251.txt",
83 "CodpageFiles/1252.txt",
84 "CodpageFiles/1253.txt",
85 "CodpageFiles/1254.txt",
86 "CodpageFiles/1255.txt",
87 "CodpageFiles/1256.txt",
88 "CodpageFiles/1257.txt",
89 "CodpageFiles/1258.txt",
90 "CodpageFiles/1361.txt",
91 "CodpageFiles/10000.txt",
92 "CodpageFiles/10001.txt",
93 "CodpageFiles/10002.txt",
94 "CodpageFiles/10003.txt",
95 "CodpageFiles/10004.txt",
96 "CodpageFiles/10005.txt",
97 "CodpageFiles/10006.txt",
98 "CodpageFiles/10007.txt",
99 "CodpageFiles/10008.txt",
100 "CodpageFiles/10010.txt",
101 "CodpageFiles/10017.txt",
102 "CodpageFiles/10021.txt",
103 "CodpageFiles/10029.txt",
104 "CodpageFiles/10079.txt",
105 "CodpageFiles/10081.txt",
106 "CodpageFiles/10082.txt",
107 "CodpageFiles/20127.txt",
108 "CodpageFiles/20866.txt",
109 "CodpageFiles/21866.txt",
110 "CodpageFiles/28591.txt",
111 "CodpageFiles/28592.txt",
112 "CodpageFiles/28593.txt",
113 "CodpageFiles/28594.txt",
114 "CodpageFiles/28595.txt",
115 "CodpageFiles/28596.txt",
116 "CodpageFiles/28597.txt",
117 "CodpageFiles/28598.txt",
118 "CodpageFiles/28599.txt",
119 "CodpageFiles/28603.txt",
120 "CodpageFiles/28605.txt",
124 my %ctype =
126 # CT_CTYPE1
127 "upper" => 0x0001,
128 "lower" => 0x0002,
129 "digit" => 0x0004,
130 "space" => 0x0008,
131 "punct" => 0x0010,
132 "cntrl" => 0x0020,
133 "blank" => 0x0040,
134 "xdigit" => 0x0080,
135 "alpha" => 0x0100 | 0x80000000,
136 "defin" => 0x0200,
137 # CT_CTYPE3 in high 16 bits
138 "nonspacing" => 0x00010000,
139 "diacritic" => 0x00020000,
140 "vowelmark" => 0x00040000,
141 "symbol" => 0x00080000,
142 "katakana" => 0x00100000,
143 "hiragana" => 0x00200000,
144 "halfwidth" => 0x00400000,
145 "fullwidth" => 0x00800000,
146 "ideograph" => 0x01000000,
147 "kashida" => 0x02000000,
148 "lexical" => 0x04000000,
149 "highsurrogate" => 0x08000000,
150 "lowsurrogate" => 0x10000000,
153 my %bracket_types =
155 "o" => 0x0000,
156 "c" => 0x0001,
159 my %indic_types =
161 "Other" => 0x0000,
162 "Bindu" => 0x0001,
163 "Visarga" => 0x0002,
164 "Avagraha" => 0x0003,
165 "Nukta" => 0x0004,
166 "Virama" => 0x0005,
167 "Vowel_Independent" => 0x0006,
168 "Vowel_Dependent" => 0x0007,
169 "Vowel" => 0x0008,
170 "Consonant_Placeholder" => 0x0009,
171 "Consonant" => 0x000a,
172 "Consonant_Dead" => 0x000b,
173 "Consonant_Succeeding_Repha" => 0x000c,
174 "Consonant_Subjoined" => 0x000d,
175 "Consonant_Medial" => 0x000e,
176 "Consonant_Final" => 0x000f,
177 "Consonant_Head_Letter" => 0x0010,
178 "Modifying_Letter" => 0x0011,
179 "Tone_Letter" => 0x0012,
180 "Tone_Mark" => 0x0013,
181 "Register_Shifter" => 0x0014,
182 "Consonant_Preceding_Repha" => 0x0015,
183 "Pure_Killer" => 0x0016,
184 "Invisible_Stacker" => 0x0017,
185 "Gemination_Mark" => 0x0018,
186 "Cantillation_Mark" => 0x0019,
187 "Non_Joiner" => 0x001a,
188 "Joiner" => 0x001b,
189 "Number_Joiner" => 0x001c,
190 "Number" => 0x001d,
191 "Brahmi_Joining_Number" => 0x001e,
192 "Consonant_With_Stacker" => 0x001f,
193 "Consonant_Prefixed" => 0x0020,
194 "Syllable_Modifier" => 0x0021,
195 "Consonant_Killer" => 0x0022,
196 "Consonant_Initial_Postfixed" => 0x0023,
199 my %matra_types =
201 "Right" => 0x01,
202 "Left" => 0x02,
203 "Visual_Order_Left" => 0x03,
204 "Left_And_Right" => 0x04,
205 "Top" => 0x05,
206 "Bottom" => 0x06,
207 "Top_And_Bottom" => 0x07,
208 "Top_And_Right" => 0x08,
209 "Top_And_Left" => 0x09,
210 "Top_And_Left_And_Right" => 0x0a,
211 "Bottom_And_Right" => 0x0b,
212 "Top_And_Bottom_And_Right" => 0x0c,
213 "Overstruck" => 0x0d,
214 "Invisible" => 0x0e,
215 "Bottom_And_Left" => 0x0f,
216 "Top_And_Bottom_And_Left" => 0x10,
219 my %break_types =
221 "BK" => 0x0001,
222 "CR" => 0x0002,
223 "LF" => 0x0003,
224 "CM" => 0x0004,
225 "SG" => 0x0005,
226 "GL" => 0x0006,
227 "CB" => 0x0007,
228 "SP" => 0x0008,
229 "ZW" => 0x0009,
230 "NL" => 0x000a,
231 "WJ" => 0x000b,
232 "JL" => 0x000c,
233 "JV" => 0x000d,
234 "JT" => 0x000e,
235 "H2" => 0x000f,
236 "H3" => 0x0010,
237 "XX" => 0x0011,
238 "OP" => 0x0012,
239 "CL" => 0x0013,
240 "CP" => 0x0014,
241 "QU" => 0x0015,
242 "NS" => 0x0016,
243 "EX" => 0x0017,
244 "SY" => 0x0018,
245 "IS" => 0x0019,
246 "PR" => 0x001a,
247 "PO" => 0x001b,
248 "NU" => 0x001c,
249 "AL" => 0x001d,
250 "ID" => 0x001e,
251 "IN" => 0x001f,
252 "HY" => 0x0020,
253 "BB" => 0x0021,
254 "BA" => 0x0022,
255 "SA" => 0x0023,
256 "AI" => 0x0024,
257 "B2" => 0x0025,
258 "HL" => 0x0026,
259 "CJ" => 0x0027,
260 "RI" => 0x0028,
261 "EB" => 0x0029,
262 "EM" => 0x002a,
263 "ZWJ" => 0x002b,
266 my %vertical_types =
268 "R" => 0x0000,
269 "U" => 0x0001,
270 "Tr" => 0x0002,
271 "Tu" => 0x0003,
274 my %categories =
276 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
277 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
278 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
279 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
280 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
281 "Me" => $ctype{"defin"}, # Mark, Enclosing
282 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
283 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
284 "No" => $ctype{"defin"}, # Number, Other
285 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
286 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
287 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
288 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
289 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
290 "Cs" => $ctype{"defin"}, # Other, Surrogate
291 "Co" => $ctype{"defin"}, # Other, Private Use
292 "Cn" => $ctype{"defin"}, # Other, Not Assigned
293 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
294 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
295 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
296 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
297 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
298 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
299 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
300 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
301 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
302 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
303 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
304 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
305 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
308 # a few characters need additional categories that cannot be determined automatically
309 my %special_categories =
311 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
312 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
313 "space" => [ 0x09..0x0d, 0x85 ],
314 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
315 "cntrl" => [ 0x070f, 0x200c, 0x200d,
316 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
317 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
318 0xfff9, 0xfffa, 0xfffb ],
319 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
320 0xd7, 0xf7 ],
321 "digit" => [ 0xb2, 0xb3, 0xb9 ],
322 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
323 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
324 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
325 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
326 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
327 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
328 0x02b9..0x02ba, 0x02c6..0x02cf ],
329 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
330 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
331 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
332 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
333 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
334 0x3131..0x3164 ],
335 "ideograph" => [ 0x3006..0x3007 ],
336 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
337 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
338 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
339 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
340 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
341 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
342 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
343 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
344 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
345 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
346 "kashida" => [ 0x0640 ],
349 my %directions =
351 "L" => 1, # Left-to-Right
352 "R" => 2, # Right-to-Left
353 "AL" => 12, # Right-to-Left Arabic
354 "EN" => 3, # European Number
355 "ES" => 4, # European Number Separator
356 "ET" => 5, # European Number Terminator
357 "AN" => 6, # Arabic Number
358 "CS" => 7, # Common Number Separator
359 "NSM" => 13, # Non-Spacing Mark
360 "BN" => 14, # Boundary Neutral
361 "B" => 8, # Paragraph Separator
362 "S" => 9, # Segment Separator
363 "WS" => 10, # Whitespace
364 "ON" => 11, # Other Neutrals
365 "LRE" => 15, # Left-to-Right Embedding
366 "LRO" => 15, # Left-to-Right Override
367 "RLE" => 15, # Right-to-Left Embedding
368 "RLO" => 15, # Right-to-Left Override
369 "PDF" => 15, # Pop Directional Format
370 "LRI" => 15, # Left-to-Right Isolate
371 "RLI" => 15, # Right-to-Left Isolate
372 "FSI" => 15, # First Strong Isolate
373 "PDI" => 15 # Pop Directional Isolate
376 my %c2_types =
378 "L" => 1, # C2_LEFTTORIGHT
379 "R" => 2, # C2_RIGHTTOLEFT
380 "AL" => 2, # C2_RIGHTTOLEFT
381 "EN" => 3, # C2_EUROPENUMBER
382 "ES" => 4, # C2_EUROPESEPARATOR
383 "ET" => 5, # C2_EUROPETERMINATOR
384 "AN" => 6, # C2_ARABICNUMBER
385 "CS" => 7, # C2_COMMONSEPARATOR
386 "NSM" => 11, # C2_OTHERNEUTRAL
387 "BN" => 0, # C2_NOTAPPLICABLE
388 "B" => 8, # C2_BLOCKSEPARATOR
389 "S" => 9, # C2_SEGMENTSEPARATOR
390 "WS" => 10, # C2_WHITESPACE
391 "ON" => 11, # C2_OTHERNEUTRAL
392 "LRE" => 11, # C2_OTHERNEUTRAL
393 "LRO" => 11, # C2_OTHERNEUTRAL
394 "RLE" => 11, # C2_OTHERNEUTRAL
395 "RLO" => 11, # C2_OTHERNEUTRAL
396 "PDF" => 11, # C2_OTHERNEUTRAL
397 "LRI" => 11, # C2_OTHERNEUTRAL
398 "RLI" => 11, # C2_OTHERNEUTRAL
399 "FSI" => 11, # C2_OTHERNEUTRAL
400 "PDI" => 11 # C2_OTHERNEUTRAL
403 my %bidi_types =
405 "ON" => 0, # Other Neutrals
406 "L" => 1, # Left-to-Right
407 "R" => 2, # Right-to-Left
408 "AN" => 3, # Arabic Number
409 "EN" => 4, # European Number
410 "AL" => 5, # Right-to-Left Arabic
411 "NSM" => 6, # Non-Spacing Mark
412 "CS" => 7, # Common Number Separator
413 "ES" => 8, # European Number Separator
414 "ET" => 9, # European Number Terminator
415 "BN" => 10, # Boundary Neutral
416 "S" => 11, # Segment Separator
417 "WS" => 12, # Whitespace
418 "B" => 13, # Paragraph Separator
419 "RLO" => 14, # Right-to-Left Override
420 "RLE" => 15, # Right-to-Left Embedding
421 "LRO" => 16, # Left-to-Right Override
422 "LRE" => 17, # Left-to-Right Embedding
423 "PDF" => 18, # Pop Directional Format
424 "LRI" => 19, # Left-to-Right Isolate
425 "RLI" => 20, # Right-to-Left Isolate
426 "FSI" => 21, # First Strong Isolate
427 "PDI" => 22 # Pop Directional Isolate
430 my %joining_types =
432 "U" => 0, # Non_Joining
433 "L" => 1, # Left_Joining
434 "R" => 2, # Right_Joining
435 "D" => 3, # Dual_Joining
436 "C" => 3, # Join_Causing
437 "ALAPH" => 4, # Syriac ALAPH
438 "DALATH RISH" => 5, # Syriac DALATH RISH group
439 "T" => 6, # Transparent
442 my @locales =
444 { name => "", lcid => 0x0000007f, file => "root", territory => "IV", sabbrevlangname => "IVL", sopentypelang =>"dflt" },
445 { name => "aa", dir => "seed", sopentypelang => "AFR" },
446 { name => "aa-DJ", dir => "seed" },
447 { name => "aa-ER", dir => "seed" },
448 { name => "aa-ET", dir => "seed" },
449 { name => "af", lcid => 0x00000036, oemcp => 850, sabbrevlangname => "AFK", sopentypelang => "AFK" },
450 { name => "af-NA" },
451 { name => "af-ZA", lcid => 0x00000436 },
452 { name => "agq" },
453 { name => "agq-CM" },
454 { name => "ak", sopentypelang => "TWI" },
455 { name => "ak-GH" },
456 { name => "am", lcid => 0x0000005e, sabbrevlangname => "AMH" },
457 { name => "am-ET", lcid => 0x0000045e },
458 { name => "ar", lcid => 0x00000001, territory => "SA", oemcp => 720, group => 13 },
459 { name => "ar-001" },
460 { name => "ar-AE", lcid => 0x00003801, sabbrevlangname => "ARU" },
461 { name => "ar-BH", lcid => 0x00003c01, sabbrevlangname => "ARH" },
462 { name => "ar-DJ" },
463 { name => "ar-DZ", lcid => 0x00001401, sabbrevlangname => "ARG" },
464 { name => "ar-EG", lcid => 0x00000c01, sabbrevlangname => "ARE" },
465 { name => "ar-EH" },
466 { name => "ar-ER" },
467 { name => "ar-IL" },
468 { name => "ar-IQ", lcid => 0x00000801, sabbrevlangname => "ARI" },
469 { name => "ar-JO", lcid => 0x00002c01, sabbrevlangname => "ARJ" },
470 { name => "ar-KM" },
471 { name => "ar-KW", lcid => 0x00003401, sabbrevlangname => "ARK" },
472 { name => "ar-LB", lcid => 0x00003001, sabbrevlangname => "ARB" },
473 { name => "ar-LY", lcid => 0x00001001, sabbrevlangname => "ARL" },
474 { name => "ar-MA", lcid => 0x00001801, sabbrevlangname => "ARM" },
475 { name => "ar-MR" },
476 { name => "ar-OM", lcid => 0x00002001, sabbrevlangname => "ARO" },
477 { name => "ar-PS" },
478 { name => "ar-QA", lcid => 0x00004001, sabbrevlangname => "ARQ" },
479 { name => "ar-SA", lcid => 0x00000401, sabbrevlangname => "ARA" },
480 { name => "ar-SD" },
481 { name => "ar-SO" },
482 { name => "ar-SS" },
483 { name => "ar-SY", lcid => 0x00002801, sabbrevlangname => "ARS" },
484 { name => "ar-TD" },
485 { name => "ar-TN", lcid => 0x00001c01, sabbrevlangname => "ART" },
486 { name => "ar-YE", lcid => 0x00002401, sabbrevlangname => "ARY" },
487 { name => "arn", lcid => 0x0000007a, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sabbrevlangname => "MPD", sopentypelang => "MAP" },
488 { name => "arn-CL", lcid => 0x0000047a, dir => "seed" },
489 { name => "arn-Latn", alias => "arn" },
490 { name => "arn-Latn-CL", alias => "arn-CL" },
491 { name => "as", lcid => 0x0000004d, slist => ",", group => 15 },
492 { name => "as-IN", lcid => 0x0000044d },
493 { name => "asa" },
494 { name => "asa-TZ" },
495 { name => "ast" },
496 { name => "ast-ES" },
497 { name => "az", lcid => 0x0000002c, oemcp => 857, ebcdiccp => 20905, group => 2 },
498 { name => "az-Cyrl", lcid => 0x0000742c, oemcp => 866, ebcdiccp => 20880, group => 5, sabbrevlangname => "AZC" },
499 { name => "az-Cyrl-AZ", lcid => 0x0000082c },
500 { name => "az-Latn", lcid => 0x0000782c },
501 { name => "az-Latn-AZ", lcid => 0x0000042c },
502 { name => "ba", lcid => 0x0000006d, oemcp => 866, group => 5, dir => "seed", sabbrevlangname => "BAS", sopentypelang => "BSH" },
503 { name => "ba-Cyrl", alias => "ba" },
504 { name => "ba-Cyrl-RU", alias => "ba-RU" },
505 { name => "ba-RU", lcid => 0x0000046d, dir => "seed" },
506 { name => "bas" },
507 { name => "bas-CM" },
508 { name => "be", lcid => 0x00000023, oemcp => 866, ebcdiccp => 500, group => 5 },
509 { name => "be-BY", lcid => 0x00000423 },
510 { name => "bem" },
511 { name => "bem-ZM" },
512 { name => "bez" },
513 { name => "bez-TZ" },
514 { name => "bg", lcid => 0x00000002, oemcp => 866, ebcdiccp => 21025, group => 5, sabbrevlangname => "BGR", sopentypelang => "BGR" },
515 { name => "bg-BG", lcid => 0x00000402 },
516 { name => "bin", lcid => 0x00000066, oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "EDO" },
517 { name => "bin-NG", lcid => 0x00000466, file => "bin", dir => "exemplars" },
518 { name => "bm", sopentypelang => "BMB" },
519 { name => "bm-Latn", file => "bm" },
520 { name => "bm-Latn-ML", file => "bm_ML" },
521 { name => "bm-ML", alias => "bm-Latn-ML" },
522 { name => "bn", lcid => 0x00000045, slist => ",", group => 15, sabbrevlangname => "BNB" },
523 { name => "bn-BD", lcid => 0x00000845 },
524 { name => "bn-IN", lcid => 0x00000445, sabbrevlangname => "BNG" },
525 { name => "bo", lcid => 0x00000051, slist => ",", group => 15, sabbrevlangname => "BOB", sopentypelang => "TIB" },
526 { name => "bo-CN", lcid => 0x00000451 },
527 { name => "bo-IN", slist => "," },
528 { name => "bo-Tibt", alias => "bo" },
529 { name => "bo-Tibt-CN", alias => "bo-CN" },
530 { name => "bo-Tibt-IN", alias => "bo-IN" },
531 { name => "br", lcid => 0x0000007e, oemcp => 850, ebcdiccp => 20297 },
532 { name => "br-FR", lcid => 0x0000047e },
533 { name => "br-Latn", alias => "br" },
534 { name => "br-Latn-FR", alias => "br-FR" },
535 { name => "brx" },
536 { name => "brx-IN" },
537 { name => "bs", lcid => 0x0000781a, oemcp => 852, maccp => 10082, ebcdiccp => 870, group => 2, sabbrevlangname => "BSB" },
538 { name => "bs-Cyrl", lcid => 0x0000641a, oemcp => 855, group => 5, sabbrevlangname => "BSC" },
539 { name => "bs-Cyrl-BA", lcid => 0x0000201a },
540 { name => "bs-Latn", lcid => 0x0000681a },
541 { name => "bs-Latn-BA", lcid => 0x0000141a },
542 { name => "byn", dir => "seed", sopentypelang => "BIL" },
543 { name => "byn-ER", dir => "seed" },
544 { name => "ca", lcid => 0x00000003, oemcp => 850 },
545 { name => "ca-AD", maccp => 65001 },
546 { name => "ca-ES", lcid => 0x00000403 },
547 { name => "ca-ES-valencia", lcid => 0x00000803, file => "ca_ES_VALENCIA", sabbrevlangname => "VAL" },
548 { name => "ca-FR", maccp => 65001 },
549 { name => "ca-IT", maccp => 65001 },
550 { name => "ccp" },
551 { name => "ccp-BD", alias => "ccp-Cakm-BD" },
552 { name => "ccp-Cakm", file => "ccp" },
553 { name => "ccp-Cakm-BD", file => "ccp_BD" },
554 { name => "ccp-Cakm-IN", file => "ccp_IN" },
555 { name => "ccp-IN", alias => "ccp-Cakm-IN" },
556 { name => "ce" },
557 { name => "ce-RU" },
558 { name => "ceb" },
559 { name => "ceb-Latn", file => "ceb" },
560 { name => "ceb-Latn-PH", file => "ceb_PH" },
561 { name => "ceb-PH", alias => "ceb-Latn-PH" },
562 { name => "cgg" },
563 { name => "cgg-UG" },
564 { name => "chr", lcid => 0x0000005c, slist => ",", sabbrevlangname => "CRE" },
565 { name => "chr-Cher", lcid => 0x00007c5c, file => "chr" },
566 { name => "chr-Cher-US", lcid => 0x0000045c, file => "chr_US" },
567 { name => "chr-US", alias => "chr-Cher-US" },
568 { name => "ckb", alias => "ku" },
569 { name => "ckb-IQ", alias => "ku-Arab-IQ" },
570 { name => "ckb-IR", alias => "ku-Arab-IR" },
571 { name => "co", lcid => 0x00000083, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
572 { name => "co-FR", lcid => 0x00000483, dir => "seed" },
573 { name => "co-Latn", alias => "co" },
574 { name => "co-Latn-FR", alias => "co-FR" },
575 { name => "cs", lcid => 0x00000005, oemcp => 852, group => 2, sabbrevlangname => "CSY", sopentypelang => "CSY" },
576 { name => "cs-CZ", lcid => 0x00000405 },
577 { name => "cu", dir => "seed", sopentypelang => "CSL" },
578 { name => "cu-RU", dir => "seed" },
579 { name => "cy", lcid => 0x00000052, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "CYM", sopentypelang => "WEL" },
580 { name => "cy-GB", lcid => 0x00000452 },
581 { name => "da", lcid => 0x00000006, oemcp => 850, ebcdiccp => 20277 },
582 { name => "da-DK", lcid => 0x00000406 },
583 { name => "da-GL", maccp => 65001 },
584 { name => "dav" },
585 { name => "dav-KE" },
586 { name => "de", lcid => 0x00000007, oemcp => 850, ebcdiccp => 20273 },
587 { name => "de-AT", lcid => 0x00000c07, sabbrevlangname => "DEA" },
588 { name => "de-BE" },
589 { name => "de-CH", lcid => 0x00000807, sabbrevlangname => "DES" },
590 { name => "de-DE", lcid => 0x00000407 },
591 { name => "de-DE_phoneb", lcid => 0x00010407, alias => "de-DE" },
592 { name => "de-DE-u-co-phonebk", alias => "de-DE_phoneb" },
593 { name => "de-IT", oemcp => 65001 },
594 { name => "de-LI", lcid => 0x00001407, sabbrevlangname => "DEC" },
595 { name => "de-LU", lcid => 0x00001007, sabbrevlangname => "DEL" },
596 { name => "dje", sopentypelang => "DJR" },
597 { name => "dje-NE" },
598 { name => "doi" },
599 { name => "doi-IN" },
600 { name => "dsb", lcid => 0x00007c2e, sparent => "hsb", oemcp => 850, ebcdiccp => 870, sabbrevlangname => "DSB", sopentypelang => "LSB" },
601 { name => "dsb-DE", lcid => 0x0000082e },
602 { name => "dua" },
603 { name => "dua-CM" },
604 { name => "dv", lcid => 0x00000065, slist => "\x{060c}", group => 13, dir => "seed" },
605 { name => "dv-MV", lcid => 0x00000465, dir => "seed" },
606 { name => "dyo" },
607 { name => "dyo-SN" },
608 { name => "dz", sopentypelang => "DZN" },
609 { name => "dz-BT", lcid => 0x00000c51, sabbrevlangname => "ZZZ" },
610 { name => "ebu" },
611 { name => "ebu-KE" },
612 { name => "ee" },
613 { name => "ee-GH" },
614 { name => "ee-TG" },
615 { name => "el", lcid => 0x00000008, oemcp => 737, group => 4 },
616 { name => "el-CY" },
617 { name => "el-GR", lcid => 0x00000408 },
618 { name => "en", lcid => 0x00000009, oemcp => 437, slist => ",", sabbrevlangname => "ENU" },
619 { name => "en-001", oemcp => 850 },
620 { name => "en-029", lcid => 0x00002409, file => "en", oemcp => 850, sabbrevlangname => "ENB" },
621 { name => "en-150", oemcp => 65001 },
622 { name => "en-AE", lcid => 0x00004c09, oemcp => 65001, sabbrevlangname => "ZZZ" },
623 { name => "en-AG", oemcp => 850 },
624 { name => "en-AI", oemcp => 850 },
625 { name => "en-AS", oemcp => 850 },
626 { name => "en-AT", oemcp => 65001 },
627 { name => "en-AU", lcid => 0x00000c09, oemcp => 850, sabbrevlangname => "ENA" },
628 { name => "en-BB", oemcp => 850 },
629 { name => "en-BE", oemcp => 850 },
630 { name => "en-BI", oemcp => 65001 },
631 { name => "en-BM", oemcp => 850 },
632 { name => "en-BS", oemcp => 850 },
633 { name => "en-BW", oemcp => 850 },
634 { name => "en-BZ", lcid => 0x00002809, oemcp => 850, sabbrevlangname => "ENL" },
635 { name => "en-CA", lcid => 0x00001009, oemcp => 850, ebcdiccp => 37, sabbrevlangname => "ENC" },
636 { name => "en-CC", oemcp => 850 },
637 { name => "en-CH", oemcp => 65001 },
638 { name => "en-CK", oemcp => 850 },
639 { name => "en-CM", oemcp => 850 },
640 { name => "en-CX", oemcp => 850 },
641 { name => "en-CY", oemcp => 65001 },
642 { name => "en-DE", oemcp => 65001 },
643 { name => "en-DG", oemcp => 850 },
644 { name => "en-DK", oemcp => 65001 },
645 { name => "en-DM", oemcp => 850 },
646 { name => "en-ER", oemcp => 850 },
647 { name => "en-FI", oemcp => 65001 },
648 { name => "en-FJ", oemcp => 850 },
649 { name => "en-FK", oemcp => 850 },
650 { name => "en-FM", oemcp => 850 },
651 { name => "en-GB", lcid => 0x00000809, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "ENG" },
652 { name => "en-GD", oemcp => 850 },
653 { name => "en-GG", oemcp => 850 },
654 { name => "en-GH", oemcp => 850 },
655 { name => "en-GI", oemcp => 850 },
656 { name => "en-GM", oemcp => 850 },
657 { name => "en-GU", oemcp => 850 },
658 { name => "en-GY", oemcp => 850 },
659 { name => "en-HK", lcid => 0x00003c09, oemcp => 850, sabbrevlangname => "ENH" },
660 { name => "en-ID", lcid => 0x00003809, file => "en", oemcp => 850, sabbrevlangname => "ZZZ" },
661 { name => "en-IE", lcid => 0x00001809, oemcp => 850, sabbrevlangname => "ENI" },
662 { name => "en-IL", oemcp => 65001 },
663 { name => "en-IM", oemcp => 850 },
664 { name => "en-IN", lcid => 0x00004009, sabbrevlangname => "ENN" },
665 { name => "en-IO", oemcp => 850 },
666 { name => "en-JE", oemcp => 850 },
667 { name => "en-JM", lcid => 0x00002009, oemcp => 850, sabbrevlangname => "ENJ" },
668 { name => "en-KE", oemcp => 850 },
669 { name => "en-KI", oemcp => 850 },
670 { name => "en-KN", oemcp => 850 },
671 { name => "en-KY", oemcp => 850 },
672 { name => "en-LC", oemcp => 850 },
673 { name => "en-LR", oemcp => 850 },
674 { name => "en-LS", oemcp => 850 },
675 { name => "en-MG", oemcp => 850 },
676 { name => "en-MH", oemcp => 850 },
677 { name => "en-MO", oemcp => 850 },
678 { name => "en-MP", oemcp => 850 },
679 { name => "en-MS", oemcp => 850 },
680 { name => "en-MT", oemcp => 850 },
681 { name => "en-MU", oemcp => 850 },
682 { name => "en-MW", oemcp => 850 },
683 { name => "en-MY", lcid => 0x00004409, sabbrevlangname => "ENM" },
684 { name => "en-NA", oemcp => 850 },
685 { name => "en-NF", oemcp => 850 },
686 { name => "en-NG", oemcp => 850 },
687 { name => "en-NL", oemcp => 65001 },
688 { name => "en-NR", oemcp => 850 },
689 { name => "en-NU", oemcp => 850 },
690 { name => "en-NZ", lcid => 0x00001409, oemcp => 850, sabbrevlangname => "ENZ" },
691 { name => "en-PG", oemcp => 850 },
692 { name => "en-PH", lcid => 0x00003409, ebcdiccp => 500, sabbrevlangname => "ENP" },
693 { name => "en-PK", oemcp => 850 },
694 { name => "en-PN", oemcp => 850 },
695 { name => "en-PR", oemcp => 850 },
696 { name => "en-PW", oemcp => 850 },
697 { name => "en-RW", oemcp => 850 },
698 { name => "en-SB", oemcp => 850 },
699 { name => "en-SC", oemcp => 850 },
700 { name => "en-SD", oemcp => 850 },
701 { name => "en-SE", oemcp => 65001 },
702 { name => "en-SG", lcid => 0x00004809, sabbrevlangname => "ENE" },
703 { name => "en-SH", oemcp => 850 },
704 { name => "en-SI", oemcp => 65001 },
705 { name => "en-SL", oemcp => 850 },
706 { name => "en-SS", oemcp => 850 },
707 { name => "en-SX", oemcp => 850 },
708 { name => "en-SZ", oemcp => 850 },
709 { name => "en-TC", oemcp => 850 },
710 { name => "en-TK", oemcp => 850 },
711 { name => "en-TO", oemcp => 850 },
712 { name => "en-TT", lcid => 0x00002c09, oemcp => 850, sabbrevlangname => "ENT" },
713 { name => "en-TV", oemcp => 850 },
714 { name => "en-TZ", oemcp => 850 },
715 { name => "en-UG", oemcp => 850 },
716 { name => "en-UM", oemcp => 850 },
717 { name => "en-US", lcid => 0x00000409 },
718 { name => "en-VC", oemcp => 850 },
719 { name => "en-VG", oemcp => 850 },
720 { name => "en-VI", oemcp => 850 },
721 { name => "en-VU", oemcp => 850 },
722 { name => "en-WS", oemcp => 850 },
723 { name => "en-ZA", lcid => 0x00001c09, ebcdiccp => 500, sabbrevlangname => "ENS" },
724 { name => "en-ZM", oemcp => 850 },
725 { name => "en-ZW", lcid => 0x00003009, ebcdiccp => 500, sabbrevlangname => "ENW" },
726 { name => "eo", sopentypelang => "NTO" },
727 { name => "eo-001" },
728 { name => "es", lcid => 0x0000000a, oemcp => 850, ebcdiccp => 20284, sabbrevlangname => "ESP", sopentypelang => "ESP" },
729 { name => "es-419", lcid => 0x0000580a, sabbrevlangname => "ESJ" },
730 { name => "es-AR", lcid => 0x00002c0a, sabbrevlangname => "ESS" },
731 { name => "es-BO", lcid => 0x0000400a, sabbrevlangname => "ESB" },
732 { name => "es-BR", oemcp => 65001 },
733 { name => "es-BZ", oemcp => 65001 },
734 { name => "es-CL", lcid => 0x0000340a, sabbrevlangname => "ESL" },
735 { name => "es-CO", lcid => 0x0000240a, sabbrevlangname => "ESO" },
736 { name => "es-CR", lcid => 0x0000140a, sabbrevlangname => "ESC" },
737 { name => "es-CU", lcid => 0x00005c0a, sabbrevlangname => "ESK" },
738 { name => "es-DO", lcid => 0x00001c0a, sabbrevlangname => "ESD" },
739 { name => "es-EA" },
740 { name => "es-EC", lcid => 0x0000300a, sabbrevlangname => "ESF" },
741 { name => "es-ES", lcid => 0x00000c0a, sabbrevlangname => "ESN" },
742 { name => "es-ES_tradnl", lcid => 0x0000040a, file => "es_ES" },
743 { name => "es-ES-u-co-trad", alias => "es-ES_tradnl" },
744 { name => "es-GQ" },
745 { name => "es-GT", lcid => 0x0000100a, sabbrevlangname => "ESG" },
746 { name => "es-HN", lcid => 0x0000480a, sabbrevlangname => "ESH" },
747 { name => "es-IC" },
748 { name => "es-MX", lcid => 0x0000080a, sabbrevlangname => "ESM" },
749 { name => "es-NI", lcid => 0x00004c0a, sabbrevlangname => "ESI" },
750 { name => "es-PA", lcid => 0x0000180a, sabbrevlangname => "ESA" },
751 { name => "es-PE", lcid => 0x0000280a, sabbrevlangname => "ESR" },
752 { name => "es-PH" },
753 { name => "es-PR", lcid => 0x0000500a, sabbrevlangname => "ESU" },
754 { name => "es-PY", lcid => 0x00003c0a, sabbrevlangname => "ESZ" },
755 { name => "es-SV", lcid => 0x0000440a, sabbrevlangname => "ESE" },
756 { name => "es-US", lcid => 0x0000540a, sabbrevlangname => "EST" },
757 { name => "es-UY", lcid => 0x0000380a, sabbrevlangname => "ESY" },
758 { name => "es-VE", lcid => 0x0000200a, sabbrevlangname => "ESV" },
759 { name => "et", lcid => 0x00000025, oemcp => 775, group => 3, sabbrevlangname => "ETI", sopentypelang => "ETI" },
760 { name => "et-EE", lcid => 0x00000425 },
761 { name => "eu", lcid => 0x0000002d, oemcp => 850, maccp => 65001, sabbrevlangname => "EUQ", sopentypelang => "EUQ" },
762 { name => "eu-ES", lcid => 0x0000042d },
763 { name => "ewo" },
764 { name => "ewo-CM" },
765 { name => "fa", lcid => 0x00000029, inegnumber => 3, oemcp => 720, slist => "\x{061b}", group => 13, sabbrevlangname => "FAR", sopentypelang => "FAR" },
766 { name => "fa-AF", alias => "prs-AF" },
767 { name => "fa-IR", lcid => 0x00000429 },
768 { name => "ff", lcid => 0x00000067, oemcp => 850, ebcdiccp => 20297 },
769 { name => "ff-CM", alias => "ff-Latn-CM" },
770 { name => "ff-GN", alias => "ff-Latn-GN" },
771 { name => "ff-MR", alias => "ff-Latn-MR" },
772 { name => "ff-NG", alias => "ff-Latn-NG" },
773 { name => "ff-SN", alias => "ff-Latn-SN" },
774 { name => "ff-Adlm" },
775 { name => "ff-Adlm-BF" },
776 { name => "ff-Adlm-CM" },
777 { name => "ff-Adlm-GH" },
778 { name => "ff-Adlm-GM" },
779 { name => "ff-Adlm-GN" },
780 { name => "ff-Adlm-GW" },
781 { name => "ff-Adlm-LR" },
782 { name => "ff-Adlm-MR" },
783 { name => "ff-Adlm-NE" },
784 { name => "ff-Adlm-NG" },
785 { name => "ff-Adlm-SL" },
786 { name => "ff-Adlm-SN" },
787 { name => "ff-Latn", lcid => 0x00007c67 },
788 { name => "ff-Latn-BF", oemcp => 65001 },
789 { name => "ff-Latn-CM" },
790 { name => "ff-Latn-GH", oemcp => 65001 },
791 { name => "ff-Latn-GM", oemcp => 65001 },
792 { name => "ff-Latn-GN" },
793 { name => "ff-Latn-GW", oemcp => 65001 },
794 { name => "ff-Latn-LR", oemcp => 65001 },
795 { name => "ff-Latn-MR" },
796 { name => "ff-Latn-NE", oemcp => 65001 },
797 { name => "ff-Latn-NG", lcid => 0x00000467, sabbrevlangname => "ZZZ" },
798 { name => "ff-Latn-SL", oemcp => 65001 },
799 { name => "ff-Latn-SN", lcid => 0x00000867 },
800 { name => "fi", lcid => 0x0000000b, oemcp => 850, ebcdiccp => 20278 },
801 { name => "fi-FI", lcid => 0x0000040b },
802 { name => "fil", lcid => 0x00000064, oemcp => 437, ebcdiccp => 500, sabbrevlangname => "FPO", sopentypelang => "PIL" },
803 { name => "fil-PH", lcid => 0x00000464 },
804 { name => "fil-Latn", alias => "fil" },
805 { name => "fil-Latn-PH", alias => "fil-PH" },
806 { name => "fo", lcid => 0x00000038, oemcp => 850, maccp => 10079, ebcdiccp => 20277, sabbrevlangname => "FOS", sopentypelang => "FOS" },
807 { name => "fo-DK", oemcp => 65001, maccp => 65001 },
808 { name => "fo-FO", lcid => 0x00000438 },
809 { name => "fr", lcid => 0x0000000c, oemcp => 850, ebcdiccp => 20297 },
810 { name => "fr-029", lcid => 0x00001c0c, file => "fr", sabbrevlangname => "ZZZ" },
811 { name => "fr-BE", lcid => 0x0000080c, sabbrevlangname => "FRB" },
812 { name => "fr-BF" },
813 { name => "fr-BI" },
814 { name => "fr-BJ" },
815 { name => "fr-BL" },
816 { name => "fr-CA", lcid => 0x00000c0c, sabbrevlangname => "FRC" },
817 { name => "fr-CD", lcid => 0x0000240c, sabbrevlangname => "FRD" },
818 { name => "fr-CF" },
819 { name => "fr-CG" },
820 { name => "fr-CH", lcid => 0x0000100c, sabbrevlangname => "FRS" },
821 { name => "fr-CI", lcid => 0x0000300c, sabbrevlangname => "FRI" },
822 { name => "fr-CM", lcid => 0x00002c0c, sabbrevlangname => "FRE" },
823 { name => "fr-DJ" },
824 { name => "fr-DZ" },
825 { name => "fr-FR", lcid => 0x0000040c },
826 { name => "fr-GA" },
827 { name => "fr-GF" },
828 { name => "fr-GN" },
829 { name => "fr-GP" },
830 { name => "fr-GQ" },
831 { name => "fr-HT", lcid => 0x00003c0c, sabbrevlangname => "FRH" },
832 { name => "fr-KM" },
833 { name => "fr-LU", lcid => 0x0000140c, sabbrevlangname => "FRL" },
834 { name => "fr-MA", lcid => 0x0000380c, sabbrevlangname => "FRO" },
835 { name => "fr-MC", lcid => 0x0000180c, sabbrevlangname => "FRM" },
836 { name => "fr-MF" },
837 { name => "fr-MG" },
838 { name => "fr-ML", lcid => 0x0000340c, sabbrevlangname => "FRF" },
839 { name => "fr-MQ" },
840 { name => "fr-MR" },
841 { name => "fr-MU" },
842 { name => "fr-NC" },
843 { name => "fr-NE" },
844 { name => "fr-PF" },
845 { name => "fr-PM" },
846 { name => "fr-RE", lcid => 0x0000200c, sabbrevlangname => "FRR" },
847 { name => "fr-RW" },
848 { name => "fr-SC" },
849 { name => "fr-SN", lcid => 0x0000280c, sabbrevlangname => "FRN" },
850 { name => "fr-SY" },
851 { name => "fr-TD" },
852 { name => "fr-TG" },
853 { name => "fr-TN" },
854 { name => "fr-VU" },
855 { name => "fr-WF" },
856 { name => "fr-YT" },
857 { name => "fur", sopentypelang => "FRL" },
858 { name => "fur-IT" },
859 { name => "fy", lcid => 0x00000062, oemcp => 850, sabbrevlangname => "FYN", sopentypelang => "FRI" },
860 { name => "fy-NL", lcid => 0x00000462 },
861 { name => "ga", lcid => 0x0000003c, oemcp => 850, sabbrevlangname => "IRE", sopentypelang => "IRI" },
862 { name => "ga-GB" },
863 { name => "ga-IE", lcid => 0x0000083c },
864 { name => "gd", lcid => 0x00000091, oemcp => 850, ebcdiccp => 20285, sopentypelang => "GAE" },
865 { name => "gd-GB", lcid => 0x00000491 },
866 { name => "gd-Latn", alias => "gd" },
867 { name => "gl", lcid => 0x00000056, oemcp => 850, sabbrevlangname => "GLC", sopentypelang => "GAL" },
868 { name => "gl-ES", lcid => 0x00000456 },
869 { name => "gn", lcid => 0x00000074, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sopentypelang => "GUA" },
870 { name => "gn-PY", lcid => 0x00000474, dir => "seed" },
871 { name => "gsw", lcid => 0x00000084, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "ZZZ", sopentypelang => "ALS" },
872 { name => "gsw-CH" },
873 { name => "gsw-FR", lcid => 0x00000484, sabbrevlangname => "GSW" },
874 { name => "gsw-LI" },
875 { name => "gu", lcid => 0x00000047, slist => ",", group => 15 },
876 { name => "gu-IN", lcid => 0x00000447 },
877 { name => "guz" },
878 { name => "guz-KE" },
879 { name => "gv", sopentypelang => "MNX" },
880 { name => "gv-IM" },
881 { name => "ha", lcid => 0x00000068, oemcp => 437 },
882 { name => "ha-GH", alias => "ha-Latn-GH" },
883 { name => "ha-Latn", lcid => 0x00007c68, file => "ha" },
884 { name => "ha-Latn-GH", file => "ha_GH", ebcdiccp => 500 },
885 { name => "ha-Latn-NE", file => "ha_NE", ebcdiccp => 500 },
886 { name => "ha-Latn-NG", lcid => 0x00000468, file => "ha_NG" },
887 { name => "ha-NE", alias => "ha-Latn-NE" },
888 { name => "ha-NG", alias => "ha-Latn-NG" },
889 { name => "haw", lcid => 0x00000075, oemcp => 437 },
890 { name => "haw-Latn", alias => "haw" },
891 { name => "haw-Latn-US", alias => "haw-US" },
892 { name => "haw-US", lcid => 0x00000475 },
893 { name => "he", lcid => 0x0000000d, oemcp => 862, slist => ",", group => 12, sopentypelang => "IWR" },
894 { name => "he-IL", lcid => 0x0000040d },
895 { name => "hi", lcid => 0x00000039, slist => ",", group => 15 },
896 { name => "hi-IN", lcid => 0x00000439 },
897 { name => "hr", lcid => 0x0000001a, inegnumber => 2, oemcp => 852, maccp => 10082, group => 2 },
898 { name => "hr-BA", lcid => 0x0000101a, ebcdiccp => 870, inegnumber => 1, sabbrevlangname => "HRB" },
899 { name => "hr-HR", lcid => 0x0000041a },
900 { name => "hsb", lcid => 0x0000002e, oemcp => 850, ebcdiccp => 870, sopentypelang => "USB" },
901 { name => "hsb-DE", lcid => 0x0000042e },
902 { name => "hu", lcid => 0x0000000e, oemcp => 852, group => 2 },
903 { name => "hu-HU", lcid => 0x0000040e },
904 { name => "hu-HU_technl", lcid => 0x0001040e, alias => "hu-HU" },
905 { name => "hy", lcid => 0x0000002b, slist => ",", group => 17 },
906 { name => "hy-AM", lcid => 0x0000042b },
907 { name => "ia" },
908 { name => "ia-001" },
909 ## name => "ibb", lcid => 0x00000069 },
910 ## name => "ibb-NG", lcid => 0x00000469 },
911 { name => "id", lcid => 0x00000021, oemcp => 850 },
912 { name => "id-ID", lcid => 0x00000421 },
913 { name => "ig", lcid => 0x00000070, oemcp => 437 },
914 { name => "ig-Latn", alias => "ig" },
915 { name => "ig-Latn-NG", alias => "ig-NG" },
916 { name => "ig-NG", lcid => 0x00000470 },
917 { name => "ii", lcid => 0x00000078, group => 9, sopentypelang => "YIM" },
918 { name => "ii-CN", lcid => 0x00000478 },
919 { name => "ii-Yiii", alias => "ii" },
920 { name => "ii-Yiii-CN", alias => "ii-CN" },
921 { name => "is", lcid => 0x0000000f, oemcp => 850, maccp => 10079, ebcdiccp => 20871 },
922 { name => "is-IS", lcid => 0x0000040f },
923 { name => "it", lcid => 0x00000010, oemcp => 850, ebcdiccp => 20280 },
924 { name => "it-CH", lcid => 0x00000810, ebcdiccp => 500, sabbrevlangname => "ITS" },
925 { name => "it-IT", lcid => 0x00000410 },
926 { name => "it-SM" },
927 { name => "it-VA", oemcp => 65001 },
928 { name => "iu", lcid => 0x0000005d, oemcp => 437, slist => ",", sortlocale => "iu-Latn-CA", dir => "seed", sabbrevlangname => "IUK", sopentypelang => "INU" },
929 { name => "iu-Cans", lcid => 0x0000785d, file => "iu", oemcp => 65001, dir => "seed", sabbrevlangname => "IUS" },
930 { name => "iu-Cans-CA", lcid => 0x0000045d, file => "iu_CA", dir => "seed" },
931 { name => "iu-Latn", lcid => 0x00007c5d, dir => "seed" },
932 { name => "iu-Latn-CA", lcid => 0x0000085d, dir => "seed" },
933 { name => "ja", lcid => 0x00000011, ireadinglayout => 2, oemcp => 932, slist => ",", sscripts => "Hani Hira Jpan Kana", group => 7, sopentypelang => "JAN" },
934 { name => "ja-JP", lcid => 0x00000411 },
935 { name => "ja-JP_radstr", lcid => 0x00040411, alias => "ja-JP" },
936 { name => "ja-JP-u-co-unihan", alias => "ja-JP_radstr" },
937 { name => "jgo" },
938 { name => "jgo-CM" },
939 { name => "jmc" },
940 { name => "jmc-TZ" },
941 { name => "jv", oemcp => 850 },
942 { name => "jv-ID", alias => "jv-Latn-ID" },
943 ## name => "jv-Java" },
944 ## name => "jv-Java-ID" },
945 { name => "jv-Latn", file => "jv" },
946 { name => "jv-Latn-ID", file => "jv_ID" },
947 { name => "ka", lcid => 0x00000037, group => 16 },
948 { name => "ka-GE", lcid => 0x00000437 },
949 { name => "ka-GE_modern", lcid => 0x00010437, alias => "ka-GE" },
950 { name => "kab", sopentypelang => "KAB0" },
951 { name => "kab-DZ" },
952 { name => "kam", sopentypelang => "KMB" },
953 { name => "kam-KE" },
954 { name => "kde" },
955 { name => "kde-TZ" },
956 { name => "kea" },
957 { name => "kea-CV" },
958 { name => "kgp" },
959 { name => "kgp-BR" },
960 { name => "khq" },
961 { name => "khq-ML" },
962 { name => "ki" },
963 { name => "ki-KE" },
964 { name => "kk", lcid => 0x0000003f, group => 5, sabbrevlangname => "KKZ" },
965 { name => "kk-Cyrl", alias => "kk" },
966 { name => "kk-Cyrl-KZ", alias => "kk-KZ" },
967 { name => "kk-KZ", lcid => 0x0000043f },
968 { name => "kkj" },
969 { name => "kkj-CM" },
970 { name => "kl", lcid => 0x0000006f, oemcp => 850, ebcdiccp => 20277, sopentypelang => "GRN" },
971 { name => "kl-GL", lcid => 0x0000046f },
972 { name => "kln", sopentypelang => "KAL" },
973 { name => "kln-KE" },
974 { name => "km", lcid => 0x00000053, inegnumber => 2, slist => ",", group => 15 },
975 { name => "km-KH", lcid => 0x00000453 },
976 { name => "kn", lcid => 0x0000004b, slist => ",", group => 15, sabbrevlangname => "KDI" },
977 { name => "kn-IN", lcid => 0x0000044b },
978 { name => "ko", lcid => 0x00000012, ireadinglayout => 2, slist => ",", oemcp => 949, ebcdiccp => 20833, sscripts => "Hang Hani Kore", group => 8 },
979 { name => "ko-KP", oemcp => 65001 },
980 { name => "ko-KR", lcid => 0x00000412 },
981 { name => "kok", lcid => 0x00000057, slist => ",", group => 15, sabbrevlangname => "KNK" },
982 { name => "kok-IN", lcid => 0x00000457 },
983 { name => "kr", lcid => 0x00000071, sortlocale => "kr-Latn-NG", oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "KNR" },
984 { name => "kr-Latn", file => "kr", dir => "exemplars" },
985 { name => "kr-Latn-NG", lcid => 0x00000471, file => "kr", dir => "exemplars" },
986 { name => "kr-NG", alias => "kr-Latn-NG" },
987 { name => "ks", lcid => 0x00000060, group => 15, sabbrevlangname => "ZZZ", sopentypelang => "KSH" },
988 { name => "ks-Arab", lcid => 0x00000460 },
989 { name => "ks-Arab-IN" },
990 { name => "ks-Deva", slist => "," },
991 { name => "ks-Deva-IN", lcid => 0x00000860 },
992 { name => "ks-IN", alias => "ks-Arab-IN" },
993 { name => "ksb" },
994 { name => "ksb-TZ" },
995 { name => "ksf" },
996 { name => "ksf-CM" },
997 { name => "ksh", sopentypelang => "KSH0" },
998 { name => "ksh-DE" },
999 { name => "ku", lcid => 0x00000092, file => "ckb", slist => "\x{061b}", sortlocale => "ku-Arab-IQ", oemcp => 720 },
1000 { name => "ku-Arab", lcid => 0x00007c92, file => "ckb", group => 13 },
1001 { name => "ku-Arab-IQ", lcid => 0x00000492, file => "ckb_IQ" },
1002 { name => "ku-Arab-IR", file => "ckb_IR", oemcp => 65001 },
1003 { name => "kw" },
1004 { name => "kw-GB" },
1005 { name => "ky", lcid => 0x00000040, oemcp => 866, group => 5, sabbrevlangname => "KYR" },
1006 { name => "ky-Cyrl", alias => "ky" },
1007 { name => "ky-Cyrl-KG", alias => "ky-KG" },
1008 { name => "ky-KG", lcid => 0x00000440 },
1009 { name => "la", lcid => 0x00000076, oemcp => 437, slist => ",", dir => "seed", sabbrevlangname => "ZZZ" },
1010 { name => "la-001", lcid => 0x00000476, file => "la", dir => "seed" },
1011 { name => "lag" },
1012 { name => "lag-TZ" },
1013 { name => "lb", lcid => 0x0000006e, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "LBX" },
1014 { name => "lb-LU", lcid => 0x0000046e },
1015 { name => "lg" },
1016 { name => "lg-UG" },
1017 { name => "lkt" },
1018 { name => "lkt-US" },
1019 { name => "ln" },
1020 { name => "ln-AO" },
1021 { name => "ln-CD" },
1022 { name => "ln-CF" },
1023 { name => "ln-CG" },
1024 { name => "lo", lcid => 0x00000054, group => 15 },
1025 { name => "lo-LA", lcid => 0x00000454 },
1026 { name => "lrc" },
1027 { name => "lrc-IQ" },
1028 { name => "lrc-IR" },
1029 { name => "lt", lcid => 0x00000027, oemcp => 775, group => 3, sabbrevlangname => "LTH", sopentypelang => "LTH" },
1030 { name => "lt-LT", lcid => 0x00000427 },
1031 { name => "lu" },
1032 { name => "lu-CD" },
1033 { name => "luo" },
1034 { name => "luo-KE" },
1035 { name => "luy", sopentypelang => "LUH" },
1036 { name => "luy-KE" },
1037 { name => "lv", lcid => 0x00000026, oemcp => 775, group => 3, sabbrevlangname => "LVI", sopentypelang => "LVI" },
1038 { name => "lv-LV", lcid => 0x00000426 },
1039 { name => "mai" },
1040 { name => "mai-IN" },
1041 { name => "mas" },
1042 { name => "mas-KE" },
1043 { name => "mas-TZ" },
1044 { name => "mer" },
1045 { name => "mer-KE" },
1046 { name => "mfe" },
1047 { name => "mfe-MU" },
1048 { name => "mg" },
1049 { name => "mg-MG" },
1050 { name => "mgh" },
1051 { name => "mgh-MZ" },
1052 { name => "mgo" },
1053 { name => "mgo-CM" },
1054 { name => "mi", lcid => 0x00000081, slist => "," },
1055 { name => "mi-Latn", alias => "mi" },
1056 { name => "mi-Latn-NZ", alias => "mi-NZ" },
1057 { name => "mi-NZ", lcid => 0x00000481 },
1058 { name => "mk", lcid => 0x0000002f, oemcp => 866, ebcdiccp => 500, group => 5, sabbrevlangname => "MKI" },
1059 { name => "mk-MK", lcid => 0x0000042f },
1060 { name => "ml", lcid => 0x0000004c, group => 15, sabbrevlangname => "MYM", sopentypelang => "MLR" },
1061 { name => "ml-IN", lcid => 0x0000044c },
1062 { name => "mn", lcid => 0x00000050, oemcp => 866, sopentypelang => "MNG" },
1063 { name => "mn-Cyrl", lcid => 0x00007850, file => "mn", sabbrevlangname => "MNN" },
1064 { name => "mn-Cyrl-MN", alias => "mn-MN" },
1065 { name => "mn-MN", lcid => 0x00000450, sparent => "mn-Cyrl", group => 5 },
1066 { name => "mn-Mong", lcid => 0x00007c50, oemcp => 65001, slist => ",", group => 15, dir => "seed", sabbrevlangname => "MNG" },
1067 { name => "mn-Mong-CN", lcid => 0x00000850, dir => "seed" },
1068 { name => "mn-Mong-MN", lcid => 0x00000c50, dir => "seed", sabbrevlangname => "MNM" },
1069 { name => "mni", lcid => 0x00000058, slist => ",", sabbrevlangname => "ZZZ" },
1070 { name => "mni-IN", lcid => 0x00000458, file => "mni_Beng_IN" },
1071 { name => "moh", lcid => 0x0000007c, oemcp => 850, ebcdiccp => 37, slist => ",", dir => "seed", sabbrevlangname => "MWK" },
1072 { name => "moh-CA", lcid => 0x0000047c, dir => "seed" },
1073 { name => "moh-Latn", alias => "moh" },
1074 { name => "moh-Latn-CA", alias => "moh-CA" },
1075 { name => "mr", lcid => 0x0000004e, slist => ",", group => 15 },
1076 { name => "mr-IN", lcid => 0x0000044e },
1077 { name => "ms", lcid => 0x0000003e, oemcp => 850, sabbrevlangname => "MSL", sopentypelang => "MLY" },
1078 { name => "ms-BN", lcid => 0x0000083e, sabbrevlangname => "MSB" },
1079 { name => "ms-ID" },
1080 { name => "ms-Latn", alias => "ms" },
1081 { name => "ms-Latn-BN", alias => "ms-BN" },
1082 { name => "ms-Latn-MY", alias => "ms-MY" },
1083 { name => "ms-Latn-SG", alias => "ms-SG" },
1084 { name => "ms-MY", lcid => 0x0000043e },
1085 { name => "ms-SG" },
1086 { name => "mt", lcid => 0x0000003a, sopentypelang => "MTS" },
1087 { name => "mt-MT", lcid => 0x0000043a },
1088 { name => "mua" },
1089 { name => "mua-CM" },
1090 { name => "my", lcid => 0x00000055, sopentypelang => "BRM" },
1091 { name => "my-MM", lcid => 0x00000455 },
1092 { name => "mzn" },
1093 { name => "mzn-IR" },
1094 { name => "naq" },
1095 { name => "naq-NA" },
1096 { name => "nb", lcid => 0x00007c14, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NOR", sopentypelang => "NOR" },
1097 { name => "nb-NO", lcid => 0x00000414 },
1098 { name => "nb-SJ" },
1099 { name => "nd", sopentypelang => "NDB" },
1100 { name => "nd-ZW" },
1101 { name => "nds" },
1102 { name => "nds-DE" },
1103 { name => "nds-NL" },
1104 { name => "ne", lcid => 0x00000061, slist => "," },
1105 { name => "ne-IN", lcid => 0x00000861, sabbrevlangname => "NEI" },
1106 { name => "ne-NP", lcid => 0x00000461, group => 15 },
1107 { name => "nl", lcid => 0x00000013, oemcp => 850 },
1108 { name => "nl-AW" },
1109 { name => "nl-BE", lcid => 0x00000813, sabbrevlangname => "NLB" },
1110 { name => "nl-BQ" },
1111 { name => "nl-CW" },
1112 { name => "nl-NL", lcid => 0x00000413 },
1113 { name => "nl-SR" },
1114 { name => "nl-SX" },
1115 { name => "nmg" },
1116 { name => "nmg-CM" },
1117 { name => "nn", lcid => 0x00007814, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NON", sopentypelang => "NYN" },
1118 { name => "nn-NO", lcid => 0x00000814 },
1119 { name => "nnh" },
1120 { name => "nnh-CM" },
1121 { name => "no", lcid => 0x00000014, oemcp => 850, ebcdiccp => 20277, sortlocale => "nb-NO" },
1122 { name => "nqo", idigits => 3, inegnumber => 3, slist => "\x{060c}", dir => "seed", sopentypelang => "NKO" },
1123 { name => "nqo-GN", dir => "seed" },
1124 { name => "nr", dir => "seed", sopentypelang => "NDB" },
1125 { name => "nr-ZA", dir => "seed" },
1126 { name => "nso", lcid => 0x0000006c, oemcp => 850, dir => "seed", sopentypelang => "SOT" },
1127 { name => "nso-ZA", lcid => 0x0000046c, dir => "seed" },
1128 { name => "nus" },
1129 { name => "nus-SD", alias => "nus-SS" },
1130 { name => "nus-SS" },
1131 { name => "nyn", sopentypelang => "NKL" },
1132 { name => "nyn-UG" },
1133 { name => "oc", lcid => 0x00000082, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
1134 { name => "oc-FR", lcid => 0x00000482, dir => "seed" },
1135 { name => "oc-Latn", alias => "oc" },
1136 { name => "oc-Latn-FR", alias => "oc-FR" },
1137 { name => "om", lcid => 0x00000072, sopentypelang => "ORO" },
1138 { name => "om-ET", lcid => 0x00000472 },
1139 { name => "om-KE" },
1140 { name => "or", lcid => 0x00000048, slist => ",", group => 15 },
1141 { name => "or-IN", lcid => 0x00000448 },
1142 { name => "os" },
1143 { name => "os-GE" },
1144 { name => "os-RU" },
1145 { name => "pa", lcid => 0x00000046, slist => "," },
1146 { name => "pa-Arab", lcid => 0x00007c46, slist => ";", inegnumber => 2, oemcp => 720, group => 13, sabbrevlangname => "PAP" },
1147 { name => "pa-Arab-PK", lcid => 0x00000846 },
1148 { name => "pa-Guru" },
1149 { name => "pa-Guru-IN", alias => "pa-IN" },
1150 { name => "pa-IN", lcid => 0x00000446, sparent => "pa-Guru", file => "pa_Guru_IN", group => 15 },
1151 ## name => "pap", lcid => 0x00000079 },
1152 ## name => "pap-029", lcid => 0x00000479 },
1153 { name => "pcm" },
1154 { name => "pcm-NG" },
1155 { name => "pl", lcid => 0x00000015, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "PLK", sopentypelang => "PLK" },
1156 { name => "pl-PL", lcid => 0x00000415 },
1157 ## name => "prg" },
1158 ## name => "prg-001" },
1159 { name => "prs", lcid => 0x0000008c, file => "fa", inegnumber => 3, oemcp => 720, group => 13, sopentypelang => "DRI" },
1160 { name => "prs-AF", lcid => 0x0000048c, file => "fa_AF" },
1161 { name => "prs-Arab", alias => "prs" },
1162 { name => "prs-Arab-AF", alias => "prs-AF" },
1163 { name => "ps", lcid => 0x00000063, group => 13, sabbrevlangname => "PAS", sopentypelang => "PAS" },
1164 { name => "ps-AF", lcid => 0x00000463 },
1165 { name => "ps-PK" },
1166 { name => "pt", lcid => 0x00000016, oemcp => 850, sabbrevlangname => "PTB", sopentypelang => "PTG" },
1167 { name => "pt-AO" },
1168 { name => "pt-BR", lcid => 0x00000416 },
1169 { name => "pt-CH", oemcp => 65001 },
1170 { name => "pt-CV" },
1171 { name => "pt-GQ", oemcp => 65001 },
1172 { name => "pt-GW" },
1173 { name => "pt-LU", oemcp => 65001 },
1174 { name => "pt-MO" },
1175 { name => "pt-MZ" },
1176 { name => "pt-PT", lcid => 0x00000816, sabbrevlangname => "PTG" },
1177 { name => "pt-ST" },
1178 { name => "pt-TL" },
1179 ## name => qps-Latn-x-sh", lcid => 0x80000901 },
1180 ## name => qps-ploc", lcid => 0x80000501 },
1181 ## name => qps-ploca", lcid => 0x800005fe },
1182 ## name => qps-plocm", lcid => 0x800009ff },
1183 { name => "qu", alias => "quz" },
1184 { name => "qu-BO", alias => "quz-BO" },
1185 { name => "qu-EC", alias => "quz-EC" },
1186 { name => "qu-PE", alias => "quz-PE" },
1187 { name => "quc", lcid => 0x00000086, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed" },
1188 { name => "quc-Latn", lcid => 0x00007c86, file => "quc", dir => "seed" },
1189 { name => "quc-Latn-GT", lcid => 0x00000486, file => "quc_GT", dir => "seed" },
1190 { name => "qut", alias => "quc" },
1191 { name => "qut-GT", alias => "quc-Latn-GT" },
1192 { name => "quz", lcid => 0x0000006b, file => "qu", territory => "BO", oemcp => 850, ebcdiccp => 20284, slist => "," },
1193 { name => "quz-BO", lcid => 0x0000046b, file => "qu_BO" },
1194 { name => "quz-EC", lcid => 0x0000086b, file => "qu_EC" },
1195 { name => "quz-Latn", alias => "quz" },
1196 { name => "quz-Latn-BO", alias => "quz-BO" },
1197 { name => "quz-Latn-EC", alias => "quz-EC" },
1198 { name => "quz-Latn-PE", alias => "quz-PE" },
1199 { name => "quz-PE", lcid => 0x00000c6b, file => "qu_PE" },
1200 { name => "rm", lcid => 0x00000017, oemcp => 850, ebcdiccp => 20273, sabbrevlangname => "RMC", sopentypelang => "RMS" },
1201 { name => "rm-CH", lcid => 0x00000417 },
1202 { name => "rn" },
1203 { name => "rn-BI" },
1204 { name => "ro", lcid => 0x00000018, oemcp => 852, ebcdiccp => 20880, sabbrevlangname => "ROM", sopentypelang => "ROM" },
1205 { name => "ro-MD", lcid => 0x00000818, maccp => 65001, sabbrevlangname => "ROD" },
1206 { name => "ro-RO", lcid => 0x00000418, group => 2 },
1207 { name => "rof" },
1208 { name => "rof-TZ" },
1209 { name => "ru", lcid => 0x00000019, oemcp => 866 },
1210 { name => "ru-BY", maccp => 65001 },
1211 { name => "ru-KG", maccp => 65001 },
1212 { name => "ru-KZ", maccp => 65001 },
1213 { name => "ru-MD", lcid => 0x00000819, maccp => 65001, sabbrevlangname => "RUM" },
1214 { name => "ru-RU", lcid => 0x00000419, group => 5 },
1215 { name => "ru-UA", maccp => 65001 },
1216 { name => "rw", lcid => 0x00000087, oemcp => 437, sopentypelang => "RUA" },
1217 { name => "rw-RW", lcid => 0x00000487 },
1218 { name => "rwk" },
1219 { name => "rwk-TZ" },
1220 { name => "sa", lcid => 0x0000004f, slist => ",", group => 15 },
1221 { name => "sa-Deva", alias => "sa" },
1222 { name => "sa-Deva-IN", alias => "sa-IN" },
1223 { name => "sa-IN", lcid => 0x0000044f },
1224 { name => "sah", lcid => 0x00000085, oemcp => 866, group => 5, sopentypelang => "YAK" },
1225 { name => "sah-Cyrl", alias => "sah" },
1226 { name => "sah-Cyrl-RU", alias => "sah-RU" },
1227 { name => "sah-RU", lcid => 0x00000485 },
1228 { name => "saq" },
1229 { name => "saq-KE" },
1230 { name => "sat" },
1231 { name => "sat-Olck" },
1232 { name => "sat-Olck-IN" },
1233 { name => "sbp" },
1234 { name => "sbp-TZ" },
1235 { name => "sc" },
1236 { name => "sc-IT" },
1237 { name => "sd", lcid => 0x00000059, inegnumber => 3, oemcp => 720, sabbrevlangname => "SIP" },
1238 { name => "sd-Arab", lcid => 0x00007c59, group => 13 },
1239 { name => "sd-Arab-PK", lcid => 0x00000859 },
1240 { name => "sd-Deva", inegnumber => 1, slist => ",", oemcp => 65001, group => 15 },
1241 { name => "sd-Deva-IN", lcid => 0x00000459, sabbrevlangname => "ZZZ" },
1242 { name => "sd-PK", alias => "sd-Arab-PK" },
1243 { name => "se", lcid => 0x0000003b, oemcp => 850, ebcdiccp => 20277, sopentypelang => "NSM" },
1244 { name => "se-FI", lcid => 0x00000c3b, ebcdiccp => 20278, sabbrevlangname => "SMG" },
1245 { name => "se-NO", lcid => 0x0000043b },
1246 { name => "se-SE", lcid => 0x0000083b, ebcdiccp => 20278, sabbrevlangname => "SMF" },
1247 { name => "se-Latn", alias => "se" },
1248 { name => "se-Latn-FI", alias => "se-FI" },
1249 { name => "se-Latn-NO", alias => "se-NO" },
1250 { name => "se-Latn-SE", alias => "se-SE" },
1251 { name => "seh" },
1252 { name => "seh-MZ" },
1253 { name => "ses" },
1254 { name => "ses-ML" },
1255 { name => "sg", sopentypelang => "SGO" },
1256 { name => "sg-CF" },
1257 { name => "shi" },
1258 { name => "shi-Latn" },
1259 { name => "shi-Latn-MA" },
1260 { name => "shi-Tfng" },
1261 { name => "shi-Tfng-MA" },
1262 { name => "si", lcid => 0x0000005b, group => 15, sopentypelang => "SNH" },
1263 { name => "si-LK", lcid => 0x0000045b },
1264 { name => "sk", lcid => 0x0000001b, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "SKY", sopentypelang => "SKY" },
1265 { name => "sk-SK", lcid => 0x0000041b },
1266 { name => "sl", lcid => 0x00000024, oemcp => 852, ebcdiccp => 20880, group => 2 },
1267 { name => "sl-SI", lcid => 0x00000424 },
1268 { name => "sma", lcid => 0x0000783b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMB", sopentypelang => "SSM" },
1269 { name => "sma-Latn", alias => "sma" },
1270 { name => "sma-Latn-NO", alias => "sma-NO" },
1271 { name => "sma-Latn-SE", alias => "sma-SE" },
1272 { name => "sma-NO", lcid => 0x0000183b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMA" },
1273 { name => "sma-SE", lcid => 0x00001c3b, dir => "seed" },
1274 { name => "smj", lcid => 0x00007c3b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMK", sopentypelang => "LSM" },
1275 { name => "smj-Latn", alias => "smj" },
1276 { name => "smj-Latn-NO", alias => "smj-NO" },
1277 { name => "smj-Latn-SE", alias => "smj-SE" },
1278 { name => "smj-NO", lcid => 0x0000103b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMJ" },
1279 { name => "smj-SE", lcid => 0x0000143b, dir => "seed" },
1280 { name => "smn", lcid => 0x0000703b, sparent => "se", ebcdiccp => 20278, sopentypelang => "ISM" },
1281 { name => "smn-FI", lcid => 0x0000243b },
1282 { name => "smn-Latn", alias => "smn" },
1283 { name => "smn-Latn-FI", alias => "smn-FI" },
1284 { name => "sms", lcid => 0x0000743b, sparent => "se", ebcdiccp => 20278, dir => "seed", sopentypelang => "SKS" },
1285 { name => "sms-FI", lcid => 0x0000203b, dir => "seed" },
1286 { name => "sms-Latn", alias => "sms" },
1287 { name => "sms-Latn-FI", alias => "sms-FI" },
1288 { name => "sn", sopentypelang => "SNA0" },
1289 { name => "sn-Latn", file => "sn" },
1290 { name => "sn-Latn-ZW", file => "sn_ZW" },
1291 { name => "sn-ZW", alias => "sn-Latn-ZW" },
1292 { name => "so", lcid => 0x00000077, sopentypelang => "SML" },
1293 { name => "so-DJ" },
1294 { name => "so-ET" },
1295 { name => "so-KE" },
1296 { name => "so-SO", lcid => 0x00000477 },
1297 { name => "sq", lcid => 0x0000001c, oemcp => 852, ebcdiccp => 20880, group => 2 },
1298 { name => "sq-AL", lcid => 0x0000041c },
1299 { name => "sq-MK" },
1300 { name => "sq-XK" },
1301 { name => "sr", lcid => 0x00007c1a, sortlocale => "sr-Latn-RS", oemcp => 852, group => 2, sabbrevlangname => "SRB", sopentypelang => "SRB" },
1302 { name => "sr-Cyrl", lcid => 0x00006c1a, oemcp => 855, ebcdiccp => 21025, group => 5, sabbrevlangname => "SRO" },
1303 { name => "sr-Cyrl-BA", lcid => 0x00001c1a, sabbrevlangname => "SRN" },
1304 { name => "sr-Cyrl-ME", lcid => 0x0000301a, sabbrevlangname => "SRQ" },
1305 { name => "sr-Cyrl-RS", lcid => 0x0000281a },
1306 { name => "sr-Cyrl-XK" },
1307 { name => "sr-Latn", lcid => 0x0000701a, sabbrevlangname => "SRM" },
1308 { name => "sr-Latn-BA", lcid => 0x0000181a, maccp => 10082, ebcdiccp => 870, sabbrevlangname => "SRS" },
1309 { name => "sr-Latn-ME", lcid => 0x00002c1a, sabbrevlangname => "SRP" },
1310 { name => "sr-Latn-RS", lcid => 0x0000241a, sabbrevlangname => "SRM" },
1311 { name => "sr-Latn-XK" },
1312 ## name => "sr-Cyrl-CS", lcid => 0x00000c1a },
1313 ## name => "sr-Latn-CS", lcid => 0x0000081a },
1314 { name => "ss", dir => "seed", sopentypelang => "SWZ" },
1315 { name => "ss-SZ", dir => "seed" },
1316 { name => "ss-ZA", dir => "seed" },
1317 { name => "ssy", dir => "seed" },
1318 { name => "ssy-ER", dir => "seed" },
1319 { name => "st", lcid => 0x00000030, dir => "seed" },
1320 { name => "st-LS", dir => "seed" },
1321 { name => "st-ZA", lcid => 0x00000430, dir => "seed" },
1322 { name => "su" },
1323 { name => "su-Latn" },
1324 { name => "su-Latn-ID" },
1325 { name => "sv", lcid => 0x0000001d, oemcp => 850, ebcdiccp => 20278, sabbrevlangname => "SVE", sopentypelang => "SVE" },
1326 { name => "sv-AX" },
1327 { name => "sv-FI", lcid => 0x0000081d, sabbrevlangname => "SVF" },
1328 { name => "sv-SE", lcid => 0x0000041d, sabbrevlangname => "SVE" },
1329 { name => "sw", lcid => 0x00000041, territory => "KE", oemcp => 437, ebcdiccp => 500, sabbrevlangname => "SWK", sopentypelang => "SWK" },
1330 { name => "sw-CD" },
1331 { name => "sw-KE", lcid => 0x00000441 },
1332 { name => "sw-TZ" },
1333 { name => "sw-UG" },
1334 { name => "swc-CD", alias => "sw-CD" },
1335 { name => "syr", lcid => 0x0000005a, slist => ",", group => 13, dir => "seed" },
1336 { name => "syr-SY", lcid => 0x0000045a, dir => "seed" },
1337 { name => "syr-Syrc", alias => "syr" },
1338 { name => "syr-Syrc-SY", alias => "syr-SY" },
1339 { name => "ta", lcid => 0x00000049, slist => ",", group => 15, sabbrevlangname => "TAI" },
1340 { name => "ta-IN", lcid => 0x00000449 },
1341 { name => "ta-LK", lcid => 0x00000849, sabbrevlangname => "TAM" },
1342 { name => "ta-MY" },
1343 { name => "ta-SG" },
1344 { name => "te", lcid => 0x0000004a, group => 15 },
1345 { name => "te-IN", lcid => 0x0000044a },
1346 { name => "teo" },
1347 { name => "teo-KE" },
1348 { name => "teo-UG" },
1349 { name => "tg", lcid => 0x00000028, oemcp => 866, group => 5, sabbrevlangname => "TAJ", sopentypelang => "TAJ" },
1350 { name => "tg-Cyrl", lcid => 0x00007c28, file => "tg" },
1351 { name => "tg-Cyrl-TJ", lcid => 0x00000428, file => "tg_TJ" },
1352 { name => "tg-TJ", alias => "tg-Cyrl-TJ" },
1353 { name => "th", lcid => 0x0000001e, oemcp => 874, ebcdiccp => 20838, slist => ",", group => 11 },
1354 { name => "th-TH", lcid => 0x0000041e },
1355 { name => "ti", lcid => 0x00000073, territory => "ER", sopentypelang => "TGY" },
1356 { name => "ti-ER", lcid => 0x00000873 },
1357 { name => "ti-ET", lcid => 0x00000473, sabbrevlangname => "TIE" },
1358 { name => "tig", dir => "seed", sopentypelang => "TGR" },
1359 { name => "tig-ER", dir => "seed" },
1360 { name => "tk", lcid => 0x00000042, oemcp => 852, ebcdiccp => 20880, group => 2, sopentypelang => "TKM" },
1361 { name => "tk-Latn", alias => "tk" },
1362 { name => "tk-Latn-TM", alias => "tk-TM" },
1363 { name => "tk-TM", lcid => 0x00000442 },
1364 { name => "tn", lcid => 0x00000032, oemcp => 850, dir => "seed", sopentypelang => "TNA" },
1365 { name => "tn-BW", lcid => 0x00000832, dir => "seed", sabbrevlangname => "TSB" },
1366 { name => "tn-ZA", lcid => 0x00000432, dir => "seed" },
1367 { name => "to", sopentypelang => "TGN" },
1368 { name => "to-TO" },
1369 { name => "tr", lcid => 0x0000001f, oemcp => 857, ebcdiccp => 20905, group => 6, sabbrevlangname => "TRK", sopentypelang => "TRK" },
1370 { name => "tr-CY" },
1371 { name => "tr-TR", lcid => 0x0000041f },
1372 { name => "ts", lcid => 0x00000031, dir => "seed", sopentypelang => "TSG" },
1373 { name => "ts-ZA", lcid => 0x00000431, dir => "seed" },
1374 { name => "tt", lcid => 0x00000044, oemcp => 866, group => 5, sabbrevlangname => "TTT" },
1375 { name => "tt-Cyrl", alias => "tt" },
1376 { name => "tt-Cyrl-RU", alias => "tt-RU" },
1377 { name => "tt-RU", lcid => 0x00000444 },
1378 { name => "twq" },
1379 { name => "twq-NE" },
1380 { name => "tzm", lcid => 0x0000005f, sortlocale => "tzm-Latn-DZ", oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "TZA" },
1381 { name => "tzm-Latn", lcid => 0x00007c5f, territory => "DZ", file => "tzm" },
1382 { name => "tzm-Latn-MA", file => "tzm_MA", oemcp => 65001 },
1383 { name => "tzm-Latn-DZ", lcid => 0x0000085f, file => "tzm" },
1384 { name => "tzm-MA", alias => "tzm-Latn-MA" },
1385 ## name => "tzm-Arab", group => 13 },
1386 ## name => "tzm-Arab-MA", lcid => 0x0000045f },
1387 ## name => "tzm-Tfng", lcid => 0x0000785f },
1388 ## name => "tzm-Tfng-MA", lcid => 0x0000105f },
1389 { name => "ug", lcid => 0x00000080, oemcp => 720, slist => ",", group => 13, sopentypelang => "UYG" },
1390 { name => "ug-Arab", alias => "ug" },
1391 { name => "ug-Arab-CN", alias => "ug-CN" },
1392 { name => "ug-CN", lcid => 0x00000480 },
1393 { name => "uk", lcid => 0x00000022, oemcp => 866, maccp => 10017, ebcdiccp => 500, group => 5 },
1394 { name => "uk-UA", lcid => 0x00000422 },
1395 { name => "ur", lcid => 0x00000020, oemcp => 720 },
1396 { name => "ur-IN", lcid => 0x00000820, maccp => 65001, sabbrevlangname => "URI" },
1397 { name => "ur-PK", lcid => 0x00000420, group => 13 },
1398 { name => "uz", lcid => 0x00000043, oemcp => 857, maccp => 10029, group => 2 },
1399 { name => "uz-Arab", oemcp => 65001, maccp => 65001 },
1400 { name => "uz-Arab-AF" },
1401 { name => "uz-Cyrl", lcid => 0x00007843, oemcp => 866, maccp => 10007, group => 5, sabbrevlangname => "UZC" },
1402 { name => "uz-Cyrl-UZ", lcid => 0x00000843 },
1403 { name => "uz-Latn", lcid => 0x00007c43 },
1404 { name => "uz-Latn-UZ", lcid => 0x00000443 },
1405 { name => "vai" },
1406 { name => "vai-Latn" },
1407 { name => "vai-Latn-LR" },
1408 { name => "vai-Vaii" },
1409 { name => "vai-Vaii-LR" },
1410 { name => "ve", lcid => 0x00000033, dir => "seed", sabbrevlangname => "ZZZ" },
1411 { name => "ve-ZA", lcid => 0x00000433, dir => "seed" },
1412 { name => "vi", lcid => 0x0000002a, oemcp => 1258, slist => ",", group => 14, sabbrevlangname => "VIT", sopentypelang => "VIT" },
1413 { name => "vi-VN", lcid => 0x0000042a },
1414 { name => "vo", dir => "seed" },
1415 { name => "vo-001", dir => "seed" },
1416 { name => "vun" },
1417 { name => "vun-TZ" },
1418 { name => "wae" },
1419 { name => "wae-CH" },
1420 { name => "wal", dir => "seed" },
1421 { name => "wal-ET", dir => "seed" },
1422 { name => "wo", lcid => 0x00000088, oemcp => 850, ebcdiccp => 20297, sopentypelang => "WLF" },
1423 { name => "wo-Latn", alias => "wo" },
1424 { name => "wo-Latn-SN", alias => "wo-SN" },
1425 { name => "wo-SN", lcid => 0x00000488 },
1426 { name => "x-IV_mathan", lcid => 0x0001007f, alias => "" },
1427 { name => "xh", lcid => 0x00000034, oemcp => 850, sopentypelang => "XHS" },
1428 { name => "xh-ZA", lcid => 0x00000434 },
1429 { name => "xog" },
1430 { name => "xog-UG" },
1431 { name => "yav" },
1432 { name => "yav-CM" },
1433 { name => "yi", lcid => 0x0000003d, sabbrevlangname => "ZZZ", sopentypelang => "JII" },
1434 { name => "yi-001", lcid => 0x0000043d },
1435 { name => "yo", lcid => 0x0000006a, oemcp => 437, sopentypelang => "YBA" },
1436 { name => "yo-BJ", ebcdiccp => 500 },
1437 { name => "yo-Latn", alias => "yo" },
1438 { name => "yo-Latn-NG", alias => "yo-NG" },
1439 { name => "yo-NG", lcid => 0x0000046a },
1440 { name => "yrl" },
1441 { name => "yrl-BR" },
1442 { name => "yrl-CO" },
1443 { name => "yrl-VE" },
1444 { name => "yue" },
1445 { name => "yue-Hans" },
1446 { name => "yue-Hans-CN" },
1447 { name => "yue-Hant" },
1448 { name => "yue-Hant-HK" },
1449 { name => "zgh" },
1450 { name => "zgh-MA", alias => "zgh-Tfng-MA" },
1451 { name => "zgh-Tfng", file => "zgh" },
1452 { name => "zgh-Tfng-MA", file => "zgh_MA" },
1453 { name => "zh", lcid => 0x00007804, ireadinglayout => 2, oemcp => 936, slist => ",", sscripts => "Hani Hans", sabbrevlangname => "CHS", sopentypelang => "ZHS" },
1454 { name => "zh-CN", lcid => 0x00000804, file => "zh_Hans_CN", sparent => "zh-Hans" },
1455 { name => "zh-CN_phoneb", lcid => 0x00050804, alias => "zh-CN" },
1456 { name => "zh-CN_stroke", lcid => 0x00020804, alias => "zh-CN" },
1457 { name => "zh-Hans", lcid => 0x00000004, group => 10 },
1458 { name => "zh-Hans-CN", alias => "zh-CN" },
1459 { name => "zh-Hans-CN-u-co-phonebk", alias => "zh-CN_phoneb" },
1460 { name => "zh-Hans-CN-u-co-stroke", alias => "zh-CN_stroke" },
1461 { name => "zh-Hans-HK", slist => ";" },
1462 { name => "zh-Hans-MO", slist => ";" },
1463 { name => "zh-Hans-SG", alias => "zh-SG" },
1464 { name => "zh-Hans-SG-u-co-phonebk", alias => "zh-SG_phoneb" },
1465 { name => "zh-Hans-SG-u-co-stroke", alias => "zh-SG_stroke" },
1466 { name => "zh-Hant", lcid => 0x00007c04, sortlocale => "zh-HK", ireadinglayout => 2, oemcp => 950, slist => ",", sscripts => "Hani Hant", group => 9, sabbrevlangname => "CHT", sopentypelang => "ZHH" },
1467 { name => "zh-Hant-HK", alias => "zh-HK" },
1468 { name => "zh-Hant-HK-u-co-unihan", alias => "zh-HK_radstr" },
1469 { name => "zh-Hant-MO", alias => "zh-MO" },
1470 { name => "zh-Hant-MO-u-co-stroke", alias => "zh-MO_stroke" },
1471 { name => "zh-Hant-MO-u-co-unihan", alias => "zh-MO_radstr" },
1472 { name => "zh-Hant-TW", alias => "zh-TW" },
1473 { name => "zh-Hant-TW-u-co-phonetic", alias => "zh-TW_pronun" },
1474 { name => "zh-Hant-TW-u-co-unihan", alias => "zh-TW_radstr" },
1475 { name => "zh-HK", lcid => 0x00000c04, file => "zh_Hant_HK", sparent => "zh-Hant", sabbrevlangname => "ZHH" },
1476 { name => "zh-HK_radstr", lcid => 0x00040c04, alias => "zh-HK" },
1477 { name => "zh-MO", lcid => 0x00001404, file => "zh_Hant_MO", sparent => "zh-Hant", sabbrevlangname => "ZHM", sopentypelang => "ZHT" },
1478 { name => "zh-MO_radstr", lcid => 0x00041404, alias => "zh-MO" },
1479 { name => "zh-MO_stroke", lcid => 0x00021404, alias => "zh-MO" },
1480 { name => "zh-SG", lcid => 0x00001004, file => "zh_Hans_SG", sparent => "zh-Hans", sabbrevlangname => "ZHI" },
1481 { name => "zh-SG_phoneb", lcid => 0x00051004, alias => "zh-SG" },
1482 { name => "zh-SG_stroke", lcid => 0x00021004, alias => "zh-SG" },
1483 { name => "zh-TW", lcid => 0x00000404, file => "zh_Hant_TW", sparent => "zh-Hant", sopentypelang => "ZHT" },
1484 { name => "zh-TW_pronun", lcid => 0x00030404, alias => "zh-TW" },
1485 { name => "zh-TW_radstr", lcid => 0x00040404, alias => "zh-TW" },
1486 { name => "zu", lcid => 0x00000035, oemcp => 850 },
1487 { name => "zu-ZA", lcid => 0x00000435 },
1490 my @calendars =
1492 { id => 1, name => "Gregorian", itwodigityearmax => 2049 },
1493 { id => 2, type => "gregorian", locale => "en-US", itwodigityearmax => 2049 },
1494 { id => 3, type => "japanese", locale => "ja-JP", eras => [ 232..236 ] },
1495 { id => 4, type => "roc", locale => "zh-TW", eras => [ 1 ] },
1496 { id => 5, type => "dangi", locale => "ko-KR", eras => [ 0 ] },
1497 { id => 6, type => "islamic", locale => "ar-SA", itwodigityearmax => 1451 },
1498 { id => 7, type => "buddhist", locale => "th-TH", eras => [ 0 ] },
1499 { id => 8, type => "hebrew", locale => "he-IL", itwodigityearmax => 5810 },
1500 { id => 9, type => "gregorian", locale => "fr-FR", itwodigityearmax => 2049 },
1501 { id => 10, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1502 { id => 11, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1503 { id => 12, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1504 { id => 13, name => "Julian", locale => "en-US", itwodigityearmax => 2049 },
1505 { id => 14, name => "Japanese Lunisolar" },
1506 { id => 15, name => "Chinese Lunisolar" },
1507 { id => 16, name => "Saka" },
1508 { id => 17, name => "Lunar ETO Chinese" },
1509 { id => 18, name => "Lunar ETO Korean" },
1510 { id => 19, name => "Lunar ETO Rokuyou" },
1511 { id => 20, name => "Korean Lunisolar" },
1512 { id => 21, name => "Taiwan Lunisolar" },
1513 { id => 22, type => "persian", locale => "prs-AF", itwodigityearmax => 1429 },
1514 { id => 23, type => "islamic-umalqura", locale => "ar-SA", itwodigityearmax => 1451 },
1517 my @geoids =
1519 { id => 2, name => "AG" }, # Antigua and Barbuda
1520 { id => 3, name => "AF" }, # Afghanistan
1521 { id => 4, name => "DZ" }, # Algeria
1522 { id => 5, name => "AZ" }, # Azerbaijan
1523 { id => 6, name => "AL" }, # Albania
1524 { id => 7, name => "AM" }, # Armenia
1525 { id => 8, name => "AD" }, # Andorra
1526 { id => 9, name => "AO" }, # Angola
1527 { id => 10, name => "AS" }, # American Samoa
1528 { id => 11, name => "AR" }, # Argentina
1529 { id => 12, name => "AU" }, # Australia
1530 { id => 14, name => "AT" }, # Austria
1531 { id => 17, name => "BH" }, # Bahrain
1532 { id => 18, name => "BB" }, # Barbados
1533 { id => 19, name => "BW" }, # Botswana
1534 { id => 20, name => "BM" }, # Bermuda
1535 { id => 21, name => "BE" }, # Belgium
1536 { id => 22, name => "BS" }, # Bahamas, The
1537 { id => 23, name => "BD" }, # Bangladesh
1538 { id => 24, name => "BZ" }, # Belize
1539 { id => 25, name => "BA" }, # Bosnia and Herzegovina
1540 { id => 26, name => "BO" }, # Bolivia
1541 { id => 27, name => "MM" }, # Myanmar
1542 { id => 28, name => "BJ" }, # Benin
1543 { id => 29, name => "BY" }, # Belarus
1544 { id => 30, name => "SB" }, # Solomon Islands
1545 { id => 32, name => "BR" }, # Brazil
1546 { id => 34, name => "BT" }, # Bhutan
1547 { id => 35, name => "BG" }, # Bulgaria
1548 { id => 37, name => "BN" }, # Brunei
1549 { id => 38, name => "BI" }, # Burundi
1550 { id => 39, name => "CA" }, # Canada
1551 { id => 40, name => "KH" }, # Cambodia
1552 { id => 41, name => "TD" }, # Chad
1553 { id => 42, name => "LK" }, # Sri Lanka
1554 { id => 43, name => "CG" }, # Congo
1555 { id => 44, name => "CD" }, # Congo (DRC)
1556 { id => 45, name => "CN" }, # China
1557 { id => 46, name => "CL" }, # Chile
1558 { id => 49, name => "CM" }, # Cameroon
1559 { id => 50, name => "KM" }, # Comoros
1560 { id => 51, name => "CO" }, # Colombia
1561 { id => 54, name => "CR" }, # Costa Rica
1562 { id => 55, name => "CF" }, # Central African Republic
1563 { id => 56, name => "CU" }, # Cuba
1564 { id => 57, name => "CV" }, # Cape Verde
1565 { id => 59, name => "CY" }, # Cyprus
1566 { id => 61, name => "DK" }, # Denmark
1567 { id => 62, name => "DJ" }, # Djibouti
1568 { id => 63, name => "DM" }, # Dominica
1569 { id => 65, name => "DO" }, # Dominican Republic
1570 { id => 66, name => "EC" }, # Ecuador
1571 { id => 67, name => "EG" }, # Egypt
1572 { id => 68, name => "IE" }, # Ireland
1573 { id => 69, name => "GQ" }, # Equatorial Guinea
1574 { id => 70, name => "EE" }, # Estonia
1575 { id => 71, name => "ER" }, # Eritrea
1576 { id => 72, name => "SV" }, # El Salvador
1577 { id => 73, name => "ET" }, # Ethiopia
1578 { id => 75, name => "CZ" }, # Czech Republic
1579 { id => 77, name => "FI" }, # Finland
1580 { id => 78, name => "FJ" }, # Fiji Islands
1581 { id => 80, name => "FM" }, # Micronesia
1582 { id => 81, name => "FO" }, # Faroe Islands
1583 { id => 84, name => "FR" }, # France
1584 { id => 86, name => "GM" }, # Gambia, The
1585 { id => 87, name => "GA" }, # Gabon
1586 { id => 88, name => "GE" }, # Georgia
1587 { id => 89, name => "GH" }, # Ghana
1588 { id => 90, name => "GI" }, # Gibraltar
1589 { id => 91, name => "GD" }, # Grenada
1590 { id => 93, name => "GL" }, # Greenland
1591 { id => 94, name => "DE" }, # Germany
1592 { id => 98, name => "GR" }, # Greece
1593 { id => 99, name => "GT" }, # Guatemala
1594 { id => 100, name => "GN" }, # Guinea
1595 { id => 101, name => "GY" }, # Guyana
1596 { id => 103, name => "HT" }, # Haiti
1597 { id => 104, name => "HK" }, # Hong Kong S.A.R.
1598 { id => 106, name => "HN" }, # Honduras
1599 { id => 108, name => "HR" }, # Croatia
1600 { id => 109, name => "HU" }, # Hungary
1601 { id => 110, name => "IS" }, # Iceland
1602 { id => 111, name => "ID" }, # Indonesia
1603 { id => 113, name => "IN" }, # India
1604 { id => 114, name => "IO" }, # British Indian Ocean Territory
1605 { id => 116, name => "IR" }, # Iran
1606 { id => 117, name => "IL" }, # Israel
1607 { id => 118, name => "IT" }, # Italy
1608 { id => 119, name => "CI" }, # Côte d'Ivoire
1609 { id => 121, name => "IQ" }, # Iraq
1610 { id => 122, name => "JP" }, # Japan
1611 { id => 124, name => "JM" }, # Jamaica
1612 { id => 125, name => "SJ" }, # Jan Mayen
1613 { id => 126, name => "JO" }, # Jordan
1614 { id => 127, parent => "UM" }, # Johnston Atoll
1615 { id => 129, name => "KE" }, # Kenya
1616 { id => 130, name => "KG" }, # Kyrgyzstan
1617 { id => 131, name => "KP" }, # North Korea
1618 { id => 133, name => "KI" }, # Kiribati
1619 { id => 134, name => "KR" }, # Korea
1620 { id => 136, name => "KW" }, # Kuwait
1621 { id => 137, name => "KZ" }, # Kazakhstan
1622 { id => 138, name => "LA" }, # Laos
1623 { id => 139, name => "LB" }, # Lebanon
1624 { id => 140, name => "LV" }, # Latvia
1625 { id => 141, name => "LT" }, # Lithuania
1626 { id => 142, name => "LR" }, # Liberia
1627 { id => 143, name => "SK" }, # Slovakia
1628 { id => 145, name => "LI" }, # Liechtenstein
1629 { id => 146, name => "LS" }, # Lesotho
1630 { id => 147, name => "LU" }, # Luxembourg
1631 { id => 148, name => "LY" }, # Libya
1632 { id => 149, name => "MG" }, # Madagascar
1633 { id => 151, name => "MO" }, # Macao S.A.R.
1634 { id => 152, name => "MD" }, # Moldova
1635 { id => 154, name => "MN" }, # Mongolia
1636 { id => 156, name => "MW" }, # Malawi
1637 { id => 157, name => "ML" }, # Mali
1638 { id => 158, name => "MC" }, # Monaco
1639 { id => 159, name => "MA" }, # Morocco
1640 { id => 160, name => "MU" }, # Mauritius
1641 { id => 162, name => "MR" }, # Mauritania
1642 { id => 163, name => "MT" }, # Malta
1643 { id => 164, name => "OM" }, # Oman
1644 { id => 165, name => "MV" }, # Maldives
1645 { id => 166, name => "MX" }, # Mexico
1646 { id => 167, name => "MY" }, # Malaysia
1647 { id => 168, name => "MZ" }, # Mozambique
1648 { id => 173, name => "NE" }, # Niger
1649 { id => 174, name => "VU" }, # Vanuatu
1650 { id => 175, name => "NG" }, # Nigeria
1651 { id => 176, name => "NL" }, # Netherlands
1652 { id => 177, name => "NO" }, # Norway
1653 { id => 178, name => "NP" }, # Nepal
1654 { id => 180, name => "NR" }, # Nauru
1655 { id => 181, name => "SR" }, # Suriname
1656 { id => 182, name => "NI" }, # Nicaragua
1657 { id => 183, name => "NZ" }, # New Zealand
1658 { id => 184, name => "PS" }, # Palestinian Authority
1659 { id => 185, name => "PY" }, # Paraguay
1660 { id => 187, name => "PE" }, # Peru
1661 { id => 190, name => "PK" }, # Pakistan
1662 { id => 191, name => "PL" }, # Poland
1663 { id => 192, name => "PA" }, # Panama
1664 { id => 193, name => "PT" }, # Portugal
1665 { id => 194, name => "PG" }, # Papua New Guinea
1666 { id => 195, name => "PW" }, # Palau
1667 { id => 196, name => "GW" }, # Guinea-Bissau
1668 { id => 197, name => "QA" }, # Qatar
1669 { id => 198, name => "RE" }, # Reunion
1670 { id => 199, name => "MH" }, # Marshall Islands
1671 { id => 200, name => "RO" }, # Romania
1672 { id => 201, name => "PH" }, # Philippines
1673 { id => 202, name => "PR" }, # Puerto Rico
1674 { id => 203, name => "RU" }, # Russia
1675 { id => 204, name => "RW" }, # Rwanda
1676 { id => 205, name => "SA" }, # Saudi Arabia
1677 { id => 206, name => "PM" }, # St. Pierre and Miquelon
1678 { id => 207, name => "KN" }, # St. Kitts and Nevis
1679 { id => 208, name => "SC" }, # Seychelles
1680 { id => 209, name => "ZA" }, # South Africa
1681 { id => 210, name => "SN" }, # Senegal
1682 { id => 212, name => "SI" }, # Slovenia
1683 { id => 213, name => "SL" }, # Sierra Leone
1684 { id => 214, name => "SM" }, # San Marino
1685 { id => 215, name => "SG" }, # Singapore
1686 { id => 216, name => "SO" }, # Somalia
1687 { id => 217, name => "ES" }, # Spain
1688 { id => 218, name => "LC" }, # St. Lucia
1689 { id => 219, name => "SD" }, # Sudan
1690 { id => 220, name => "SJ" }, # Svalbard
1691 { id => 221, name => "SE" }, # Sweden
1692 { id => 222, name => "SY" }, # Syria
1693 { id => 223, name => "CH" }, # Switzerland
1694 { id => 224, name => "AE" }, # United Arab Emirates
1695 { id => 225, name => "TT" }, # Trinidad and Tobago
1696 { id => 227, name => "TH" }, # Thailand
1697 { id => 228, name => "TJ" }, # Tajikistan
1698 { id => 231, name => "TO" }, # Tonga
1699 { id => 232, name => "TG" }, # Togo
1700 { id => 233, name => "ST" }, # São Tomé and Príncipe
1701 { id => 234, name => "TN" }, # Tunisia
1702 { id => 235, name => "TR" }, # Turkey
1703 { id => 236, name => "TV" }, # Tuvalu
1704 { id => 237, name => "TW" }, # Taiwan
1705 { id => 238, name => "TM" }, # Turkmenistan
1706 { id => 239, name => "TZ" }, # Tanzania
1707 { id => 240, name => "UG" }, # Uganda
1708 { id => 241, name => "UA" }, # Ukraine
1709 { id => 242, name => "GB" }, # United Kingdom
1710 { id => 244, name => "US" }, # United States
1711 { id => 245, name => "BF" }, # Burkina Faso
1712 { id => 246, name => "UY" }, # Uruguay
1713 { id => 247, name => "UZ" }, # Uzbekistan
1714 { id => 248, name => "VC" }, # St. Vincent and the Grenadines
1715 { id => 249, name => "VE" }, # Bolivarian Republic of Venezuela
1716 { id => 251, name => "VN" }, # Vietnam
1717 { id => 252, name => "VI" }, # Virgin Islands
1718 { id => 253, name => "VA" }, # Vatican City
1719 { id => 254, name => "NA" }, # Namibia
1720 { id => 257, name => "EH" }, # Western Sahara (disputed)
1721 { id => 258, parent => "UM" }, # Wake Island
1722 { id => 259, name => "WS" }, # Samoa
1723 { id => 260, name => "SZ" }, # Swaziland
1724 { id => 261, name => "YE" }, # Yemen
1725 { id => 263, name => "ZM" }, # Zambia
1726 { id => 264, name => "ZW" }, # Zimbabwe
1727 { id => 269, name => "CS" }, # Serbia and Montenegro (Former)
1728 { id => 270, name => "ME" }, # Montenegro
1729 { id => 271, name => "RS" }, # Serbia
1730 { id => 273, name => "CW" }, # Curaçao
1731 { id => 276, name => "SS" }, # South Sudan
1732 { id => 300, name => "AI" }, # Anguilla
1733 { id => 301, name => "AQ" }, # Antarctica
1734 { id => 302, name => "AW" }, # Aruba
1735 { id => 303, parent => "SH" }, # Ascension Island
1736 { id => 304, parent => "053" }, # Ashmore and Cartier Islands
1737 { id => 305, parent => "UM" }, # Baker Island
1738 { id => 306, name => "BV" }, # Bouvet Island
1739 { id => 307, name => "KY" }, # Cayman Islands
1740 { id => 308, name => "830", parent => "155" }, # Channel Islands
1741 { id => 309, name => "CX" }, # Christmas Island
1742 { id => 310, parent => "009" }, # Clipperton Island
1743 { id => 311, name => "CC" }, # Cocos (Keeling) Islands
1744 { id => 312, name => "CK" }, # Cook Islands
1745 { id => 313, parent => "053" }, # Coral Sea Islands
1746 { id => 314, parent => "IO" }, # Diego Garcia
1747 { id => 315, name => "FK" }, # Falkland Islands (Islas Malvinas)
1748 { id => 317, name => "GF" }, # French Guiana
1749 { id => 318, name => "PF" }, # French Polynesia
1750 { id => 319, name => "TF" }, # French Southern and Antarctic Lands
1751 { id => 321, name => "GP" }, # Guadeloupe
1752 { id => 322, name => "GU" }, # Guam
1753 { id => 323 }, # Guantanamo Bay
1754 { id => 324, name => "GG" }, # Guernsey
1755 { id => 325, name => "HM" }, # Heard Island and McDonald Islands
1756 { id => 326, parent => "UM" }, # Howland Island
1757 { id => 327, parent => "UM" }, # Jarvis Island
1758 { id => 328, name => "JE" }, # Jersey
1759 { id => 329, parent => "UM" }, # Kingman Reef
1760 { id => 330, name => "MQ" }, # Martinique
1761 { id => 331, name => "YT" }, # Mayotte
1762 { id => 332, name => "MS" }, # Montserrat
1763 { id => 333, name => "AN", region => 1 }, # Netherlands Antilles (Former)
1764 { id => 334, name => "NC" }, # New Caledonia
1765 { id => 335, name => "NU" }, # Niue
1766 { id => 336, name => "NF" }, # Norfolk Island
1767 { id => 337, name => "MP" }, # Northern Mariana Islands
1768 { id => 338, parent => "UM" }, # Palmyra Atoll
1769 { id => 339, name => "PN" }, # Pitcairn Islands
1770 { id => 340, parent => "MP" }, # Rota Island
1771 { id => 341, parent => "MP" }, # Saipan
1772 { id => 342, name => "GS" }, # South Georgia and the South Sandwich Islands
1773 { id => 343, name => "SH" }, # St. Helena
1774 { id => 346, parent => "MP" }, # Tinian Island
1775 { id => 347, name => "TK" }, # Tokelau
1776 { id => 348, parent => "SH" }, # Tristan da Cunha
1777 { id => 349, name => "TC" }, # Turks and Caicos Islands
1778 { id => 351, name => "VG" }, # Virgin Islands, British
1779 { id => 352, name => "WF" }, # Wallis and Futuna
1780 { id => 742, name => "002" }, # Africa
1781 { id => 2129, name => "142" }, # Asia
1782 { id => 10541, name => "150" }, # Europe
1783 { id => 15126, name => "IM" }, # Man, Isle of
1784 { id => 19618, name => "MK" }, # Macedonia, Former Yugoslav Republic of
1785 { id => 20900, name => "054" }, # Melanesia
1786 { id => 21206, name => "057" }, # Micronesia
1787 { id => 21242, parent => "UM" }, # Midway Islands
1788 { id => 23581, name => "021" }, # Northern America
1789 { id => 26286, name => "061" }, # Polynesia
1790 { id => 27082, name => "013" }, # Central America
1791 { id => 27114, name => "009" }, # Oceania
1792 { id => 30967, name => "SX" }, # Sint Maarten (Dutch part)
1793 { id => 31396, name => "005" }, # South America
1794 { id => 31706, name => "MF" }, # Saint Martin (French part)
1795 { id => 39070, name => "001" }, # World
1796 { id => 42483, name => "011" }, # Western Africa
1797 { id => 42484, name => "017" }, # Middle Africa
1798 { id => 42487, name => "015" }, # Northern Africa
1799 { id => 47590, name => "143" }, # Central Asia
1800 { id => 47599, name => "035" }, # South-Eastern Asia
1801 { id => 47600, name => "030" }, # Eastern Asia
1802 { id => 47603, name => "014" }, # Eastern Africa
1803 { id => 47609, name => "151" }, # Eastern Europe
1804 { id => 47610, name => "039" }, # Southern Europe
1805 { id => 47611, name => "145" }, # Middle East
1806 { id => 47614, name => "034" }, # Southern Asia
1807 { id => 7299303, name => "TL" }, # Democratic Republic of Timor-Leste
1808 { id => 9914689, name => "XK" }, # Kosovo
1809 { id => 10026358, name => "019" }, # Americas
1810 { id => 10028789, name => "AX" }, # Ã…land Islands
1811 { id => 10039880, name => "029", sintlsymbol => "XCD" }, # Caribbean
1812 { id => 10039882, name => "154" }, # Northern Europe
1813 { id => 10039883, name => "018" }, # Southern Africa
1814 { id => 10210824, name => "155" }, # Western Europe
1815 { id => 10210825, name => "053" }, # Australia and New Zealand
1816 { id => 161832015, name => "BL" }, # Saint Barthélemy
1817 { id => 161832256, name => "UM" }, # U.S. Minor Outlying Islands
1818 { id => 161832257, name => "419", parent => "019" }, # Latin America and the Caribbean
1819 { id => 161832258, name => "BQ" }, # Bonaire, Sint Eustatius and Saba
1822 my @cp2uni = ();
1823 my @glyph2uni = ();
1824 my @lead_bytes = ();
1825 my @uni2cp = ();
1826 my @tolower_table = ();
1827 my @toupper_table = ();
1828 my @digitmap_table = ();
1829 my @halfwidth_table = ();
1830 my @fullwidth_table = ();
1831 my @cjk_compat_table = ();
1832 my @chinese_traditional_table = ();
1833 my @chinese_simplified_table = ();
1834 my @category_table = ();
1835 my @initial_joining_table = ();
1836 my @direction_table = ();
1837 my @decomp_table = ();
1838 my @combining_class_table = ();
1839 my @decomp_compat_table = ();
1840 my @comp_exclusions = ();
1841 my @idna_decomp_table = ();
1842 my @idna_disallowed = ();
1843 my %registry_keys;
1844 my $default_char;
1845 my $default_wchar;
1847 my %joining_forms =
1849 "isolated" => [],
1850 "final" => [],
1851 "initial" => [],
1852 "medial" => []
1855 sub to_utf16(@)
1857 my @ret;
1858 foreach my $ch (@_)
1860 if ($ch < 0x10000)
1862 push @ret, $ch;
1864 else
1866 my $val = $ch - 0x10000;
1867 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
1870 return @ret;
1873 ################################################################
1874 # fetch a unicode.org file and open it
1875 sub open_data_file($$)
1877 my ($base, $name) = @_;
1878 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
1879 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
1880 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
1881 local *FILE;
1883 if ($base =~ /.*\/([^\/]+)\.zip$/)
1885 my $zip = "$1$suffix.zip";
1886 unless (-f "$cache/$zip")
1888 system "mkdir", "-p", $cache;
1889 print "Fetching $base...\n";
1890 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
1892 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
1894 else
1896 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
1897 unless (-f $dest)
1899 system "mkdir", "-p", $dir;
1900 print "Fetching $base/$name...\n";
1901 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
1903 open FILE, "<$dest" or die "cannot open $dest";
1905 return *FILE;
1908 ################################################################
1909 # load a unicode.org file as XML data
1910 sub load_xml_data_file($$)
1912 my ($base, $name) = @_;
1913 my $FILE = open_data_file( $base, $name );
1914 my $xml = XML::LibXML->load_xml( IO => $FILE );
1915 close FILE;
1916 return $xml;
1919 ################################################################
1920 # recursively get the decomposition for a character
1921 sub get_decomposition($$);
1922 sub get_decomposition($$)
1924 my ($char, $table) = @_;
1925 my @ret;
1927 return $char unless defined $table->[$char];
1928 foreach my $ch (@{$table->[$char]})
1930 push @ret, get_decomposition( $ch, $table );
1932 return @ret;
1935 ################################################################
1936 # get the composition that results in a given character
1937 sub get_composition($$)
1939 my ($ch, $compat) = @_;
1940 return () unless defined $decomp_table[$ch]; # no decomposition
1941 my @ret = @{$decomp_table[$ch]};
1942 return () if @ret < 2; # singleton decomposition
1943 return () if $comp_exclusions[$ch]; # composition exclusion
1944 return () if $combining_class_table[$ch]; # non-starter
1945 return () if $combining_class_table[$ret[0]]; # first char is non-starter
1946 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
1947 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
1948 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
1949 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
1950 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
1951 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
1952 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
1953 return @ret;
1956 ################################################################
1957 # recursively build decompositions
1958 sub build_decompositions(@)
1960 my @src = @_;
1961 my @dst;
1963 for (my $i = 0; $i < @src; $i++)
1965 next unless defined $src[$i];
1966 my @decomp = to_utf16( get_decomposition( $i, \@src ));
1967 $dst[$i] = \@decomp;
1969 return @dst;
1972 ################################################################
1973 # compose Hangul sequences
1974 sub compose_hangul(@)
1976 my $SBASE = 0xac00;
1977 my $LBASE = 0x1100;
1978 my $VBASE = 0x1161;
1979 my $TBASE = 0x11a7;
1980 my $LCOUNT = 19;
1981 my $VCOUNT = 21;
1982 my $TCOUNT = 28;
1983 my $NCOUNT = $VCOUNT * $TCOUNT;
1984 my $SCOUNT = $LCOUNT * $NCOUNT;
1986 my @seq = @_;
1987 my @ret;
1988 my $i;
1990 for ($i = 0; $i < @seq; $i++)
1992 my $ch = $seq[$i];
1993 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
1994 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
1996 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
1997 $i++;
1999 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
2000 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
2002 $ch += $seq[$i+1] - $TBASE;
2003 $i++;
2005 push @ret, $ch;
2007 return @ret;
2010 ################################################################
2011 # remove linguistic-only mappings from the case table
2012 sub remove_linguistic_mappings($$)
2014 my ($upper, $lower) = @_;
2016 # remove case mappings that don't round-trip
2018 for (my $i = 0; $i < @{$upper}; $i++)
2020 next unless defined ${$upper}[$i];
2021 my $ch = ${$upper}[$i];
2022 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
2024 for (my $i = 0; $i < @{$lower}; $i++)
2026 next unless defined ${$lower}[$i];
2027 my $ch = ${$lower}[$i];
2028 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
2032 ################################################################
2033 # read in the Unicode database files
2034 sub load_data()
2036 my $start;
2038 # now build mappings from the decomposition field of the Unicode database
2040 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
2041 while (<$UNICODE_DATA>)
2043 # Decode the fields ...
2044 my ($code, $name, $cat, $comb, $bidi,
2045 $decomp, $dec, $dig, $num, $mirror,
2046 $oldname, $comment, $upper, $lower, $title) = split /;/;
2047 my $src = hex $code;
2049 die "unknown category $cat" unless defined $categories{$cat};
2050 die "unknown directionality $bidi" unless defined $directions{$bidi};
2052 $category_table[$src] = $categories{$cat};
2053 $direction_table[$src] = $bidi;
2054 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
2056 $initial_joining_table[$src] = $joining_types{"T"};
2058 else
2060 $initial_joining_table[$src] = $joining_types{"U"};
2063 if ($lower ne "")
2065 $tolower_table[$src] = hex $lower;
2067 if ($upper ne "")
2069 $toupper_table[$src] = hex $upper;
2071 if ($dec ne "")
2073 $category_table[$src] |= $ctype{"digit"};
2075 if ($dig ne "")
2077 $digitmap_table[$src] = ord $dig;
2079 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
2081 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
2082 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
2083 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
2084 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
2085 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
2086 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
2087 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
2088 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
2089 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
2090 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
2091 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
2092 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
2094 # copy the category and direction for everything between First/Last pairs
2095 if ($name =~ /, First>/) { $start = $src; }
2096 if ($name =~ /, Last>/)
2098 while ($start < $src)
2100 $category_table[$start] = $category_table[$src];
2101 $direction_table[$start] = $direction_table[$src];
2102 $combining_class_table[$start] = $combining_class_table[$src];
2103 $start++;
2107 next if $decomp eq ""; # no decomposition, skip it
2109 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
2111 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
2112 $decomp_compat_table[$src] = \@seq;
2115 if ($decomp =~ /^<narrow>\s+([0-9a-fA-F]+)$/)
2117 $halfwidth_table[hex $1] = $src;
2118 $fullwidth_table[$src] = hex $1;
2120 elsif ($decomp =~ /^<wide>\s+([0-9a-fA-F]+)$/)
2122 next if hex $1 == 0x5c; # don't remap backslash
2123 $fullwidth_table[hex $1] = $src;
2124 $halfwidth_table[$src] = hex $1;
2126 elsif ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
2128 # decomposition of the form "<foo> 1234" -> use char if type is known
2129 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
2131 ${joining_forms{$1}}[hex $2] = $src;
2134 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
2136 # decomposition "<compat> 0020 1234" -> combining accent
2138 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
2140 # store decomposition
2141 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
2143 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
2145 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
2147 my $dst = hex $1;
2148 # Single char decomposition
2149 $decomp_table[$src] = $decomp_compat_table[$src] = [ $dst ];
2150 $cjk_compat_table[$src] = $dst if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
2154 close $UNICODE_DATA;
2156 # patch the category of some special characters
2158 for (my $i = 0; $i < @decomp_table; $i++)
2160 next unless defined $decomp_table[$i];
2161 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
2163 foreach my $cat (keys %special_categories)
2165 my $flag = $ctype{$cat};
2166 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
2168 for (my $i = 0; $i < @decomp_compat_table; $i++)
2170 next unless defined $decomp_compat_table[$i];
2171 next unless @{$decomp_compat_table[$i]} == 2;
2172 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
2175 # load the composition exclusions
2177 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
2178 while (<$EXCL>)
2180 s/\#.*//; # remove comments
2181 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
2183 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
2185 elsif (/^([0-9a-fA-F]+)\s*$/)
2187 $comp_exclusions[hex $1] = 1;
2190 close $EXCL;
2192 # load the IDNA mappings
2194 @idna_decomp_table = @decomp_compat_table;
2195 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
2196 while (<$IDNA>)
2198 s/\#.*//; # remove comments
2199 next if /^\s*$/;
2200 my ($char, $type, $mapping) = split /;/;
2201 my ($ch1, $ch2);
2202 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
2204 $ch1 = hex $1;
2205 $ch2 = hex $2;
2207 elsif ($char =~ /([0-9a-fA-F]+)/)
2209 $ch1 = $ch2 = hex $1;
2212 if ($type =~ /mapped/ || $type =~ /deviation/)
2214 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
2215 my @seq = map { hex $_; } split /\s+/, $mapping;
2216 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
2218 elsif ($type =~ /valid/)
2221 elsif ($type =~ /ignored/)
2223 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
2225 elsif ($type =~ /disallowed/)
2227 foreach my $i ($ch1 .. $ch2)
2229 $idna_decomp_table[$i] = undef;
2230 $idna_disallowed[$i] = 1;
2234 close $IDNA;
2236 # load the Unihan mappings
2238 my $UNIHAN = open_data_file( $UNIHAN, "Unihan_Variants.txt" );
2239 while (<$UNIHAN>)
2241 s/\#.*//; # remove comments
2242 next if /^\s*$/;
2243 if (/^U\+([0-9a-fA-F]+)\s+kTraditionalVariant\s+U\+([0-9a-fA-F]+)/)
2245 $chinese_traditional_table[hex $1] = hex $2;
2247 elsif (/^U\+([0-9a-fA-F]+)\s+kSimplifiedVariant\s+U\+([0-9a-fA-F]+)/)
2249 $chinese_simplified_table[hex $1] = hex $2;
2252 close $UNIHAN;
2256 ################################################################
2257 # add a new registry key
2258 sub add_registry_key($$)
2260 my ($key, $defval) = @_;
2261 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
2264 ################################################################
2265 # add a new registry value
2266 sub add_registry_value($$$)
2268 my ($key, $name, $value) = @_;
2269 add_registry_key( $key, undef );
2270 push @{$registry_keys{$key}}, "'$name' = s '$value'";
2273 ################################################################
2274 # define a new lead byte
2275 sub add_lead_byte($)
2277 my $ch = shift;
2278 return if defined $cp2uni[$ch];
2279 push @lead_bytes, $ch;
2280 $cp2uni[$ch] = 0;
2283 ################################################################
2284 # define a new char mapping
2285 sub add_mapping($$)
2287 my ($cp, $uni) = @_;
2288 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2289 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2290 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
2293 ################################################################
2294 # get a mapping including glyph chars for MB_USEGLYPHCHARS
2295 sub get_glyphs_mapping(@)
2297 my @table = @_;
2299 for (my $i = 0; $i < @glyph2uni; $i++)
2301 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
2303 return @table;
2306 ################################################################
2307 # build EUC-JP table from the JIS 0208/0212 files
2308 sub dump_eucjp_codepage()
2310 @cp2uni = ();
2311 @glyph2uni = ();
2312 @lead_bytes = ();
2313 @uni2cp = ();
2314 $default_char = $DEF_CHAR;
2315 $default_wchar = 0x30fb;
2317 # ASCII chars
2318 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
2320 # lead bytes
2321 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
2323 # JIS X 0201 right plane
2324 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
2326 # undefined chars
2327 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
2328 $cp2uni[0xa0] = 0xf8f0;
2329 $cp2uni[0xff] = 0xf8f3;
2331 # Fix backslash conversion
2332 add_mapping( 0xa1c0, 0xff3c );
2334 # Add private mappings for rows undefined in JIS 0208/0212
2335 my $private = 0xe000;
2336 foreach my $hi (0xf5 .. 0xfe)
2338 foreach my $lo (0xa1 .. 0xfe)
2340 add_mapping( ($hi << 8) + $lo, $private++ );
2343 foreach my $hi (0xf5 .. 0xfe)
2345 foreach my $lo (0x21 .. 0x7e)
2347 add_mapping( ($hi << 8) + $lo, $private++ );
2351 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
2352 while (<$INPUT>)
2354 next if /^\#/; # skip comments
2355 next if /^$/; # skip empty lines
2356 next if /\x1a/; # skip ^Z
2357 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2359 add_mapping( 0x8080 + hex $1, hex $2 );
2360 next;
2362 die "Unrecognized line $_\n";
2364 close $INPUT;
2366 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
2367 while (<$INPUT>)
2369 next if /^\#/; # skip comments
2370 next if /^$/; # skip empty lines
2371 next if /\x1a/; # skip ^Z
2372 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2374 add_mapping( 0x8000 + hex $1, hex $2 );
2375 next;
2377 die "Unrecognized line $_\n";
2379 close $INPUT;
2381 output_codepage_file( 20932 );
2384 ################################################################
2385 # build Korean Wansung table from the KSX1001 file
2386 sub dump_krwansung_codepage(@)
2388 my @cp949 = @_;
2389 @cp2uni = ();
2390 @glyph2uni = ();
2391 @lead_bytes = ();
2392 @uni2cp = ();
2393 $default_char = 0x3f;
2394 $default_wchar = 0x003f;
2396 # ASCII and undefined chars
2397 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
2398 add_mapping( 0xa0, 0xf8e6 );
2399 add_mapping( 0xad, 0xf8e7 );
2400 add_mapping( 0xae, 0xf8e8 );
2401 add_mapping( 0xaf, 0xf8e9 );
2402 add_mapping( 0xfe, 0xf8ea );
2403 add_mapping( 0xff, 0xf8eb );
2405 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
2406 while (<$INPUT>)
2408 next if /^\#/; # skip comments
2409 next if /^$/; # skip empty lines
2410 next if /\x1a/; # skip ^Z
2411 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2413 add_mapping( 0x8080 + hex $1, hex $2 );
2414 next;
2416 die "Unrecognized line $_\n";
2418 close $INPUT;
2420 # get some extra mappings from cp 949
2421 my @defined_lb;
2422 map { $defined_lb[$_] = 1; } @lead_bytes;
2423 foreach my $i (0x0000 .. 0xffff)
2425 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
2426 next unless defined $cp949[$i];
2427 if ($cp949[$i] >= 0xff)
2429 # only add chars for lead bytes that exist in 20949
2430 my $hi = $cp949[$i] >> 8;
2431 my $lo = $cp949[$i] & 0xff;
2432 next unless $defined_lb[$hi];
2433 next unless $lo >= 0xa1 && $lo <= 0xfe;
2435 add_mapping( $cp949[$i], $i );
2438 output_codepage_file( 20949 );
2441 ################################################################
2442 # build the sort keys table
2443 sub dump_sortkeys($)
2445 my $filename = shift;
2446 my @sortkeys = ();
2448 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
2449 while (<$INPUT>)
2451 next if /^\#/; # skip comments
2452 next if /^$/; # skip empty lines
2453 next if /\x1a/; # skip ^Z
2454 next if /^\@version/; # skip @version header
2455 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
2457 my ($uni,$variable) = (hex $1, $2);
2458 next if $uni > 65535;
2459 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
2460 next;
2462 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
2464 # multiple character sequence, ignored for now
2465 next;
2467 die "$SORTKEYS: Unrecognized line $_\n";
2469 close $INPUT;
2471 # compress the keys to 32 bit:
2472 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
2474 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
2475 ${$a}[2] <=> ${$b}[2] or
2476 ${$a}[3] <=> ${$b}[3] or
2477 ${$a}[4] <=> ${$b}[4] or
2478 $a cmp $b; } @sortkeys;
2480 my ($n2, $n3) = (1, 1);
2481 my @keys = (-1, -1, -1, -1, -1 );
2482 my @flatkeys = ();
2484 for (my $i = 0; $i < @sortkeys; $i++)
2486 next unless defined $sortkeys[$i];
2487 my @current = @{$sortkeys[$i]};
2488 if ($current[1] == $keys[1])
2490 if ($current[2] == $keys[2])
2492 if ($current[3] == $keys[3])
2494 # nothing
2496 else
2498 $keys[3] = $current[3];
2499 $n3++;
2500 die if ($n3 >= 16);
2503 else
2505 $keys[2] = $current[2];
2506 $keys[3] = $current[3];
2507 $n2++;
2508 $n3 = 1;
2509 die if ($n2 >= 256);
2512 else
2514 $keys[1] = $current[1];
2515 $keys[2] = $current[2];
2516 $keys[3] = $current[3];
2517 $n2 = 1;
2518 $n3 = 1;
2521 if ($current[2]) { $current[2] = $n2; }
2522 if ($current[3]) { $current[3] = $n3; }
2523 if ($current[4]) { $current[4] = 1; }
2525 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
2528 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2529 printf "Building $filename\n";
2530 printf OUTPUT "/* Unicode collation element table */\n";
2531 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
2532 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
2533 print OUTPUT "#include \"windef.h\"\n\n";
2535 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
2537 close OUTPUT;
2538 save_file($filename);
2542 ################################################################
2543 # dump an array of integers
2544 sub dump_array($$@)
2546 my ($bit_width, $default, @array) = @_;
2547 my $format = sprintf "0x%%0%ux", $bit_width / 4;
2548 my $i;
2549 my $ret = " ";
2550 for ($i = 0; $i < $#array; $i++)
2552 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2553 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
2555 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2556 return $ret;
2560 ################################################################
2561 # dump an SBCS mapping table in binary format
2562 sub dump_binary_sbcs_table($)
2564 my $codepage = shift;
2566 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2567 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
2569 print OUTPUT pack "S<*", @header;
2570 print OUTPUT pack "C12", (0) x 12;
2571 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2573 if (@glyph2uni)
2575 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
2577 else
2579 print OUTPUT pack "S<*", 0;
2582 print OUTPUT pack "S<*", 0, 0;
2584 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2588 ################################################################
2589 # dump a DBCS mapping table in binary format
2590 sub dump_binary_dbcs_table($)
2592 my $codepage = shift;
2593 my @lb_ranges = get_lb_ranges();
2594 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2596 my @offsets = (0) x 256;
2597 my $pos = 0;
2598 foreach my $i (@lead_bytes)
2600 $offsets[$i] = ($pos += 256);
2601 $cp2uni[$i] = 0;
2604 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
2606 print OUTPUT pack "S<*", @header;
2607 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
2608 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2609 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
2611 foreach my $i (@lead_bytes)
2613 my $base = $i << 8;
2614 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
2617 print OUTPUT pack "S<", 4;
2618 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2622 ################################################################
2623 # get the list of defined lead byte ranges
2624 sub get_lb_ranges()
2626 my @list = ();
2627 my @ranges = ();
2629 foreach my $i (@lead_bytes) { $list[$i] = 1; }
2630 my $on = 0;
2631 for (my $i = 0; $i < 256; $i++)
2633 if ($on)
2635 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
2637 else
2639 if ($list[$i]) { push @ranges, $i; $on = 1; }
2642 if ($on) { push @ranges, 0xff; }
2643 return @ranges;
2646 ################################################################
2647 # dump the Indic Syllabic Category table
2648 sub dump_indic($)
2650 my $filename = shift;
2651 my @indic_table;
2653 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
2654 while (<$INPUT>)
2656 next if /^\#/; # skip comments
2657 next if /^\s*$/; # skip empty lines
2658 next if /\x1a/; # skip ^Z
2659 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2661 my $type = $2;
2662 die "unknown indic $type" unless defined $indic_types{$type};
2663 if (hex $1 < 65536)
2665 $indic_table[hex $1] = $indic_types{$type};
2667 next;
2669 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2671 my $type = $3;
2672 die "unknown indic $type" unless defined $indic_types{$type};
2673 if (hex $1 < 65536 and hex $2 < 65536)
2675 foreach my $i (hex $1 .. hex $2)
2677 $indic_table[$i] = $indic_types{$type};
2680 next;
2682 die "malformed line $_";
2684 close $INPUT;
2686 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
2687 while (<$INPUT>)
2689 next if /^\#/; # skip comments
2690 next if /^\s*$/; # skip empty lines
2691 next if /\x1a/; # skip ^Z
2692 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2694 my $type = $2;
2695 die "unknown matra $type" unless defined $matra_types{$type};
2696 $indic_table[hex $1] |= $matra_types{$type} << 8;
2697 next;
2699 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2701 my $type = $3;
2702 die "unknown matra $type" unless defined $matra_types{$type};
2703 foreach my $i (hex $1 .. hex $2)
2705 $indic_table[$i] |= $matra_types{$type} << 8;
2707 next;
2709 die "malformed line $_";
2711 close $INPUT;
2713 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2714 print "Building $filename\n";
2715 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
2716 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
2717 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
2718 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2719 print OUTPUT "#include \"windef.h\"\n\n";
2721 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
2723 close OUTPUT;
2724 save_file($filename);
2727 ################################################################
2728 # dump the Line Break Properties table
2729 sub dump_linebreak($)
2731 my $filename = shift;
2732 my @break_table;
2734 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
2735 while (<$INPUT>)
2737 next if /^\#/; # skip comments
2738 next if /^\s*$/; # skip empty lines
2739 next if /\x1a/; # skip ^Z
2740 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2742 my $type = $2;
2743 die "unknown breaktype $type" unless defined $break_types{$type};
2744 $break_table[hex $1] = $break_types{$type};
2745 next;
2747 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2749 my $type = $3;
2750 die "unknown breaktype $type" unless defined $break_types{$type};
2751 foreach my $i (hex $1 .. hex $2)
2753 $break_table[$i] = $break_types{$type};
2755 next;
2757 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2759 my $type = $2;
2760 die "unknown breaktype $type" unless defined $break_types{$type};
2761 $break_table[hex $1] = $break_types{$type};
2762 next;
2764 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2766 my $type = $3;
2767 die "unknown breaktype $type" unless defined $break_types{$type};
2768 foreach my $i (hex $1 .. hex $2)
2770 $break_table[$i] = $break_types{$type};
2772 next;
2774 die "malformed line $_";
2776 close $INPUT;
2778 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2779 print "Building $filename\n";
2780 print OUTPUT "/* Unicode Line Break Properties */\n";
2781 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
2782 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2783 print OUTPUT "#include \"windef.h\"\n\n";
2785 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
2787 close OUTPUT;
2788 save_file($filename);
2791 my %scripts =
2793 "Unknown" => 0,
2794 "Common" => 1,
2795 "Inherited" => 2,
2796 "Arabic" => 3,
2797 "Armenian" => 4,
2798 "Avestan" => 5,
2799 "Balinese" => 6,
2800 "Bamum" => 7,
2801 "Batak" => 8,
2802 "Bengali" => 9,
2803 "Bopomofo" => 10,
2804 "Brahmi" => 11,
2805 "Braille" => 12,
2806 "Buginese" => 13,
2807 "Buhid" => 14,
2808 "Canadian_Aboriginal" => 15,
2809 "Carian" => 16,
2810 "Cham" => 17,
2811 "Cherokee" => 18,
2812 "Coptic" => 19,
2813 "Cuneiform" => 20,
2814 "Cypriot" => 21,
2815 "Cyrillic" => 22,
2816 "Deseret" => 23,
2817 "Devanagari" => 24,
2818 "Egyptian_Hieroglyphs" => 25,
2819 "Ethiopic" => 26,
2820 "Georgian" => 27,
2821 "Glagolitic" => 28,
2822 "Gothic" => 29,
2823 "Greek" => 30,
2824 "Gujarati" => 31,
2825 "Gurmukhi" => 32,
2826 "Han" => 33,
2827 "Hangul" => 34,
2828 "Hanunoo" => 35,
2829 "Hebrew" => 36,
2830 "Hiragana" => 37,
2831 "Imperial_Aramaic" => 38,
2832 "Inscriptional_Pahlavi" => 39,
2833 "Inscriptional_Parthian" => 40,
2834 "Javanese" => 41,
2835 "Kaithi" => 42,
2836 "Kannada" => 43,
2837 "Katakana" => 44,
2838 "Kayah_Li" => 45,
2839 "Kharoshthi" => 46,
2840 "Khmer" => 47,
2841 "Lao" => 48,
2842 "Latin" => 49,
2843 "Lepcha" => 50,
2844 "Limbu" => 51,
2845 "Linear_B" => 52,
2846 "Lisu" => 53,
2847 "Lycian" => 54,
2848 "Lydian" => 55,
2849 "Malayalam" => 56,
2850 "Mandaic" => 57,
2851 "Meetei_Mayek" => 58,
2852 "Mongolian" => 59,
2853 "Myanmar" => 60,
2854 "New_Tai_Lue" => 61,
2855 "Nko" => 62,
2856 "Ogham" => 63,
2857 "Ol_Chiki" => 64,
2858 "Old_Italic" => 65,
2859 "Old_Persian" => 66,
2860 "Old_South_Arabian" => 67,
2861 "Old_Turkic" => 68,
2862 "Oriya" => 69,
2863 "Osmanya" => 70,
2864 "Phags_Pa" => 71,
2865 "Phoenician" => 72,
2866 "Rejang" => 73,
2867 "Runic" => 74,
2868 "Samaritan" => 75,
2869 "Saurashtra" => 76,
2870 "Shavian" => 77,
2871 "Sinhala" => 78,
2872 "Sundanese" => 79,
2873 "Syloti_Nagri" => 80,
2874 "Syriac" => 81,
2875 "Tagalog" => 82,
2876 "Tagbanwa" => 83,
2877 "Tai_Le" => 84,
2878 "Tai_Tham" => 85,
2879 "Tai_Viet" => 86,
2880 "Tamil" => 87,
2881 "Telugu" => 88,
2882 "Thaana" => 89,
2883 "Thai" => 90,
2884 "Tibetan" => 91,
2885 "Tifinagh" => 92,
2886 "Ugaritic" => 93,
2887 "Vai" => 94,
2888 "Yi" => 95,
2889 # Win8/Win8.1
2890 "Chakma" => 96,
2891 "Meroitic_Cursive" => 97,
2892 "Meroitic_Hieroglyphs" => 98,
2893 "Miao" => 99,
2894 "Sharada" => 100,
2895 "Sora_Sompeng" => 101,
2896 "Takri" => 102,
2897 # Win10
2898 "Bassa_Vah" => 103,
2899 "Caucasian_Albanian" => 104,
2900 "Duployan" => 105,
2901 "Elbasan" => 106,
2902 "Grantha" => 107,
2903 "Khojki" => 108,
2904 "Khudawadi" => 109,
2905 "Linear_A" => 110,
2906 "Mahajani" => 111,
2907 "Manichaean" => 112,
2908 "Mende_Kikakui" => 113,
2909 "Modi" => 114,
2910 "Mro" => 115,
2911 "Nabataean" => 116,
2912 "Old_North_Arabian" => 117,
2913 "Old_Permic" => 118,
2914 "Pahawh_Hmong" => 119,
2915 "Palmyrene" => 120,
2916 "Pau_Cin_Hau" => 121,
2917 "Psalter_Pahlavi" => 122,
2918 "Siddham" => 123,
2919 "Tirhuta" => 124,
2920 "Warang_Citi" => 125,
2921 # Win10 RS1
2922 "Adlam" => 126,
2923 "Ahom" => 127,
2924 "Anatolian_Hieroglyphs" => 128,
2925 "Bhaiksuki" => 129,
2926 "Hatran" => 130,
2927 "Marchen" => 131,
2928 "Multani" => 132,
2929 "Newa" => 133,
2930 "Old_Hungarian" => 134,
2931 "Osage" => 135,
2932 "SignWriting" => 136,
2933 "Tangut" => 137,
2934 # Win10 RS4
2935 "Masaram_Gondi" => 138,
2936 "Nushu" => 139,
2937 "Soyombo" => 140,
2938 "Zanabazar_Square" => 141,
2939 # Win10 1903
2940 "Dogra" => 142,
2941 "Gunjala_Gondi" => 143,
2942 "Hanifi_Rohingya" => 144,
2943 "Makasar" => 145,
2944 "Medefaidrin" => 146,
2945 "Old_Sogdian" => 147,
2946 "Sogdian" => 148,
2947 # Win10 2004
2948 "Elymaic" => 149,
2949 "Nyiakeng_Puachue_Hmong" => 150,
2950 "Nandinagari" => 151,
2951 "Wancho" => 152,
2952 # Win11
2953 "Chorasmian" => 153,
2954 "Dives_Akuru" => 154,
2955 "Khitan_Small_Script" => 155,
2956 "Yezidi" => 156,
2959 ################################################################
2960 # dump Script IDs table
2961 sub dump_scripts($)
2963 my $filename = shift;
2964 my $header = $filename;
2965 my @scripts_table;
2966 my $script_index;
2967 my $i;
2969 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
2970 # Fill the table
2971 # Unknown script id is always 0, so undefined scripts are automatically treated as such
2972 while (<$INPUT>)
2974 my $type = "";
2976 next if /^\#/; # skip comments
2977 next if /^\s*$/; # skip empty lines
2978 next if /\x1a/; # skip ^Z
2979 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2981 $type = $2;
2982 if (defined $scripts{$type})
2984 $scripts_table[hex $1] = $scripts{$type};
2986 next;
2988 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2990 $type = $3;
2991 if (defined $scripts{$type})
2993 foreach my $i (hex $1 .. hex $2)
2995 $scripts_table[$i] = $scripts{$type};
2998 next;
3002 close $INPUT;
3004 $header = "$filename.h";
3005 open OUTPUT,">$header.new" or die "Cannot create $header";
3006 print "Building $header\n";
3007 print OUTPUT "/* Unicode Script IDs */\n";
3008 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
3009 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3011 print OUTPUT "enum unicode_script_id {\n";
3012 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
3014 print OUTPUT " Script_$script = $scripts{$script},\n";
3016 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
3017 print OUTPUT "};\n";
3019 close OUTPUT;
3020 save_file($header);
3022 $filename = "$filename.c";
3023 open OUTPUT,">$filename.new" or die "Cannot create $header";
3024 print "Building $filename\n";
3025 print OUTPUT "/* Unicode Script IDs */\n";
3026 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
3027 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3028 print OUTPUT "#include \"windef.h\"\n\n";
3030 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
3031 close OUTPUT;
3032 save_file($filename);
3035 ################################################################
3036 # dump the BiDi mirroring table
3037 sub dump_mirroring($)
3039 my $filename = shift;
3040 my @mirror_table = ();
3042 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
3043 while (<$INPUT>)
3045 next if /^\#/; # skip comments
3046 next if /^$/; # skip empty lines
3047 next if /\x1a/; # skip ^Z
3048 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
3050 $mirror_table[hex $1] = hex $2;
3051 next;
3053 die "malformed line $_";
3055 close $INPUT;
3057 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3058 print "Building $filename\n";
3059 print OUTPUT "/* Unicode BiDi mirroring */\n";
3060 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
3061 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3062 print OUTPUT "#include \"windef.h\"\n\n";
3063 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
3064 close OUTPUT;
3065 save_file($filename);
3068 ################################################################
3069 # dump the Bidi Brackets
3070 sub dump_bracket($)
3072 my $filename = shift;
3073 my @bracket_table;
3075 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
3076 while (<$INPUT>)
3078 next if /^\#/; # skip comments
3079 next if /^\s*$/; # skip empty lines
3080 next if /\x1a/; # skip ^Z
3081 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
3083 my $type = $3;
3084 die "unknown bracket $type" unless defined $bracket_types{$type};
3085 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
3086 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
3087 $bracket_table[hex $1] += $bracket_types{$type} << 8;
3088 next;
3090 die "malformed line $_";
3092 close $INPUT;
3094 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3095 print "Building $filename\n";
3096 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
3097 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
3098 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3099 print OUTPUT "#include \"windef.h\"\n\n";
3101 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
3103 close OUTPUT;
3104 save_file($filename);
3107 ################################################################
3108 # dump the Arabic shaping table
3109 sub dump_shaping($)
3111 my $filename = shift;
3112 my @joining_table = @initial_joining_table;
3114 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
3115 while (<$INPUT>)
3117 next if /^\#/; # skip comments
3118 next if /^\s*$/; # skip empty lines
3119 next if /\x1a/; # skip ^Z
3120 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3122 my $type = $2;
3123 $joining_table[hex $1] = $joining_types{$type};
3124 next;
3126 die "malformed line $_";
3128 close $INPUT;
3130 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3131 print "Building $filename\n";
3132 print OUTPUT "/* Unicode Arabic shaping */\n";
3133 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
3134 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3135 print OUTPUT "#include \"windef.h\"\n\n";
3137 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
3139 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
3140 for (my $i = 0x600; $i <= 0x6ff; $i++)
3142 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
3143 ${joining_forms{"isolated"}}[$i] || $i,
3144 ${joining_forms{"final"}}[$i] || $i,
3145 ${joining_forms{"initial"}}[$i] || $i,
3146 ${joining_forms{"medial"}}[$i] || $i;
3148 print OUTPUT "};\n";
3150 close OUTPUT;
3151 save_file($filename);
3154 ################################################################
3155 # dump the Arabic shaping table
3156 sub dump_arabic_shaping($)
3158 my $filename = shift;
3159 my @joining_table = @initial_joining_table;
3161 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
3162 while (<$INPUT>)
3164 next if /^\#/; # skip comments
3165 next if /^\s*$/; # skip empty lines
3166 next if /\x1a/; # skip ^Z
3167 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3169 my $type = $2;
3170 my $group = $3;
3172 if ($group eq "ALAPH" || $group eq "DALATH RISH")
3174 $joining_table[hex $1] = $joining_types{$group};
3176 else
3178 $joining_table[hex $1] = $joining_types{$type};
3181 next;
3183 die "malformed line $_";
3185 close $INPUT;
3187 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3188 print "Building $filename\n";
3189 print OUTPUT "/* Unicode Arabic shaping */\n";
3190 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
3191 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3192 print OUTPUT "#include \"windef.h\"\n\n";
3194 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
3196 close OUTPUT;
3197 save_file($filename);
3200 ################################################################
3201 # dump the Vertical Orientation table
3202 sub dump_vertical($$)
3204 my ($filename, $unix) = @_;
3205 my @vertical_table;
3207 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
3208 while (<$INPUT>)
3210 next if /^\#/; # skip comments
3211 next if /^\s*$/; # skip empty lines
3212 next if /\x1a/; # skip ^Z
3213 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
3215 my $type = $2;
3216 die "unknown vertical $type" unless defined $vertical_types{$type};
3217 if (hex $1 < 65536)
3219 $vertical_table[hex $1] = $vertical_types{$type};
3221 next;
3223 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
3225 my $type = $3;
3226 die "unknown vertical $type" unless defined $vertical_types{$type};
3227 foreach my $i (hex $1 .. hex $2)
3229 $vertical_table[$i] = $vertical_types{$type};
3231 next;
3233 die "malformed line $_";
3235 close $INPUT;
3237 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3238 print "Building $filename\n";
3239 print OUTPUT "/* Unicode Vertical Orientation */\n";
3240 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
3241 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3242 if ($unix)
3244 print OUTPUT "#if 0\n";
3245 print OUTPUT "#pragma makedep unix\n";
3246 print OUTPUT "#endif\n\n";
3248 print OUTPUT "#include \"windef.h\"\n\n";
3250 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
3252 close OUTPUT;
3253 save_file($filename);
3256 ################################################################
3257 # compress a mapping table by removing identical rows
3258 sub compress_array($$@)
3260 my $rows = shift;
3261 my $def = shift;
3262 my @table = @_;
3263 my $len = @table / $rows;
3264 my @array;
3265 my $data = "";
3267 # try to merge table rows
3268 for (my $row = 0; $row < $rows; $row++)
3270 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
3271 my $pos = index $data, $rowtxt;
3272 if ($pos == -1)
3274 # check if the tail of the data can match the start of the new row
3275 my $first = substr( $rowtxt, 0, 1 );
3276 for (my $i = length($data) - 1; $i > 0; $i--)
3278 $pos = index( substr( $data, -$i ), $first );
3279 last if $pos == -1;
3280 $i -= $pos;
3281 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
3282 substr( $data, -$i ) = "";
3283 last;
3285 $pos = length $data;
3286 $data .= $rowtxt;
3288 $array[$row] = $rows + $pos;
3290 return @array, unpack "U*", $data;
3293 ################################################################
3294 # dump a char -> 16-bit value mapping table using two-level tables
3295 sub dump_two_level_mapping($$@)
3297 my $name = shift;
3298 my $def = shift;
3299 my $size = shift;
3300 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3301 my @row_array = compress_array( 4096, $def, @_[0..65535] );
3302 my @array = compress_array( 256, 0, @row_array[0..4095] );
3304 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
3306 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
3307 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
3308 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
3309 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
3312 ################################################################
3313 # dump a char -> value mapping table using three-level tables
3314 sub dump_three_level_mapping($$@)
3316 my $name = shift;
3317 my $def = shift;
3318 my $size = shift;
3319 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3320 my $level3 = ($MAX_CHAR + 1) / 16;
3321 my $level2 = $level3 / 16;
3322 my $level1 = $level2 / 16;
3323 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
3324 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
3325 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
3327 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
3328 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
3330 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
3331 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
3332 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
3333 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
3334 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
3337 ################################################################
3338 # dump a binary case mapping table in l_intl.nls format
3339 sub dump_binary_case_table(@)
3341 my (@table) = @_;
3342 my $max_char = 0x10000;
3343 my $level1 = $max_char / 16;
3344 my $level2 = $level1 / 16;
3346 my @difftable;
3347 for (my $i = 0; $i < @table; $i++)
3349 next unless defined $table[$i];
3350 $difftable[$i] = ($table[$i] - $i) & 0xffff;
3353 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
3354 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
3355 my $offset = @array - $level1;
3356 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
3357 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
3360 ################################################################
3361 # dump case mappings for l_intl.nls
3362 sub dump_intl_nls($)
3364 my @upper_table = @toupper_table;
3365 my @lower_table = @tolower_table;
3366 remove_linguistic_mappings( \@upper_table, \@lower_table );
3368 my $upper = dump_binary_case_table( @upper_table );
3369 my $lower = dump_binary_case_table( @lower_table );
3371 my $filename = shift;
3372 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3373 printf "Building $filename\n";
3375 binmode OUTPUT;
3376 print OUTPUT pack "S<", 1; # version
3377 print OUTPUT $upper;
3378 print OUTPUT $lower;
3379 close OUTPUT;
3380 save_file($filename);
3384 ################################################################
3385 # dump the bidi direction table
3386 sub dump_bidi_dir_table($)
3388 my $filename = shift;
3389 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3390 printf "Building $filename\n";
3391 printf OUTPUT "/* Unicode BiDi direction table */\n";
3392 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
3393 printf OUTPUT "#include \"windef.h\"\n\n";
3395 my @table;
3397 for (my $i = 0; $i < 65536; $i++)
3399 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
3402 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
3404 close OUTPUT;
3405 save_file($filename);
3409 sub rol($$)
3411 my ($byte, $count) = @_;
3412 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
3415 ################################################################
3416 # compress the character properties table
3417 sub compress_char_props_table($@)
3419 my $rows = shift;
3420 my @table = @_;
3421 my $len = @table / $rows;
3422 my $pos = 0;
3423 my @array = (0) x $rows;
3424 my %sequences;
3426 # add some predefined sequences
3427 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
3429 # try to merge table rows
3430 for (my $row = 0; $row < $rows; $row++)
3432 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
3433 my $rowtxt = pack "L*", @table_row;
3434 if (defined($sequences{$rowtxt}))
3436 # reuse an existing row
3437 $array[$row] = $sequences{$rowtxt};
3439 else
3441 # create a new row
3442 $sequences{$rowtxt} = $array[$row] = ++$pos;
3443 push @array, @table_row;
3446 return @array;
3449 ################################################################
3450 # dump a normalization table in binary format
3451 sub dump_norm_table($)
3453 my $filename = shift;
3455 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
3456 my %decomp = ( "nfc" => \@decomp_table,
3457 "nfd" => \@decomp_table,
3458 "nfkc" => \@decomp_compat_table,
3459 "nfkd" => \@decomp_compat_table ,
3460 "idna" => \@idna_decomp_table );
3462 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3463 print "Building $filename\n";
3465 my $type = $filename;
3466 $type =~ s!.*/norm(\w+)\.nls!$1!;
3468 my $compose = $forms{$type} & 1;
3469 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
3471 my @version = split /\./, $UNIVERSION;
3473 # combining classes
3475 my @classes;
3476 my @class_values;
3478 foreach my $c (grep defined, @combining_class_table)
3480 $classes[$c] = 1 if $c < 0x100;
3482 for (my $i = 0; $i < @classes; $i++)
3484 next unless defined $classes[$i];
3485 $classes[$i] = @class_values;
3486 push @class_values, $i;
3488 push @class_values, 0 if (@class_values % 2);
3489 die "too many classes" if @class_values >= 0x40;
3491 # character properties
3493 my @char_props;
3494 my @decomposed;
3495 my @comp_hash_table;
3496 my $comp_hash_size = $compose ? 254 : 0;
3498 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3500 next unless defined $combining_class_table[$i];
3501 if (defined $decomp{$type}->[$i])
3503 my @dec = get_decomposition( $i, $decomp{$type} );
3504 if ($compose && (my @comp = get_composition( $i, $compat )))
3506 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
3507 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
3509 my $val = 0;
3510 foreach my $d (@dec)
3512 $val = $combining_class_table[$d];
3513 last if $val;
3515 $char_props[$i] = $classes[$val];
3517 else
3519 $char_props[$i] = 0xbf;
3521 @dec = compose_hangul( @dec ) if $compose;
3522 @dec = to_utf16( @dec );
3523 push @dec, 0 if @dec >= 7;
3524 $decomposed[$i] = \@dec;
3526 else
3528 if ($combining_class_table[$i] == 0x100)
3530 $char_props[$i] = 0x7f;
3532 elsif ($combining_class_table[$i])
3534 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
3536 elsif ($type eq "idna" && defined $idna_disallowed[$i])
3538 $char_props[$i] = 0xff;
3540 else
3542 $char_props[$i] = 0;
3547 if ($compose)
3549 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3551 my @comp = get_composition( $i, $compat );
3552 next unless @comp;
3553 if ($combining_class_table[$comp[1]])
3555 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
3556 $char_props[$comp[1]] |= 0x40;
3558 else
3560 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
3561 $char_props[$comp[1]] |= 0xc0;
3566 # surrogates
3567 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
3568 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
3570 # Hangul
3571 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
3572 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
3573 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
3575 # invalid chars
3576 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
3577 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
3578 foreach my $i (0x00..0x10)
3580 $char_props[($i << 16) | 0xfffe] = 0xff;
3581 $char_props[($i << 16) | 0xffff] = 0xff;
3584 # decomposition hash table
3586 my @decomp_hash_table;
3587 my @decomp_hash_index;
3588 my @decomp_hash_data;
3589 my $decomp_hash_size = 944;
3591 # build string of character data, reusing substrings when possible
3592 my $decomp_char_data = "";
3593 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
3595 my $str = pack "U*", @{$i};
3596 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
3598 for (my $i = 0; $i < @decomposed; $i++)
3600 next unless defined $decomposed[$i];
3601 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
3602 die "sequence not found" if $pos == -1;
3603 my $len = @{$decomposed[$i]};
3604 $len = 7 if $len > 7;
3605 my $hash = $i % $decomp_hash_size;
3606 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
3608 for (my $i = 0; $i < $decomp_hash_size; $i++)
3610 $decomp_hash_index[$i] = @decomp_hash_data / 2;
3611 next unless defined $decomp_hash_table[$i];
3612 if (@{$decomp_hash_table[$i]} == 1)
3614 my $entry = $decomp_hash_table[$i]->[0];
3615 if ($char_props[$entry->[0]] == 0xbf)
3617 $decomp_hash_index[$i] = $entry->[1];
3618 next;
3621 foreach my $entry (@{$decomp_hash_table[$i]})
3623 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
3626 push @decomp_hash_data, 0, 0;
3628 # composition hash table
3630 my @comp_hash_index;
3631 my @comp_hash_data;
3632 if (@comp_hash_table)
3634 for (my $i = 0; $i < $comp_hash_size; $i++)
3636 $comp_hash_index[$i] = @comp_hash_data;
3637 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
3639 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
3640 push @comp_hash_data, 0, 0, 0;
3643 my $level1 = ($MAX_CHAR + 1) / 128;
3644 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
3646 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
3647 0, $decomp_hash_size, $comp_hash_size, 0 );
3648 my @tables = (0) x 8;
3650 $tables[0] = 16 + @header + @tables;
3651 $tables[1] = $tables[0] + @class_values / 2;
3652 $tables[2] = $tables[1] + $level1 / 2;
3653 $tables[3] = $tables[2] + (@rows - $level1) / 2;
3654 $tables[4] = $tables[3] + @decomp_hash_index;
3655 $tables[5] = $tables[4] + @decomp_hash_data;
3656 $tables[6] = $tables[5] + length $decomp_char_data;
3657 $tables[7] = $tables[6] + @comp_hash_index;
3659 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
3660 print OUTPUT pack "S<*", @header;
3661 print OUTPUT pack "S<*", @tables;
3662 print OUTPUT pack "C*", @class_values;
3664 print OUTPUT pack "C*", @rows[0..$level1-1];
3665 print OUTPUT pack "C*", @rows[$level1..$#rows];
3666 print OUTPUT pack "S<*", @decomp_hash_index;
3667 print OUTPUT pack "S<*", @decomp_hash_data;
3668 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
3669 print OUTPUT pack "S<*", @comp_hash_index;
3670 print OUTPUT pack "S<*", @comp_hash_data;
3672 close OUTPUT;
3673 save_file($filename);
3675 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
3679 ################################################################
3680 # output a codepage definition file from the global tables
3681 sub output_codepage_file($)
3683 my $codepage = shift;
3685 my $output = sprintf "nls/c_%03d.nls", $codepage;
3686 open OUTPUT,">$output.new" or die "Cannot create $output";
3688 printf "Building %s\n", $output;
3689 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
3690 else { dump_binary_dbcs_table( $codepage ); }
3692 close OUTPUT;
3693 save_file($output);
3695 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
3698 ################################################################
3699 # output a codepage table from a Microsoft-style mapping file
3700 sub dump_msdata_codepage($)
3702 my $filename = shift;
3704 my $state = "";
3705 my ($codepage, $width, $count);
3706 my ($lb_cur, $lb_end);
3708 @cp2uni = ();
3709 @glyph2uni = ();
3710 @lead_bytes = ();
3711 @uni2cp = ();
3712 $default_char = $DEF_CHAR;
3713 $default_wchar = $DEF_CHAR;
3715 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
3717 while (<$INPUT>)
3719 next if /^;/; # skip comments
3720 next if /^\s*$/; # skip empty lines
3721 next if /\x1a/; # skip ^Z
3722 last if /^ENDCODEPAGE/;
3724 if (/^CODEPAGE\s+(\d+)/)
3726 $codepage = $1;
3727 next;
3729 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
3731 $width = $1;
3732 $default_char = hex $2;
3733 $default_wchar = hex $3;
3734 next;
3736 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
3738 $state = $1;
3739 $count = $2;
3740 next;
3742 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
3744 if ($state eq "MBTABLE")
3746 my $cp = hex $1;
3747 my $uni = hex $2;
3748 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3749 next;
3751 if ($state eq "GLYPHTABLE")
3753 my $cp = hex $1;
3754 my $uni = hex $2;
3755 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
3756 next;
3758 if ($state eq "WCTABLE")
3760 my $uni = hex $1;
3761 my $cp = hex $2;
3762 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
3763 next;
3765 if ($state eq "DBCSRANGE")
3767 my $start = hex $1;
3768 my $end = hex $2;
3769 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
3770 $lb_cur = $start;
3771 $lb_end = $end;
3772 next;
3774 if ($state eq "DBCSTABLE")
3776 my $mb = hex $1;
3777 my $uni = hex $2;
3778 my $cp = ($lb_cur << 8) | $mb;
3779 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3780 if (!--$count)
3782 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
3784 next;
3787 die "$filename: Unrecognized line $_\n";
3789 close $INPUT;
3791 output_codepage_file( $codepage );
3793 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
3796 ################################################################
3797 # align a string length
3798 sub align_string($$)
3800 my ($align, $str) = @_;
3801 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
3802 return $str;
3805 ################################################################
3806 # pad a string with zeros
3807 sub pad_string($$)
3809 my ($pad, $str) = @_;
3810 $str .= pack "C*", (0) x ($pad - length($str)) if length($str) < $pad;
3811 return $str;
3814 ################################################################
3815 # pack a GUID string
3816 sub pack_guid($)
3818 $_ = shift;
3819 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
3820 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
3823 ################################################################
3824 # comparison function for compression sort
3825 sub cmp_compression
3827 return scalar @{$a} <=> scalar @{$b} ||
3828 $a->[4] <=> $b->[4] ||
3829 $a->[5] <=> $b->[5] ||
3830 $a->[6] <=> $b->[6] ||
3831 $a->[7] <=> $b->[7] ||
3832 $a->[8] <=> $b->[8] ||
3833 $a->[9] <=> $b->[9] ||
3834 $a->[10] <=> $b->[10] ||
3835 $a->[11] <=> $b->[11] ||
3836 $a->[12] <=> $b->[12];
3839 ################################################################
3840 # build a binary sort keys table
3841 sub dump_sortkey_table($$)
3843 my ($filename, $download) = @_;
3845 my @keys;
3846 my ($part, $section, $subsection, $guid, $version, $ling_flag);
3847 my @multiple_weights;
3848 my @expansions;
3849 my @compressions;
3850 my %exceptions;
3851 my %guids;
3852 my %compr_flags;
3853 my %locales;
3854 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
3855 my $jamostr = "";
3857 my $re_hex = '0x[0-9A-Fa-f]+';
3858 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
3859 $guids{$default_guid} = { };
3861 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
3863 my $KEYS = open_data_file( $MSDATA, $download );
3865 printf "Building $filename\n";
3867 while (<$KEYS>)
3869 s/\s*;.*$//;
3870 next if /^\s*$/; # skip empty lines
3871 if (/^\s*(SORTKEY|SORTTABLES)/)
3873 $part = $1;
3874 next;
3876 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
3878 $part = $section = "";
3879 next;
3881 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
3883 $section = $1;
3884 $guid = undef;
3885 next;
3887 next unless $part;
3888 if ("$part.$section" eq "SORTKEY.DEFAULT")
3890 if (/^\s*($re_hex)\s+$re_key/)
3892 $keys[hex $1] = [ split(/\s+/,$2) ];
3893 next;
3896 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
3898 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
3900 $version = hex $1;
3901 next;
3903 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
3905 # ignore for now
3906 next;
3909 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
3910 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
3911 "$part.$section" eq "SORTTABLES.INVERSECASING")
3913 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
3915 $guid = lc $1;
3916 $guids{$guid} = { } unless defined $guids{$guid};
3917 $guids{$guid}->{flags} |= $flags{$section};
3918 next;
3920 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3922 $locales{$1} = $guid;
3923 next;
3926 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
3928 if (/^\s*(\d+)\s+(\d+)/)
3930 push @multiple_weights, $1, $2;
3931 next;
3934 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
3936 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3938 my $pos = scalar @expansions / 2;
3939 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
3940 push @expansions, hex $2, hex $3;
3941 next;
3944 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
3946 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3948 $keys[hex $1] = $keys[hex $2];
3949 next;
3952 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
3954 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
3956 if ($subsection || !$guid) # start a new one
3958 $guid = lc $1;
3959 $subsection = "";
3960 $guids{$guid} = { } unless defined $guids{$guid};
3961 $guids{$guid}->{flags} |= $flags{$2} if $2;
3962 $guids{$guid}->{compr} = @compressions;
3963 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
3964 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
3965 push @compressions, [ ];
3967 else # merge with current one
3969 $guids{lc $1} = { } unless defined $guids{lc $1};
3970 $guids{lc $1}->{flags} |= $flags{$2} if $2;
3971 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
3972 $compr_flags{lc $1} = $compr_flags{$guid};
3974 next;
3976 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3978 $locales{$1} = $guid;
3979 next;
3981 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
3983 $subsection = $1;
3984 next;
3986 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
3988 my @comp = map { hex $_; } split(/\s+/,$1);
3989 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
3990 # add compression flags
3991 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
3992 next;
3995 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
3997 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
3999 $guid = lc $1;
4000 $guids{$guid} = { } unless defined $guids{lc $1};
4001 $ling_flag = ($2 ? "+" : "-");
4002 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
4003 next;
4005 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
4007 $locales{$1} = $guid;
4008 next;
4010 if (/^\s*($re_hex)\s+$re_key/)
4012 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
4013 next;
4016 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
4018 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
4020 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
4021 next;
4024 die "$download: $part.$section: unrecognized line $_\n";
4026 close $KEYS;
4028 # Sortkey table
4030 my $table;
4031 for (my $i = 0; $i < 0x10000; $i++)
4033 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
4034 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4037 foreach my $id (sort keys %exceptions)
4039 my $pos = length($table) / 4;
4040 my @exc = @{$exceptions{$id}};
4041 my @filled;
4042 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
4043 my $guid = substr( $id, 0, -1 );
4044 $guids{$guid}->{$key} = $pos;
4045 $pos += 0x100;
4046 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
4047 for (my $j = 0; $j < 0x10000; $j++)
4049 next unless defined $exc[$j] || defined $flags[$j];
4050 $filled[$j >> 8] = 1;
4051 $j |= 0xff;
4053 for (my $j = 0; $j < 0x100; $j++)
4055 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
4056 $pos += 0x100 if $filled[$j];
4058 for (my $j = 0; $j < 0x10000; $j++)
4060 next unless $filled[$j >> 8];
4061 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
4062 $k[3] |= $flags[$j] || 0;
4063 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4067 # Case mapping tables
4069 # standard table
4070 my @casemaps;
4071 my @upper = @toupper_table;
4072 my @lower = @tolower_table;
4073 remove_linguistic_mappings( \@upper, \@lower );
4074 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4076 # linguistic table
4077 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
4079 # Turkish table
4080 @upper = @toupper_table;
4081 @lower = @tolower_table;
4082 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
4083 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
4084 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4085 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
4087 # Char type table
4089 my @table;
4090 my $types = "";
4091 my %typestr;
4092 for (my $i = 0; $i < 0x10000; $i++)
4094 my $str = pack "S<3",
4095 ($category_table[$i] || 0) & 0xffff,
4096 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
4097 ($category_table[$i] || 0) >> 16;
4099 if (!defined($typestr{$str}))
4101 $typestr{$str} = length($types) / 6;
4102 $types .= $str;
4104 $table[$i] = $typestr{$str};
4107 my @rows = compress_array( 4096, 0, @table[0..65535] );
4108 my @array = compress_array( 256, 0, @rows[0..4095] );
4109 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
4110 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
4112 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
4113 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
4114 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
4116 # Sort tables
4118 # guids
4119 my $sorttables = pack "L<2", $version, scalar %guids;
4120 foreach my $id (sort keys %guids)
4122 my %guid = %{$guids{$id}};
4123 my $flags = $guid{flags} || 0;
4124 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
4125 $sorttables .= pack_guid($id) . pack "L<5",
4126 $flags,
4127 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
4128 $guid{except} || 0,
4129 $guid{ling_except} || 0,
4130 $map / 2;
4133 # expansions
4134 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
4136 # compressions
4137 $sorttables .= pack "L<", scalar @compressions;
4138 my $rowstr = "";
4139 foreach my $c (@compressions)
4141 my $pos = length($rowstr) / 2;
4142 my $min = 0xffff;
4143 my $max = 0;
4144 my @lengths = (0) x 8;
4145 foreach my $r (sort cmp_compression @{$c})
4147 my @row = @{$r};
4148 $lengths[scalar @row - 6]++;
4149 foreach my $val (@row[4..$#row])
4151 $min = $val if $min > $val;
4152 $max = $val if $max < $val;
4154 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
4155 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
4157 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
4159 $sorttables .= $rowstr;
4161 # multiple weights
4162 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
4164 # jamo sort
4165 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
4167 # Locales
4169 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
4170 foreach my $loc (sort keys %locales)
4172 # skip specific locales that match more general ones
4173 my @parts = split /[-_]/, $loc;
4174 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
4175 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
4176 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
4179 # File header
4181 my @header;
4182 $header[0] = 16;
4183 $header[1] = $header[0] + length $table;
4184 $header[2] = $header[1] + length $casemaps;
4185 $header[3] = $header[2] + length $chartypes;
4187 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4188 print OUTPUT pack "L<*", @header;
4189 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
4190 close OUTPUT;
4191 save_file($filename);
4192 return $chartypes;
4196 my %lcnames;
4198 sub locale_parent($)
4200 my $loc = shift;
4202 return undef unless $loc;
4203 return $lcnames{$loc}->{sparent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{sparent};
4204 return $lcnames{$loc}->{parent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{parent};
4205 if ($loc =~ /(.*)-[0-9A-Za-z]+/) { return $1; }
4206 return "";
4209 sub compare_locales
4211 (my $n1 = $a) =~ tr/A-Z_/a-z-/;
4212 (my $n2 = $b) =~ tr/A-Z_/a-z-/;
4213 return $n1 cmp $n2;
4216 # query an xml key
4217 sub xml_query($$)
4219 my ($xml, $query) = @_;
4220 my $ret = $xml->find( $query );
4221 return undef unless $ret;
4222 printf STDERR "multiple entries for %s\n", $query if (@{$ret} > 1);
4223 return @{$ret}[0]->textContent;
4226 # query an xml key for a locale, with fallback to the parents
4227 sub loc_query($$)
4229 my ($loc, $query) = @_;
4231 $loc = $lcnames{"en-US"} unless $loc->{name}; # fallback to "en-US" for root locale
4233 for (my $cur = $loc->{name}; defined $cur; $cur = locale_parent( $cur ))
4235 next unless defined $lcnames{$cur};
4236 my $xml = $lcnames{$cur}->{xml};
4237 my $ret = $xml->find( $query );
4238 next unless $ret;
4239 printf STDERR "%s: multiple entries for %s\n", $cur, $query if (@{$ret} > 1);
4240 next if @{$ret}[0]->textContent eq "\x{2191}\x{2191}\x{2191}"; # "↑↑↑"
4241 return @{$ret}[0]->textContent;
4243 return undef;
4246 # retrieve a locale field entry by going up the parents tree
4247 sub locale_entry($$$)
4249 my ($loc, $field, $def) = @_;
4251 return $loc->{$field} if defined $loc->{$field};
4253 unless ($loc->{name}) # fallback to "en-US" for root locale
4255 $loc = $lcnames{"en-US"};
4256 return $loc->{$field} if defined $loc->{$field};
4258 while (defined $loc->{alias}) # resolve aliases
4260 $loc = $lcnames{$loc->{alias}};
4261 return $loc->{$field} if defined $loc->{$field};
4263 my $cur = $loc->{name};
4264 while ($cur)
4266 if (defined $lcnames{$cur} && defined $lcnames{$cur}->{sparent})
4268 $cur = $lcnames{$cur}->{sparent};
4270 elsif ($cur =~ /(.*)-[0-9A-Za-z]+/)
4272 $cur = $1;
4274 else
4276 return $def;
4278 return $lcnames{$cur}->{$field} if defined $lcnames{$cur} && defined $lcnames{$cur}->{$field};
4280 return $def;
4283 my $string_data;
4285 sub add_str_data($)
4287 my $txt = shift;
4288 my $ret = index( $string_data, $txt );
4289 if ($ret == -1)
4291 $ret = length($string_data);
4292 $string_data .= $txt
4294 return $ret / 2;
4297 sub add_string($)
4299 my $str = shift;
4300 return 0 unless defined($str) && $str ne "";
4301 my $utf = encode( "UTF16LE", $str );
4302 return add_str_data( (pack "S<", length($utf) / 2) . $utf . (pack "S", 0) );
4305 sub add_fontsig(@)
4307 return add_str_data( pack "S<L<*", scalar(@_) * 2, @_ );
4310 sub add_strarray(@)
4312 return 0 unless @_;
4313 return add_str_data( pack "S<L<*", scalar @_, map { add_string($_) } @_);
4316 sub format_to_grouping($)
4318 my $format = shift;
4319 if ($format =~ /#,(#+),(#+0)/) { return chr(length($2)) . chr(length($1)); }
4320 if ($format =~ /#,(#+0)/) { return chr(length($1)); }
4321 # printf STDERR "unknown format %s\n", $format;
4322 return chr(3);
4325 sub parse_currency_format($$)
4327 my $name = shift;
4328 my ($posfmt, $negfmt) = split /;/, shift;
4329 my @pospatterns = ( "\xa4[^\xa0]*#", # $1.1
4330 "00[^\xa0]*\xa4", # 1.1$
4331 "\xa4.*\xa0.*#", # $ 1.1
4332 "00.*\xa0.*\xa4" ); # 1.1 $
4333 my @negpatterns = ( "\\(\xa4[^\xa0]*#", # ($1.1)
4334 "-\xa4[^\xa0]*#", # -$1.1
4335 "\xa4[^\xa0]*-#", # $-1.1
4336 "\xa4[^\xa0]*#.*00-", # $1.1-
4337 "00[^\xa0]*\xa4\\)", # (1.1$)
4338 "-#.*00[^\xa0]*\xa4", # -1.1$
4339 "00-[^\xa0]*\xa4", # 1.1-$
4340 "00[^\xa0]*\xa4-", # 1.1$-
4341 "-#.*00.*\xa0.*\xa4", # -1.1 $
4342 "-\xa4.*\xa0.*#", # -$ 1.1
4343 "00.*\xa0.*\xa4-", # 1.1 $-
4344 "\xa4.*\xa0.*#.*00-", # $ 1.1-
4345 "\xa4.*\xa0.*-#", # $ -1.1
4346 "00-.*\xa0.*\xa4", # 1.1- $
4347 "\\(\xa4.*\xa0.*#", # ($ 1.1)
4348 "00.*\xa0.*\xa4\\)"); # (1.1 $)
4349 my ($pos, $neg);
4351 for ($pos = 0; $pos < @pospatterns; $pos++)
4353 last if ($posfmt =~ /$pospatterns[$pos]/);
4355 #printf STDERR "$name: unknown format '%s'\n", $posfmt if ($pos == @pospatterns);
4356 $pos = 0 if ($pos == @pospatterns);
4358 if (defined $negfmt)
4360 for ($neg = 0; $neg < @negpatterns; $neg++)
4362 last if ($negfmt =~ /$negpatterns[$neg]/);
4364 #printf STDERR "$name: unknown format '%s'\n", $negfmt if ($neg == @negpatterns);
4365 $neg = 0 if ($neg == @negpatterns);
4367 elsif ($pos == 0) { $neg = 1; }
4368 elsif ($pos == 1) { $neg = 5; }
4369 elsif ($pos == 2) { $neg = 9; }
4370 elsif ($pos == 3) { $neg = 8; }
4372 return ($pos, $neg);
4375 sub parse_percent_format($)
4377 my $fmt = shift;
4378 my @patterns = ( "0.+%", # 1 %
4379 "0%", # 1%
4380 "%#", # %1
4381 "%.+#" ); # % 1
4382 my $pos;
4383 for ($pos = 0; $pos < @patterns; $pos++)
4385 last if ($fmt =~ /$patterns[$pos]/);
4387 printf STDERR "unknown format '%s'\n", $fmt if ($pos == @patterns);
4388 return ($pos, ($pos == 3) ? 7 : $pos);
4391 sub convert_date_format($)
4393 my $fmt = shift;
4394 $fmt =~ s/G+/gg/;
4395 $fmt =~ s/LLLL/MMMM/;
4396 $fmt =~ s/LLL/MMM/;
4397 $fmt =~ s/E+/dddd/;
4398 $fmt =~ s/ccc+/dddd/;
4399 $fmt =~ s/([^gy])y([^y])/$1yyyy$2/;
4400 $fmt =~ s/^y([^y])/yyyy$1/;
4401 $fmt =~ s/([^gy])y$/$1yyyy/;
4402 return $fmt;
4405 sub convert_time_format($)
4407 my $fmt = shift;
4408 $fmt =~ s/a+/tt/;
4409 $fmt =~ s/B+/tt/;
4410 return $fmt;
4413 sub load_iso639()
4415 my %iso639;
4416 my $DATA = open_data_file( $ISO639, "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3.tab" );
4417 while (<$DATA>)
4419 if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
4421 close $DATA;
4422 return %iso639;
4426 ################################################################
4427 # build the locale table for locale.nls
4428 sub build_locale_data()
4430 my $base = "cldr-release-$CLDRVERSION";
4431 my $suppl = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/supplementalData.xml" );
4432 my $subtags = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/likelySubtags.xml" );
4433 my $numbers = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/numberingSystems.xml" );
4434 # obsolete phone data from CLDR version 33
4435 my $phone = load_xml_data_file( $CLDR33DATA, "common/supplemental/telephoneCodeData.xml" );
4436 my %iso639 = load_iso639();
4437 $string_data = pack "S2", 0, 0; # offset 0 == empty string
4439 %lcnames = map { $_->{name} => $_ } @locales;
4441 my %lcids;
4442 foreach my $loc (@locales) { $lcids{$loc->{lcid}} = $loc if defined $loc->{lcid}; }
4444 my %days = ( "sun" => 0, "mon" => 1, "tue" => 2, "wed" => 3, "thu" => 4, "fri" => 5, "sat" => 6 );
4446 # assign locale parents
4448 foreach my $loc (@locales)
4450 next if $loc->{name} eq "";
4451 next if defined $loc->{parent};
4452 (my $unix_name = $loc->{name}) =~ s/-/_/g;
4453 my $parent = xml_query( $suppl, "/supplementalData/parentLocales/parentLocale[contains(concat(' ',\@locales,' '),' $unix_name ')]/\@parent" );
4454 if ($parent)
4456 $parent =~ s/_/-/g;
4457 $parent = "" if $parent eq "root";
4459 elsif ($loc->{name} =~ /(.*)-[0-9A-Za-z]+/) { $parent = $1; }
4460 $loc->{parent} = $parent || "";
4463 # load per-locale XML files
4465 foreach my $loc (@locales)
4467 next if defined $loc->{alias};
4468 (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
4469 $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
4470 my $xml = load_xml_data_file( $CLDRDATA, $file );
4471 $loc->{xml} = $xml;
4472 $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
4473 $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
4474 $loc->{script} = xml_query( $xml, "/ldml/identity/script/\@type" );
4475 if (!defined($loc->{territory}) && $loc->{name} =~ /-([A-Z]{2}|[0-9]{3})$/) { $loc->{territory} = $1; }
4476 if (!defined($loc->{script}) && $loc->{name} =~ /-([A-Z][a-z]{3})(-[A-Z]{2})?$/) { $loc->{script} = $1; }
4479 # assign a default territory and sort locale
4481 foreach my $loc (@locales)
4483 next if defined $loc->{alias};
4484 next if defined $loc->{territory};
4485 my $id = $loc->{sortlocale};
4486 if (defined $id && ($id =~ /[-_]([A-Z0-9]+)$/))
4488 $loc->{territory} = $1;
4489 next;
4491 my @children = grep /^$loc->{name}-[A-Z0-9]+$/ && !defined $lcnames{$_}->{alias}, keys %lcnames;
4492 if (@children == 1)
4494 $id = $children[0];
4496 else
4498 my $name = $loc->{file} || $loc->{name};
4499 $name =~ s/-(Arab|Cyrl|Deva|Guru|Hans|Hant|Latn|Tfng|Vaii)$//;
4500 $name =~ s/-/_/g;
4501 $id = xml_query( $subtags, "/supplementalData/likelySubtags/likelySubtag[\@from='$name']/\@to" );
4502 $id =~ s/_/-/g if $id;
4504 if ($id =~ /[-_]([A-Z0-9]+)$/)
4506 $loc->{territory} = $1;
4507 next if defined $loc->{sortlocale};
4508 next unless $id =~ /^$loc->{name}/;
4509 while (defined $lcnames{$id} && defined $lcnames{$id}->{alias}) { $id = $lcnames{$id}->{alias}; }
4510 $loc->{sortlocale} = $id if defined $lcnames{$id};
4511 next;
4513 print STDERR "no territory found for $loc->{name}\n";
4516 # fill geoid table
4518 my %geotable;
4519 foreach my $geo (@geoids)
4521 my $name = $geo->{name};
4522 next unless defined $name;
4523 $geo->{alias} = $geotable{$name} if defined $geotable{$name};
4524 $geotable{$name} ||= $geo;
4526 foreach my $loc (@locales)
4528 next if defined $loc->{alias};
4529 my $territory = $loc->{territory};
4530 $geotable{$territory} ||= { name => $territory };
4532 foreach my $name (keys %geotable)
4534 my $geo = $geotable{$name};
4535 $geo->{dialcode} = xml_query( $phone, "(/supplementalData/telephoneCodeData/codesByTerritory[\@territory='$name']/telephoneCountryCode)[1]/\@code" );
4536 if ($name =~ /\d+/)
4538 $geo->{uncode} = $name;
4539 next;
4541 $geo->{iso2} = $name;
4542 $geo->{iso3} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@alpha3");
4543 $geo->{uncode} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@numeric");
4544 $geo->{sintlsymbol} ||= xml_query( $suppl, "(/supplementalData/currencyData/region[\@iso3166='$name']/currency[not(\@to)])[1]/\@iso4217") || "XXX";
4545 $geo->{sintlsymbol} =~ s/XXX/XDR/;
4547 foreach my $geo (@geoids)
4549 $geo->{parentid} = $geotable{$geo->{parent}}->{id} if defined $geo->{parent};
4550 next if defined $geo->{iso2};
4551 next if defined $geo->{alias};
4552 next unless defined $geo->{uncode};
4553 my @contains;
4554 my $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and not(\@status)]/\@contains");
4555 push @contains, split /\s+/, $list if defined $list;
4556 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and \@status='deprecated']/\@contains");
4557 push @contains, split /\s+/, $list if defined $list;
4558 while (@contains)
4560 my $territory = pop @contains;
4561 if (defined $geotable{$territory})
4563 $geotable{$territory}->{parentid} ||= $geo->{id};
4565 elsif ($territory =~ /\d+/)
4567 # expand region recursively
4568 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$territory' and not(\@status)]/\@contains" );
4569 push @contains, split /\s+/, $list if defined $list;
4574 # assign calendars to their locale
4576 foreach my $cal (@calendars)
4578 next unless defined $cal->{locale};
4579 my $loc = $lcnames{$cal->{locale}};
4580 $loc->{calendar} = [ ] unless defined $loc->{calendar};
4581 push @{$loc->{calendar}}, $cal;
4584 # assign default lcid to aliases
4586 foreach my $loc (@locales)
4588 next unless defined $loc->{alias};
4589 next if defined $loc->{lcid};
4590 my $alias = $loc->{alias};
4591 my $lcid = $lcnames{$alias}->{lcid} || 0x1000;
4592 $loc->{lcid} = $lcid | 0x80000000;
4595 # assign sort aliases to parent locale
4597 foreach my $loc (@locales)
4599 next unless $loc->{name} =~ /_/;
4600 next unless defined $loc->{alias};
4601 my $alias = $loc->{alias};
4602 my $parent = $lcnames{$alias};
4603 my $basename = $parent->{name};
4604 while (1)
4606 @{$parent->{sortnames}}[($loc->{lcid} >> 16) - 1] = $loc->{name};
4607 $alias = locale_parent( $alias );
4608 last unless $alias && defined $lcnames{$alias};
4609 $parent = $lcnames{$alias};
4610 last if defined $parent->{sortbase} && $parent->{sortbase} ne $basename;
4611 $parent->{sortbase} = $basename;
4615 # assign an array index to all locales
4617 my $idx = 0;
4618 foreach my $loc (@locales)
4620 next if defined $loc->{alias};
4621 $loc->{idx} = $idx++;
4623 foreach my $loc (@locales)
4625 my $alias = $loc->{alias};
4626 next unless defined $alias;
4627 while (defined $lcnames{$alias}->{alias}) { $alias = $lcnames{$alias}->{alias}; }
4628 $loc->{idx} = $lcnames{$alias}->{idx};
4631 # output lcids table
4633 my $lcid_data = "";
4634 foreach my $id (sort { $a <=> $b } keys %lcids)
4636 my $loc = $lcids{$id};
4637 $lcid_data .= pack "L<S<2", $id, $loc->{idx}, add_string($loc->{name});
4640 # output lcnames table
4642 my $lcname_data = "";
4643 foreach my $name (sort compare_locales keys %lcnames)
4645 my $loc = $lcnames{$name};
4646 $lcname_data .= pack "S<2L<", add_string($name), $loc->{idx}, $loc->{lcid} || 0x1000;
4649 # output locales array
4651 my $locale_data = "";
4652 my $default_lcid = 0x8001;
4653 foreach my $loc (@locales)
4655 next if defined $loc->{alias};
4656 my $sname = $loc->{name};
4657 my $language = $loc->{language};
4658 my $territory = $loc->{territory};
4659 my $script = $loc->{script};
4660 my $neutral = ($sname && $sname !~ /-$territory/);
4661 my $sparent = $loc->{sparent} || (($sname =~ /(.*)-[0-9A-Za-z]+/) ? $1 : $loc->{parent});
4662 my $unique_lcid = $loc->{lcid};
4663 unless (defined $unique_lcid) { $unique_lcid = $default_lcid++; }
4664 my $geo = $geotable{$territory};
4665 my $territory_match = "contains(concat(' ',normalize-space(\@territories),' '),' $territory ')";
4667 # languages and scripts
4669 my $ssortlocale = $loc->{sortlocale} || ($neutral ? "$sname-$territory" : $sname);
4670 my $idefaultlanguage = defined $lcnames{$ssortlocale} ? $lcnames{$ssortlocale}->{lcid} : undef;
4671 $idefaultlanguage = $lcnames{"en-US"}->{lcid} unless $ssortlocale;
4672 (my $siso639langname = $sname) =~ s/-.*$//;
4673 my $siso639langname2 = $iso639{$siso639langname} || $siso639langname;
4674 my $sopentypelang = sprintf "%-4s", locale_entry( $loc, "sopentypelang", uc $siso639langname2 );
4675 my $sabbrevlangname = defined $loc->{lcid} ? locale_entry( $loc, "sabbrevlangname", uc $siso639langname2 ) : "ZZZ";
4676 my $siso3166ctryname2 = $geo->{iso3} || $geo->{uncode};
4677 my $senglanguage = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" ) || "";
4678 my $sengcountry = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" ) || "";
4679 my $snativelangname = loc_query( $loc, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" );
4680 my $snativectryname = loc_query( $loc, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" );
4681 $sengcountry =~ s/South Korea/Korea/;
4682 $snativelangname ||= $senglanguage;
4683 $snativectryname ||= $sengcountry;
4684 if ($script)
4686 my $engscript = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4687 my $nativescript = loc_query( $loc, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4688 $senglanguage .= " ($engscript)" if $engscript;
4689 $snativelangname .= " ($nativescript)" if $nativescript;
4691 my $sengdisplayname = $neutral ? $senglanguage : "$senglanguage ($sengcountry)";
4692 my $snativedisplayname = $neutral ? $snativelangname : "$snativelangname ($snativectryname)";
4693 $sengdisplayname =~ s/\) \(/, /;
4694 $snativedisplayname =~ s/\) \(/, /;
4695 my $sscripts = locale_entry( $loc, "sscripts", $script ) || xml_query( $suppl, "/supplementalData/languageData/language[\@type='$language' and not(\@alt)]/\@scripts" );
4696 $sscripts = (join ";", (sort split / /, ($sscripts || "Latn"))) . ";";
4697 my $ireadinglayout = locale_entry( $loc, "ireadinglayout", 0 );
4698 my $charlayout = loc_query( $loc, "/ldml/layout/orientation/characterOrder" );
4699 if ($charlayout eq "right-to-left")
4701 $ireadinglayout = 1;
4703 elsif ($charlayout eq "top-to-bottom")
4705 my $linelayout = loc_query( $loc, "/ldml/layout/orientation/lineOrder" );
4706 $ireadinglayout = $linelayout eq "right-to-left" ? 2 : 3;
4708 my $igeoid = $geo->{id} || 0;
4710 # numbers
4712 my $sdecimal = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/decimal" );
4713 my $slist = locale_entry( $loc, "slist", ";" );
4714 my $smondecimalsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyDecimal" ) || $sdecimal;
4715 my $sthousand = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/group" );
4716 $sthousand =~ s/\x{202f}/\x{00a0}/;
4717 my $smonthousandsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyGroup" ) || $sthousand;
4718 my $spositivesign = "";
4719 my $snegativesign = "-";
4720 my $spercent = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/percentSign" );
4721 my $snan = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/nan" );
4722 my $sposinfinity = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/infinity" );
4723 my $sneginfinity = $sposinfinity ? "-$sposinfinity" : "";
4724 my $sgrouping = format_to_grouping( loc_query( $loc, "/ldml/numbers/decimalFormats[\@numberSystem='latn']/decimalFormatLength[not(\@type)]/decimalFormat/pattern" ));
4725 my $percentformat = loc_query( $loc, "/ldml/numbers/percentFormats[\@numberSystem='latn']/percentFormatLength[not(\@type)]/percentFormat/pattern" );
4726 my $currencyformat = loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='accounting']/pattern" ) ||
4727 loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='standard']/pattern" );
4728 my $smongrouping = format_to_grouping( $currencyformat );
4729 my ($icurrency, $inegcurr) = parse_currency_format( $sname, $currencyformat );
4730 my ($ipospercent, $inegpercent) = parse_percent_format( $percentformat );
4731 my $native_numbering = loc_query( $loc, "/ldml/numbers/otherNumberingSystems/native" );
4732 my @snativedigits = split //, xml_query( $numbers, "/supplementalData/numberingSystems/numberingSystem[\@id='$native_numbering']/\@digits" );
4733 my $digitsubstitution = !(ord($snativedigits[0]) >= 0x600 && ord($snativedigits[0]) <= 0x6ff);
4734 my $measure = defined xml_query( $suppl, "/supplementalData/measurementData/measurementSystem[\@type='US' and $territory_match]" );
4735 my $papersize = defined xml_query( $suppl, "/supplementalData/measurementData/paperSize[\@type='US-Letter' and $territory_match]" );
4737 # currencies
4739 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
4740 my $scurrency = $geo->{scurrency} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[\@alt='narrow']" );
4741 $scurrency ||= loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[not(\@alt)]" );
4742 $geo->{scurrency} = $scurrency if $scurrency;
4743 $scurrency ||= $sintlsymbol;
4744 my $sengcurrname = $loc->{sengcurrname} || loc_query( $lcnames{en}, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" );
4745 my $snativecurrname = $loc->{sengcurrname} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" ) || $sengcurrname;
4746 my $icurrdigits = xml_query( $suppl, "/supplementalData/currencyData/fractions/info[\@iso4217='$sintlsymbol']/\@digits" );
4747 $icurrdigits = 2 unless defined $icurrdigits;
4749 # calendars
4751 my $firstday = xml_query( $suppl, "/supplementalData/weekData/firstDay[not(\@alt) and $territory_match]/\@day" );
4752 my $ifirstdayofweek = $firstday ? $days{$firstday} : 1;
4753 my $firstweekofyear = (xml_query( $suppl, "/supplementalData/weekData/minDays[$territory_match]/\@count" ) || 0) == 4 ? 2 : 0;
4754 my $serastring = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/eras/eraAbbr/era[\@type='1' and not(\@alt)]" );
4755 my (@sdayname, @sabbrevdayname, @sshortestdayname);
4756 foreach my $d (sort { $days{$a} <=> $days{$b} } keys %days)
4758 my $n = $days{$d};
4759 my %name;
4760 foreach my $type (qw(wide abbreviated short))
4762 $name{$type} = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/days/dayContext[\@type='format']/dayWidth[\@type='$type']/day[\@type='$d' and not(\@alt)]" );
4764 push @sdayname, $name{wide};
4765 push @sabbrevdayname, $name{abbreviated} || $name{wide};
4766 push @sshortestdayname, $name{short} || $name{abbreviated} || $name{wide};
4768 my (@smonthname, @sabbrevmonthname, @sgenitivemonth, @sabbrevgenitivemonth);
4769 foreach my $n (1..13)
4771 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='wide']/month[\@type='$n']" );
4772 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4773 my $genitive = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n']" );
4774 my $abbrevgen = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4775 push @smonthname, $name || $genitive || "";
4776 push @sabbrevmonthname, $abbrev || $abbrevgen || $name || $genitive || "";
4777 push @sgenitivemonth, $genitive || "";
4778 push @sabbrevgenitivemonth, $abbrevgen || $genitive || "";
4780 @sgenitivemonth = () if join("|",@smonthname) eq join("|",@sgenitivemonth);
4781 @sabbrevgenitivemonth = () if join("|",@sabbrevmonthname) eq join("|",@sabbrevgenitivemonth);
4782 my %caltypes = ( "gregorian" => 1, "japanese" => 3, "chinese" => 4, "dangi" => 5, "islamic" => 6, "buddhist" => 7, "hebrew" => 8,
4783 "persian" => 22, "islamic-civil" => 23, "islamic-umalqura" => 23 );
4784 my $calpref = xml_query( $suppl, "/supplementalData/calendarPreferenceData/calendarPreference[$territory_match]/\@ordering" ) || "gregorian";
4785 my $icalendartype;
4786 my @scalnames;
4787 foreach my $c (split /\s+/, $calpref)
4789 next unless defined $caltypes{$c};
4790 $icalendartype .= chr($caltypes{$c});
4791 $scalnames[$caltypes{$c} - 1] = loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$c']" );
4794 # date/time formats
4796 my $s1159 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='am' and not(\@alt)]" );
4797 my $s2359 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='pm' and not (\@alt)]" );
4798 my $sshortestam = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='am' and not(\@alt)]" );
4799 my $sshortestpm = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='pm' and not (\@alt)]" );
4800 my @stimeformat = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='medium']/timeFormat/pattern[not(\@alt)]" ));
4801 push @stimeformat, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4802 pop @stimeformat if $stimeformat[0] eq $stimeformat[1];
4803 @stimeformat = map convert_time_format($_), @stimeformat;
4804 my @sshorttime = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='short']/timeFormat/pattern[not(\@alt)]" ));
4805 push @sshorttime, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hm' and not(\@alt)]" );
4806 pop @sshorttime if $sshorttime[0] eq $sshorttime[1];
4807 @sshorttime = map convert_time_format($_), @sshorttime;
4808 my @sshortdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4809 push @sshortdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
4810 @sshortdate = map convert_date_format($_), @sshortdate;
4811 my @slongdate = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" ));
4812 push @slongdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
4813 @slongdate = map convert_date_format($_), @slongdate;
4814 my @smonthday = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMd' and not(\@alt)]" ));
4815 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Md' and not(\@alt)]" );
4816 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMd' and not(\@alt)]" );
4817 @smonthday = map convert_date_format($_), @smonthday;
4818 my @syearmonth = map convert_date_format($_), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMM' and not(\@alt)]" );
4819 my @sduration = map convert_time_format( lc $_ ), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4820 my $srelativelongdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMEd' and not(\@alt)]" ) ||
4821 loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMEd' and not(\@alt)]" );
4822 $srelativelongdate = convert_date_format( $srelativelongdate );
4824 if (defined $loc->{calendar})
4826 foreach my $cal (@{$loc->{calendar}})
4828 $cal->{sshortdate} = \@sshortdate;
4829 $cal->{syearmonth} = \@syearmonth;
4830 $cal->{slongdate} = \@slongdate;
4831 $cal->{serastring} = [ $serastring ];
4832 $cal->{sdayname} = \@sdayname;
4833 $cal->{sabbrevdayname} = \@sabbrevdayname;
4834 $cal->{smonthname} = \@smonthname;
4835 $cal->{sabbrevmonthname} = \@sabbrevmonthname;
4836 $cal->{scalname} = $scalnames[$cal->{id}];
4837 $cal->{smonthday} = \@smonthday;
4838 $cal->{sshortestdayname} = \@sshortestdayname;
4839 $cal->{sabbreverastring} = [ $serastring ];
4840 $cal->{sshortestdayname} = \@sshortestdayname;
4841 $cal->{srelativelongdate} = $srelativelongdate;
4845 # codepages
4847 my %ansicpmap = ( 437 => 1252, 720 => 1256, 737 => 1253, 775 => 1257, 850 => 1252,
4848 852 => 1250, 855 => 1251, 866 => 1251, 857 => 1254, 862 => 1255 );
4849 my %maccpmap = ( 437 => 10000, 720 => 10004, 737 => 10006, 775 => 10029, 850 => 10000,
4850 852 => 10029, 855 => 10007, 857 => 10081, 862 => 10005, 866 => 10007,
4851 874 => 10021, 932 => 10001, 936 => 10008, 949 => 10003, 950 => 10002,
4852 1258 => 10000 );
4853 my %ebcdiccpmap = ( 437 => 37, 720 => 20420, 737 => 20273, 866 => 20880, 932 => 20290 );
4854 my %codepagemasks = ( 874 => [ 0x01000000, 0x00000000, 0x00000000, 0, 0x00010000, 0x00000000, 0x00010000, 0x00000000 ],
4855 932 => [ 0x00000000, 0x28c70000, 0x00000010, 0, 0x00020000, 0x00000000, 0x00020000, 0x00000000 ],
4856 936 => [ 0x00000000, 0x28010000, 0x00000002, 0, 0x00040000, 0x00000000, 0x00040000, 0x00000000 ],
4857 949 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00080000, 0x00000000, 0x00080000, 0x00000000 ],
4858 950 => [ 0x00000000, 0x28c10000, 0x00000012, 0, 0x00100000, 0x00000000, 0x00100000, 0x00000000 ],
4859 1258 => [ 0x2000000f, 0x00000000, 0x00000000, 0, 0x00000100, 0x00008000, 0x00000100, 0x00008000 ],
4860 866 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x00020000, 0x00000004, 0x02020000 ],
4861 862 => [ 0x00000800, 0x40000000, 0x00000000, 0, 0x00000020, 0x00200000, 0x00000020, 0x00200000 ],
4862 857 => [ 0x0000001f, 0x00000000, 0x00000000, 0, 0x00000010, 0x01000000, 0x00000010, 0x01000000 ],
4863 855 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x02000000, 0x00000004, 0x02000000 ],
4864 852 => [ 0x00000027, 0x00000000, 0x00000000, 0, 0x00000002, 0x04000000, 0x00000002, 0x04000000 ],
4865 775 => [ 0x00000007, 0x00000000, 0x00000000, 0, 0x00000080, 0x08000000, 0x00000080, 0x08000000 ],
4866 737 => [ 0x00000080, 0x00000000, 0x00000000, 0, 0x00000008, 0x10000000, 0x00000008, 0x10010000 ],
4867 720 => [ 0x00002000, 0x00000000, 0x00000000, 0, 0x00000040, 0x20000000, 0x00000040, 0x20080000 ],
4868 850 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x40000000, 0x0000019f, 0xdfd70000 ],
4869 437 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x80000000, 0x0000019f, 0xdfd70000 ],
4870 65001 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00000000, 0x00000000, 0x0000019f, 0xdfd70000 ] );
4871 my $oemcp = locale_entry( $loc, "oemcp", 65001 );
4872 my $maccp = locale_entry( $loc, "maccp", undef ) || $maccpmap{$oemcp} || 65001;
4873 my $ebcdiccp = locale_entry( $loc, "ebcdiccp", undef ) || $ebcdiccpmap{$oemcp} || 500;
4874 $ebcdiccp = 500 if (defined $loc->{oemcp} && $loc->{oemcp} == 65001) || (defined $loc->{maccp} && $loc->{maccp} == 65001);
4875 my $ansicp = $ansicpmap{$oemcp} || $oemcp;
4876 my @fontsig = (0) x 8;
4877 my $sig = locale_entry( $loc, "fontsig", [] );
4878 foreach my $i (0..7) { $fontsig[$i] |= $codepagemasks{$oemcp}->[$i]; }
4879 foreach my $i (0..$#{$sig}) { $fontsig[$i] |= $sig->[$i]; }
4880 $fontsig[3] |= 1 << 31;
4881 $fontsig[3] |= 1 << 27 if $ireadinglayout == 1;
4882 $fontsig[3] |= 1 << 28 if $ireadinglayout == 3;
4884 # special cases for invariant locale
4886 unless ($loc->{name})
4888 $siso639langname = "iv";
4889 $siso639langname2 = "ivl";
4890 $senglanguage = $snativelangname = "Invariant Language";
4891 $sengcountry = $snativectryname = "Invariant Country";
4892 $sengdisplayname = "Invariant Language (Invariant Country)";
4893 $snativedisplayname = "Invariant Language (Invariant Region)";
4894 $sengcurrname = $snativecurrname = "International Monetary Fund";
4895 $scurrency = "\x{00a4}";
4896 $ifirstdayofweek = 0;
4897 $igeoid = $geotable{"US"}->{id};
4898 @stimeformat = ("HH:mm:ss");
4899 @sshortdate = ("MM/dd/yyyy", "yyyy-MM-dd");
4900 @slongdate = ("dddd, dd MMMM yyyy");
4901 @syearmonth = ("yyyy MMMM");
4902 @smonthday = ("MMMM dd", "MMMM d", "M/d", "MMM d");
4903 @sshorttime = ("HH:mm", "hh:mm tt", "H:mm", "h:mm tt");
4904 $srelativelongdate = "dddd, MMMM dd";
4905 $sposinfinity = "Infinity";
4906 $sneginfinity = "-Infinity";
4907 $spositivesign = "+";
4908 $ipospercent = $inegpercent = 0;
4911 # output data
4913 $locale_data .= pack "L<2",
4914 add_string( $sname ), # name
4915 add_string( $sopentypelang ); # LOCALE_SOPENTYPELANGUAGETAG
4917 $locale_data .= pack "S<14",
4918 $loc->{lcid} || 0x1000, # LOCALE_ILANGUAGE
4919 $unique_lcid, # unique_lcid
4920 locale_entry( $loc, "idigits", 2 ), # LOCALE_IDIGITS
4921 locale_entry( $loc, "inegnumber", 1 ), # LOCALE_INEGNUMBER
4922 $icurrdigits, # LOCALE_ICURRDIGITS
4923 $icurrency, # LOCALE_ICURRENCY
4924 $inegcurr, # LOCALE_INEGCURR
4925 locale_entry( $loc, "ilzero", 1 ), # LOCALE_ILZERO
4926 !$neutral, # LOCALE_INEUTRAL
4927 $ifirstdayofweek, # LOCALE_IFIRSTDAYOFWEEK
4928 $firstweekofyear, # LOCALE_IFIRSTWEEKOFYEAR
4929 $geo->{dialcode} || 1 , # LOCALE_ICOUNTRY,
4930 $measure, # LOCALE_IMEASURE
4931 $digitsubstitution; # LOCALE_IDIGITSUBSTITUTION
4933 $locale_data .= pack "L<18",
4934 add_string( $sgrouping ), # LOCALE_SGROUPING
4935 add_string( $smongrouping ), # LOCALE_SMONGROUPING
4936 add_string( $slist ), # LOCALE_SLIST
4937 add_string( $sdecimal ), # LOCALE_SDECIMAL
4938 add_string( $sthousand ), # LOCALE_STHOUSAND
4939 add_string( $scurrency ), # LOCALE_SCURRENCY
4940 add_string( $smondecimalsep ), # LOCALE_SMONDECIMALSEP
4941 add_string( $smonthousandsep ), # LOCALE_SMONTHOUSANDSEP
4942 add_string( $spositivesign ), # LOCALE_SPOSITIVESIGN
4943 add_string( $snegativesign ), # LOCALE_SNEGATIVESIGN
4944 add_string( $s1159 ), # LOCALE_S1159
4945 add_string( $s2359 ), # LOCALE_S2359
4946 add_strarray( @snativedigits ), # LOCALE_SNATIVEDIGITS
4947 add_strarray( @stimeformat ), # LOCALE_STIMEFORMAT
4948 add_strarray( @sshortdate ), # LOCALE_SSHORTDATE
4949 add_strarray( @slongdate ), # LOCALE_SLONGDATE
4950 add_strarray( @syearmonth ), # LOCALE_SYEARMONTH
4951 add_strarray( @sduration ); # LOCALE_SDURATION
4953 $locale_data .= pack "S<8",
4954 $idefaultlanguage || 0x1000, # LOCALE_IDEFAULTLANGUAGE
4955 $ansicp, # LOCALE_IDEFAULTANSICODEPAGE
4956 $oemcp, # LOCALE_IDEFAULTCODEPAGE
4957 $maccp, # LOCALE_IDEFAULTMACCODEPAGE
4958 $ebcdiccp, # LOCALE_IDEFAULTEBCDICCODEPAGE
4959 $igeoid < 65536 ? $igeoid : 39070, # old_geoid
4960 $papersize ? 1 : 9, # LOCALE_IPAPERSIZE
4961 0; # FIXME # islamic_cal
4963 $locale_data .= pack "L<24",
4964 add_string( $icalendartype ), # LOCALE_ICALENDARTYPE
4965 add_string( $sabbrevlangname ), # LOCALE_SABBREVLANGNAME
4966 add_string( $siso639langname ), # LOCALE_SISO639LANGNAME
4967 add_string( $senglanguage ), # LOCALE_SENGLANGUAGE
4968 add_string( $snativelangname ), # LOCALE_SNATIVELANGNAME
4969 add_string( $sengcountry ), # LOCALE_SENGCOUNTRY
4970 add_string( $snativectryname ), # LOCALE_SNATIVECTRYNAME
4971 add_string( $siso3166ctryname2 ), # LOCALE_SABBREVCTRYNAME
4972 add_string( $territory ), # LOCALE_SISO3166CTRYNAME
4973 add_string( $sintlsymbol ), # LOCALE_SINTLSYMBOL
4974 add_string( $sengcurrname ), # LOCALE_SENGCURRNAME
4975 add_string( $snativecurrname ), # LOCALE_SNATIVECURRNAME
4976 add_fontsig( @fontsig ), # LOCALE_FONTSIGNATURE
4977 add_string( $siso639langname2 ), # LOCALE_SISO639LANGNAME2
4978 add_string( $siso3166ctryname2 ), # LOCALE_SISO3166CTRYNAME2
4979 add_string( $sparent ), # LOCALE_SPARENT
4980 add_strarray( @sdayname ), # LOCALE_SDAYNAME
4981 add_strarray( @sabbrevdayname ), # LOCALE_SABBREVDAYNAME
4982 add_strarray( @smonthname ), # LOCALE_SMONTHNAME
4983 add_strarray( @sabbrevmonthname ), # LOCALE_SABBREVMONTHNAME
4984 add_strarray( @sgenitivemonth ), # LOCALE_SGENITIVEMONTH
4985 add_strarray( @sabbrevgenitivemonth ), # LOCALE_SABBREVGENITIVEMONTH
4986 add_strarray( @scalnames ), # LOCALE_SCALNAMES
4987 add_strarray( @{$loc->{sortnames}} ); # LOCALE_SSORTNAMES
4989 $locale_data .= pack "S<6",
4990 $inegpercent, # LOCALE_INEGATIVEPERCENT
4991 $ipospercent, # LOCALE_IPOSITIVEPERCENT
4992 0, # unknown
4993 $ireadinglayout, # LOCALE_IREADINGLAYOUT
4994 0x2a, # unknown
4995 0x2a; # unknown
4997 $locale_data .= pack "L<24",
4998 0, # unknown
4999 add_string( $sengdisplayname ), # LOCALE_SENGLISHDISPLAYNAME
5000 add_string( $snativedisplayname ), # LOCALE_SNATIVEDISPLAYNAME
5001 add_string( $spercent ), # LOCALE_SPERCENT
5002 add_string( $snan ), # LOCALE_SNAN
5003 add_string( $sposinfinity ), # LOCALE_SPOSINFINITY
5004 add_string( $sneginfinity ), # LOCALE_SNEGINFINITY
5005 0, # unknown
5006 add_string( $serastring ), # CAL_SERASTRING
5007 add_string( $serastring ), # CAL_SABBREVERASTRING
5008 0, # unknown
5009 add_string( $ssortlocale ), # LOCALE_SCONSOLEFALLBACKNAME
5010 add_strarray( @sshorttime ), # LOCALE_SSHORTTIME
5011 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5012 0, # unknown
5013 add_string( $ssortlocale ), # LOCALE_SSORTLOCALE
5014 add_string( "0409:00000409" ), # FIXME # LOCALE_SKEYBOARDSTOINSTALL
5015 add_string( $sscripts ), # LOCALE_SSCRIPTS
5016 add_string( $srelativelongdate ), # LOCALE_SRELATIVELONGDATE
5017 $igeoid, # LOCALE_IGEOID
5018 add_string( $sshortestam || "a" ), # LOCALE_SSHORTESTAM
5019 add_string( $sshortestpm || "p" ), # LOCALE_SSHORTESTPM
5020 add_strarray( @smonthday ), # LOCALE_SMONTHDAY
5021 add_string( "k0-windows-us" ) # FIXME # keyboard_layout
5024 # output language groups
5026 my %groups;
5027 add_registry_key( "Locale", "00000409" );
5028 foreach my $loc (@locales)
5030 next unless defined $loc->{lcid};
5031 next if ($loc->{lcid} & 0x80000000);
5032 next if !defined($loc->{alias}) && $loc->{name} !~ /-$loc->{territory}/; # skip neutral locales
5033 my $group = locale_entry( $loc, "group", 1 );
5034 my $name = sprintf( "%08x", $loc->{lcid} );
5035 my $val = sprintf( "%x", $group );
5036 add_registry_value( "Locale", $name, $val ) unless ($loc->{lcid} & 0x000f0000);
5037 add_registry_value( "Locale\\Alternate Sorts", $name, $val ) if $loc->{name} =~ /_/;
5038 $groups{$val} = 1;
5040 foreach my $group (keys %groups) { add_registry_value( "Language Groups", $group, "1" ); }
5042 # output calendar data
5044 my $calendar_data = "";
5045 foreach my $cal (@calendars)
5047 my $scalname = $cal->{name};
5048 my $iyearoffsetrange = 0;
5049 my $itwodigityearmax = $cal->{itwodigityearmax};
5050 my @sshortdate;
5051 my @syearmonth;
5052 my @slongdate;
5053 my @serastring;
5054 my @sdayname;
5055 my @sabbrevdayname;
5056 my @smonthname;
5057 my @sabbrevmonthname;
5058 my @smonthday;
5059 my @sabbreverastring;
5060 my @sshortestdayname;
5062 my $type = $cal->{type};
5063 if (defined $cal->{locale} && defined $type)
5065 my $loc = $lcnames{$cal->{locale}};
5066 my $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
5067 push @sshortdate, $fmt if $fmt;
5068 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMd' and not(\@alt)]" );
5069 push @sshortdate, $fmt if $fmt;
5070 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
5071 push @sshortdate, $fmt if $fmt;
5072 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMMMd' and not(\@alt)]" );
5073 push @sshortdate, $fmt if $fmt;
5074 @sshortdate = map convert_date_format($_), @sshortdate;
5075 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" );
5076 push @slongdate, $fmt if $fmt;
5077 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
5078 push @slongdate, $fmt if $fmt;
5079 @slongdate = map convert_date_format($_), @slongdate;
5081 foreach my $n (1..13)
5083 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n' and not(\@yeartype)]" );
5084 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n' and not(\@yeartype)]" );
5085 push @smonthname, $name || "";
5086 push @sabbrevmonthname, $abbrev || $name || "";
5089 $scalname ||= loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$type']" );
5090 if (defined $cal->{eras})
5092 my @eras;
5093 my $idx = 1;
5094 foreach my $era (@{$cal->{eras}})
5096 my $start = xml_query( $suppl, "/supplementalData/calendarData/calendar[\@type='$type']/eras/era[\@type='$era']/\@start" );
5097 next unless $start =~ /^(-?\d+)-(\d+)-(\d+)/;
5098 my ($year, $mon, $day, $zero, $first) = ($1, $2, $3, $1 - 1, 1);
5099 if ($zero < 0)
5101 $first -= $zero;
5102 $year = 1;
5103 $itwodigityearmax = 2049 - $zero;
5105 unshift @eras, pack( "S<8", 6, $idx++, $year, $mon, $day, $zero, $first, 0 );
5106 push @serastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraAbbr/era[\@type='$era']" );
5107 push @sabbreverastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraNarrow/era[\@type='$era']" );
5109 $iyearoffsetrange = add_str_data( pack "S<L<*", scalar @eras, map { add_str_data($_); } @eras );
5113 @sshortdate = @{$cal->{sshortdate}} if defined $cal->{sshortdate} && !@sshortdate;
5114 @syearmonth = @{$cal->{syearmonth}} if defined $cal->{syearmonth};
5115 @slongdate = @{$cal->{slongdate}} if defined $cal->{slongdate} && !@slongdate;
5116 @serastring = @{$cal->{serastring}} if defined $cal->{serastring} && !@serastring;
5117 @sdayname = @{$cal->{sdayname}} if defined $cal->{sdayname};
5118 @sabbrevdayname = @{$cal->{sabbrevdayname}} if defined $cal->{sabbrevdayname};
5119 @smonthname = @{$cal->{smonthname}} if defined $cal->{smonthname} && !join("",@smonthname);
5120 @sabbrevmonthname = @{$cal->{sabbrevmonthname}} if defined $cal->{sabbrevmonthname} && !join("",@sabbrevmonthname);
5121 @smonthday = @{$cal->{smonthday}} if defined $cal->{smonthday};
5122 @sabbreverastring = @{$cal->{sabbreverastring}} if defined $cal->{sabbreverastring} && !@sabbreverastring;
5123 @sshortestdayname = @{$cal->{sshortestdayname}} if defined $cal->{sshortestdayname};
5124 my $srelativelongdate = $cal->{srelativelongdate};
5126 @serastring = ("A.D.") unless @serastring;
5127 @sabbreverastring = ("AD") unless @sabbreverastring;
5129 if ($cal->{id} != 1) # calendar 1 is a placeholder, information is fetched from locale instead
5131 @sshortdate = ("") unless @sshortdate;
5132 @syearmonth = ("") unless @syearmonth;
5133 @slongdate = ("") unless @slongdate;
5134 @sdayname = ("") x 7 unless @sdayname;
5135 @sabbrevdayname = ("") x 7 unless @sabbrevdayname;
5136 @sshortestdayname = ("") x 7 unless @sshortestdayname;
5137 @smonthname = ("") x 13 unless @smonthname;
5138 @sabbrevmonthname = ("") x 13 unless @sabbrevmonthname;
5139 @smonthday = ("") unless @smonthday;
5142 $calendar_data .= pack "S<2L<17",
5143 $cal->{id}, # CAL_ICALINTVALUE
5144 $itwodigityearmax || 99, # CAL_ITWODIGITYEARMAX
5145 add_strarray( @sshortdate ), # CAL_SSHORTDATE
5146 add_strarray( @syearmonth ), # CAL_SYEARMONTH
5147 add_strarray( @slongdate ), # CAL_SLONGDATE
5148 add_strarray( @serastring ), # CAL_SERASTRING
5149 $iyearoffsetrange, # CAL_IYEAROFFSETRANGE
5150 add_strarray( @sdayname ), # CAL_SDAYNAME
5151 add_strarray( @sabbrevdayname ), # CAL_SABBREVDAYNAME
5152 add_strarray( @smonthname ), # CAL_SMONTHNAME
5153 add_strarray( @sabbrevmonthname ), # CAL_SABBREVMONTHNAME
5154 add_string( $scalname ), # CAL_SCALNAME
5155 add_strarray( @smonthday ), # CAL_SMONTHDAY
5156 add_strarray( @sabbreverastring ), # CAL_SABBREVERASTRING
5157 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5158 add_string( $srelativelongdate ); # CAL_SRELATIVELONGDATE
5161 # output locale header
5163 my $nb_lcids = scalar keys %lcids;
5164 my $nb_locales = scalar grep { !defined $_->{alias} } @locales;
5165 my $nb_lcnames = scalar keys %lcnames;
5166 my $locale_size = length($locale_data) / $nb_locales;
5167 my $nb_calendars = scalar @calendars;
5168 my $calendar_size = length($calendar_data) / $nb_calendars;
5169 my $lcids_offset = 19 * 4; # size of header
5170 my $lcnames_offset = $lcids_offset + length $lcid_data;
5171 my $locales_offset = $lcnames_offset + length $lcname_data;
5172 my $calendar_offset = $locales_offset + length $locale_data;
5173 my $strings_offset = $calendar_offset + length $calendar_data;
5175 my $locale_header = pack "L<7S<4L<S<2L<3S<2L<4",
5176 8, # offset
5178 7, # version
5179 0x5344534e, # magic
5180 0, 0, 0,
5182 $nb_lcids,
5183 $nb_locales,
5184 $locale_size,
5185 $locales_offset,
5186 $nb_lcnames,
5188 $lcids_offset,
5189 $lcnames_offset,
5191 $nb_calendars,
5192 $calendar_size,
5193 $calendar_offset,
5194 $strings_offset,
5195 0, 0;
5197 return align_string( 4, $locale_header . $lcid_data . $lcname_data . $locale_data . $calendar_data . $string_data );
5201 ################################################################
5202 # build the charmaps table for locale.nls
5203 sub build_charmaps_data()
5205 my $data = "";
5207 # MAP_FOLDDIGITS
5208 $data .= dump_binary_case_table( @digitmap_table );
5210 # CJK compatibility map
5211 $data .= dump_binary_case_table( @cjk_compat_table );
5213 # LCMAP_HIRAGANA/KATAKANA
5214 my (@hiragana_table, @katakana_table);
5215 foreach my $ch (0x3041..0x3096, 0x309d..0x309e)
5217 $hiragana_table[$ch + 0x60] = $ch;
5218 $katakana_table[$ch] = $ch + 0x60;
5220 $data .= dump_binary_case_table( @hiragana_table ) . dump_binary_case_table( @katakana_table );
5222 # LCMAP_HALFWIDTH/FULLWIDTH
5223 $halfwidth_table[0x2018] = 0x0027;
5224 $halfwidth_table[0x2019] = 0x0027;
5225 $halfwidth_table[0x201c] = 0x0022;
5226 $halfwidth_table[0x201d] = 0x0022;
5227 $halfwidth_table[0x309b] = 0xff9e;
5228 $halfwidth_table[0x309c] = 0xff9f;
5229 $fullwidth_table[0x309b] = 0x3099;
5230 $fullwidth_table[0x309c] = 0x309a;
5231 $data .= dump_binary_case_table( @halfwidth_table ) . dump_binary_case_table( @fullwidth_table );
5233 # LCMAP_TRADITIONAL/SIMPLIFIED_CHINESE
5234 $data .= dump_binary_case_table( @chinese_traditional_table ) . dump_binary_case_table( @chinese_simplified_table );
5236 # FIXME: some more unknown tables here
5238 return $data;
5242 ################################################################
5243 # build the geoids table for locale.nls
5244 sub build_geoids_data()
5246 my $data = "";
5247 my %index;
5248 my $idx = 0;
5249 my @geo_header = (0x00650067, 0x0000006f, 0, 4 * 7, scalar @geoids, 0, 0);
5251 foreach my $geo (@geoids)
5253 my $id = $geo->{id};
5254 $geo = $geo->{alias} if defined $geo->{alias};
5255 my $lat = "0.000";
5256 my $long = "0.000";
5257 my $iso2 = $geo->{iso2} || "XX";
5258 my $iso3 = $geo->{iso3} || "XX";
5259 my $isregion = $geo->{region} || (defined $geo->{uncode} && !defined $geo->{iso2});
5260 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
5261 my $scurrency = $geo->{scurrency} || "\x{00a4}";
5263 $data .= pack( "L<", $id );
5264 $data .= pad_string( 24, encode( "UTF16LE", $lat ));
5265 $data .= pad_string( 24, encode( "UTF16LE", $long ));
5266 $data .= pack( "L<2", $isregion ? 14 : 16, $geo->{parentid} || 39070 );
5267 $data .= pad_string( 8, encode( "UTF16LE", $iso2 ));
5268 $data .= pad_string( 8, encode( "UTF16LE", $iso3 ));
5269 $data .= pack( "S<2", $geo->{uncode} || 0, $geo->{dialcode} || 0 );
5270 $data .= pad_string( 8, encode( "UTF16LE", $sintlsymbol ));
5271 $data .= pad_string( 16, encode( "UTF16LE", $scurrency ));
5272 $index{$geo->{name}} = $idx if $geo->{name};
5273 $idx++;
5275 $index{"XX"} = $index{"001"};
5277 $geo_header[5] = $geo_header[3] + length $data;
5278 $geo_header[6] = scalar keys %index;
5280 foreach my $name (sort keys %index)
5282 $data .= pad_string( 8, encode( "UTF16LE", $name ));
5283 $data .= pack "L<", $index{$name};
5286 $geo_header[2] = $geo_header[3] + length $data;
5287 return pack( "L<7", @geo_header ) . $data;
5291 ################################################################
5292 # build a binary locale table
5293 sub dump_locales($$)
5295 my ($filename, $chartypes) = @_;
5297 printf "Building $filename\n";
5299 my $locale_data = build_locale_data();
5300 my $charmaps_data = build_charmaps_data();
5301 my $geoids_data = build_geoids_data();
5302 my $scripts_data = ""; # FIXME
5304 my @header = ( 0 ) x 8;
5305 $header[0] = 4 * scalar @header; # chartypes offset
5306 $header[4] = $header[0] + length $chartypes; # locales offset
5307 $header[5] = $header[4] + length $locale_data; # charmaps offset
5308 $header[6] = $header[5] + length $charmaps_data; # geoids offset
5309 $header[7] = $header[6] + length $geoids_data; # scripts offset
5311 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5312 print OUTPUT pack "L<*", @header;
5313 print OUTPUT $chartypes, $locale_data, $charmaps_data, $geoids_data, $scripts_data;
5314 close OUTPUT;
5315 save_file($filename);
5319 ################################################################
5320 # build the script to create registry keys
5321 sub dump_registry_script($%)
5323 my ($filename, %keys) = @_;
5324 my $indent = 1;
5326 printf "Building %s\n", $filename;
5327 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5328 print OUTPUT "HKLM\n{\n";
5329 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
5331 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
5332 $indent++;
5334 foreach my $k (sort keys %keys)
5336 my @subkeys = split /\\/, $k;
5337 my ($def, @vals) = @{$keys{$k}};
5338 for (my $i = 0; $i < @subkeys; $i++)
5340 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "",
5341 $subkeys[$i] =~ /\s/ ? "'$subkeys[$i]'" : $subkeys[$i],
5342 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
5343 $indent++;
5345 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
5346 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
5348 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
5349 close OUTPUT;
5350 save_file($filename);
5354 ################################################################
5355 # save a file if modified
5356 sub save_file($)
5358 my $file = shift;
5359 if (-f $file && !system "cmp $file $file.new >/dev/null")
5361 unlink "$file.new";
5363 else
5365 rename "$file.new", "$file";
5370 ################################################################
5371 # main routine
5373 chdir ".." if -f "./make_unicode";
5374 load_data();
5375 dump_sortkeys( "dlls/kernelbase/collation.c" );
5376 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
5377 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
5378 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
5379 dump_mirroring( "dlls/dwrite/mirror.c" );
5380 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
5381 dump_bracket( "dlls/dwrite/bracket.c" );
5382 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
5383 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
5384 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
5385 dump_linebreak( "dlls/dwrite/linebreak.c" );
5386 dump_scripts( "dlls/dwrite/scripts" );
5387 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
5388 dump_vertical( "dlls/win32u/vertical.c", 1 );
5389 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
5390 dump_intl_nls("nls/l_intl.nls");
5391 dump_norm_table( "nls/normnfc.nls" );
5392 dump_norm_table( "nls/normnfd.nls" );
5393 dump_norm_table( "nls/normnfkc.nls" );
5394 dump_norm_table( "nls/normnfkd.nls" );
5395 dump_norm_table( "nls/normidna.nls" );
5396 my $chartypes = dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
5397 dump_locales( "nls/locale.nls", $chartypes );
5398 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
5399 dump_eucjp_codepage();
5400 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
5402 exit 0;
5404 # Local Variables:
5405 # compile-command: "./make_unicode"
5406 # End: