d3dx10: Add support for ID3DX10ThreadPump parameter in D3DX10CreateTextureFromMemory.
[wine.git] / tools / make_unicode
blob33253104eb65827094e6225d6da0b6a3c886a81a
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
23 use XML::LibXML;
24 use Encode;
26 # base URLs for www.unicode.org files
27 my $UNIVERSION = "14.0.0";
28 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
29 my $UNIHAN = "https://www.unicode.org/Public/$UNIVERSION/ucd/Unihan.zip";
30 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
31 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
32 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
33 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
34 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
36 my $CLDRVERSION = "41";
37 my $CLDRDATA = "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip";
38 my $CLDR33DATA = "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip";
40 my $ISO639VERSION = "20220120";
41 my $ISO639 = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip";
43 # Default char for undefined mappings
44 my $DEF_CHAR = ord '?';
46 # Last valid Unicode character
47 my $MAX_CHAR = 0x10ffff;
49 my @allfiles =
51 "CodpageFiles/037.txt",
52 "CodpageFiles/437.txt",
53 "CodpageFiles/500.txt",
54 "CodpageFiles/708.txt",
55 "CodpageFiles/720.txt",
56 "CodpageFiles/737.txt",
57 "CodpageFiles/775.txt",
58 "CodpageFiles/850.txt",
59 "CodpageFiles/852.txt",
60 "CodpageFiles/855.txt",
61 "CodpageFiles/857.txt",
62 "CodpageFiles/860.txt",
63 "CodpageFiles/861.txt",
64 "CodpageFiles/862.txt",
65 "CodpageFiles/863.txt",
66 "CodpageFiles/864.txt",
67 "CodpageFiles/865.txt",
68 "CodpageFiles/866.txt",
69 "CodpageFiles/869.txt",
70 "CodpageFiles/874.txt",
71 "CodpageFiles/875.txt",
72 "CodpageFiles/932.txt",
73 "CodpageFiles/936.txt",
74 "CodpageFiles/949.txt",
75 "CodpageFiles/950.txt",
76 "CodpageFiles/1026.txt",
77 "CodpageFiles/1250.txt",
78 "CodpageFiles/1251.txt",
79 "CodpageFiles/1252.txt",
80 "CodpageFiles/1253.txt",
81 "CodpageFiles/1254.txt",
82 "CodpageFiles/1255.txt",
83 "CodpageFiles/1256.txt",
84 "CodpageFiles/1257.txt",
85 "CodpageFiles/1258.txt",
86 "CodpageFiles/1361.txt",
87 "CodpageFiles/10000.txt",
88 "CodpageFiles/10001.txt",
89 "CodpageFiles/10002.txt",
90 "CodpageFiles/10003.txt",
91 "CodpageFiles/10004.txt",
92 "CodpageFiles/10005.txt",
93 "CodpageFiles/10006.txt",
94 "CodpageFiles/10007.txt",
95 "CodpageFiles/10008.txt",
96 "CodpageFiles/10010.txt",
97 "CodpageFiles/10017.txt",
98 "CodpageFiles/10021.txt",
99 "CodpageFiles/10029.txt",
100 "CodpageFiles/10079.txt",
101 "CodpageFiles/10081.txt",
102 "CodpageFiles/10082.txt",
103 "CodpageFiles/20127.txt",
104 "CodpageFiles/20866.txt",
105 "CodpageFiles/21866.txt",
106 "CodpageFiles/28591.txt",
107 "CodpageFiles/28592.txt",
108 "CodpageFiles/28593.txt",
109 "CodpageFiles/28594.txt",
110 "CodpageFiles/28595.txt",
111 "CodpageFiles/28596.txt",
112 "CodpageFiles/28597.txt",
113 "CodpageFiles/28598.txt",
114 "CodpageFiles/28599.txt",
115 "CodpageFiles/28603.txt",
116 "CodpageFiles/28605.txt",
120 my %ctype =
122 # CT_CTYPE1
123 "upper" => 0x0001,
124 "lower" => 0x0002,
125 "digit" => 0x0004,
126 "space" => 0x0008,
127 "punct" => 0x0010,
128 "cntrl" => 0x0020,
129 "blank" => 0x0040,
130 "xdigit" => 0x0080,
131 "alpha" => 0x0100 | 0x80000000,
132 "defin" => 0x0200,
133 # CT_CTYPE3 in high 16 bits
134 "nonspacing" => 0x00010000,
135 "diacritic" => 0x00020000,
136 "vowelmark" => 0x00040000,
137 "symbol" => 0x00080000,
138 "katakana" => 0x00100000,
139 "hiragana" => 0x00200000,
140 "halfwidth" => 0x00400000,
141 "fullwidth" => 0x00800000,
142 "ideograph" => 0x01000000,
143 "kashida" => 0x02000000,
144 "lexical" => 0x04000000,
145 "highsurrogate" => 0x08000000,
146 "lowsurrogate" => 0x10000000,
149 my %bracket_types =
151 "o" => 0x0000,
152 "c" => 0x0001,
155 my %indic_types =
157 "Other" => 0x0000,
158 "Bindu" => 0x0001,
159 "Visarga" => 0x0002,
160 "Avagraha" => 0x0003,
161 "Nukta" => 0x0004,
162 "Virama" => 0x0005,
163 "Vowel_Independent" => 0x0006,
164 "Vowel_Dependent" => 0x0007,
165 "Vowel" => 0x0008,
166 "Consonant_Placeholder" => 0x0009,
167 "Consonant" => 0x000a,
168 "Consonant_Dead" => 0x000b,
169 "Consonant_Succeeding_Repha" => 0x000c,
170 "Consonant_Subjoined" => 0x000d,
171 "Consonant_Medial" => 0x000e,
172 "Consonant_Final" => 0x000f,
173 "Consonant_Head_Letter" => 0x0010,
174 "Modifying_Letter" => 0x0011,
175 "Tone_Letter" => 0x0012,
176 "Tone_Mark" => 0x0013,
177 "Register_Shifter" => 0x0014,
178 "Consonant_Preceding_Repha" => 0x0015,
179 "Pure_Killer" => 0x0016,
180 "Invisible_Stacker" => 0x0017,
181 "Gemination_Mark" => 0x0018,
182 "Cantillation_Mark" => 0x0019,
183 "Non_Joiner" => 0x001a,
184 "Joiner" => 0x001b,
185 "Number_Joiner" => 0x001c,
186 "Number" => 0x001d,
187 "Brahmi_Joining_Number" => 0x001e,
188 "Consonant_With_Stacker" => 0x001f,
189 "Consonant_Prefixed" => 0x0020,
190 "Syllable_Modifier" => 0x0021,
191 "Consonant_Killer" => 0x0022,
192 "Consonant_Initial_Postfixed" => 0x0023,
195 my %matra_types =
197 "Right" => 0x01,
198 "Left" => 0x02,
199 "Visual_Order_Left" => 0x03,
200 "Left_And_Right" => 0x04,
201 "Top" => 0x05,
202 "Bottom" => 0x06,
203 "Top_And_Bottom" => 0x07,
204 "Top_And_Right" => 0x08,
205 "Top_And_Left" => 0x09,
206 "Top_And_Left_And_Right" => 0x0a,
207 "Bottom_And_Right" => 0x0b,
208 "Top_And_Bottom_And_Right" => 0x0c,
209 "Overstruck" => 0x0d,
210 "Invisible" => 0x0e,
211 "Bottom_And_Left" => 0x0f,
212 "Top_And_Bottom_And_Left" => 0x10,
215 my %break_types =
217 "BK" => 0x0001,
218 "CR" => 0x0002,
219 "LF" => 0x0003,
220 "CM" => 0x0004,
221 "SG" => 0x0005,
222 "GL" => 0x0006,
223 "CB" => 0x0007,
224 "SP" => 0x0008,
225 "ZW" => 0x0009,
226 "NL" => 0x000a,
227 "WJ" => 0x000b,
228 "JL" => 0x000c,
229 "JV" => 0x000d,
230 "JT" => 0x000e,
231 "H2" => 0x000f,
232 "H3" => 0x0010,
233 "XX" => 0x0011,
234 "OP" => 0x0012,
235 "CL" => 0x0013,
236 "CP" => 0x0014,
237 "QU" => 0x0015,
238 "NS" => 0x0016,
239 "EX" => 0x0017,
240 "SY" => 0x0018,
241 "IS" => 0x0019,
242 "PR" => 0x001a,
243 "PO" => 0x001b,
244 "NU" => 0x001c,
245 "AL" => 0x001d,
246 "ID" => 0x001e,
247 "IN" => 0x001f,
248 "HY" => 0x0020,
249 "BB" => 0x0021,
250 "BA" => 0x0022,
251 "SA" => 0x0023,
252 "AI" => 0x0024,
253 "B2" => 0x0025,
254 "HL" => 0x0026,
255 "CJ" => 0x0027,
256 "RI" => 0x0028,
257 "EB" => 0x0029,
258 "EM" => 0x002a,
259 "ZWJ" => 0x002b,
262 my %vertical_types =
264 "R" => 0x0000,
265 "U" => 0x0001,
266 "Tr" => 0x0002,
267 "Tu" => 0x0003,
270 my %categories =
272 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
273 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
274 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
275 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
276 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
277 "Me" => $ctype{"defin"}, # Mark, Enclosing
278 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
279 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
280 "No" => $ctype{"defin"}, # Number, Other
281 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
282 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
283 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
284 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
285 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
286 "Cs" => $ctype{"defin"}, # Other, Surrogate
287 "Co" => $ctype{"defin"}, # Other, Private Use
288 "Cn" => $ctype{"defin"}, # Other, Not Assigned
289 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
290 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
291 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
292 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
293 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
294 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
295 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
296 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
297 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
298 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
299 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
300 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
301 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
304 # a few characters need additional categories that cannot be determined automatically
305 my %special_categories =
307 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
308 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
309 "space" => [ 0x09..0x0d, 0x85 ],
310 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
311 "cntrl" => [ 0x070f, 0x200c, 0x200d,
312 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
313 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
314 0xfff9, 0xfffa, 0xfffb ],
315 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
316 0xd7, 0xf7 ],
317 "digit" => [ 0xb2, 0xb3, 0xb9 ],
318 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
319 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
320 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
321 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
322 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
323 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
324 0x02b9..0x02ba, 0x02c6..0x02cf ],
325 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
326 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
327 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
328 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
329 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
330 0x3131..0x3164 ],
331 "ideograph" => [ 0x3006..0x3007 ],
332 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
333 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
334 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
335 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
336 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
337 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
338 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
339 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
340 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
341 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
342 "kashida" => [ 0x0640 ],
345 my %directions =
347 "L" => 1, # Left-to-Right
348 "R" => 2, # Right-to-Left
349 "AL" => 12, # Right-to-Left Arabic
350 "EN" => 3, # European Number
351 "ES" => 4, # European Number Separator
352 "ET" => 5, # European Number Terminator
353 "AN" => 6, # Arabic Number
354 "CS" => 7, # Common Number Separator
355 "NSM" => 13, # Non-Spacing Mark
356 "BN" => 14, # Boundary Neutral
357 "B" => 8, # Paragraph Separator
358 "S" => 9, # Segment Separator
359 "WS" => 10, # Whitespace
360 "ON" => 11, # Other Neutrals
361 "LRE" => 15, # Left-to-Right Embedding
362 "LRO" => 15, # Left-to-Right Override
363 "RLE" => 15, # Right-to-Left Embedding
364 "RLO" => 15, # Right-to-Left Override
365 "PDF" => 15, # Pop Directional Format
366 "LRI" => 15, # Left-to-Right Isolate
367 "RLI" => 15, # Right-to-Left Isolate
368 "FSI" => 15, # First Strong Isolate
369 "PDI" => 15 # Pop Directional Isolate
372 my %c2_types =
374 "L" => 1, # C2_LEFTTORIGHT
375 "R" => 2, # C2_RIGHTTOLEFT
376 "AL" => 2, # C2_RIGHTTOLEFT
377 "EN" => 3, # C2_EUROPENUMBER
378 "ES" => 4, # C2_EUROPESEPARATOR
379 "ET" => 5, # C2_EUROPETERMINATOR
380 "AN" => 6, # C2_ARABICNUMBER
381 "CS" => 7, # C2_COMMONSEPARATOR
382 "NSM" => 11, # C2_OTHERNEUTRAL
383 "BN" => 0, # C2_NOTAPPLICABLE
384 "B" => 8, # C2_BLOCKSEPARATOR
385 "S" => 9, # C2_SEGMENTSEPARATOR
386 "WS" => 10, # C2_WHITESPACE
387 "ON" => 11, # C2_OTHERNEUTRAL
388 "LRE" => 11, # C2_OTHERNEUTRAL
389 "LRO" => 11, # C2_OTHERNEUTRAL
390 "RLE" => 11, # C2_OTHERNEUTRAL
391 "RLO" => 11, # C2_OTHERNEUTRAL
392 "PDF" => 11, # C2_OTHERNEUTRAL
393 "LRI" => 11, # C2_OTHERNEUTRAL
394 "RLI" => 11, # C2_OTHERNEUTRAL
395 "FSI" => 11, # C2_OTHERNEUTRAL
396 "PDI" => 11 # C2_OTHERNEUTRAL
399 my %bidi_types =
401 "ON" => 0, # Other Neutrals
402 "L" => 1, # Left-to-Right
403 "R" => 2, # Right-to-Left
404 "AN" => 3, # Arabic Number
405 "EN" => 4, # European Number
406 "AL" => 5, # Right-to-Left Arabic
407 "NSM" => 6, # Non-Spacing Mark
408 "CS" => 7, # Common Number Separator
409 "ES" => 8, # European Number Separator
410 "ET" => 9, # European Number Terminator
411 "BN" => 10, # Boundary Neutral
412 "S" => 11, # Segment Separator
413 "WS" => 12, # Whitespace
414 "B" => 13, # Paragraph Separator
415 "RLO" => 14, # Right-to-Left Override
416 "RLE" => 15, # Right-to-Left Embedding
417 "LRO" => 16, # Left-to-Right Override
418 "LRE" => 17, # Left-to-Right Embedding
419 "PDF" => 18, # Pop Directional Format
420 "LRI" => 19, # Left-to-Right Isolate
421 "RLI" => 20, # Right-to-Left Isolate
422 "FSI" => 21, # First Strong Isolate
423 "PDI" => 22 # Pop Directional Isolate
426 my %joining_types =
428 "U" => 0, # Non_Joining
429 "L" => 1, # Left_Joining
430 "R" => 2, # Right_Joining
431 "D" => 3, # Dual_Joining
432 "C" => 3, # Join_Causing
433 "ALAPH" => 4, # Syriac ALAPH
434 "DALATH RISH" => 5, # Syriac DALATH RISH group
435 "T" => 6, # Transparent
438 my @locales =
440 { name => "", lcid => 0x0000007f, file => "root", territory => "IV", sabbrevlangname => "IVL", sopentypelang =>"dflt" },
441 { name => "aa", dir => "seed", sopentypelang => "AFR" },
442 { name => "aa-DJ", dir => "seed" },
443 { name => "aa-ER", dir => "seed" },
444 { name => "aa-ET", dir => "seed" },
445 { name => "af", lcid => 0x00000036, oemcp => 850, sabbrevlangname => "AFK", sopentypelang => "AFK" },
446 { name => "af-NA" },
447 { name => "af-ZA", lcid => 0x00000436 },
448 { name => "agq" },
449 { name => "agq-CM" },
450 { name => "ak", sopentypelang => "TWI" },
451 { name => "ak-GH" },
452 { name => "am", lcid => 0x0000005e, sabbrevlangname => "AMH" },
453 { name => "am-ET", lcid => 0x0000045e },
454 { name => "ar", lcid => 0x00000001, territory => "SA", oemcp => 720, group => 13 },
455 { name => "ar-001" },
456 { name => "ar-AE", lcid => 0x00003801, sabbrevlangname => "ARU" },
457 { name => "ar-BH", lcid => 0x00003c01, sabbrevlangname => "ARH" },
458 { name => "ar-DJ" },
459 { name => "ar-DZ", lcid => 0x00001401, sabbrevlangname => "ARG" },
460 { name => "ar-EG", lcid => 0x00000c01, sabbrevlangname => "ARE" },
461 { name => "ar-EH" },
462 { name => "ar-ER" },
463 { name => "ar-IL" },
464 { name => "ar-IQ", lcid => 0x00000801, sabbrevlangname => "ARI" },
465 { name => "ar-JO", lcid => 0x00002c01, sabbrevlangname => "ARJ" },
466 { name => "ar-KM" },
467 { name => "ar-KW", lcid => 0x00003401, sabbrevlangname => "ARK" },
468 { name => "ar-LB", lcid => 0x00003001, sabbrevlangname => "ARB" },
469 { name => "ar-LY", lcid => 0x00001001, sabbrevlangname => "ARL" },
470 { name => "ar-MA", lcid => 0x00001801, sabbrevlangname => "ARM" },
471 { name => "ar-MR" },
472 { name => "ar-OM", lcid => 0x00002001, sabbrevlangname => "ARO" },
473 { name => "ar-PS" },
474 { name => "ar-QA", lcid => 0x00004001, sabbrevlangname => "ARQ" },
475 { name => "ar-SA", lcid => 0x00000401, sabbrevlangname => "ARA" },
476 { name => "ar-SD" },
477 { name => "ar-SO" },
478 { name => "ar-SS" },
479 { name => "ar-SY", lcid => 0x00002801, sabbrevlangname => "ARS" },
480 { name => "ar-TD" },
481 { name => "ar-TN", lcid => 0x00001c01, sabbrevlangname => "ART" },
482 { name => "ar-YE", lcid => 0x00002401, sabbrevlangname => "ARY" },
483 { name => "arn", lcid => 0x0000007a, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sabbrevlangname => "MPD", sopentypelang => "MAP" },
484 { name => "arn-CL", lcid => 0x0000047a, dir => "seed" },
485 { name => "arn-Latn", alias => "arn" },
486 { name => "arn-Latn-CL", alias => "arn-CL" },
487 { name => "as", lcid => 0x0000004d, slist => ",", group => 15 },
488 { name => "as-IN", lcid => 0x0000044d },
489 { name => "asa" },
490 { name => "asa-TZ" },
491 { name => "ast" },
492 { name => "ast-ES" },
493 { name => "az", lcid => 0x0000002c, oemcp => 857, ebcdiccp => 20905, group => 2 },
494 { name => "az-Cyrl", lcid => 0x0000742c, oemcp => 866, ebcdiccp => 20880, group => 5, sabbrevlangname => "AZC" },
495 { name => "az-Cyrl-AZ", lcid => 0x0000082c },
496 { name => "az-Latn", lcid => 0x0000782c },
497 { name => "az-Latn-AZ", lcid => 0x0000042c },
498 { name => "ba", lcid => 0x0000006d, oemcp => 866, group => 5, dir => "seed", sabbrevlangname => "BAS", sopentypelang => "BSH" },
499 { name => "ba-Cyrl", alias => "ba" },
500 { name => "ba-Cyrl-RU", alias => "ba-RU" },
501 { name => "ba-RU", lcid => 0x0000046d, dir => "seed" },
502 { name => "bas" },
503 { name => "bas-CM" },
504 { name => "be", lcid => 0x00000023, oemcp => 866, ebcdiccp => 500, group => 5 },
505 { name => "be-BY", lcid => 0x00000423 },
506 { name => "bem" },
507 { name => "bem-ZM" },
508 { name => "bez" },
509 { name => "bez-TZ" },
510 { name => "bg", lcid => 0x00000002, oemcp => 866, ebcdiccp => 21025, group => 5, sabbrevlangname => "BGR", sopentypelang => "BGR" },
511 { name => "bg-BG", lcid => 0x00000402 },
512 { name => "bin", lcid => 0x00000066, oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "EDO" },
513 { name => "bin-NG", lcid => 0x00000466, file => "bin", dir => "exemplars" },
514 { name => "bm", sopentypelang => "BMB" },
515 { name => "bm-Latn", file => "bm" },
516 { name => "bm-Latn-ML", file => "bm_ML" },
517 { name => "bm-ML", alias => "bm-Latn-ML" },
518 { name => "bn", lcid => 0x00000045, slist => ",", group => 15, sabbrevlangname => "BNB" },
519 { name => "bn-BD", lcid => 0x00000845 },
520 { name => "bn-IN", lcid => 0x00000445, sabbrevlangname => "BNG" },
521 { name => "bo", lcid => 0x00000051, slist => ",", group => 15, sabbrevlangname => "BOB", sopentypelang => "TIB" },
522 { name => "bo-CN", lcid => 0x00000451 },
523 { name => "bo-IN", slist => "," },
524 { name => "bo-Tibt", alias => "bo" },
525 { name => "bo-Tibt-CN", alias => "bo-CN" },
526 { name => "bo-Tibt-IN", alias => "bo-IN" },
527 { name => "br", lcid => 0x0000007e, oemcp => 850, ebcdiccp => 20297 },
528 { name => "br-FR", lcid => 0x0000047e },
529 { name => "br-Latn", alias => "br" },
530 { name => "br-Latn-FR", alias => "br-FR" },
531 { name => "brx" },
532 { name => "brx-IN" },
533 { name => "bs", lcid => 0x0000781a, oemcp => 852, maccp => 10082, ebcdiccp => 870, group => 2, sabbrevlangname => "BSB" },
534 { name => "bs-Cyrl", lcid => 0x0000641a, oemcp => 855, group => 5, sabbrevlangname => "BSC" },
535 { name => "bs-Cyrl-BA", lcid => 0x0000201a },
536 { name => "bs-Latn", lcid => 0x0000681a },
537 { name => "bs-Latn-BA", lcid => 0x0000141a },
538 { name => "byn", dir => "seed", sopentypelang => "BIL" },
539 { name => "byn-ER", dir => "seed" },
540 { name => "ca", lcid => 0x00000003, oemcp => 850 },
541 { name => "ca-AD", maccp => 65001 },
542 { name => "ca-ES", lcid => 0x00000403 },
543 { name => "ca-ES-valencia", lcid => 0x00000803, file => "ca_ES_VALENCIA", sabbrevlangname => "VAL" },
544 { name => "ca-FR", maccp => 65001 },
545 { name => "ca-IT", maccp => 65001 },
546 { name => "ccp" },
547 { name => "ccp-BD", alias => "ccp-Cakm-BD" },
548 { name => "ccp-Cakm", file => "ccp" },
549 { name => "ccp-Cakm-BD", file => "ccp_BD" },
550 { name => "ccp-Cakm-IN", file => "ccp_IN" },
551 { name => "ccp-IN", alias => "ccp-Cakm-IN" },
552 { name => "ce" },
553 { name => "ce-RU" },
554 { name => "ceb" },
555 { name => "ceb-Latn", file => "ceb" },
556 { name => "ceb-Latn-PH", file => "ceb_PH" },
557 { name => "ceb-PH", alias => "ceb-Latn-PH" },
558 { name => "cgg" },
559 { name => "cgg-UG" },
560 { name => "chr", lcid => 0x0000005c, slist => ",", sabbrevlangname => "CRE" },
561 { name => "chr-Cher", lcid => 0x00007c5c, file => "chr" },
562 { name => "chr-Cher-US", lcid => 0x0000045c, file => "chr_US" },
563 { name => "chr-US", alias => "chr-Cher-US" },
564 { name => "ckb", alias => "ku" },
565 { name => "ckb-IQ", alias => "ku-Arab-IQ" },
566 { name => "ckb-IR", alias => "ku-Arab-IR" },
567 { name => "co", lcid => 0x00000083, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
568 { name => "co-FR", lcid => 0x00000483, dir => "seed" },
569 { name => "co-Latn", alias => "co" },
570 { name => "co-Latn-FR", alias => "co-FR" },
571 { name => "cs", lcid => 0x00000005, oemcp => 852, group => 2, sabbrevlangname => "CSY", sopentypelang => "CSY" },
572 { name => "cs-CZ", lcid => 0x00000405 },
573 { name => "cu", dir => "seed", sopentypelang => "CSL" },
574 { name => "cu-RU", dir => "seed" },
575 { name => "cy", lcid => 0x00000052, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "CYM", sopentypelang => "WEL" },
576 { name => "cy-GB", lcid => 0x00000452 },
577 { name => "da", lcid => 0x00000006, oemcp => 850, ebcdiccp => 20277 },
578 { name => "da-DK", lcid => 0x00000406 },
579 { name => "da-GL", maccp => 65001 },
580 { name => "dav" },
581 { name => "dav-KE" },
582 { name => "de", lcid => 0x00000007, oemcp => 850, ebcdiccp => 20273 },
583 { name => "de-AT", lcid => 0x00000c07, sabbrevlangname => "DEA" },
584 { name => "de-BE" },
585 { name => "de-CH", lcid => 0x00000807, sabbrevlangname => "DES" },
586 { name => "de-DE", lcid => 0x00000407 },
587 { name => "de-DE_phoneb", lcid => 0x00010407, alias => "de-DE" },
588 { name => "de-DE-u-co-phonebk", alias => "de-DE_phoneb" },
589 { name => "de-IT", oemcp => 65001 },
590 { name => "de-LI", lcid => 0x00001407, sabbrevlangname => "DEC" },
591 { name => "de-LU", lcid => 0x00001007, sabbrevlangname => "DEL" },
592 { name => "dje", sopentypelang => "DJR" },
593 { name => "dje-NE" },
594 { name => "doi" },
595 { name => "doi-IN" },
596 { name => "dsb", lcid => 0x00007c2e, sparent => "hsb", oemcp => 850, ebcdiccp => 870, sabbrevlangname => "DSB", sopentypelang => "LSB" },
597 { name => "dsb-DE", lcid => 0x0000082e },
598 { name => "dua" },
599 { name => "dua-CM" },
600 { name => "dv", lcid => 0x00000065, slist => "\x{060c}", group => 13, dir => "seed" },
601 { name => "dv-MV", lcid => 0x00000465, dir => "seed" },
602 { name => "dyo" },
603 { name => "dyo-SN" },
604 { name => "dz", sopentypelang => "DZN" },
605 { name => "dz-BT", lcid => 0x00000c51, sabbrevlangname => "ZZZ" },
606 { name => "ebu" },
607 { name => "ebu-KE" },
608 { name => "ee" },
609 { name => "ee-GH" },
610 { name => "ee-TG" },
611 { name => "el", lcid => 0x00000008, oemcp => 737, group => 4 },
612 { name => "el-CY" },
613 { name => "el-GR", lcid => 0x00000408 },
614 { name => "en", lcid => 0x00000009, oemcp => 437, slist => ",", sabbrevlangname => "ENU" },
615 { name => "en-001", oemcp => 850 },
616 { name => "en-029", lcid => 0x00002409, file => "en", oemcp => 850, sabbrevlangname => "ENB" },
617 { name => "en-150", oemcp => 65001 },
618 { name => "en-AE", lcid => 0x00004c09, oemcp => 65001, sabbrevlangname => "ZZZ" },
619 { name => "en-AG", oemcp => 850 },
620 { name => "en-AI", oemcp => 850 },
621 { name => "en-AS", oemcp => 850 },
622 { name => "en-AT", oemcp => 65001 },
623 { name => "en-AU", lcid => 0x00000c09, oemcp => 850, sabbrevlangname => "ENA" },
624 { name => "en-BB", oemcp => 850 },
625 { name => "en-BE", oemcp => 850 },
626 { name => "en-BI", oemcp => 65001 },
627 { name => "en-BM", oemcp => 850 },
628 { name => "en-BS", oemcp => 850 },
629 { name => "en-BW", oemcp => 850 },
630 { name => "en-BZ", lcid => 0x00002809, oemcp => 850, sabbrevlangname => "ENL" },
631 { name => "en-CA", lcid => 0x00001009, oemcp => 850, ebcdiccp => 37, sabbrevlangname => "ENC" },
632 { name => "en-CC", oemcp => 850 },
633 { name => "en-CH", oemcp => 65001 },
634 { name => "en-CK", oemcp => 850 },
635 { name => "en-CM", oemcp => 850 },
636 { name => "en-CX", oemcp => 850 },
637 { name => "en-CY", oemcp => 65001 },
638 { name => "en-DE", oemcp => 65001 },
639 { name => "en-DG", oemcp => 850 },
640 { name => "en-DK", oemcp => 65001 },
641 { name => "en-DM", oemcp => 850 },
642 { name => "en-ER", oemcp => 850 },
643 { name => "en-FI", oemcp => 65001 },
644 { name => "en-FJ", oemcp => 850 },
645 { name => "en-FK", oemcp => 850 },
646 { name => "en-FM", oemcp => 850 },
647 { name => "en-GB", lcid => 0x00000809, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "ENG" },
648 { name => "en-GD", oemcp => 850 },
649 { name => "en-GG", oemcp => 850 },
650 { name => "en-GH", oemcp => 850 },
651 { name => "en-GI", oemcp => 850 },
652 { name => "en-GM", oemcp => 850 },
653 { name => "en-GU", oemcp => 850 },
654 { name => "en-GY", oemcp => 850 },
655 { name => "en-HK", lcid => 0x00003c09, oemcp => 850, sabbrevlangname => "ENH" },
656 { name => "en-ID", lcid => 0x00003809, file => "en", oemcp => 850, sabbrevlangname => "ZZZ" },
657 { name => "en-IE", lcid => 0x00001809, oemcp => 850, sabbrevlangname => "ENI" },
658 { name => "en-IL", oemcp => 65001 },
659 { name => "en-IM", oemcp => 850 },
660 { name => "en-IN", lcid => 0x00004009, sabbrevlangname => "ENN" },
661 { name => "en-IO", oemcp => 850 },
662 { name => "en-JE", oemcp => 850 },
663 { name => "en-JM", lcid => 0x00002009, oemcp => 850, sabbrevlangname => "ENJ" },
664 { name => "en-KE", oemcp => 850 },
665 { name => "en-KI", oemcp => 850 },
666 { name => "en-KN", oemcp => 850 },
667 { name => "en-KY", oemcp => 850 },
668 { name => "en-LC", oemcp => 850 },
669 { name => "en-LR", oemcp => 850 },
670 { name => "en-LS", oemcp => 850 },
671 { name => "en-MG", oemcp => 850 },
672 { name => "en-MH", oemcp => 850 },
673 { name => "en-MO", oemcp => 850 },
674 { name => "en-MP", oemcp => 850 },
675 { name => "en-MS", oemcp => 850 },
676 { name => "en-MT", oemcp => 850 },
677 { name => "en-MU", oemcp => 850 },
678 { name => "en-MW", oemcp => 850 },
679 { name => "en-MY", lcid => 0x00004409, sabbrevlangname => "ENM" },
680 { name => "en-NA", oemcp => 850 },
681 { name => "en-NF", oemcp => 850 },
682 { name => "en-NG", oemcp => 850 },
683 { name => "en-NL", oemcp => 65001 },
684 { name => "en-NR", oemcp => 850 },
685 { name => "en-NU", oemcp => 850 },
686 { name => "en-NZ", lcid => 0x00001409, oemcp => 850, sabbrevlangname => "ENZ" },
687 { name => "en-PG", oemcp => 850 },
688 { name => "en-PH", lcid => 0x00003409, ebcdiccp => 500, sabbrevlangname => "ENP" },
689 { name => "en-PK", oemcp => 850 },
690 { name => "en-PN", oemcp => 850 },
691 { name => "en-PR", oemcp => 850 },
692 { name => "en-PW", oemcp => 850 },
693 { name => "en-RW", oemcp => 850 },
694 { name => "en-SB", oemcp => 850 },
695 { name => "en-SC", oemcp => 850 },
696 { name => "en-SD", oemcp => 850 },
697 { name => "en-SE", oemcp => 65001 },
698 { name => "en-SG", lcid => 0x00004809, sabbrevlangname => "ENE" },
699 { name => "en-SH", oemcp => 850 },
700 { name => "en-SI", oemcp => 65001 },
701 { name => "en-SL", oemcp => 850 },
702 { name => "en-SS", oemcp => 850 },
703 { name => "en-SX", oemcp => 850 },
704 { name => "en-SZ", oemcp => 850 },
705 { name => "en-TC", oemcp => 850 },
706 { name => "en-TK", oemcp => 850 },
707 { name => "en-TO", oemcp => 850 },
708 { name => "en-TT", lcid => 0x00002c09, oemcp => 850, sabbrevlangname => "ENT" },
709 { name => "en-TV", oemcp => 850 },
710 { name => "en-TZ", oemcp => 850 },
711 { name => "en-UG", oemcp => 850 },
712 { name => "en-UM", oemcp => 850 },
713 { name => "en-US", lcid => 0x00000409 },
714 { name => "en-VC", oemcp => 850 },
715 { name => "en-VG", oemcp => 850 },
716 { name => "en-VI", oemcp => 850 },
717 { name => "en-VU", oemcp => 850 },
718 { name => "en-WS", oemcp => 850 },
719 { name => "en-ZA", lcid => 0x00001c09, ebcdiccp => 500, sabbrevlangname => "ENS" },
720 { name => "en-ZM", oemcp => 850 },
721 { name => "en-ZW", lcid => 0x00003009, ebcdiccp => 500, sabbrevlangname => "ENW" },
722 { name => "eo", sopentypelang => "NTO" },
723 { name => "eo-001" },
724 { name => "es", lcid => 0x0000000a, oemcp => 850, ebcdiccp => 20284, sabbrevlangname => "ESP", sopentypelang => "ESP" },
725 { name => "es-419", lcid => 0x0000580a, sabbrevlangname => "ESJ" },
726 { name => "es-AR", lcid => 0x00002c0a, sabbrevlangname => "ESS" },
727 { name => "es-BO", lcid => 0x0000400a, sabbrevlangname => "ESB" },
728 { name => "es-BR", oemcp => 65001 },
729 { name => "es-BZ", oemcp => 65001 },
730 { name => "es-CL", lcid => 0x0000340a, sabbrevlangname => "ESL" },
731 { name => "es-CO", lcid => 0x0000240a, sabbrevlangname => "ESO" },
732 { name => "es-CR", lcid => 0x0000140a, sabbrevlangname => "ESC" },
733 { name => "es-CU", lcid => 0x00005c0a, sabbrevlangname => "ESK" },
734 { name => "es-DO", lcid => 0x00001c0a, sabbrevlangname => "ESD" },
735 { name => "es-EA" },
736 { name => "es-EC", lcid => 0x0000300a, sabbrevlangname => "ESF" },
737 { name => "es-ES", lcid => 0x00000c0a, sabbrevlangname => "ESN" },
738 { name => "es-ES_tradnl", lcid => 0x0000040a, file => "es_ES" },
739 { name => "es-ES-u-co-trad", alias => "es-ES_tradnl" },
740 { name => "es-GQ" },
741 { name => "es-GT", lcid => 0x0000100a, sabbrevlangname => "ESG" },
742 { name => "es-HN", lcid => 0x0000480a, sabbrevlangname => "ESH" },
743 { name => "es-IC" },
744 { name => "es-MX", lcid => 0x0000080a, sabbrevlangname => "ESM" },
745 { name => "es-NI", lcid => 0x00004c0a, sabbrevlangname => "ESI" },
746 { name => "es-PA", lcid => 0x0000180a, sabbrevlangname => "ESA" },
747 { name => "es-PE", lcid => 0x0000280a, sabbrevlangname => "ESR" },
748 { name => "es-PH" },
749 { name => "es-PR", lcid => 0x0000500a, sabbrevlangname => "ESU" },
750 { name => "es-PY", lcid => 0x00003c0a, sabbrevlangname => "ESZ" },
751 { name => "es-SV", lcid => 0x0000440a, sabbrevlangname => "ESE" },
752 { name => "es-US", lcid => 0x0000540a, sabbrevlangname => "EST" },
753 { name => "es-UY", lcid => 0x0000380a, sabbrevlangname => "ESY" },
754 { name => "es-VE", lcid => 0x0000200a, sabbrevlangname => "ESV" },
755 { name => "et", lcid => 0x00000025, oemcp => 775, group => 3, sabbrevlangname => "ETI", sopentypelang => "ETI" },
756 { name => "et-EE", lcid => 0x00000425 },
757 { name => "eu", lcid => 0x0000002d, oemcp => 850, maccp => 65001, sabbrevlangname => "EUQ", sopentypelang => "EUQ" },
758 { name => "eu-ES", lcid => 0x0000042d },
759 { name => "ewo" },
760 { name => "ewo-CM" },
761 { name => "fa", lcid => 0x00000029, inegnumber => 3, oemcp => 720, slist => "\x{061b}", group => 13, sabbrevlangname => "FAR", sopentypelang => "FAR" },
762 { name => "fa-AF", alias => "prs-AF" },
763 { name => "fa-IR", lcid => 0x00000429 },
764 { name => "ff", lcid => 0x00000067, oemcp => 850, ebcdiccp => 20297 },
765 { name => "ff-CM", alias => "ff-Latn-CM" },
766 { name => "ff-GN", alias => "ff-Latn-GN" },
767 { name => "ff-MR", alias => "ff-Latn-MR" },
768 { name => "ff-NG", alias => "ff-Latn-NG" },
769 { name => "ff-SN", alias => "ff-Latn-SN" },
770 { name => "ff-Adlm" },
771 { name => "ff-Adlm-BF" },
772 { name => "ff-Adlm-CM" },
773 { name => "ff-Adlm-GH" },
774 { name => "ff-Adlm-GM" },
775 { name => "ff-Adlm-GN" },
776 { name => "ff-Adlm-GW" },
777 { name => "ff-Adlm-LR" },
778 { name => "ff-Adlm-MR" },
779 { name => "ff-Adlm-NE" },
780 { name => "ff-Adlm-NG" },
781 { name => "ff-Adlm-SL" },
782 { name => "ff-Adlm-SN" },
783 { name => "ff-Latn", lcid => 0x00007c67 },
784 { name => "ff-Latn-BF", oemcp => 65001 },
785 { name => "ff-Latn-CM" },
786 { name => "ff-Latn-GH", oemcp => 65001 },
787 { name => "ff-Latn-GM", oemcp => 65001 },
788 { name => "ff-Latn-GN" },
789 { name => "ff-Latn-GW", oemcp => 65001 },
790 { name => "ff-Latn-LR", oemcp => 65001 },
791 { name => "ff-Latn-MR" },
792 { name => "ff-Latn-NE", oemcp => 65001 },
793 { name => "ff-Latn-NG", lcid => 0x00000467, sabbrevlangname => "ZZZ" },
794 { name => "ff-Latn-SL", oemcp => 65001 },
795 { name => "ff-Latn-SN", lcid => 0x00000867 },
796 { name => "fi", lcid => 0x0000000b, oemcp => 850, ebcdiccp => 20278 },
797 { name => "fi-FI", lcid => 0x0000040b },
798 { name => "fil", lcid => 0x00000064, oemcp => 437, ebcdiccp => 500, sabbrevlangname => "FPO", sopentypelang => "PIL" },
799 { name => "fil-PH", lcid => 0x00000464 },
800 { name => "fil-Latn", alias => "fil" },
801 { name => "fil-Latn-PH", alias => "fil-PH" },
802 { name => "fo", lcid => 0x00000038, oemcp => 850, maccp => 10079, ebcdiccp => 20277, sabbrevlangname => "FOS", sopentypelang => "FOS" },
803 { name => "fo-DK", oemcp => 65001, maccp => 65001 },
804 { name => "fo-FO", lcid => 0x00000438 },
805 { name => "fr", lcid => 0x0000000c, oemcp => 850, ebcdiccp => 20297 },
806 { name => "fr-029", lcid => 0x00001c0c, file => "fr", sabbrevlangname => "ZZZ" },
807 { name => "fr-BE", lcid => 0x0000080c, sabbrevlangname => "FRB" },
808 { name => "fr-BF" },
809 { name => "fr-BI" },
810 { name => "fr-BJ" },
811 { name => "fr-BL" },
812 { name => "fr-CA", lcid => 0x00000c0c, sabbrevlangname => "FRC" },
813 { name => "fr-CD", lcid => 0x0000240c, sabbrevlangname => "FRD" },
814 { name => "fr-CF" },
815 { name => "fr-CG" },
816 { name => "fr-CH", lcid => 0x0000100c, sabbrevlangname => "FRS" },
817 { name => "fr-CI", lcid => 0x0000300c, sabbrevlangname => "FRI" },
818 { name => "fr-CM", lcid => 0x00002c0c, sabbrevlangname => "FRE" },
819 { name => "fr-DJ" },
820 { name => "fr-DZ" },
821 { name => "fr-FR", lcid => 0x0000040c },
822 { name => "fr-GA" },
823 { name => "fr-GF" },
824 { name => "fr-GN" },
825 { name => "fr-GP" },
826 { name => "fr-GQ" },
827 { name => "fr-HT", lcid => 0x00003c0c, sabbrevlangname => "FRH" },
828 { name => "fr-KM" },
829 { name => "fr-LU", lcid => 0x0000140c, sabbrevlangname => "FRL" },
830 { name => "fr-MA", lcid => 0x0000380c, sabbrevlangname => "FRO" },
831 { name => "fr-MC", lcid => 0x0000180c, sabbrevlangname => "FRM" },
832 { name => "fr-MF" },
833 { name => "fr-MG" },
834 { name => "fr-ML", lcid => 0x0000340c, sabbrevlangname => "FRF" },
835 { name => "fr-MQ" },
836 { name => "fr-MR" },
837 { name => "fr-MU" },
838 { name => "fr-NC" },
839 { name => "fr-NE" },
840 { name => "fr-PF" },
841 { name => "fr-PM" },
842 { name => "fr-RE", lcid => 0x0000200c, sabbrevlangname => "FRR" },
843 { name => "fr-RW" },
844 { name => "fr-SC" },
845 { name => "fr-SN", lcid => 0x0000280c, sabbrevlangname => "FRN" },
846 { name => "fr-SY" },
847 { name => "fr-TD" },
848 { name => "fr-TG" },
849 { name => "fr-TN" },
850 { name => "fr-VU" },
851 { name => "fr-WF" },
852 { name => "fr-YT" },
853 { name => "fur", sopentypelang => "FRL" },
854 { name => "fur-IT" },
855 { name => "fy", lcid => 0x00000062, oemcp => 850, sabbrevlangname => "FYN", sopentypelang => "FRI" },
856 { name => "fy-NL", lcid => 0x00000462 },
857 { name => "ga", lcid => 0x0000003c, oemcp => 850, sabbrevlangname => "IRE", sopentypelang => "IRI" },
858 { name => "ga-GB" },
859 { name => "ga-IE", lcid => 0x0000083c },
860 { name => "gd", lcid => 0x00000091, oemcp => 850, ebcdiccp => 20285, sopentypelang => "GAE" },
861 { name => "gd-GB", lcid => 0x00000491 },
862 { name => "gd-Latn", alias => "gd" },
863 { name => "gl", lcid => 0x00000056, oemcp => 850, sabbrevlangname => "GLC", sopentypelang => "GAL" },
864 { name => "gl-ES", lcid => 0x00000456 },
865 { name => "gn", lcid => 0x00000074, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sopentypelang => "GUA" },
866 { name => "gn-PY", lcid => 0x00000474, dir => "seed" },
867 { name => "gsw", lcid => 0x00000084, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "ZZZ", sopentypelang => "ALS" },
868 { name => "gsw-CH" },
869 { name => "gsw-FR", lcid => 0x00000484, sabbrevlangname => "GSW" },
870 { name => "gsw-LI" },
871 { name => "gu", lcid => 0x00000047, slist => ",", group => 15 },
872 { name => "gu-IN", lcid => 0x00000447 },
873 { name => "guz" },
874 { name => "guz-KE" },
875 { name => "gv", sopentypelang => "MNX" },
876 { name => "gv-IM" },
877 { name => "ha", lcid => 0x00000068, oemcp => 437 },
878 { name => "ha-GH", alias => "ha-Latn-GH" },
879 { name => "ha-Latn", lcid => 0x00007c68, file => "ha" },
880 { name => "ha-Latn-GH", file => "ha_GH", ebcdiccp => 500 },
881 { name => "ha-Latn-NE", file => "ha_NE", ebcdiccp => 500 },
882 { name => "ha-Latn-NG", lcid => 0x00000468, file => "ha_NG" },
883 { name => "ha-NE", alias => "ha-Latn-NE" },
884 { name => "ha-NG", alias => "ha-Latn-NG" },
885 { name => "haw", lcid => 0x00000075, oemcp => 437 },
886 { name => "haw-Latn", alias => "haw" },
887 { name => "haw-Latn-US", alias => "haw-US" },
888 { name => "haw-US", lcid => 0x00000475 },
889 { name => "he", lcid => 0x0000000d, oemcp => 862, slist => ",", group => 12, sopentypelang => "IWR" },
890 { name => "he-IL", lcid => 0x0000040d },
891 { name => "hi", lcid => 0x00000039, slist => ",", group => 15 },
892 { name => "hi-IN", lcid => 0x00000439 },
893 { name => "hr", lcid => 0x0000001a, inegnumber => 2, oemcp => 852, maccp => 10082, group => 2 },
894 { name => "hr-BA", lcid => 0x0000101a, ebcdiccp => 870, inegnumber => 1, sabbrevlangname => "HRB" },
895 { name => "hr-HR", lcid => 0x0000041a },
896 { name => "hsb", lcid => 0x0000002e, oemcp => 850, ebcdiccp => 870, sopentypelang => "USB" },
897 { name => "hsb-DE", lcid => 0x0000042e },
898 { name => "hu", lcid => 0x0000000e, oemcp => 852, group => 2 },
899 { name => "hu-HU", lcid => 0x0000040e },
900 { name => "hu-HU_technl", lcid => 0x0001040e, alias => "hu-HU" },
901 { name => "hy", lcid => 0x0000002b, slist => ",", group => 17 },
902 { name => "hy-AM", lcid => 0x0000042b },
903 { name => "ia" },
904 { name => "ia-001" },
905 ## name => "ibb", lcid => 0x00000069 },
906 ## name => "ibb-NG", lcid => 0x00000469 },
907 { name => "id", lcid => 0x00000021, oemcp => 850 },
908 { name => "id-ID", lcid => 0x00000421 },
909 { name => "ig", lcid => 0x00000070, oemcp => 437 },
910 { name => "ig-Latn", alias => "ig" },
911 { name => "ig-Latn-NG", alias => "ig-NG" },
912 { name => "ig-NG", lcid => 0x00000470 },
913 { name => "ii", lcid => 0x00000078, group => 9, sopentypelang => "YIM" },
914 { name => "ii-CN", lcid => 0x00000478 },
915 { name => "ii-Yiii", alias => "ii" },
916 { name => "ii-Yiii-CN", alias => "ii-CN" },
917 { name => "is", lcid => 0x0000000f, oemcp => 850, maccp => 10079, ebcdiccp => 20871 },
918 { name => "is-IS", lcid => 0x0000040f },
919 { name => "it", lcid => 0x00000010, oemcp => 850, ebcdiccp => 20280 },
920 { name => "it-CH", lcid => 0x00000810, ebcdiccp => 500, sabbrevlangname => "ITS" },
921 { name => "it-IT", lcid => 0x00000410 },
922 { name => "it-SM" },
923 { name => "it-VA", oemcp => 65001 },
924 { name => "iu", lcid => 0x0000005d, oemcp => 437, slist => ",", sortlocale => "iu-Latn-CA", dir => "seed", sabbrevlangname => "IUK", sopentypelang => "INU" },
925 { name => "iu-Cans", lcid => 0x0000785d, file => "iu", oemcp => 65001, dir => "seed", sabbrevlangname => "IUS" },
926 { name => "iu-Cans-CA", lcid => 0x0000045d, file => "iu_CA", dir => "seed" },
927 { name => "iu-Latn", lcid => 0x00007c5d, dir => "seed" },
928 { name => "iu-Latn-CA", lcid => 0x0000085d, dir => "seed" },
929 { name => "ja", lcid => 0x00000011, ireadinglayout => 2, oemcp => 932, slist => ",", sscripts => "Hani Hira Jpan Kana", group => 7, sopentypelang => "JAN" },
930 { name => "ja-JP", lcid => 0x00000411 },
931 { name => "ja-JP_radstr", lcid => 0x00040411, alias => "ja-JP" },
932 { name => "ja-JP-u-co-unihan", alias => "ja-JP_radstr" },
933 { name => "jgo" },
934 { name => "jgo-CM" },
935 { name => "jmc" },
936 { name => "jmc-TZ" },
937 { name => "jv", oemcp => 850 },
938 { name => "jv-ID", alias => "jv-Latn-ID" },
939 ## name => "jv-Java" },
940 ## name => "jv-Java-ID" },
941 { name => "jv-Latn", file => "jv" },
942 { name => "jv-Latn-ID", file => "jv_ID" },
943 { name => "ka", lcid => 0x00000037, group => 16 },
944 { name => "ka-GE", lcid => 0x00000437 },
945 { name => "ka-GE_modern", lcid => 0x00010437, alias => "ka-GE" },
946 { name => "kab", sopentypelang => "KAB0" },
947 { name => "kab-DZ" },
948 { name => "kam", sopentypelang => "KMB" },
949 { name => "kam-KE" },
950 { name => "kde" },
951 { name => "kde-TZ" },
952 { name => "kea" },
953 { name => "kea-CV" },
954 { name => "kgp" },
955 { name => "kgp-BR" },
956 { name => "khq" },
957 { name => "khq-ML" },
958 { name => "ki" },
959 { name => "ki-KE" },
960 { name => "kk", lcid => 0x0000003f, group => 5, sabbrevlangname => "KKZ" },
961 { name => "kk-Cyrl", alias => "kk" },
962 { name => "kk-Cyrl-KZ", alias => "kk-KZ" },
963 { name => "kk-KZ", lcid => 0x0000043f },
964 { name => "kkj" },
965 { name => "kkj-CM" },
966 { name => "kl", lcid => 0x0000006f, oemcp => 850, ebcdiccp => 20277, sopentypelang => "GRN" },
967 { name => "kl-GL", lcid => 0x0000046f },
968 { name => "kln", sopentypelang => "KAL" },
969 { name => "kln-KE" },
970 { name => "km", lcid => 0x00000053, inegnumber => 2, slist => ",", group => 15 },
971 { name => "km-KH", lcid => 0x00000453 },
972 { name => "kn", lcid => 0x0000004b, slist => ",", group => 15, sabbrevlangname => "KDI" },
973 { name => "kn-IN", lcid => 0x0000044b },
974 { name => "ko", lcid => 0x00000012, ireadinglayout => 2, slist => ",", oemcp => 949, ebcdiccp => 20833, sscripts => "Hang Hani Kore", group => 8 },
975 { name => "ko-KP", oemcp => 65001 },
976 { name => "ko-KR", lcid => 0x00000412 },
977 { name => "kok", lcid => 0x00000057, slist => ",", group => 15, sabbrevlangname => "KNK" },
978 { name => "kok-IN", lcid => 0x00000457 },
979 { name => "kr", lcid => 0x00000071, sortlocale => "kr-Latn-NG", oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "KNR" },
980 { name => "kr-Latn", file => "kr", dir => "exemplars" },
981 { name => "kr-Latn-NG", lcid => 0x00000471, file => "kr", dir => "exemplars" },
982 { name => "kr-NG", alias => "kr-Latn-NG" },
983 { name => "ks", lcid => 0x00000060, group => 15, sabbrevlangname => "ZZZ", sopentypelang => "KSH" },
984 { name => "ks-Arab", lcid => 0x00000460 },
985 { name => "ks-Arab-IN" },
986 { name => "ks-Deva", slist => "," },
987 { name => "ks-Deva-IN", lcid => 0x00000860 },
988 { name => "ks-IN", alias => "ks-Arab-IN" },
989 { name => "ksb" },
990 { name => "ksb-TZ" },
991 { name => "ksf" },
992 { name => "ksf-CM" },
993 { name => "ksh", sopentypelang => "KSH0" },
994 { name => "ksh-DE" },
995 { name => "ku", lcid => 0x00000092, file => "ckb", slist => "\x{061b}", sortlocale => "ku-Arab-IQ", oemcp => 720 },
996 { name => "ku-Arab", lcid => 0x00007c92, file => "ckb", group => 13 },
997 { name => "ku-Arab-IQ", lcid => 0x00000492, file => "ckb_IQ" },
998 { name => "ku-Arab-IR", file => "ckb_IR", oemcp => 65001 },
999 { name => "kw" },
1000 { name => "kw-GB" },
1001 { name => "ky", lcid => 0x00000040, oemcp => 866, group => 5, sabbrevlangname => "KYR" },
1002 { name => "ky-Cyrl", alias => "ky" },
1003 { name => "ky-Cyrl-KG", alias => "ky-KG" },
1004 { name => "ky-KG", lcid => 0x00000440 },
1005 { name => "la", lcid => 0x00000076, oemcp => 437, slist => ",", dir => "seed", sabbrevlangname => "ZZZ" },
1006 { name => "la-001", lcid => 0x00000476, file => "la", dir => "seed" },
1007 { name => "lag" },
1008 { name => "lag-TZ" },
1009 { name => "lb", lcid => 0x0000006e, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "LBX" },
1010 { name => "lb-LU", lcid => 0x0000046e },
1011 { name => "lg" },
1012 { name => "lg-UG" },
1013 { name => "lkt" },
1014 { name => "lkt-US" },
1015 { name => "ln" },
1016 { name => "ln-AO" },
1017 { name => "ln-CD" },
1018 { name => "ln-CF" },
1019 { name => "ln-CG" },
1020 { name => "lo", lcid => 0x00000054, group => 15 },
1021 { name => "lo-LA", lcid => 0x00000454 },
1022 { name => "lrc" },
1023 { name => "lrc-IQ" },
1024 { name => "lrc-IR" },
1025 { name => "lt", lcid => 0x00000027, oemcp => 775, group => 3, sabbrevlangname => "LTH", sopentypelang => "LTH" },
1026 { name => "lt-LT", lcid => 0x00000427 },
1027 { name => "lu" },
1028 { name => "lu-CD" },
1029 { name => "luo" },
1030 { name => "luo-KE" },
1031 { name => "luy", sopentypelang => "LUH" },
1032 { name => "luy-KE" },
1033 { name => "lv", lcid => 0x00000026, oemcp => 775, group => 3, sabbrevlangname => "LVI", sopentypelang => "LVI" },
1034 { name => "lv-LV", lcid => 0x00000426 },
1035 { name => "mai" },
1036 { name => "mai-IN" },
1037 { name => "mas" },
1038 { name => "mas-KE" },
1039 { name => "mas-TZ" },
1040 { name => "mer" },
1041 { name => "mer-KE" },
1042 { name => "mfe" },
1043 { name => "mfe-MU" },
1044 { name => "mg" },
1045 { name => "mg-MG" },
1046 { name => "mgh" },
1047 { name => "mgh-MZ" },
1048 { name => "mgo" },
1049 { name => "mgo-CM" },
1050 { name => "mi", lcid => 0x00000081, slist => "," },
1051 { name => "mi-Latn", alias => "mi" },
1052 { name => "mi-Latn-NZ", alias => "mi-NZ" },
1053 { name => "mi-NZ", lcid => 0x00000481 },
1054 { name => "mk", lcid => 0x0000002f, oemcp => 866, ebcdiccp => 500, group => 5, sabbrevlangname => "MKI" },
1055 { name => "mk-MK", lcid => 0x0000042f },
1056 { name => "ml", lcid => 0x0000004c, group => 15, sabbrevlangname => "MYM", sopentypelang => "MLR" },
1057 { name => "ml-IN", lcid => 0x0000044c },
1058 { name => "mn", lcid => 0x00000050, oemcp => 866, sopentypelang => "MNG" },
1059 { name => "mn-Cyrl", lcid => 0x00007850, file => "mn", sabbrevlangname => "MNN" },
1060 { name => "mn-Cyrl-MN", alias => "mn-MN" },
1061 { name => "mn-MN", lcid => 0x00000450, sparent => "mn-Cyrl", group => 5 },
1062 { name => "mn-Mong", lcid => 0x00007c50, oemcp => 65001, slist => ",", group => 15, dir => "seed", sabbrevlangname => "MNG" },
1063 { name => "mn-Mong-CN", lcid => 0x00000850, dir => "seed" },
1064 { name => "mn-Mong-MN", lcid => 0x00000c50, dir => "seed", sabbrevlangname => "MNM" },
1065 { name => "mni", lcid => 0x00000058, slist => ",", sabbrevlangname => "ZZZ" },
1066 { name => "mni-IN", lcid => 0x00000458, file => "mni_Beng_IN" },
1067 { name => "moh", lcid => 0x0000007c, oemcp => 850, ebcdiccp => 37, slist => ",", dir => "seed", sabbrevlangname => "MWK" },
1068 { name => "moh-CA", lcid => 0x0000047c, dir => "seed" },
1069 { name => "moh-Latn", alias => "moh" },
1070 { name => "moh-Latn-CA", alias => "moh-CA" },
1071 { name => "mr", lcid => 0x0000004e, slist => ",", group => 15 },
1072 { name => "mr-IN", lcid => 0x0000044e },
1073 { name => "ms", lcid => 0x0000003e, oemcp => 850, sabbrevlangname => "MSL", sopentypelang => "MLY" },
1074 { name => "ms-BN", lcid => 0x0000083e, sabbrevlangname => "MSB" },
1075 { name => "ms-ID" },
1076 { name => "ms-Latn", alias => "ms" },
1077 { name => "ms-Latn-BN", alias => "ms-BN" },
1078 { name => "ms-Latn-MY", alias => "ms-MY" },
1079 { name => "ms-Latn-SG", alias => "ms-SG" },
1080 { name => "ms-MY", lcid => 0x0000043e },
1081 { name => "ms-SG" },
1082 { name => "mt", lcid => 0x0000003a, sopentypelang => "MTS" },
1083 { name => "mt-MT", lcid => 0x0000043a },
1084 { name => "mua" },
1085 { name => "mua-CM" },
1086 { name => "my", lcid => 0x00000055, sopentypelang => "BRM" },
1087 { name => "my-MM", lcid => 0x00000455 },
1088 { name => "mzn" },
1089 { name => "mzn-IR" },
1090 { name => "naq" },
1091 { name => "naq-NA" },
1092 { name => "nb", lcid => 0x00007c14, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NOR", sopentypelang => "NOR" },
1093 { name => "nb-NO", lcid => 0x00000414 },
1094 { name => "nb-SJ" },
1095 { name => "nd", sopentypelang => "NDB" },
1096 { name => "nd-ZW" },
1097 { name => "nds" },
1098 { name => "nds-DE" },
1099 { name => "nds-NL" },
1100 { name => "ne", lcid => 0x00000061, slist => "," },
1101 { name => "ne-IN", lcid => 0x00000861, sabbrevlangname => "NEI" },
1102 { name => "ne-NP", lcid => 0x00000461, group => 15 },
1103 { name => "nl", lcid => 0x00000013, oemcp => 850 },
1104 { name => "nl-AW" },
1105 { name => "nl-BE", lcid => 0x00000813, sabbrevlangname => "NLB" },
1106 { name => "nl-BQ" },
1107 { name => "nl-CW" },
1108 { name => "nl-NL", lcid => 0x00000413 },
1109 { name => "nl-SR" },
1110 { name => "nl-SX" },
1111 { name => "nmg" },
1112 { name => "nmg-CM" },
1113 { name => "nn", lcid => 0x00007814, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NON", sopentypelang => "NYN" },
1114 { name => "nn-NO", lcid => 0x00000814 },
1115 { name => "nnh" },
1116 { name => "nnh-CM" },
1117 { name => "no", lcid => 0x00000014, oemcp => 850, ebcdiccp => 20277, sortlocale => "nb-NO" },
1118 { name => "nqo", idigits => 3, inegnumber => 3, slist => "\x{060c}", dir => "seed", sopentypelang => "NKO" },
1119 { name => "nqo-GN", dir => "seed" },
1120 { name => "nr", dir => "seed", sopentypelang => "NDB" },
1121 { name => "nr-ZA", dir => "seed" },
1122 { name => "nso", lcid => 0x0000006c, oemcp => 850, dir => "seed", sopentypelang => "SOT" },
1123 { name => "nso-ZA", lcid => 0x0000046c, dir => "seed" },
1124 { name => "nus" },
1125 { name => "nus-SD", alias => "nus-SS" },
1126 { name => "nus-SS" },
1127 { name => "nyn", sopentypelang => "NKL" },
1128 { name => "nyn-UG" },
1129 { name => "oc", lcid => 0x00000082, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
1130 { name => "oc-FR", lcid => 0x00000482, dir => "seed" },
1131 { name => "oc-Latn", alias => "oc" },
1132 { name => "oc-Latn-FR", alias => "oc-FR" },
1133 { name => "om", lcid => 0x00000072, sopentypelang => "ORO" },
1134 { name => "om-ET", lcid => 0x00000472 },
1135 { name => "om-KE" },
1136 { name => "or", lcid => 0x00000048, slist => ",", group => 15 },
1137 { name => "or-IN", lcid => 0x00000448 },
1138 { name => "os" },
1139 { name => "os-GE" },
1140 { name => "os-RU" },
1141 { name => "pa", lcid => 0x00000046, slist => "," },
1142 { name => "pa-Arab", lcid => 0x00007c46, slist => ";", inegnumber => 2, oemcp => 720, group => 13, sabbrevlangname => "PAP" },
1143 { name => "pa-Arab-PK", lcid => 0x00000846 },
1144 { name => "pa-Guru" },
1145 { name => "pa-Guru-IN", alias => "pa-IN" },
1146 { name => "pa-IN", lcid => 0x00000446, sparent => "pa-Guru", file => "pa_Guru_IN", group => 15 },
1147 ## name => "pap", lcid => 0x00000079 },
1148 ## name => "pap-029", lcid => 0x00000479 },
1149 { name => "pcm" },
1150 { name => "pcm-NG" },
1151 { name => "pl", lcid => 0x00000015, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "PLK", sopentypelang => "PLK" },
1152 { name => "pl-PL", lcid => 0x00000415 },
1153 ## name => "prg" },
1154 ## name => "prg-001" },
1155 { name => "prs", lcid => 0x0000008c, file => "fa", inegnumber => 3, oemcp => 720, group => 13, sopentypelang => "DRI" },
1156 { name => "prs-AF", lcid => 0x0000048c, file => "fa_AF" },
1157 { name => "prs-Arab", alias => "prs" },
1158 { name => "prs-Arab-AF", alias => "prs-AF" },
1159 { name => "ps", lcid => 0x00000063, group => 13, sabbrevlangname => "PAS", sopentypelang => "PAS" },
1160 { name => "ps-AF", lcid => 0x00000463 },
1161 { name => "ps-PK" },
1162 { name => "pt", lcid => 0x00000016, oemcp => 850, sabbrevlangname => "PTB", sopentypelang => "PTG" },
1163 { name => "pt-AO" },
1164 { name => "pt-BR", lcid => 0x00000416 },
1165 { name => "pt-CH", oemcp => 65001 },
1166 { name => "pt-CV" },
1167 { name => "pt-GQ", oemcp => 65001 },
1168 { name => "pt-GW" },
1169 { name => "pt-LU", oemcp => 65001 },
1170 { name => "pt-MO" },
1171 { name => "pt-MZ" },
1172 { name => "pt-PT", lcid => 0x00000816, sabbrevlangname => "PTG" },
1173 { name => "pt-ST" },
1174 { name => "pt-TL" },
1175 ## name => qps-Latn-x-sh", lcid => 0x80000901 },
1176 ## name => qps-ploc", lcid => 0x80000501 },
1177 ## name => qps-ploca", lcid => 0x800005fe },
1178 ## name => qps-plocm", lcid => 0x800009ff },
1179 { name => "qu", alias => "quz" },
1180 { name => "qu-BO", alias => "quz-BO" },
1181 { name => "qu-EC", alias => "quz-EC" },
1182 { name => "qu-PE", alias => "quz-PE" },
1183 { name => "quc", lcid => 0x00000086, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed" },
1184 { name => "quc-Latn", lcid => 0x00007c86, file => "quc", dir => "seed" },
1185 { name => "quc-Latn-GT", lcid => 0x00000486, file => "quc_GT", dir => "seed" },
1186 { name => "qut", alias => "quc" },
1187 { name => "qut-GT", alias => "quc-Latn-GT" },
1188 { name => "quz", lcid => 0x0000006b, file => "qu", territory => "BO", oemcp => 850, ebcdiccp => 20284, slist => "," },
1189 { name => "quz-BO", lcid => 0x0000046b, file => "qu_BO" },
1190 { name => "quz-EC", lcid => 0x0000086b, file => "qu_EC" },
1191 { name => "quz-Latn", alias => "quz" },
1192 { name => "quz-Latn-BO", alias => "quz-BO" },
1193 { name => "quz-Latn-EC", alias => "quz-EC" },
1194 { name => "quz-Latn-PE", alias => "quz-PE" },
1195 { name => "quz-PE", lcid => 0x00000c6b, file => "qu_PE" },
1196 { name => "rm", lcid => 0x00000017, oemcp => 850, ebcdiccp => 20273, sabbrevlangname => "RMC", sopentypelang => "RMS" },
1197 { name => "rm-CH", lcid => 0x00000417 },
1198 { name => "rn" },
1199 { name => "rn-BI" },
1200 { name => "ro", lcid => 0x00000018, oemcp => 852, ebcdiccp => 20880, sabbrevlangname => "ROM", sopentypelang => "ROM" },
1201 { name => "ro-MD", lcid => 0x00000818, maccp => 65001, sabbrevlangname => "ROD" },
1202 { name => "ro-RO", lcid => 0x00000418, group => 2 },
1203 { name => "rof" },
1204 { name => "rof-TZ" },
1205 { name => "ru", lcid => 0x00000019, oemcp => 866 },
1206 { name => "ru-BY", maccp => 65001 },
1207 { name => "ru-KG", maccp => 65001 },
1208 { name => "ru-KZ", maccp => 65001 },
1209 { name => "ru-MD", lcid => 0x00000819, maccp => 65001, sabbrevlangname => "RUM" },
1210 { name => "ru-RU", lcid => 0x00000419, group => 5 },
1211 { name => "ru-UA", maccp => 65001 },
1212 { name => "rw", lcid => 0x00000087, oemcp => 437, sopentypelang => "RUA" },
1213 { name => "rw-RW", lcid => 0x00000487 },
1214 { name => "rwk" },
1215 { name => "rwk-TZ" },
1216 { name => "sa", lcid => 0x0000004f, slist => ",", group => 15 },
1217 { name => "sa-Deva", alias => "sa" },
1218 { name => "sa-Deva-IN", alias => "sa-IN" },
1219 { name => "sa-IN", lcid => 0x0000044f },
1220 { name => "sah", lcid => 0x00000085, oemcp => 866, group => 5, sopentypelang => "YAK" },
1221 { name => "sah-Cyrl", alias => "sah" },
1222 { name => "sah-Cyrl-RU", alias => "sah-RU" },
1223 { name => "sah-RU", lcid => 0x00000485 },
1224 { name => "saq" },
1225 { name => "saq-KE" },
1226 { name => "sat" },
1227 { name => "sat-Olck" },
1228 { name => "sat-Olck-IN" },
1229 { name => "sbp" },
1230 { name => "sbp-TZ" },
1231 { name => "sc" },
1232 { name => "sc-IT" },
1233 { name => "sd", lcid => 0x00000059, inegnumber => 3, oemcp => 720, sabbrevlangname => "SIP" },
1234 { name => "sd-Arab", lcid => 0x00007c59, group => 13 },
1235 { name => "sd-Arab-PK", lcid => 0x00000859 },
1236 { name => "sd-Deva", inegnumber => 1, slist => ",", oemcp => 65001, group => 15 },
1237 { name => "sd-Deva-IN", lcid => 0x00000459, sabbrevlangname => "ZZZ" },
1238 { name => "sd-PK", alias => "sd-Arab-PK" },
1239 { name => "se", lcid => 0x0000003b, oemcp => 850, ebcdiccp => 20277, sopentypelang => "NSM" },
1240 { name => "se-FI", lcid => 0x00000c3b, ebcdiccp => 20278, sabbrevlangname => "SMG" },
1241 { name => "se-NO", lcid => 0x0000043b },
1242 { name => "se-SE", lcid => 0x0000083b, ebcdiccp => 20278, sabbrevlangname => "SMF" },
1243 { name => "se-Latn", alias => "se" },
1244 { name => "se-Latn-FI", alias => "se-FI" },
1245 { name => "se-Latn-NO", alias => "se-NO" },
1246 { name => "se-Latn-SE", alias => "se-SE" },
1247 { name => "seh" },
1248 { name => "seh-MZ" },
1249 { name => "ses" },
1250 { name => "ses-ML" },
1251 { name => "sg", sopentypelang => "SGO" },
1252 { name => "sg-CF" },
1253 { name => "shi" },
1254 { name => "shi-Latn" },
1255 { name => "shi-Latn-MA" },
1256 { name => "shi-Tfng" },
1257 { name => "shi-Tfng-MA" },
1258 { name => "si", lcid => 0x0000005b, group => 15, sopentypelang => "SNH" },
1259 { name => "si-LK", lcid => 0x0000045b },
1260 { name => "sk", lcid => 0x0000001b, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "SKY", sopentypelang => "SKY" },
1261 { name => "sk-SK", lcid => 0x0000041b },
1262 { name => "sl", lcid => 0x00000024, oemcp => 852, ebcdiccp => 20880, group => 2 },
1263 { name => "sl-SI", lcid => 0x00000424 },
1264 { name => "sma", lcid => 0x0000783b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMB", sopentypelang => "SSM" },
1265 { name => "sma-Latn", alias => "sma" },
1266 { name => "sma-Latn-NO", alias => "sma-NO" },
1267 { name => "sma-Latn-SE", alias => "sma-SE" },
1268 { name => "sma-NO", lcid => 0x0000183b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMA" },
1269 { name => "sma-SE", lcid => 0x00001c3b, dir => "seed" },
1270 { name => "smj", lcid => 0x00007c3b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMK", sopentypelang => "LSM" },
1271 { name => "smj-Latn", alias => "smj" },
1272 { name => "smj-Latn-NO", alias => "smj-NO" },
1273 { name => "smj-Latn-SE", alias => "smj-SE" },
1274 { name => "smj-NO", lcid => 0x0000103b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMJ" },
1275 { name => "smj-SE", lcid => 0x0000143b, dir => "seed" },
1276 { name => "smn", lcid => 0x0000703b, sparent => "se", ebcdiccp => 20278, sopentypelang => "ISM" },
1277 { name => "smn-FI", lcid => 0x0000243b },
1278 { name => "smn-Latn", alias => "smn" },
1279 { name => "smn-Latn-FI", alias => "smn-FI" },
1280 { name => "sms", lcid => 0x0000743b, sparent => "se", ebcdiccp => 20278, dir => "seed", sopentypelang => "SKS" },
1281 { name => "sms-FI", lcid => 0x0000203b, dir => "seed" },
1282 { name => "sms-Latn", alias => "sms" },
1283 { name => "sms-Latn-FI", alias => "sms-FI" },
1284 { name => "sn", sopentypelang => "SNA0" },
1285 { name => "sn-Latn", file => "sn" },
1286 { name => "sn-Latn-ZW", file => "sn_ZW" },
1287 { name => "sn-ZW", alias => "sn-Latn-ZW" },
1288 { name => "so", lcid => 0x00000077, sopentypelang => "SML" },
1289 { name => "so-DJ" },
1290 { name => "so-ET" },
1291 { name => "so-KE" },
1292 { name => "so-SO", lcid => 0x00000477 },
1293 { name => "sq", lcid => 0x0000001c, oemcp => 852, ebcdiccp => 20880, group => 2 },
1294 { name => "sq-AL", lcid => 0x0000041c },
1295 { name => "sq-MK" },
1296 { name => "sq-XK" },
1297 { name => "sr", lcid => 0x00007c1a, sortlocale => "sr-Latn-RS", oemcp => 852, group => 2, sabbrevlangname => "SRB", sopentypelang => "SRB" },
1298 { name => "sr-Cyrl", lcid => 0x00006c1a, oemcp => 855, ebcdiccp => 21025, group => 5, sabbrevlangname => "SRO" },
1299 { name => "sr-Cyrl-BA", lcid => 0x00001c1a, sabbrevlangname => "SRN" },
1300 { name => "sr-Cyrl-ME", lcid => 0x0000301a, sabbrevlangname => "SRQ" },
1301 { name => "sr-Cyrl-RS", lcid => 0x0000281a },
1302 { name => "sr-Cyrl-XK" },
1303 { name => "sr-Latn", lcid => 0x0000701a, sabbrevlangname => "SRM" },
1304 { name => "sr-Latn-BA", lcid => 0x0000181a, maccp => 10082, ebcdiccp => 870, sabbrevlangname => "SRS" },
1305 { name => "sr-Latn-ME", lcid => 0x00002c1a, sabbrevlangname => "SRP" },
1306 { name => "sr-Latn-RS", lcid => 0x0000241a, sabbrevlangname => "SRM" },
1307 { name => "sr-Latn-XK" },
1308 ## name => "sr-Cyrl-CS", lcid => 0x00000c1a },
1309 ## name => "sr-Latn-CS", lcid => 0x0000081a },
1310 { name => "ss", dir => "seed", sopentypelang => "SWZ" },
1311 { name => "ss-SZ", dir => "seed" },
1312 { name => "ss-ZA", dir => "seed" },
1313 { name => "ssy", dir => "seed" },
1314 { name => "ssy-ER", dir => "seed" },
1315 { name => "st", lcid => 0x00000030, dir => "seed" },
1316 { name => "st-LS", dir => "seed" },
1317 { name => "st-ZA", lcid => 0x00000430, dir => "seed" },
1318 { name => "su" },
1319 { name => "su-Latn" },
1320 { name => "su-Latn-ID" },
1321 { name => "sv", lcid => 0x0000001d, oemcp => 850, ebcdiccp => 20278, sabbrevlangname => "SVE", sopentypelang => "SVE" },
1322 { name => "sv-AX" },
1323 { name => "sv-FI", lcid => 0x0000081d, sabbrevlangname => "SVF" },
1324 { name => "sv-SE", lcid => 0x0000041d, sabbrevlangname => "SVE" },
1325 { name => "sw", lcid => 0x00000041, territory => "KE", oemcp => 437, ebcdiccp => 500, sabbrevlangname => "SWK", sopentypelang => "SWK" },
1326 { name => "sw-CD" },
1327 { name => "sw-KE", lcid => 0x00000441 },
1328 { name => "sw-TZ" },
1329 { name => "sw-UG" },
1330 { name => "swc-CD", alias => "sw-CD" },
1331 { name => "syr", lcid => 0x0000005a, slist => ",", group => 13, dir => "seed" },
1332 { name => "syr-SY", lcid => 0x0000045a, dir => "seed" },
1333 { name => "syr-Syrc", alias => "syr" },
1334 { name => "syr-Syrc-SY", alias => "syr-SY" },
1335 { name => "ta", lcid => 0x00000049, slist => ",", group => 15, sabbrevlangname => "TAI" },
1336 { name => "ta-IN", lcid => 0x00000449 },
1337 { name => "ta-LK", lcid => 0x00000849, sabbrevlangname => "TAM" },
1338 { name => "ta-MY" },
1339 { name => "ta-SG" },
1340 { name => "te", lcid => 0x0000004a, group => 15 },
1341 { name => "te-IN", lcid => 0x0000044a },
1342 { name => "teo" },
1343 { name => "teo-KE" },
1344 { name => "teo-UG" },
1345 { name => "tg", lcid => 0x00000028, oemcp => 866, group => 5, sabbrevlangname => "TAJ", sopentypelang => "TAJ" },
1346 { name => "tg-Cyrl", lcid => 0x00007c28, file => "tg" },
1347 { name => "tg-Cyrl-TJ", lcid => 0x00000428, file => "tg_TJ" },
1348 { name => "tg-TJ", alias => "tg-Cyrl-TJ" },
1349 { name => "th", lcid => 0x0000001e, oemcp => 874, ebcdiccp => 20838, slist => ",", group => 11 },
1350 { name => "th-TH", lcid => 0x0000041e },
1351 { name => "ti", lcid => 0x00000073, territory => "ER", sopentypelang => "TGY" },
1352 { name => "ti-ER", lcid => 0x00000873 },
1353 { name => "ti-ET", lcid => 0x00000473, sabbrevlangname => "TIE" },
1354 { name => "tig", dir => "seed", sopentypelang => "TGR" },
1355 { name => "tig-ER", dir => "seed" },
1356 { name => "tk", lcid => 0x00000042, oemcp => 852, ebcdiccp => 20880, group => 2, sopentypelang => "TKM" },
1357 { name => "tk-Latn", alias => "tk" },
1358 { name => "tk-Latn-TM", alias => "tk-TM" },
1359 { name => "tk-TM", lcid => 0x00000442 },
1360 { name => "tn", lcid => 0x00000032, oemcp => 850, dir => "seed", sopentypelang => "TNA" },
1361 { name => "tn-BW", lcid => 0x00000832, dir => "seed", sabbrevlangname => "TSB" },
1362 { name => "tn-ZA", lcid => 0x00000432, dir => "seed" },
1363 { name => "to", sopentypelang => "TGN" },
1364 { name => "to-TO" },
1365 { name => "tr", lcid => 0x0000001f, oemcp => 857, ebcdiccp => 20905, group => 6, sabbrevlangname => "TRK", sopentypelang => "TRK" },
1366 { name => "tr-CY" },
1367 { name => "tr-TR", lcid => 0x0000041f },
1368 { name => "ts", lcid => 0x00000031, dir => "seed", sopentypelang => "TSG" },
1369 { name => "ts-ZA", lcid => 0x00000431, dir => "seed" },
1370 { name => "tt", lcid => 0x00000044, oemcp => 866, group => 5, sabbrevlangname => "TTT" },
1371 { name => "tt-Cyrl", alias => "tt" },
1372 { name => "tt-Cyrl-RU", alias => "tt-RU" },
1373 { name => "tt-RU", lcid => 0x00000444 },
1374 { name => "twq" },
1375 { name => "twq-NE" },
1376 { name => "tzm", lcid => 0x0000005f, sortlocale => "tzm-Latn-DZ", oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "TZA" },
1377 { name => "tzm-Latn", lcid => 0x00007c5f, territory => "DZ", file => "tzm" },
1378 { name => "tzm-Latn-MA", file => "tzm_MA", oemcp => 65001 },
1379 { name => "tzm-Latn-DZ", lcid => 0x0000085f, file => "tzm" },
1380 { name => "tzm-MA", alias => "tzm-Latn-MA" },
1381 ## name => "tzm-Arab", group => 13 },
1382 ## name => "tzm-Arab-MA", lcid => 0x0000045f },
1383 ## name => "tzm-Tfng", lcid => 0x0000785f },
1384 ## name => "tzm-Tfng-MA", lcid => 0x0000105f },
1385 { name => "ug", lcid => 0x00000080, oemcp => 720, slist => ",", group => 13, sopentypelang => "UYG" },
1386 { name => "ug-Arab", alias => "ug" },
1387 { name => "ug-Arab-CN", alias => "ug-CN" },
1388 { name => "ug-CN", lcid => 0x00000480 },
1389 { name => "uk", lcid => 0x00000022, oemcp => 866, maccp => 10017, ebcdiccp => 500, group => 5 },
1390 { name => "uk-UA", lcid => 0x00000422 },
1391 { name => "ur", lcid => 0x00000020, oemcp => 720 },
1392 { name => "ur-IN", lcid => 0x00000820, maccp => 65001, sabbrevlangname => "URI" },
1393 { name => "ur-PK", lcid => 0x00000420, group => 13 },
1394 { name => "uz", lcid => 0x00000043, oemcp => 857, maccp => 10029, group => 2 },
1395 { name => "uz-Arab", oemcp => 65001, maccp => 65001 },
1396 { name => "uz-Arab-AF" },
1397 { name => "uz-Cyrl", lcid => 0x00007843, oemcp => 866, maccp => 10007, group => 5, sabbrevlangname => "UZC" },
1398 { name => "uz-Cyrl-UZ", lcid => 0x00000843 },
1399 { name => "uz-Latn", lcid => 0x00007c43 },
1400 { name => "uz-Latn-UZ", lcid => 0x00000443 },
1401 { name => "vai" },
1402 { name => "vai-Latn" },
1403 { name => "vai-Latn-LR" },
1404 { name => "vai-Vaii" },
1405 { name => "vai-Vaii-LR" },
1406 { name => "ve", lcid => 0x00000033, dir => "seed", sabbrevlangname => "ZZZ" },
1407 { name => "ve-ZA", lcid => 0x00000433, dir => "seed" },
1408 { name => "vi", lcid => 0x0000002a, oemcp => 1258, slist => ",", group => 14, sabbrevlangname => "VIT", sopentypelang => "VIT" },
1409 { name => "vi-VN", lcid => 0x0000042a },
1410 { name => "vo", dir => "seed" },
1411 { name => "vo-001", dir => "seed" },
1412 { name => "vun" },
1413 { name => "vun-TZ" },
1414 { name => "wae" },
1415 { name => "wae-CH" },
1416 { name => "wal", dir => "seed" },
1417 { name => "wal-ET", dir => "seed" },
1418 { name => "wo", lcid => 0x00000088, oemcp => 850, ebcdiccp => 20297, sopentypelang => "WLF" },
1419 { name => "wo-Latn", alias => "wo" },
1420 { name => "wo-Latn-SN", alias => "wo-SN" },
1421 { name => "wo-SN", lcid => 0x00000488 },
1422 { name => "x-IV_mathan", lcid => 0x0001007f, alias => "" },
1423 { name => "xh", lcid => 0x00000034, oemcp => 850, sopentypelang => "XHS" },
1424 { name => "xh-ZA", lcid => 0x00000434 },
1425 { name => "xog" },
1426 { name => "xog-UG" },
1427 { name => "yav" },
1428 { name => "yav-CM" },
1429 { name => "yi", lcid => 0x0000003d, sabbrevlangname => "ZZZ", sopentypelang => "JII" },
1430 { name => "yi-001", lcid => 0x0000043d },
1431 { name => "yo", lcid => 0x0000006a, oemcp => 437, sopentypelang => "YBA" },
1432 { name => "yo-BJ", ebcdiccp => 500 },
1433 { name => "yo-Latn", alias => "yo" },
1434 { name => "yo-Latn-NG", alias => "yo-NG" },
1435 { name => "yo-NG", lcid => 0x0000046a },
1436 { name => "yrl" },
1437 { name => "yrl-BR" },
1438 { name => "yrl-CO" },
1439 { name => "yrl-VE" },
1440 { name => "yue" },
1441 { name => "yue-Hans" },
1442 { name => "yue-Hans-CN" },
1443 { name => "yue-Hant" },
1444 { name => "yue-Hant-HK" },
1445 { name => "zgh" },
1446 { name => "zgh-MA", alias => "zgh-Tfng-MA" },
1447 { name => "zgh-Tfng", file => "zgh" },
1448 { name => "zgh-Tfng-MA", file => "zgh_MA" },
1449 { name => "zh", lcid => 0x00007804, ireadinglayout => 2, oemcp => 936, slist => ",", sscripts => "Hani Hans", sabbrevlangname => "CHS", sopentypelang => "ZHS" },
1450 { name => "zh-CN", lcid => 0x00000804, file => "zh_Hans_CN", sparent => "zh-Hans" },
1451 { name => "zh-CN_phoneb", lcid => 0x00050804, alias => "zh-CN" },
1452 { name => "zh-CN_stroke", lcid => 0x00020804, alias => "zh-CN" },
1453 { name => "zh-Hans", lcid => 0x00000004, group => 10 },
1454 { name => "zh-Hans-CN", alias => "zh-CN" },
1455 { name => "zh-Hans-CN-u-co-phonebk", alias => "zh-CN_phoneb" },
1456 { name => "zh-Hans-CN-u-co-stroke", alias => "zh-CN_stroke" },
1457 { name => "zh-Hans-HK", slist => ";" },
1458 { name => "zh-Hans-MO", slist => ";" },
1459 { name => "zh-Hans-SG", alias => "zh-SG" },
1460 { name => "zh-Hans-SG-u-co-phonebk", alias => "zh-SG_phoneb" },
1461 { name => "zh-Hans-SG-u-co-stroke", alias => "zh-SG_stroke" },
1462 { name => "zh-Hant", lcid => 0x00007c04, sortlocale => "zh-HK", ireadinglayout => 2, oemcp => 950, slist => ",", sscripts => "Hani Hant", group => 9, sabbrevlangname => "CHT", sopentypelang => "ZHH" },
1463 { name => "zh-Hant-HK", alias => "zh-HK" },
1464 { name => "zh-Hant-HK-u-co-unihan", alias => "zh-HK_radstr" },
1465 { name => "zh-Hant-MO", alias => "zh-MO" },
1466 { name => "zh-Hant-MO-u-co-stroke", alias => "zh-MO_stroke" },
1467 { name => "zh-Hant-MO-u-co-unihan", alias => "zh-MO_radstr" },
1468 { name => "zh-Hant-TW", alias => "zh-TW" },
1469 { name => "zh-Hant-TW-u-co-phonetic", alias => "zh-TW_pronun" },
1470 { name => "zh-Hant-TW-u-co-unihan", alias => "zh-TW_radstr" },
1471 { name => "zh-HK", lcid => 0x00000c04, file => "zh_Hant_HK", sparent => "zh-Hant", sabbrevlangname => "ZHH" },
1472 { name => "zh-HK_radstr", lcid => 0x00040c04, alias => "zh-HK" },
1473 { name => "zh-MO", lcid => 0x00001404, file => "zh_Hant_MO", sparent => "zh-Hant", sabbrevlangname => "ZHM", sopentypelang => "ZHT" },
1474 { name => "zh-MO_radstr", lcid => 0x00041404, alias => "zh-MO" },
1475 { name => "zh-MO_stroke", lcid => 0x00021404, alias => "zh-MO" },
1476 { name => "zh-SG", lcid => 0x00001004, file => "zh_Hans_SG", sparent => "zh-Hans", sabbrevlangname => "ZHI" },
1477 { name => "zh-SG_phoneb", lcid => 0x00051004, alias => "zh-SG" },
1478 { name => "zh-SG_stroke", lcid => 0x00021004, alias => "zh-SG" },
1479 { name => "zh-TW", lcid => 0x00000404, file => "zh_Hant_TW", sparent => "zh-Hant", sopentypelang => "ZHT" },
1480 { name => "zh-TW_pronun", lcid => 0x00030404, alias => "zh-TW" },
1481 { name => "zh-TW_radstr", lcid => 0x00040404, alias => "zh-TW" },
1482 { name => "zu", lcid => 0x00000035, oemcp => 850 },
1483 { name => "zu-ZA", lcid => 0x00000435 },
1486 my @calendars =
1488 { id => 1, name => "Gregorian", itwodigityearmax => 2049 },
1489 { id => 2, type => "gregorian", locale => "en-US", itwodigityearmax => 2049 },
1490 { id => 3, type => "japanese", locale => "ja-JP", eras => [ 232..236 ] },
1491 { id => 4, type => "roc", locale => "zh-TW", eras => [ 1 ] },
1492 { id => 5, type => "dangi", locale => "ko-KR", eras => [ 0 ] },
1493 { id => 6, type => "islamic", locale => "ar-SA", itwodigityearmax => 1451 },
1494 { id => 7, type => "buddhist", locale => "th-TH", eras => [ 0 ] },
1495 { id => 8, type => "hebrew", locale => "he-IL", itwodigityearmax => 5810 },
1496 { id => 9, type => "gregorian", locale => "fr-FR", itwodigityearmax => 2049 },
1497 { id => 10, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1498 { id => 11, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1499 { id => 12, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1500 { id => 13, name => "Julian", locale => "en-US", itwodigityearmax => 2049 },
1501 { id => 14, name => "Japanese Lunisolar" },
1502 { id => 15, name => "Chinese Lunisolar" },
1503 { id => 16, name => "Saka" },
1504 { id => 17, name => "Lunar ETO Chinese" },
1505 { id => 18, name => "Lunar ETO Korean" },
1506 { id => 19, name => "Lunar ETO Rokuyou" },
1507 { id => 20, name => "Korean Lunisolar" },
1508 { id => 21, name => "Taiwan Lunisolar" },
1509 { id => 22, type => "persian", locale => "prs-AF", itwodigityearmax => 1429 },
1510 { id => 23, type => "islamic-umalqura", locale => "ar-SA", itwodigityearmax => 1451 },
1513 my @geoids =
1515 { id => 2, name => "AG" }, # Antigua and Barbuda
1516 { id => 3, name => "AF" }, # Afghanistan
1517 { id => 4, name => "DZ" }, # Algeria
1518 { id => 5, name => "AZ" }, # Azerbaijan
1519 { id => 6, name => "AL" }, # Albania
1520 { id => 7, name => "AM" }, # Armenia
1521 { id => 8, name => "AD" }, # Andorra
1522 { id => 9, name => "AO" }, # Angola
1523 { id => 10, name => "AS" }, # American Samoa
1524 { id => 11, name => "AR" }, # Argentina
1525 { id => 12, name => "AU" }, # Australia
1526 { id => 14, name => "AT" }, # Austria
1527 { id => 17, name => "BH" }, # Bahrain
1528 { id => 18, name => "BB" }, # Barbados
1529 { id => 19, name => "BW" }, # Botswana
1530 { id => 20, name => "BM" }, # Bermuda
1531 { id => 21, name => "BE" }, # Belgium
1532 { id => 22, name => "BS" }, # Bahamas, The
1533 { id => 23, name => "BD" }, # Bangladesh
1534 { id => 24, name => "BZ" }, # Belize
1535 { id => 25, name => "BA" }, # Bosnia and Herzegovina
1536 { id => 26, name => "BO" }, # Bolivia
1537 { id => 27, name => "MM" }, # Myanmar
1538 { id => 28, name => "BJ" }, # Benin
1539 { id => 29, name => "BY" }, # Belarus
1540 { id => 30, name => "SB" }, # Solomon Islands
1541 { id => 32, name => "BR" }, # Brazil
1542 { id => 34, name => "BT" }, # Bhutan
1543 { id => 35, name => "BG" }, # Bulgaria
1544 { id => 37, name => "BN" }, # Brunei
1545 { id => 38, name => "BI" }, # Burundi
1546 { id => 39, name => "CA" }, # Canada
1547 { id => 40, name => "KH" }, # Cambodia
1548 { id => 41, name => "TD" }, # Chad
1549 { id => 42, name => "LK" }, # Sri Lanka
1550 { id => 43, name => "CG" }, # Congo
1551 { id => 44, name => "CD" }, # Congo (DRC)
1552 { id => 45, name => "CN" }, # China
1553 { id => 46, name => "CL" }, # Chile
1554 { id => 49, name => "CM" }, # Cameroon
1555 { id => 50, name => "KM" }, # Comoros
1556 { id => 51, name => "CO" }, # Colombia
1557 { id => 54, name => "CR" }, # Costa Rica
1558 { id => 55, name => "CF" }, # Central African Republic
1559 { id => 56, name => "CU" }, # Cuba
1560 { id => 57, name => "CV" }, # Cape Verde
1561 { id => 59, name => "CY" }, # Cyprus
1562 { id => 61, name => "DK" }, # Denmark
1563 { id => 62, name => "DJ" }, # Djibouti
1564 { id => 63, name => "DM" }, # Dominica
1565 { id => 65, name => "DO" }, # Dominican Republic
1566 { id => 66, name => "EC" }, # Ecuador
1567 { id => 67, name => "EG" }, # Egypt
1568 { id => 68, name => "IE" }, # Ireland
1569 { id => 69, name => "GQ" }, # Equatorial Guinea
1570 { id => 70, name => "EE" }, # Estonia
1571 { id => 71, name => "ER" }, # Eritrea
1572 { id => 72, name => "SV" }, # El Salvador
1573 { id => 73, name => "ET" }, # Ethiopia
1574 { id => 75, name => "CZ" }, # Czech Republic
1575 { id => 77, name => "FI" }, # Finland
1576 { id => 78, name => "FJ" }, # Fiji Islands
1577 { id => 80, name => "FM" }, # Micronesia
1578 { id => 81, name => "FO" }, # Faroe Islands
1579 { id => 84, name => "FR" }, # France
1580 { id => 86, name => "GM" }, # Gambia, The
1581 { id => 87, name => "GA" }, # Gabon
1582 { id => 88, name => "GE" }, # Georgia
1583 { id => 89, name => "GH" }, # Ghana
1584 { id => 90, name => "GI" }, # Gibraltar
1585 { id => 91, name => "GD" }, # Grenada
1586 { id => 93, name => "GL" }, # Greenland
1587 { id => 94, name => "DE" }, # Germany
1588 { id => 98, name => "GR" }, # Greece
1589 { id => 99, name => "GT" }, # Guatemala
1590 { id => 100, name => "GN" }, # Guinea
1591 { id => 101, name => "GY" }, # Guyana
1592 { id => 103, name => "HT" }, # Haiti
1593 { id => 104, name => "HK" }, # Hong Kong S.A.R.
1594 { id => 106, name => "HN" }, # Honduras
1595 { id => 108, name => "HR" }, # Croatia
1596 { id => 109, name => "HU" }, # Hungary
1597 { id => 110, name => "IS" }, # Iceland
1598 { id => 111, name => "ID" }, # Indonesia
1599 { id => 113, name => "IN" }, # India
1600 { id => 114, name => "IO" }, # British Indian Ocean Territory
1601 { id => 116, name => "IR" }, # Iran
1602 { id => 117, name => "IL" }, # Israel
1603 { id => 118, name => "IT" }, # Italy
1604 { id => 119, name => "CI" }, # Côte d'Ivoire
1605 { id => 121, name => "IQ" }, # Iraq
1606 { id => 122, name => "JP" }, # Japan
1607 { id => 124, name => "JM" }, # Jamaica
1608 { id => 125, name => "SJ" }, # Jan Mayen
1609 { id => 126, name => "JO" }, # Jordan
1610 { id => 127, parent => "UM" }, # Johnston Atoll
1611 { id => 129, name => "KE" }, # Kenya
1612 { id => 130, name => "KG" }, # Kyrgyzstan
1613 { id => 131, name => "KP" }, # North Korea
1614 { id => 133, name => "KI" }, # Kiribati
1615 { id => 134, name => "KR" }, # Korea
1616 { id => 136, name => "KW" }, # Kuwait
1617 { id => 137, name => "KZ" }, # Kazakhstan
1618 { id => 138, name => "LA" }, # Laos
1619 { id => 139, name => "LB" }, # Lebanon
1620 { id => 140, name => "LV" }, # Latvia
1621 { id => 141, name => "LT" }, # Lithuania
1622 { id => 142, name => "LR" }, # Liberia
1623 { id => 143, name => "SK" }, # Slovakia
1624 { id => 145, name => "LI" }, # Liechtenstein
1625 { id => 146, name => "LS" }, # Lesotho
1626 { id => 147, name => "LU" }, # Luxembourg
1627 { id => 148, name => "LY" }, # Libya
1628 { id => 149, name => "MG" }, # Madagascar
1629 { id => 151, name => "MO" }, # Macao S.A.R.
1630 { id => 152, name => "MD" }, # Moldova
1631 { id => 154, name => "MN" }, # Mongolia
1632 { id => 156, name => "MW" }, # Malawi
1633 { id => 157, name => "ML" }, # Mali
1634 { id => 158, name => "MC" }, # Monaco
1635 { id => 159, name => "MA" }, # Morocco
1636 { id => 160, name => "MU" }, # Mauritius
1637 { id => 162, name => "MR" }, # Mauritania
1638 { id => 163, name => "MT" }, # Malta
1639 { id => 164, name => "OM" }, # Oman
1640 { id => 165, name => "MV" }, # Maldives
1641 { id => 166, name => "MX" }, # Mexico
1642 { id => 167, name => "MY" }, # Malaysia
1643 { id => 168, name => "MZ" }, # Mozambique
1644 { id => 173, name => "NE" }, # Niger
1645 { id => 174, name => "VU" }, # Vanuatu
1646 { id => 175, name => "NG" }, # Nigeria
1647 { id => 176, name => "NL" }, # Netherlands
1648 { id => 177, name => "NO" }, # Norway
1649 { id => 178, name => "NP" }, # Nepal
1650 { id => 180, name => "NR" }, # Nauru
1651 { id => 181, name => "SR" }, # Suriname
1652 { id => 182, name => "NI" }, # Nicaragua
1653 { id => 183, name => "NZ" }, # New Zealand
1654 { id => 184, name => "PS" }, # Palestinian Authority
1655 { id => 185, name => "PY" }, # Paraguay
1656 { id => 187, name => "PE" }, # Peru
1657 { id => 190, name => "PK" }, # Pakistan
1658 { id => 191, name => "PL" }, # Poland
1659 { id => 192, name => "PA" }, # Panama
1660 { id => 193, name => "PT" }, # Portugal
1661 { id => 194, name => "PG" }, # Papua New Guinea
1662 { id => 195, name => "PW" }, # Palau
1663 { id => 196, name => "GW" }, # Guinea-Bissau
1664 { id => 197, name => "QA" }, # Qatar
1665 { id => 198, name => "RE" }, # Reunion
1666 { id => 199, name => "MH" }, # Marshall Islands
1667 { id => 200, name => "RO" }, # Romania
1668 { id => 201, name => "PH" }, # Philippines
1669 { id => 202, name => "PR" }, # Puerto Rico
1670 { id => 203, name => "RU" }, # Russia
1671 { id => 204, name => "RW" }, # Rwanda
1672 { id => 205, name => "SA" }, # Saudi Arabia
1673 { id => 206, name => "PM" }, # St. Pierre and Miquelon
1674 { id => 207, name => "KN" }, # St. Kitts and Nevis
1675 { id => 208, name => "SC" }, # Seychelles
1676 { id => 209, name => "ZA" }, # South Africa
1677 { id => 210, name => "SN" }, # Senegal
1678 { id => 212, name => "SI" }, # Slovenia
1679 { id => 213, name => "SL" }, # Sierra Leone
1680 { id => 214, name => "SM" }, # San Marino
1681 { id => 215, name => "SG" }, # Singapore
1682 { id => 216, name => "SO" }, # Somalia
1683 { id => 217, name => "ES" }, # Spain
1684 { id => 218, name => "LC" }, # St. Lucia
1685 { id => 219, name => "SD" }, # Sudan
1686 { id => 220, name => "SJ" }, # Svalbard
1687 { id => 221, name => "SE" }, # Sweden
1688 { id => 222, name => "SY" }, # Syria
1689 { id => 223, name => "CH" }, # Switzerland
1690 { id => 224, name => "AE" }, # United Arab Emirates
1691 { id => 225, name => "TT" }, # Trinidad and Tobago
1692 { id => 227, name => "TH" }, # Thailand
1693 { id => 228, name => "TJ" }, # Tajikistan
1694 { id => 231, name => "TO" }, # Tonga
1695 { id => 232, name => "TG" }, # Togo
1696 { id => 233, name => "ST" }, # São Tomé and Príncipe
1697 { id => 234, name => "TN" }, # Tunisia
1698 { id => 235, name => "TR" }, # Turkey
1699 { id => 236, name => "TV" }, # Tuvalu
1700 { id => 237, name => "TW" }, # Taiwan
1701 { id => 238, name => "TM" }, # Turkmenistan
1702 { id => 239, name => "TZ" }, # Tanzania
1703 { id => 240, name => "UG" }, # Uganda
1704 { id => 241, name => "UA" }, # Ukraine
1705 { id => 242, name => "GB" }, # United Kingdom
1706 { id => 244, name => "US" }, # United States
1707 { id => 245, name => "BF" }, # Burkina Faso
1708 { id => 246, name => "UY" }, # Uruguay
1709 { id => 247, name => "UZ" }, # Uzbekistan
1710 { id => 248, name => "VC" }, # St. Vincent and the Grenadines
1711 { id => 249, name => "VE" }, # Bolivarian Republic of Venezuela
1712 { id => 251, name => "VN" }, # Vietnam
1713 { id => 252, name => "VI" }, # Virgin Islands
1714 { id => 253, name => "VA" }, # Vatican City
1715 { id => 254, name => "NA" }, # Namibia
1716 { id => 257, name => "EH" }, # Western Sahara (disputed)
1717 { id => 258, parent => "UM" }, # Wake Island
1718 { id => 259, name => "WS" }, # Samoa
1719 { id => 260, name => "SZ" }, # Swaziland
1720 { id => 261, name => "YE" }, # Yemen
1721 { id => 263, name => "ZM" }, # Zambia
1722 { id => 264, name => "ZW" }, # Zimbabwe
1723 { id => 269, name => "CS" }, # Serbia and Montenegro (Former)
1724 { id => 270, name => "ME" }, # Montenegro
1725 { id => 271, name => "RS" }, # Serbia
1726 { id => 273, name => "CW" }, # Curaçao
1727 { id => 276, name => "SS" }, # South Sudan
1728 { id => 300, name => "AI" }, # Anguilla
1729 { id => 301, name => "AQ" }, # Antarctica
1730 { id => 302, name => "AW" }, # Aruba
1731 { id => 303, parent => "SH" }, # Ascension Island
1732 { id => 304, parent => "053" }, # Ashmore and Cartier Islands
1733 { id => 305, parent => "UM" }, # Baker Island
1734 { id => 306, name => "BV" }, # Bouvet Island
1735 { id => 307, name => "KY" }, # Cayman Islands
1736 { id => 308, name => "830", parent => "155" }, # Channel Islands
1737 { id => 309, name => "CX" }, # Christmas Island
1738 { id => 310, parent => "009" }, # Clipperton Island
1739 { id => 311, name => "CC" }, # Cocos (Keeling) Islands
1740 { id => 312, name => "CK" }, # Cook Islands
1741 { id => 313, parent => "053" }, # Coral Sea Islands
1742 { id => 314, parent => "IO" }, # Diego Garcia
1743 { id => 315, name => "FK" }, # Falkland Islands (Islas Malvinas)
1744 { id => 317, name => "GF" }, # French Guiana
1745 { id => 318, name => "PF" }, # French Polynesia
1746 { id => 319, name => "TF" }, # French Southern and Antarctic Lands
1747 { id => 321, name => "GP" }, # Guadeloupe
1748 { id => 322, name => "GU" }, # Guam
1749 { id => 323 }, # Guantanamo Bay
1750 { id => 324, name => "GG" }, # Guernsey
1751 { id => 325, name => "HM" }, # Heard Island and McDonald Islands
1752 { id => 326, parent => "UM" }, # Howland Island
1753 { id => 327, parent => "UM" }, # Jarvis Island
1754 { id => 328, name => "JE" }, # Jersey
1755 { id => 329, parent => "UM" }, # Kingman Reef
1756 { id => 330, name => "MQ" }, # Martinique
1757 { id => 331, name => "YT" }, # Mayotte
1758 { id => 332, name => "MS" }, # Montserrat
1759 { id => 333, name => "AN", region => 1 }, # Netherlands Antilles (Former)
1760 { id => 334, name => "NC" }, # New Caledonia
1761 { id => 335, name => "NU" }, # Niue
1762 { id => 336, name => "NF" }, # Norfolk Island
1763 { id => 337, name => "MP" }, # Northern Mariana Islands
1764 { id => 338, parent => "UM" }, # Palmyra Atoll
1765 { id => 339, name => "PN" }, # Pitcairn Islands
1766 { id => 340, parent => "MP" }, # Rota Island
1767 { id => 341, parent => "MP" }, # Saipan
1768 { id => 342, name => "GS" }, # South Georgia and the South Sandwich Islands
1769 { id => 343, name => "SH" }, # St. Helena
1770 { id => 346, parent => "MP" }, # Tinian Island
1771 { id => 347, name => "TK" }, # Tokelau
1772 { id => 348, parent => "SH" }, # Tristan da Cunha
1773 { id => 349, name => "TC" }, # Turks and Caicos Islands
1774 { id => 351, name => "VG" }, # Virgin Islands, British
1775 { id => 352, name => "WF" }, # Wallis and Futuna
1776 { id => 742, name => "002" }, # Africa
1777 { id => 2129, name => "142" }, # Asia
1778 { id => 10541, name => "150" }, # Europe
1779 { id => 15126, name => "IM" }, # Man, Isle of
1780 { id => 19618, name => "MK" }, # Macedonia, Former Yugoslav Republic of
1781 { id => 20900, name => "054" }, # Melanesia
1782 { id => 21206, name => "057" }, # Micronesia
1783 { id => 21242, parent => "UM" }, # Midway Islands
1784 { id => 23581, name => "021" }, # Northern America
1785 { id => 26286, name => "061" }, # Polynesia
1786 { id => 27082, name => "013" }, # Central America
1787 { id => 27114, name => "009" }, # Oceania
1788 { id => 30967, name => "SX" }, # Sint Maarten (Dutch part)
1789 { id => 31396, name => "005" }, # South America
1790 { id => 31706, name => "MF" }, # Saint Martin (French part)
1791 { id => 39070, name => "001" }, # World
1792 { id => 42483, name => "011" }, # Western Africa
1793 { id => 42484, name => "017" }, # Middle Africa
1794 { id => 42487, name => "015" }, # Northern Africa
1795 { id => 47590, name => "143" }, # Central Asia
1796 { id => 47599, name => "035" }, # South-Eastern Asia
1797 { id => 47600, name => "030" }, # Eastern Asia
1798 { id => 47603, name => "014" }, # Eastern Africa
1799 { id => 47609, name => "151" }, # Eastern Europe
1800 { id => 47610, name => "039" }, # Southern Europe
1801 { id => 47611, name => "145" }, # Middle East
1802 { id => 47614, name => "034" }, # Southern Asia
1803 { id => 7299303, name => "TL" }, # Democratic Republic of Timor-Leste
1804 { id => 9914689, name => "XK" }, # Kosovo
1805 { id => 10026358, name => "019" }, # Americas
1806 { id => 10028789, name => "AX" }, # Ã…land Islands
1807 { id => 10039880, name => "029", sintlsymbol => "XCD" }, # Caribbean
1808 { id => 10039882, name => "154" }, # Northern Europe
1809 { id => 10039883, name => "018" }, # Southern Africa
1810 { id => 10210824, name => "155" }, # Western Europe
1811 { id => 10210825, name => "053" }, # Australia and New Zealand
1812 { id => 161832015, name => "BL" }, # Saint Barthélemy
1813 { id => 161832256, name => "UM" }, # U.S. Minor Outlying Islands
1814 { id => 161832257, name => "419", parent => "019" }, # Latin America and the Caribbean
1815 { id => 161832258, name => "BQ" }, # Bonaire, Sint Eustatius and Saba
1818 my @cp2uni = ();
1819 my @glyph2uni = ();
1820 my @lead_bytes = ();
1821 my @uni2cp = ();
1822 my @tolower_table = ();
1823 my @toupper_table = ();
1824 my @digitmap_table = ();
1825 my @halfwidth_table = ();
1826 my @fullwidth_table = ();
1827 my @cjk_compat_table = ();
1828 my @chinese_traditional_table = ();
1829 my @chinese_simplified_table = ();
1830 my @category_table = ();
1831 my @initial_joining_table = ();
1832 my @direction_table = ();
1833 my @decomp_table = ();
1834 my @combining_class_table = ();
1835 my @decomp_compat_table = ();
1836 my @comp_exclusions = ();
1837 my @idna_decomp_table = ();
1838 my @idna_disallowed = ();
1839 my %registry_keys;
1840 my $default_char;
1841 my $default_wchar;
1843 my %joining_forms =
1845 "isolated" => [],
1846 "final" => [],
1847 "initial" => [],
1848 "medial" => []
1851 sub to_utf16(@)
1853 my @ret;
1854 foreach my $ch (@_)
1856 if ($ch < 0x10000)
1858 push @ret, $ch;
1860 else
1862 my $val = $ch - 0x10000;
1863 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
1866 return @ret;
1869 ################################################################
1870 # fetch a unicode.org file and open it
1871 sub open_data_file($$)
1873 my ($base, $name) = @_;
1874 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
1875 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
1876 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
1877 local *FILE;
1879 if ($base =~ /.*\/([^\/]+)\.zip$/)
1881 my $zip = "$1$suffix.zip";
1882 unless (-f "$cache/$zip")
1884 system "mkdir", "-p", $cache;
1885 print "Fetching $base...\n";
1886 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
1888 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
1890 else
1892 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
1893 unless (-f $dest)
1895 system "mkdir", "-p", $dir;
1896 print "Fetching $base/$name...\n";
1897 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
1899 open FILE, "<$dest" or die "cannot open $dest";
1901 return *FILE;
1904 ################################################################
1905 # load a unicode.org file as XML data
1906 sub load_xml_data_file($$)
1908 my ($base, $name) = @_;
1909 my $FILE = open_data_file( $base, $name );
1910 my $xml = XML::LibXML->load_xml( IO => $FILE );
1911 close FILE;
1912 return $xml;
1915 ################################################################
1916 # recursively get the decomposition for a character
1917 sub get_decomposition($$);
1918 sub get_decomposition($$)
1920 my ($char, $table) = @_;
1921 my @ret;
1923 return $char unless defined $table->[$char];
1924 foreach my $ch (@{$table->[$char]})
1926 push @ret, get_decomposition( $ch, $table );
1928 return @ret;
1931 ################################################################
1932 # get the composition that results in a given character
1933 sub get_composition($$)
1935 my ($ch, $compat) = @_;
1936 return () unless defined $decomp_table[$ch]; # no decomposition
1937 my @ret = @{$decomp_table[$ch]};
1938 return () if @ret < 2; # singleton decomposition
1939 return () if $comp_exclusions[$ch]; # composition exclusion
1940 return () if $combining_class_table[$ch]; # non-starter
1941 return () if $combining_class_table[$ret[0]]; # first char is non-starter
1942 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
1943 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
1944 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
1945 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
1946 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
1947 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
1948 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
1949 return @ret;
1952 ################################################################
1953 # recursively build decompositions
1954 sub build_decompositions(@)
1956 my @src = @_;
1957 my @dst;
1959 for (my $i = 0; $i < @src; $i++)
1961 next unless defined $src[$i];
1962 my @decomp = to_utf16( get_decomposition( $i, \@src ));
1963 $dst[$i] = \@decomp;
1965 return @dst;
1968 ################################################################
1969 # compose Hangul sequences
1970 sub compose_hangul(@)
1972 my $SBASE = 0xac00;
1973 my $LBASE = 0x1100;
1974 my $VBASE = 0x1161;
1975 my $TBASE = 0x11a7;
1976 my $LCOUNT = 19;
1977 my $VCOUNT = 21;
1978 my $TCOUNT = 28;
1979 my $NCOUNT = $VCOUNT * $TCOUNT;
1980 my $SCOUNT = $LCOUNT * $NCOUNT;
1982 my @seq = @_;
1983 my @ret;
1984 my $i;
1986 for ($i = 0; $i < @seq; $i++)
1988 my $ch = $seq[$i];
1989 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
1990 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
1992 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
1993 $i++;
1995 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
1996 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
1998 $ch += $seq[$i+1] - $TBASE;
1999 $i++;
2001 push @ret, $ch;
2003 return @ret;
2006 ################################################################
2007 # remove linguistic-only mappings from the case table
2008 sub remove_linguistic_mappings($$)
2010 my ($upper, $lower) = @_;
2012 # remove case mappings that don't round-trip
2014 for (my $i = 0; $i < @{$upper}; $i++)
2016 next unless defined ${$upper}[$i];
2017 my $ch = ${$upper}[$i];
2018 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
2020 for (my $i = 0; $i < @{$lower}; $i++)
2022 next unless defined ${$lower}[$i];
2023 my $ch = ${$lower}[$i];
2024 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
2028 ################################################################
2029 # read in the Unicode database files
2030 sub load_data()
2032 my $start;
2034 # now build mappings from the decomposition field of the Unicode database
2036 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
2037 while (<$UNICODE_DATA>)
2039 # Decode the fields ...
2040 my ($code, $name, $cat, $comb, $bidi,
2041 $decomp, $dec, $dig, $num, $mirror,
2042 $oldname, $comment, $upper, $lower, $title) = split /;/;
2043 my $src = hex $code;
2045 die "unknown category $cat" unless defined $categories{$cat};
2046 die "unknown directionality $bidi" unless defined $directions{$bidi};
2048 $category_table[$src] = $categories{$cat};
2049 $direction_table[$src] = $bidi;
2050 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
2052 $initial_joining_table[$src] = $joining_types{"T"};
2054 else
2056 $initial_joining_table[$src] = $joining_types{"U"};
2059 if ($lower ne "")
2061 $tolower_table[$src] = hex $lower;
2063 if ($upper ne "")
2065 $toupper_table[$src] = hex $upper;
2067 if ($dec ne "")
2069 $category_table[$src] |= $ctype{"digit"};
2071 if ($dig ne "")
2073 $digitmap_table[$src] = ord $dig;
2075 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
2077 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
2078 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
2079 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
2080 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
2081 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
2082 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
2083 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
2084 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
2085 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
2086 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
2087 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
2088 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
2090 # copy the category and direction for everything between First/Last pairs
2091 if ($name =~ /, First>/) { $start = $src; }
2092 if ($name =~ /, Last>/)
2094 while ($start < $src)
2096 $category_table[$start] = $category_table[$src];
2097 $direction_table[$start] = $direction_table[$src];
2098 $combining_class_table[$start] = $combining_class_table[$src];
2099 $start++;
2103 next if $decomp eq ""; # no decomposition, skip it
2105 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
2107 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
2108 $decomp_compat_table[$src] = \@seq;
2111 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
2113 # decomposition of the form "<foo> 1234" -> use char if type is known
2114 my $dst = hex $2;
2115 if ($1 eq "narrow")
2117 $halfwidth_table[$dst] = $src;
2118 $fullwidth_table[$src] = $dst;
2120 elsif ($1 eq "wide")
2122 next if $dst == 0x5c; # don't remap backslash
2123 $fullwidth_table[$dst] = $src;
2124 $halfwidth_table[$src] = $dst;
2126 elsif ($1 eq "font" || $1 eq "square" || $1 eq "circle")
2128 $fullwidth_table[$src] = $dst if $src >= 0x10000;
2130 elsif ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
2132 ${joining_forms{$1}}[$dst] = $src;
2135 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
2137 # decomposition "<compat> 0020 1234" -> combining accent
2139 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
2141 # store decomposition
2142 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
2144 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
2146 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
2148 my $dst = hex $1;
2149 # Single char decomposition
2150 $decomp_table[$src] = $decomp_compat_table[$src] = [ $dst ];
2151 if ($name =~ /^CJK COMPATIBILITY IDEOGRAPH/)
2153 $cjk_compat_table[$src] = $dst;
2154 $fullwidth_table[$src] = $dst if $src >= 0x10000;
2159 close $UNICODE_DATA;
2161 # patch the category of some special characters
2163 for (my $i = 0; $i < @decomp_table; $i++)
2165 next unless defined $decomp_table[$i];
2166 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
2168 foreach my $cat (keys %special_categories)
2170 my $flag = $ctype{$cat};
2171 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
2173 for (my $i = 0; $i < @decomp_compat_table; $i++)
2175 next unless defined $decomp_compat_table[$i];
2176 next unless @{$decomp_compat_table[$i]} == 2;
2177 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
2180 # load the composition exclusions
2182 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
2183 while (<$EXCL>)
2185 s/\#.*//; # remove comments
2186 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
2188 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
2190 elsif (/^([0-9a-fA-F]+)\s*$/)
2192 $comp_exclusions[hex $1] = 1;
2195 close $EXCL;
2197 # load the IDNA mappings
2199 @idna_decomp_table = @decomp_compat_table;
2200 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
2201 while (<$IDNA>)
2203 s/\#.*//; # remove comments
2204 next if /^\s*$/;
2205 my ($char, $type, $mapping) = split /;/;
2206 my ($ch1, $ch2);
2207 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
2209 $ch1 = hex $1;
2210 $ch2 = hex $2;
2212 elsif ($char =~ /([0-9a-fA-F]+)/)
2214 $ch1 = $ch2 = hex $1;
2217 if ($type =~ /mapped/ || $type =~ /deviation/)
2219 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
2220 my @seq = map { hex $_; } split /\s+/, $mapping;
2221 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
2223 elsif ($type =~ /valid/)
2226 elsif ($type =~ /ignored/)
2228 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
2230 elsif ($type =~ /disallowed/)
2232 foreach my $i ($ch1 .. $ch2)
2234 $idna_decomp_table[$i] = undef;
2235 $idna_disallowed[$i] = 1;
2239 close $IDNA;
2241 # load the Unihan mappings
2243 my $UNIHAN = open_data_file( $UNIHAN, "Unihan_Variants.txt" );
2244 while (<$UNIHAN>)
2246 s/\#.*//; # remove comments
2247 next if /^\s*$/;
2248 if (/^U\+([0-9a-fA-F]{4})\s+kTraditionalVariant\s+U\+([0-9a-fA-F]{4})$/)
2250 next if hex $1 < 0x4dc0; # skip extension A
2251 $chinese_traditional_table[hex $1] = hex $2;
2253 elsif (/^U\+([0-9a-fA-F]{4})\s+kSimplifiedVariant\s+U\+([0-9a-fA-F]{4})$/)
2255 next if hex $1 < 0x4dc0; # skip extension A
2256 $chinese_simplified_table[hex $1] = hex $2;
2259 close $UNIHAN;
2260 foreach my $i (0xf900..0xfaff)
2262 next unless defined $cjk_compat_table[$i];
2263 next if defined $chinese_simplified_table[$cjk_compat_table[$i]];
2264 $chinese_simplified_table[$i] = $cjk_compat_table[$i];
2269 ################################################################
2270 # add a new registry key
2271 sub add_registry_key($$)
2273 my ($key, $defval) = @_;
2274 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
2277 ################################################################
2278 # add a new registry value
2279 sub add_registry_value($$$)
2281 my ($key, $name, $value) = @_;
2282 add_registry_key( $key, undef );
2283 push @{$registry_keys{$key}}, "'$name' = s '$value'";
2286 ################################################################
2287 # define a new lead byte
2288 sub add_lead_byte($)
2290 my $ch = shift;
2291 return if defined $cp2uni[$ch];
2292 push @lead_bytes, $ch;
2293 $cp2uni[$ch] = 0;
2296 ################################################################
2297 # define a new char mapping
2298 sub add_mapping($$)
2300 my ($cp, $uni) = @_;
2301 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2302 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2303 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
2306 ################################################################
2307 # get a mapping including glyph chars for MB_USEGLYPHCHARS
2308 sub get_glyphs_mapping(@)
2310 my @table = @_;
2312 for (my $i = 0; $i < @glyph2uni; $i++)
2314 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
2316 return @table;
2319 ################################################################
2320 # build EUC-JP table from the JIS 0208/0212 files
2321 sub dump_eucjp_codepage()
2323 @cp2uni = ();
2324 @glyph2uni = ();
2325 @lead_bytes = ();
2326 @uni2cp = ();
2327 $default_char = $DEF_CHAR;
2328 $default_wchar = 0x30fb;
2330 # ASCII chars
2331 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
2333 # lead bytes
2334 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
2336 # JIS X 0201 right plane
2337 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
2339 # undefined chars
2340 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
2341 $cp2uni[0xa0] = 0xf8f0;
2342 $cp2uni[0xff] = 0xf8f3;
2344 # Fix backslash conversion
2345 add_mapping( 0xa1c0, 0xff3c );
2347 # Add private mappings for rows undefined in JIS 0208/0212
2348 my $private = 0xe000;
2349 foreach my $hi (0xf5 .. 0xfe)
2351 foreach my $lo (0xa1 .. 0xfe)
2353 add_mapping( ($hi << 8) + $lo, $private++ );
2356 foreach my $hi (0xf5 .. 0xfe)
2358 foreach my $lo (0x21 .. 0x7e)
2360 add_mapping( ($hi << 8) + $lo, $private++ );
2364 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
2365 while (<$INPUT>)
2367 next if /^\#/; # skip comments
2368 next if /^$/; # skip empty lines
2369 next if /\x1a/; # skip ^Z
2370 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2372 add_mapping( 0x8080 + hex $1, hex $2 );
2373 next;
2375 die "Unrecognized line $_\n";
2377 close $INPUT;
2379 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
2380 while (<$INPUT>)
2382 next if /^\#/; # skip comments
2383 next if /^$/; # skip empty lines
2384 next if /\x1a/; # skip ^Z
2385 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2387 add_mapping( 0x8000 + hex $1, hex $2 );
2388 next;
2390 die "Unrecognized line $_\n";
2392 close $INPUT;
2394 output_codepage_file( 20932 );
2397 ################################################################
2398 # build Korean Wansung table from the KSX1001 file
2399 sub dump_krwansung_codepage(@)
2401 my @cp949 = @_;
2402 @cp2uni = ();
2403 @glyph2uni = ();
2404 @lead_bytes = ();
2405 @uni2cp = ();
2406 $default_char = 0x3f;
2407 $default_wchar = 0x003f;
2409 # ASCII and undefined chars
2410 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
2411 add_mapping( 0xa0, 0xf8e6 );
2412 add_mapping( 0xad, 0xf8e7 );
2413 add_mapping( 0xae, 0xf8e8 );
2414 add_mapping( 0xaf, 0xf8e9 );
2415 add_mapping( 0xfe, 0xf8ea );
2416 add_mapping( 0xff, 0xf8eb );
2418 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
2419 while (<$INPUT>)
2421 next if /^\#/; # skip comments
2422 next if /^$/; # skip empty lines
2423 next if /\x1a/; # skip ^Z
2424 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2426 add_mapping( 0x8080 + hex $1, hex $2 );
2427 next;
2429 die "Unrecognized line $_\n";
2431 close $INPUT;
2433 # get some extra mappings from cp 949
2434 my @defined_lb;
2435 map { $defined_lb[$_] = 1; } @lead_bytes;
2436 foreach my $i (0x0000 .. 0xffff)
2438 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
2439 next unless defined $cp949[$i];
2440 if ($cp949[$i] >= 0xff)
2442 # only add chars for lead bytes that exist in 20949
2443 my $hi = $cp949[$i] >> 8;
2444 my $lo = $cp949[$i] & 0xff;
2445 next unless $defined_lb[$hi];
2446 next unless $lo >= 0xa1 && $lo <= 0xfe;
2448 add_mapping( $cp949[$i], $i );
2451 output_codepage_file( 20949 );
2455 ################################################################
2456 # dump an array of integers
2457 sub dump_array($$@)
2459 my ($bit_width, $default, @array) = @_;
2460 my $format = sprintf "0x%%0%ux", $bit_width / 4;
2461 my $i;
2462 my $ret = " ";
2463 for ($i = 0; $i < $#array; $i++)
2465 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2466 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
2468 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2469 return $ret;
2473 ################################################################
2474 # dump an SBCS mapping table in binary format
2475 sub dump_binary_sbcs_table($)
2477 my $codepage = shift;
2479 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2480 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
2482 print OUTPUT pack "S<*", @header;
2483 print OUTPUT pack "C12", (0) x 12;
2484 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2486 if (@glyph2uni)
2488 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
2490 else
2492 print OUTPUT pack "S<*", 0;
2495 print OUTPUT pack "S<*", 0, 0;
2497 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2501 ################################################################
2502 # dump a DBCS mapping table in binary format
2503 sub dump_binary_dbcs_table($)
2505 my $codepage = shift;
2506 my @lb_ranges = get_lb_ranges();
2507 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2509 my @offsets = (0) x 256;
2510 my $pos = 0;
2511 foreach my $i (@lead_bytes)
2513 $offsets[$i] = ($pos += 256);
2514 $cp2uni[$i] = 0;
2517 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
2519 print OUTPUT pack "S<*", @header;
2520 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
2521 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2522 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
2524 foreach my $i (@lead_bytes)
2526 my $base = $i << 8;
2527 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
2530 print OUTPUT pack "S<", 4;
2531 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2535 ################################################################
2536 # get the list of defined lead byte ranges
2537 sub get_lb_ranges()
2539 my @list = ();
2540 my @ranges = ();
2542 foreach my $i (@lead_bytes) { $list[$i] = 1; }
2543 my $on = 0;
2544 for (my $i = 0; $i < 256; $i++)
2546 if ($on)
2548 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
2550 else
2552 if ($list[$i]) { push @ranges, $i; $on = 1; }
2555 if ($on) { push @ranges, 0xff; }
2556 return @ranges;
2559 ################################################################
2560 # dump the Indic Syllabic Category table
2561 sub dump_indic($)
2563 my $filename = shift;
2564 my @indic_table;
2566 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
2567 while (<$INPUT>)
2569 next if /^\#/; # skip comments
2570 next if /^\s*$/; # skip empty lines
2571 next if /\x1a/; # skip ^Z
2572 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2574 my $type = $2;
2575 die "unknown indic $type" unless defined $indic_types{$type};
2576 if (hex $1 < 65536)
2578 $indic_table[hex $1] = $indic_types{$type};
2580 next;
2582 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2584 my $type = $3;
2585 die "unknown indic $type" unless defined $indic_types{$type};
2586 if (hex $1 < 65536 and hex $2 < 65536)
2588 foreach my $i (hex $1 .. hex $2)
2590 $indic_table[$i] = $indic_types{$type};
2593 next;
2595 die "malformed line $_";
2597 close $INPUT;
2599 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
2600 while (<$INPUT>)
2602 next if /^\#/; # skip comments
2603 next if /^\s*$/; # skip empty lines
2604 next if /\x1a/; # skip ^Z
2605 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2607 my $type = $2;
2608 die "unknown matra $type" unless defined $matra_types{$type};
2609 $indic_table[hex $1] |= $matra_types{$type} << 8;
2610 next;
2612 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2614 my $type = $3;
2615 die "unknown matra $type" unless defined $matra_types{$type};
2616 foreach my $i (hex $1 .. hex $2)
2618 $indic_table[$i] |= $matra_types{$type} << 8;
2620 next;
2622 die "malformed line $_";
2624 close $INPUT;
2626 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2627 print "Building $filename\n";
2628 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
2629 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
2630 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
2631 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2632 print OUTPUT "#include \"windef.h\"\n\n";
2634 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
2636 close OUTPUT;
2637 save_file($filename);
2640 ################################################################
2641 # dump the Line Break Properties table
2642 sub dump_linebreak($)
2644 my $filename = shift;
2645 my @break_table;
2647 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
2648 while (<$INPUT>)
2650 next if /^\#/; # skip comments
2651 next if /^\s*$/; # skip empty lines
2652 next if /\x1a/; # skip ^Z
2653 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2655 my $type = $2;
2656 die "unknown breaktype $type" unless defined $break_types{$type};
2657 $break_table[hex $1] = $break_types{$type};
2658 next;
2660 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2662 my $type = $3;
2663 die "unknown breaktype $type" unless defined $break_types{$type};
2664 foreach my $i (hex $1 .. hex $2)
2666 $break_table[$i] = $break_types{$type};
2668 next;
2670 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2672 my $type = $2;
2673 die "unknown breaktype $type" unless defined $break_types{$type};
2674 $break_table[hex $1] = $break_types{$type};
2675 next;
2677 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2679 my $type = $3;
2680 die "unknown breaktype $type" unless defined $break_types{$type};
2681 foreach my $i (hex $1 .. hex $2)
2683 $break_table[$i] = $break_types{$type};
2685 next;
2687 die "malformed line $_";
2689 close $INPUT;
2691 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2692 print "Building $filename\n";
2693 print OUTPUT "/* Unicode Line Break Properties */\n";
2694 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
2695 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2696 print OUTPUT "#include \"windef.h\"\n\n";
2698 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
2700 close OUTPUT;
2701 save_file($filename);
2704 my %scripts =
2706 "Unknown" => 0,
2707 "Common" => 1,
2708 "Inherited" => 2,
2709 "Arabic" => 3,
2710 "Armenian" => 4,
2711 "Avestan" => 5,
2712 "Balinese" => 6,
2713 "Bamum" => 7,
2714 "Batak" => 8,
2715 "Bengali" => 9,
2716 "Bopomofo" => 10,
2717 "Brahmi" => 11,
2718 "Braille" => 12,
2719 "Buginese" => 13,
2720 "Buhid" => 14,
2721 "Canadian_Aboriginal" => 15,
2722 "Carian" => 16,
2723 "Cham" => 17,
2724 "Cherokee" => 18,
2725 "Coptic" => 19,
2726 "Cuneiform" => 20,
2727 "Cypriot" => 21,
2728 "Cyrillic" => 22,
2729 "Deseret" => 23,
2730 "Devanagari" => 24,
2731 "Egyptian_Hieroglyphs" => 25,
2732 "Ethiopic" => 26,
2733 "Georgian" => 27,
2734 "Glagolitic" => 28,
2735 "Gothic" => 29,
2736 "Greek" => 30,
2737 "Gujarati" => 31,
2738 "Gurmukhi" => 32,
2739 "Han" => 33,
2740 "Hangul" => 34,
2741 "Hanunoo" => 35,
2742 "Hebrew" => 36,
2743 "Hiragana" => 37,
2744 "Imperial_Aramaic" => 38,
2745 "Inscriptional_Pahlavi" => 39,
2746 "Inscriptional_Parthian" => 40,
2747 "Javanese" => 41,
2748 "Kaithi" => 42,
2749 "Kannada" => 43,
2750 "Katakana" => 44,
2751 "Kayah_Li" => 45,
2752 "Kharoshthi" => 46,
2753 "Khmer" => 47,
2754 "Lao" => 48,
2755 "Latin" => 49,
2756 "Lepcha" => 50,
2757 "Limbu" => 51,
2758 "Linear_B" => 52,
2759 "Lisu" => 53,
2760 "Lycian" => 54,
2761 "Lydian" => 55,
2762 "Malayalam" => 56,
2763 "Mandaic" => 57,
2764 "Meetei_Mayek" => 58,
2765 "Mongolian" => 59,
2766 "Myanmar" => 60,
2767 "New_Tai_Lue" => 61,
2768 "Nko" => 62,
2769 "Ogham" => 63,
2770 "Ol_Chiki" => 64,
2771 "Old_Italic" => 65,
2772 "Old_Persian" => 66,
2773 "Old_South_Arabian" => 67,
2774 "Old_Turkic" => 68,
2775 "Oriya" => 69,
2776 "Osmanya" => 70,
2777 "Phags_Pa" => 71,
2778 "Phoenician" => 72,
2779 "Rejang" => 73,
2780 "Runic" => 74,
2781 "Samaritan" => 75,
2782 "Saurashtra" => 76,
2783 "Shavian" => 77,
2784 "Sinhala" => 78,
2785 "Sundanese" => 79,
2786 "Syloti_Nagri" => 80,
2787 "Syriac" => 81,
2788 "Tagalog" => 82,
2789 "Tagbanwa" => 83,
2790 "Tai_Le" => 84,
2791 "Tai_Tham" => 85,
2792 "Tai_Viet" => 86,
2793 "Tamil" => 87,
2794 "Telugu" => 88,
2795 "Thaana" => 89,
2796 "Thai" => 90,
2797 "Tibetan" => 91,
2798 "Tifinagh" => 92,
2799 "Ugaritic" => 93,
2800 "Vai" => 94,
2801 "Yi" => 95,
2802 # Win8/Win8.1
2803 "Chakma" => 96,
2804 "Meroitic_Cursive" => 97,
2805 "Meroitic_Hieroglyphs" => 98,
2806 "Miao" => 99,
2807 "Sharada" => 100,
2808 "Sora_Sompeng" => 101,
2809 "Takri" => 102,
2810 # Win10
2811 "Bassa_Vah" => 103,
2812 "Caucasian_Albanian" => 104,
2813 "Duployan" => 105,
2814 "Elbasan" => 106,
2815 "Grantha" => 107,
2816 "Khojki" => 108,
2817 "Khudawadi" => 109,
2818 "Linear_A" => 110,
2819 "Mahajani" => 111,
2820 "Manichaean" => 112,
2821 "Mende_Kikakui" => 113,
2822 "Modi" => 114,
2823 "Mro" => 115,
2824 "Nabataean" => 116,
2825 "Old_North_Arabian" => 117,
2826 "Old_Permic" => 118,
2827 "Pahawh_Hmong" => 119,
2828 "Palmyrene" => 120,
2829 "Pau_Cin_Hau" => 121,
2830 "Psalter_Pahlavi" => 122,
2831 "Siddham" => 123,
2832 "Tirhuta" => 124,
2833 "Warang_Citi" => 125,
2834 # Win10 RS1
2835 "Adlam" => 126,
2836 "Ahom" => 127,
2837 "Anatolian_Hieroglyphs" => 128,
2838 "Bhaiksuki" => 129,
2839 "Hatran" => 130,
2840 "Marchen" => 131,
2841 "Multani" => 132,
2842 "Newa" => 133,
2843 "Old_Hungarian" => 134,
2844 "Osage" => 135,
2845 "SignWriting" => 136,
2846 "Tangut" => 137,
2847 # Win10 RS4
2848 "Masaram_Gondi" => 138,
2849 "Nushu" => 139,
2850 "Soyombo" => 140,
2851 "Zanabazar_Square" => 141,
2852 # Win10 1903
2853 "Dogra" => 142,
2854 "Gunjala_Gondi" => 143,
2855 "Hanifi_Rohingya" => 144,
2856 "Makasar" => 145,
2857 "Medefaidrin" => 146,
2858 "Old_Sogdian" => 147,
2859 "Sogdian" => 148,
2860 # Win10 2004
2861 "Elymaic" => 149,
2862 "Nyiakeng_Puachue_Hmong" => 150,
2863 "Nandinagari" => 151,
2864 "Wancho" => 152,
2865 # Win11
2866 "Chorasmian" => 153,
2867 "Dives_Akuru" => 154,
2868 "Khitan_Small_Script" => 155,
2869 "Yezidi" => 156,
2872 ################################################################
2873 # dump Script IDs table
2874 sub dump_scripts($)
2876 my $filename = shift;
2877 my $header = $filename;
2878 my @scripts_table;
2879 my $script_index;
2880 my $i;
2882 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
2883 # Fill the table
2884 # Unknown script id is always 0, so undefined scripts are automatically treated as such
2885 while (<$INPUT>)
2887 my $type = "";
2889 next if /^\#/; # skip comments
2890 next if /^\s*$/; # skip empty lines
2891 next if /\x1a/; # skip ^Z
2892 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2894 $type = $2;
2895 if (defined $scripts{$type})
2897 $scripts_table[hex $1] = $scripts{$type};
2899 next;
2901 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2903 $type = $3;
2904 if (defined $scripts{$type})
2906 foreach my $i (hex $1 .. hex $2)
2908 $scripts_table[$i] = $scripts{$type};
2911 next;
2915 close $INPUT;
2917 $header = "$filename.h";
2918 open OUTPUT,">$header.new" or die "Cannot create $header";
2919 print "Building $header\n";
2920 print OUTPUT "/* Unicode Script IDs */\n";
2921 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
2922 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2924 print OUTPUT "enum unicode_script_id {\n";
2925 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
2927 print OUTPUT " Script_$script = $scripts{$script},\n";
2929 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
2930 print OUTPUT "};\n";
2932 close OUTPUT;
2933 save_file($header);
2935 $filename = "$filename.c";
2936 open OUTPUT,">$filename.new" or die "Cannot create $header";
2937 print "Building $filename\n";
2938 print OUTPUT "/* Unicode Script IDs */\n";
2939 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
2940 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2941 print OUTPUT "#include \"windef.h\"\n\n";
2943 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
2944 close OUTPUT;
2945 save_file($filename);
2948 ################################################################
2949 # dump the BiDi mirroring table
2950 sub dump_mirroring($)
2952 my $filename = shift;
2953 my @mirror_table = ();
2955 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
2956 while (<$INPUT>)
2958 next if /^\#/; # skip comments
2959 next if /^$/; # skip empty lines
2960 next if /\x1a/; # skip ^Z
2961 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
2963 $mirror_table[hex $1] = hex $2;
2964 next;
2966 die "malformed line $_";
2968 close $INPUT;
2970 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2971 print "Building $filename\n";
2972 print OUTPUT "/* Unicode BiDi mirroring */\n";
2973 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
2974 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2975 print OUTPUT "#include \"windef.h\"\n\n";
2976 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
2977 close OUTPUT;
2978 save_file($filename);
2981 ################################################################
2982 # dump the Bidi Brackets
2983 sub dump_bracket($)
2985 my $filename = shift;
2986 my @bracket_table;
2988 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
2989 while (<$INPUT>)
2991 next if /^\#/; # skip comments
2992 next if /^\s*$/; # skip empty lines
2993 next if /\x1a/; # skip ^Z
2994 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
2996 my $type = $3;
2997 die "unknown bracket $type" unless defined $bracket_types{$type};
2998 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
2999 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
3000 $bracket_table[hex $1] += $bracket_types{$type} << 8;
3001 next;
3003 die "malformed line $_";
3005 close $INPUT;
3007 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3008 print "Building $filename\n";
3009 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
3010 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
3011 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3012 print OUTPUT "#include \"windef.h\"\n\n";
3014 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
3016 close OUTPUT;
3017 save_file($filename);
3020 ################################################################
3021 # dump the Arabic shaping table
3022 sub dump_shaping($)
3024 my $filename = shift;
3025 my @joining_table = @initial_joining_table;
3027 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
3028 while (<$INPUT>)
3030 next if /^\#/; # skip comments
3031 next if /^\s*$/; # skip empty lines
3032 next if /\x1a/; # skip ^Z
3033 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3035 my $type = $2;
3036 $joining_table[hex $1] = $joining_types{$type};
3037 next;
3039 die "malformed line $_";
3041 close $INPUT;
3043 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3044 print "Building $filename\n";
3045 print OUTPUT "/* Unicode Arabic shaping */\n";
3046 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
3047 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3048 print OUTPUT "#include \"windef.h\"\n\n";
3050 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
3052 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
3053 for (my $i = 0x600; $i <= 0x6ff; $i++)
3055 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
3056 ${joining_forms{"isolated"}}[$i] || $i,
3057 ${joining_forms{"final"}}[$i] || $i,
3058 ${joining_forms{"initial"}}[$i] || $i,
3059 ${joining_forms{"medial"}}[$i] || $i;
3061 print OUTPUT "};\n";
3063 close OUTPUT;
3064 save_file($filename);
3067 ################################################################
3068 # dump the Arabic shaping table
3069 sub dump_arabic_shaping($)
3071 my $filename = shift;
3072 my @joining_table = @initial_joining_table;
3074 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
3075 while (<$INPUT>)
3077 next if /^\#/; # skip comments
3078 next if /^\s*$/; # skip empty lines
3079 next if /\x1a/; # skip ^Z
3080 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3082 my $type = $2;
3083 my $group = $3;
3085 if ($group eq "ALAPH" || $group eq "DALATH RISH")
3087 $joining_table[hex $1] = $joining_types{$group};
3089 else
3091 $joining_table[hex $1] = $joining_types{$type};
3094 next;
3096 die "malformed line $_";
3098 close $INPUT;
3100 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3101 print "Building $filename\n";
3102 print OUTPUT "/* Unicode Arabic shaping */\n";
3103 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
3104 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3105 print OUTPUT "#include \"windef.h\"\n\n";
3107 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
3109 close OUTPUT;
3110 save_file($filename);
3113 ################################################################
3114 # dump the Vertical Orientation table
3115 sub dump_vertical($$)
3117 my ($filename, $unix) = @_;
3118 my @vertical_table;
3120 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
3121 while (<$INPUT>)
3123 next if /^\#/; # skip comments
3124 next if /^\s*$/; # skip empty lines
3125 next if /\x1a/; # skip ^Z
3126 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
3128 my $type = $2;
3129 die "unknown vertical $type" unless defined $vertical_types{$type};
3130 if (hex $1 < 65536)
3132 $vertical_table[hex $1] = $vertical_types{$type};
3134 next;
3136 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
3138 my $type = $3;
3139 die "unknown vertical $type" unless defined $vertical_types{$type};
3140 foreach my $i (hex $1 .. hex $2)
3142 $vertical_table[$i] = $vertical_types{$type};
3144 next;
3146 die "malformed line $_";
3148 close $INPUT;
3150 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3151 print "Building $filename\n";
3152 print OUTPUT "/* Unicode Vertical Orientation */\n";
3153 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
3154 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3155 if ($unix)
3157 print OUTPUT "#if 0\n";
3158 print OUTPUT "#pragma makedep unix\n";
3159 print OUTPUT "#endif\n\n";
3161 print OUTPUT "#include \"windef.h\"\n\n";
3163 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
3165 close OUTPUT;
3166 save_file($filename);
3169 ################################################################
3170 # compress a mapping table by removing identical rows
3171 sub compress_array($$@)
3173 my $rows = shift;
3174 my $def = shift;
3175 my @table = @_;
3176 my $len = @table / $rows;
3177 my @array;
3178 my $data = "";
3180 # try to merge table rows
3181 for (my $row = 0; $row < $rows; $row++)
3183 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
3184 my $pos = index $data, $rowtxt;
3185 if ($pos == -1)
3187 # check if the tail of the data can match the start of the new row
3188 my $first = substr( $rowtxt, 0, 1 );
3189 for (my $i = length($data) - 1; $i > 0; $i--)
3191 $pos = index( substr( $data, -$i ), $first );
3192 last if $pos == -1;
3193 $i -= $pos;
3194 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
3195 substr( $data, -$i ) = "";
3196 last;
3198 $pos = length $data;
3199 $data .= $rowtxt;
3201 $array[$row] = $rows + $pos;
3203 return @array, unpack "U*", $data;
3206 ################################################################
3207 # dump a char -> 16-bit value mapping table using two-level tables
3208 sub dump_two_level_mapping($$$@)
3210 my $name = shift;
3211 my $def = shift;
3212 my $size = shift;
3213 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3214 my (@array, @row_array, @data, @row_data);
3215 (@row_array[0..4095], @data) = compress_array( 4096, $def, @_[0..65535] );
3216 (@array[0..255], @row_data) = compress_array( 256, 0, @row_array );
3218 for (my $i = 0; $i < @row_data; $i++) { $row_data[$i] += @row_data + 256 - 4096; }
3220 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_data + @data;
3221 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array );
3222 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @row_data );
3223 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @data );
3226 ################################################################
3227 # dump a char -> value mapping table using three-level tables
3228 sub dump_three_level_mapping($$@)
3230 my $name = shift;
3231 my $def = shift;
3232 my $size = shift;
3233 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3234 my $level3 = ($MAX_CHAR + 1) / 16;
3235 my $level2 = $level3 / 16;
3236 my $level1 = $level2 / 16;
3237 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
3238 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
3239 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
3241 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
3242 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
3244 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
3245 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
3246 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
3247 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
3248 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
3251 ################################################################
3252 # dump a binary case mapping table in l_intl.nls format
3253 sub dump_binary_case_table(@)
3255 my (@table) = @_;
3256 my @difftable;
3257 my @res;
3259 for (my $i = 0; $i < @table; $i++)
3261 next unless defined $table[$i];
3262 $difftable[$i] = ($table[$i] - $i) & 0xffffffff;
3265 my (@low_array1, @low_array2, @low_data, @low_row_data);
3266 (@low_array2[0..4095], @low_data) = compress_array( 4096, 0, @difftable[0..65535] );
3267 (@low_array1[0..255], @low_row_data) = compress_array( 256, 0, @low_array2 );
3269 if (scalar @table > 0x10000)
3271 my (@high_array1, @high_array2, @high_data, @high_row_data);
3272 (@high_array2[0..32767], @high_data) = compress_array( 32768, 0, @difftable[65536..$MAX_CHAR] );
3273 (@high_array1[0..1023], @high_row_data) = compress_array( 1024, 0, @high_array2 );
3275 push @res, map { $_ + 1024; } @low_array1;
3276 push @res, map { $_ + @res + @low_row_data + @low_data; } @high_array1;
3277 push @res, map { $_ + @res + @low_row_data - 4096; } @low_row_data;
3278 push @res, @low_data;
3279 push @res, map { 2 * ($_ - 32768) + @res + @high_row_data; } @high_row_data;
3280 return pack( "S<*", 1 + scalar @res + 2 * scalar @high_data, @res ) . pack( "L<*", @high_data );
3282 else
3284 push @res, @low_array1;
3285 push @res, map { $_ + @res + @low_row_data - 4096; } @low_row_data;
3286 push @res, @low_data;
3287 return pack "S<*", 1 + scalar @res, @res;
3291 ################################################################
3292 # dump case mappings for l_intl.nls
3293 sub dump_intl_nls($)
3295 my @upper_table = @toupper_table;
3296 my @lower_table = @tolower_table;
3297 remove_linguistic_mappings( \@upper_table, \@lower_table );
3299 my $upper = dump_binary_case_table( @upper_table[0..65535] );
3300 my $lower = dump_binary_case_table( @lower_table[0..65535] );
3302 my $filename = shift;
3303 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3304 printf "Building $filename\n";
3306 binmode OUTPUT;
3307 print OUTPUT pack "S<", 1; # version
3308 print OUTPUT $upper;
3309 print OUTPUT $lower;
3310 close OUTPUT;
3311 save_file($filename);
3315 ################################################################
3316 # dump the bidi direction table
3317 sub dump_bidi_dir_table($)
3319 my $filename = shift;
3320 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3321 printf "Building $filename\n";
3322 printf OUTPUT "/* Unicode BiDi direction table */\n";
3323 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
3324 printf OUTPUT "#include \"windef.h\"\n\n";
3326 my @table;
3328 for (my $i = 0; $i < 65536; $i++)
3330 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
3333 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
3335 close OUTPUT;
3336 save_file($filename);
3340 sub rol($$)
3342 my ($byte, $count) = @_;
3343 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
3346 ################################################################
3347 # compress the character properties table
3348 sub compress_char_props_table($@)
3350 my $rows = shift;
3351 my @table = @_;
3352 my $len = @table / $rows;
3353 my $pos = 0;
3354 my @array = (0) x $rows;
3355 my %sequences;
3357 # add some predefined sequences
3358 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
3360 # try to merge table rows
3361 for (my $row = 0; $row < $rows; $row++)
3363 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
3364 my $rowtxt = pack "L*", @table_row;
3365 if (defined($sequences{$rowtxt}))
3367 # reuse an existing row
3368 $array[$row] = $sequences{$rowtxt};
3370 else
3372 # create a new row
3373 $sequences{$rowtxt} = $array[$row] = ++$pos;
3374 push @array, @table_row;
3377 return @array;
3380 ################################################################
3381 # dump a normalization table in binary format
3382 sub dump_norm_table($)
3384 my $filename = shift;
3386 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
3387 my %decomp = ( "nfc" => \@decomp_table,
3388 "nfd" => \@decomp_table,
3389 "nfkc" => \@decomp_compat_table,
3390 "nfkd" => \@decomp_compat_table ,
3391 "idna" => \@idna_decomp_table );
3393 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3394 print "Building $filename\n";
3396 my $type = $filename;
3397 $type =~ s!.*/norm(\w+)\.nls!$1!;
3399 my $compose = $forms{$type} & 1;
3400 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
3402 my @version = split /\./, $UNIVERSION;
3404 # combining classes
3406 my @classes;
3407 my @class_values;
3409 foreach my $c (grep defined, @combining_class_table)
3411 $classes[$c] = 1 if $c < 0x100;
3413 for (my $i = 0; $i < @classes; $i++)
3415 next unless defined $classes[$i];
3416 $classes[$i] = @class_values;
3417 push @class_values, $i;
3419 push @class_values, 0 if (@class_values % 2);
3420 die "too many classes" if @class_values >= 0x40;
3422 # character properties
3424 my @char_props;
3425 my @decomposed;
3426 my @comp_hash_table;
3427 my $comp_hash_size = $compose ? 254 : 0;
3429 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3431 next unless defined $combining_class_table[$i];
3432 if (defined $decomp{$type}->[$i])
3434 my @dec = get_decomposition( $i, $decomp{$type} );
3435 if ($compose && (my @comp = get_composition( $i, $compat )))
3437 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
3438 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
3440 my $val = 0;
3441 foreach my $d (@dec)
3443 $val = $combining_class_table[$d];
3444 last if $val;
3446 $char_props[$i] = $classes[$val];
3448 else
3450 $char_props[$i] = 0xbf;
3452 @dec = compose_hangul( @dec ) if $compose;
3453 @dec = to_utf16( @dec );
3454 push @dec, 0 if @dec >= 7;
3455 $decomposed[$i] = \@dec;
3457 else
3459 if ($combining_class_table[$i] == 0x100)
3461 $char_props[$i] = 0x7f;
3463 elsif ($combining_class_table[$i])
3465 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
3467 elsif ($type eq "idna" && defined $idna_disallowed[$i])
3469 $char_props[$i] = 0xff;
3471 else
3473 $char_props[$i] = 0;
3478 if ($compose)
3480 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3482 my @comp = get_composition( $i, $compat );
3483 next unless @comp;
3484 if ($combining_class_table[$comp[1]])
3486 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
3487 $char_props[$comp[1]] |= 0x40;
3489 else
3491 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
3492 $char_props[$comp[1]] |= 0xc0;
3497 # surrogates
3498 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
3499 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
3501 # Hangul
3502 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
3503 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
3504 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
3506 # invalid chars
3507 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
3508 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
3509 foreach my $i (0x00..0x10)
3511 $char_props[($i << 16) | 0xfffe] = 0xff;
3512 $char_props[($i << 16) | 0xffff] = 0xff;
3515 # decomposition hash table
3517 my @decomp_hash_table;
3518 my @decomp_hash_index;
3519 my @decomp_hash_data;
3520 my $decomp_hash_size = 944;
3522 # build string of character data, reusing substrings when possible
3523 my $decomp_char_data = "";
3524 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
3526 my $str = pack "U*", @{$i};
3527 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
3529 for (my $i = 0; $i < @decomposed; $i++)
3531 next unless defined $decomposed[$i];
3532 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
3533 die "sequence not found" if $pos == -1;
3534 my $len = @{$decomposed[$i]};
3535 $len = 7 if $len > 7;
3536 my $hash = $i % $decomp_hash_size;
3537 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
3539 for (my $i = 0; $i < $decomp_hash_size; $i++)
3541 $decomp_hash_index[$i] = @decomp_hash_data / 2;
3542 next unless defined $decomp_hash_table[$i];
3543 if (@{$decomp_hash_table[$i]} == 1)
3545 my $entry = $decomp_hash_table[$i]->[0];
3546 if ($char_props[$entry->[0]] == 0xbf)
3548 $decomp_hash_index[$i] = $entry->[1];
3549 next;
3552 foreach my $entry (@{$decomp_hash_table[$i]})
3554 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
3557 push @decomp_hash_data, 0, 0;
3559 # composition hash table
3561 my @comp_hash_index;
3562 my @comp_hash_data;
3563 if (@comp_hash_table)
3565 for (my $i = 0; $i < $comp_hash_size; $i++)
3567 $comp_hash_index[$i] = @comp_hash_data;
3568 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
3570 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
3571 push @comp_hash_data, 0, 0, 0;
3574 my $level1 = ($MAX_CHAR + 1) / 128;
3575 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
3577 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
3578 0, $decomp_hash_size, $comp_hash_size, 0 );
3579 my @tables = (0) x 8;
3581 $tables[0] = 16 + @header + @tables;
3582 $tables[1] = $tables[0] + @class_values / 2;
3583 $tables[2] = $tables[1] + $level1 / 2;
3584 $tables[3] = $tables[2] + (@rows - $level1) / 2;
3585 $tables[4] = $tables[3] + @decomp_hash_index;
3586 $tables[5] = $tables[4] + @decomp_hash_data;
3587 $tables[6] = $tables[5] + length $decomp_char_data;
3588 $tables[7] = $tables[6] + @comp_hash_index;
3590 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
3591 print OUTPUT pack "S<*", @header;
3592 print OUTPUT pack "S<*", @tables;
3593 print OUTPUT pack "C*", @class_values;
3595 print OUTPUT pack "C*", @rows[0..$level1-1];
3596 print OUTPUT pack "C*", @rows[$level1..$#rows];
3597 print OUTPUT pack "S<*", @decomp_hash_index;
3598 print OUTPUT pack "S<*", @decomp_hash_data;
3599 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
3600 print OUTPUT pack "S<*", @comp_hash_index;
3601 print OUTPUT pack "S<*", @comp_hash_data;
3603 close OUTPUT;
3604 save_file($filename);
3606 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
3610 ################################################################
3611 # output a codepage definition file from the global tables
3612 sub output_codepage_file($)
3614 my $codepage = shift;
3616 my $output = sprintf "nls/c_%03d.nls", $codepage;
3617 open OUTPUT,">$output.new" or die "Cannot create $output";
3619 printf "Building %s\n", $output;
3620 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
3621 else { dump_binary_dbcs_table( $codepage ); }
3623 close OUTPUT;
3624 save_file($output);
3626 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
3629 ################################################################
3630 # output a codepage table from a Microsoft-style mapping file
3631 sub dump_msdata_codepage($)
3633 my $filename = shift;
3635 my $state = "";
3636 my ($codepage, $width, $count);
3637 my ($lb_cur, $lb_end);
3639 @cp2uni = ();
3640 @glyph2uni = ();
3641 @lead_bytes = ();
3642 @uni2cp = ();
3643 $default_char = $DEF_CHAR;
3644 $default_wchar = $DEF_CHAR;
3646 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
3648 while (<$INPUT>)
3650 next if /^;/; # skip comments
3651 next if /^\s*$/; # skip empty lines
3652 next if /\x1a/; # skip ^Z
3653 last if /^ENDCODEPAGE/;
3655 if (/^CODEPAGE\s+(\d+)/)
3657 $codepage = $1;
3658 next;
3660 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
3662 $width = $1;
3663 $default_char = hex $2;
3664 $default_wchar = hex $3;
3665 next;
3667 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
3669 $state = $1;
3670 $count = $2;
3671 next;
3673 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
3675 if ($state eq "MBTABLE")
3677 my $cp = hex $1;
3678 my $uni = hex $2;
3679 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3680 next;
3682 if ($state eq "GLYPHTABLE")
3684 my $cp = hex $1;
3685 my $uni = hex $2;
3686 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
3687 next;
3689 if ($state eq "WCTABLE")
3691 my $uni = hex $1;
3692 my $cp = hex $2;
3693 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
3694 next;
3696 if ($state eq "DBCSRANGE")
3698 my $start = hex $1;
3699 my $end = hex $2;
3700 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
3701 $lb_cur = $start;
3702 $lb_end = $end;
3703 next;
3705 if ($state eq "DBCSTABLE")
3707 my $mb = hex $1;
3708 my $uni = hex $2;
3709 my $cp = ($lb_cur << 8) | $mb;
3710 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3711 if (!--$count)
3713 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
3715 next;
3718 die "$filename: Unrecognized line $_\n";
3720 close $INPUT;
3722 output_codepage_file( $codepage );
3724 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
3727 ################################################################
3728 # align a string length
3729 sub align_string($$)
3731 my ($align, $str) = @_;
3732 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
3733 return $str;
3736 ################################################################
3737 # pad a string with zeros
3738 sub pad_string($$)
3740 my ($pad, $str) = @_;
3741 $str .= pack "C*", (0) x ($pad - length($str)) if length($str) < $pad;
3742 return $str;
3745 ################################################################
3746 # pack a GUID string
3747 sub pack_guid($)
3749 $_ = shift;
3750 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
3751 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
3754 ################################################################
3755 # comparison function for compression sort
3756 sub cmp_compression
3758 return scalar @{$a} <=> scalar @{$b} ||
3759 $a->[4] <=> $b->[4] ||
3760 $a->[5] <=> $b->[5] ||
3761 $a->[6] <=> $b->[6] ||
3762 $a->[7] <=> $b->[7] ||
3763 $a->[8] <=> $b->[8] ||
3764 $a->[9] <=> $b->[9] ||
3765 $a->[10] <=> $b->[10] ||
3766 $a->[11] <=> $b->[11] ||
3767 $a->[12] <=> $b->[12];
3770 ################################################################
3771 # build a binary sort keys table
3772 sub dump_sortkey_table($$)
3774 my ($filename, $download) = @_;
3776 my @keys;
3777 my ($part, $section, $subsection, $guid, $version, $ling_flag);
3778 my @multiple_weights;
3779 my @expansions;
3780 my @compressions;
3781 my %exceptions;
3782 my %guids;
3783 my %compr_flags;
3784 my %locales;
3785 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
3786 my $jamostr = "";
3788 my $re_hex = '0x[0-9A-Fa-f]+';
3789 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
3790 $guids{$default_guid} = { };
3792 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
3794 my $KEYS = open_data_file( $MSDATA, $download );
3796 printf "Building $filename\n";
3798 while (<$KEYS>)
3800 s/\s*;.*$//;
3801 next if /^\s*$/; # skip empty lines
3802 if (/^\s*(SORTKEY|SORTTABLES)/)
3804 $part = $1;
3805 next;
3807 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
3809 $part = $section = "";
3810 next;
3812 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
3814 $section = $1;
3815 $guid = undef;
3816 next;
3818 next unless $part;
3819 if ("$part.$section" eq "SORTKEY.DEFAULT")
3821 if (/^\s*($re_hex)\s+$re_key/)
3823 $keys[hex $1] = [ split(/\s+/,$2) ];
3824 next;
3827 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
3829 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
3831 $version = hex $1;
3832 next;
3834 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
3836 # ignore for now
3837 next;
3840 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
3841 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
3842 "$part.$section" eq "SORTTABLES.INVERSECASING")
3844 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
3846 $guid = lc $1;
3847 $guids{$guid} = { } unless defined $guids{$guid};
3848 $guids{$guid}->{flags} |= $flags{$section};
3849 next;
3851 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3853 $locales{$1} = $guid;
3854 next;
3857 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
3859 if (/^\s*(\d+)\s+(\d+)/)
3861 push @multiple_weights, $1, $2;
3862 next;
3865 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
3867 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3869 my $pos = scalar @expansions / 2;
3870 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
3871 push @expansions, hex $2, hex $3;
3872 next;
3875 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
3877 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3879 $keys[hex $1] = $keys[hex $2];
3880 next;
3883 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
3885 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
3887 if ($subsection || !$guid) # start a new one
3889 $guid = lc $1;
3890 $subsection = "";
3891 $guids{$guid} = { } unless defined $guids{$guid};
3892 $guids{$guid}->{flags} |= $flags{$2} if $2;
3893 $guids{$guid}->{compr} = @compressions;
3894 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
3895 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
3896 push @compressions, [ ];
3898 else # merge with current one
3900 $guids{lc $1} = { } unless defined $guids{lc $1};
3901 $guids{lc $1}->{flags} |= $flags{$2} if $2;
3902 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
3903 $compr_flags{lc $1} = $compr_flags{$guid};
3905 next;
3907 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3909 $locales{$1} = $guid;
3910 next;
3912 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
3914 $subsection = $1;
3915 next;
3917 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
3919 my @comp = map { hex $_; } split(/\s+/,$1);
3920 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
3921 # add compression flags
3922 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
3923 next;
3926 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
3928 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
3930 $guid = lc $1;
3931 $guids{$guid} = { } unless defined $guids{lc $1};
3932 $ling_flag = ($2 ? "+" : "-");
3933 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
3934 next;
3936 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3938 $locales{$1} = $guid;
3939 next;
3941 if (/^\s*($re_hex)\s+$re_key/)
3943 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
3944 next;
3947 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
3949 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
3951 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
3952 next;
3955 die "$download: $part.$section: unrecognized line $_\n";
3957 close $KEYS;
3959 # Sortkey table
3961 my $table;
3962 for (my $i = 0; $i < 0x10000; $i++)
3964 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
3965 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
3968 foreach my $id (sort keys %exceptions)
3970 my $pos = length($table) / 4;
3971 my @exc = @{$exceptions{$id}};
3972 my @filled;
3973 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
3974 my $guid = substr( $id, 0, -1 );
3975 $guids{$guid}->{$key} = $pos;
3976 $pos += 0x100;
3977 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
3978 for (my $j = 0; $j < 0x10000; $j++)
3980 next unless defined $exc[$j] || defined $flags[$j];
3981 $filled[$j >> 8] = 1;
3982 $j |= 0xff;
3984 for (my $j = 0; $j < 0x100; $j++)
3986 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
3987 $pos += 0x100 if $filled[$j];
3989 for (my $j = 0; $j < 0x10000; $j++)
3991 next unless $filled[$j >> 8];
3992 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
3993 $k[3] |= $flags[$j] || 0;
3994 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
3998 # Case mapping tables
4000 # standard table
4001 my @casemaps;
4002 my @upper = @toupper_table;
4003 my @lower = @tolower_table;
4004 remove_linguistic_mappings( \@upper, \@lower );
4005 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4007 # linguistic table
4008 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
4010 # Turkish table
4011 @upper = @toupper_table;
4012 @lower = @tolower_table;
4013 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
4014 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
4015 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4016 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
4018 # Char type table
4020 my @table;
4021 my $types = "";
4022 my %typestr;
4023 for (my $i = 0; $i < 0x10000; $i++)
4025 my $str = pack "S<3",
4026 ($category_table[$i] || 0) & 0xffff,
4027 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
4028 ($category_table[$i] || 0) >> 16;
4030 if (!defined($typestr{$str}))
4032 $typestr{$str} = length($types) / 6;
4033 $types .= $str;
4035 $table[$i] = $typestr{$str};
4038 my (@rows, @array, @data, @row_data);
4039 (@rows[0..4095], @data) = compress_array( 4096, 0, @table[0..65535] );
4040 (@array[0..255], @row_data) = compress_array( 256, 0, @rows );
4041 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
4042 for (my $i = 0; $i < @row_data; $i++) { $row_data[$i] += 2 * @row_data + 512 - 4096; }
4044 my $arraystr = pack("S<*", @array, @row_data) . pack("C*", @data);
4045 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
4046 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
4048 # Sort tables
4050 # guids
4051 my $sorttables = pack "L<2", $version, scalar %guids;
4052 foreach my $id (sort keys %guids)
4054 my %guid = %{$guids{$id}};
4055 my $flags = $guid{flags} || 0;
4056 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
4057 $sorttables .= pack_guid($id) . pack "L<5",
4058 $flags,
4059 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
4060 $guid{except} || 0,
4061 $guid{ling_except} || 0,
4062 $map / 2;
4065 # expansions
4066 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
4068 # compressions
4069 $sorttables .= pack "L<", scalar @compressions;
4070 my $rowstr = "";
4071 foreach my $c (@compressions)
4073 my $pos = length($rowstr) / 2;
4074 my $min = 0xffff;
4075 my $max = 0;
4076 my @lengths = (0) x 8;
4077 foreach my $r (sort cmp_compression @{$c})
4079 my @row = @{$r};
4080 $lengths[scalar @row - 6]++;
4081 foreach my $val (@row[4..$#row])
4083 $min = $val if $min > $val;
4084 $max = $val if $max < $val;
4086 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
4087 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
4089 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
4091 $sorttables .= $rowstr;
4093 # multiple weights
4094 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
4096 # jamo sort
4097 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
4099 # Locales
4101 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
4102 foreach my $loc (sort keys %locales)
4104 # skip specific locales that match more general ones
4105 my @parts = split /[-_]/, $loc;
4106 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
4107 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
4108 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
4111 # File header
4113 my @header;
4114 $header[0] = 16;
4115 $header[1] = $header[0] + length $table;
4116 $header[2] = $header[1] + length $casemaps;
4117 $header[3] = $header[2] + length $chartypes;
4119 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4120 print OUTPUT pack "L<*", @header;
4121 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
4122 close OUTPUT;
4123 save_file($filename);
4124 return $chartypes;
4128 my %lcnames;
4130 sub locale_parent($)
4132 my $loc = shift;
4134 return undef unless $loc;
4135 return $lcnames{$loc}->{sparent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{sparent};
4136 return $lcnames{$loc}->{parent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{parent};
4137 if ($loc =~ /(.*)-[0-9A-Za-z]+/) { return $1; }
4138 return "";
4141 sub compare_locales
4143 (my $n1 = $a) =~ tr/A-Z_/a-z-/;
4144 (my $n2 = $b) =~ tr/A-Z_/a-z-/;
4145 return $n1 cmp $n2;
4148 # query an xml key
4149 sub xml_query($$)
4151 my ($xml, $query) = @_;
4152 my $ret = $xml->find( $query );
4153 return undef unless $ret;
4154 printf STDERR "multiple entries for %s\n", $query if (@{$ret} > 1);
4155 return @{$ret}[0]->textContent;
4158 # query an xml key for a locale, with fallback to the parents
4159 sub loc_query($$)
4161 my ($loc, $query) = @_;
4163 $loc = $lcnames{"en-US"} unless $loc->{name}; # fallback to "en-US" for root locale
4165 for (my $cur = $loc->{name}; defined $cur; $cur = locale_parent( $cur ))
4167 next unless defined $lcnames{$cur};
4168 my $xml = $lcnames{$cur}->{xml};
4169 my $ret = $xml->find( $query );
4170 next unless $ret;
4171 printf STDERR "%s: multiple entries for %s\n", $cur, $query if (@{$ret} > 1);
4172 next if @{$ret}[0]->textContent eq "\x{2191}\x{2191}\x{2191}"; # "↑↑↑"
4173 return @{$ret}[0]->textContent;
4175 return undef;
4178 # retrieve a locale field entry by going up the parents tree
4179 sub locale_entry($$$)
4181 my ($loc, $field, $def) = @_;
4183 return $loc->{$field} if defined $loc->{$field};
4185 unless ($loc->{name}) # fallback to "en-US" for root locale
4187 $loc = $lcnames{"en-US"};
4188 return $loc->{$field} if defined $loc->{$field};
4190 while (defined $loc->{alias}) # resolve aliases
4192 $loc = $lcnames{$loc->{alias}};
4193 return $loc->{$field} if defined $loc->{$field};
4195 my $cur = $loc->{name};
4196 while ($cur)
4198 if (defined $lcnames{$cur} && defined $lcnames{$cur}->{sparent})
4200 $cur = $lcnames{$cur}->{sparent};
4202 elsif ($cur =~ /(.*)-[0-9A-Za-z]+/)
4204 $cur = $1;
4206 else
4208 return $def;
4210 return $lcnames{$cur}->{$field} if defined $lcnames{$cur} && defined $lcnames{$cur}->{$field};
4212 return $def;
4215 my $string_data;
4217 sub add_str_data($)
4219 my $txt = shift;
4220 my $ret = index( $string_data, $txt );
4221 if ($ret == -1)
4223 $ret = length($string_data);
4224 $string_data .= $txt
4226 return $ret / 2;
4229 sub add_string($)
4231 my $str = shift;
4232 return 0 unless defined($str) && $str ne "";
4233 my $utf = encode( "UTF16LE", $str );
4234 return add_str_data( (pack "S<", length($utf) / 2) . $utf . (pack "S", 0) );
4237 sub add_fontsig(@)
4239 return add_str_data( pack "S<L<*", scalar(@_) * 2, @_ );
4242 sub add_strarray(@)
4244 return 0 unless @_;
4245 return add_str_data( pack "S<L<*", scalar @_, map { add_string($_) } @_);
4248 sub format_to_grouping($)
4250 my $format = shift;
4251 if ($format =~ /#,(#+),(#+0)/) { return chr(length($2)) . chr(length($1)); }
4252 if ($format =~ /#,(#+0)/) { return chr(length($1)); }
4253 # printf STDERR "unknown format %s\n", $format;
4254 return chr(3);
4257 sub parse_currency_format($$)
4259 my $name = shift;
4260 my ($posfmt, $negfmt) = split /;/, shift;
4261 my @pospatterns = ( "\xa4[^\xa0]*#", # $1.1
4262 "00[^\xa0]*\xa4", # 1.1$
4263 "\xa4.*\xa0.*#", # $ 1.1
4264 "00.*\xa0.*\xa4" ); # 1.1 $
4265 my @negpatterns = ( "\\(\xa4[^\xa0]*#", # ($1.1)
4266 "-\xa4[^\xa0]*#", # -$1.1
4267 "\xa4[^\xa0]*-#", # $-1.1
4268 "\xa4[^\xa0]*#.*00-", # $1.1-
4269 "00[^\xa0]*\xa4\\)", # (1.1$)
4270 "-#.*00[^\xa0]*\xa4", # -1.1$
4271 "00-[^\xa0]*\xa4", # 1.1-$
4272 "00[^\xa0]*\xa4-", # 1.1$-
4273 "-#.*00.*\xa0.*\xa4", # -1.1 $
4274 "-\xa4.*\xa0.*#", # -$ 1.1
4275 "00.*\xa0.*\xa4-", # 1.1 $-
4276 "\xa4.*\xa0.*#.*00-", # $ 1.1-
4277 "\xa4.*\xa0.*-#", # $ -1.1
4278 "00-.*\xa0.*\xa4", # 1.1- $
4279 "\\(\xa4.*\xa0.*#", # ($ 1.1)
4280 "00.*\xa0.*\xa4\\)"); # (1.1 $)
4281 my ($pos, $neg);
4283 for ($pos = 0; $pos < @pospatterns; $pos++)
4285 last if ($posfmt =~ /$pospatterns[$pos]/);
4287 #printf STDERR "$name: unknown format '%s'\n", $posfmt if ($pos == @pospatterns);
4288 $pos = 0 if ($pos == @pospatterns);
4290 if (defined $negfmt)
4292 for ($neg = 0; $neg < @negpatterns; $neg++)
4294 last if ($negfmt =~ /$negpatterns[$neg]/);
4296 #printf STDERR "$name: unknown format '%s'\n", $negfmt if ($neg == @negpatterns);
4297 $neg = 0 if ($neg == @negpatterns);
4299 elsif ($pos == 0) { $neg = 1; }
4300 elsif ($pos == 1) { $neg = 5; }
4301 elsif ($pos == 2) { $neg = 9; }
4302 elsif ($pos == 3) { $neg = 8; }
4304 return ($pos, $neg);
4307 sub parse_percent_format($)
4309 my $fmt = shift;
4310 my @patterns = ( "0.+%", # 1 %
4311 "0%", # 1%
4312 "%#", # %1
4313 "%.+#" ); # % 1
4314 my $pos;
4315 for ($pos = 0; $pos < @patterns; $pos++)
4317 last if ($fmt =~ /$patterns[$pos]/);
4319 printf STDERR "unknown format '%s'\n", $fmt if ($pos == @patterns);
4320 return ($pos, ($pos == 3) ? 7 : $pos);
4323 sub convert_date_format($)
4325 my $fmt = shift;
4326 $fmt =~ s/G+/gg/;
4327 $fmt =~ s/LLLL/MMMM/;
4328 $fmt =~ s/LLL/MMM/;
4329 $fmt =~ s/E+/dddd/;
4330 $fmt =~ s/ccc+/dddd/;
4331 $fmt =~ s/([^gy])y([^y])/$1yyyy$2/;
4332 $fmt =~ s/^y([^y])/yyyy$1/;
4333 $fmt =~ s/([^gy])y$/$1yyyy/;
4334 return $fmt;
4337 sub convert_time_format($)
4339 my $fmt = shift;
4340 $fmt =~ s/a+/tt/;
4341 $fmt =~ s/B+/tt/;
4342 return $fmt;
4345 sub load_iso639()
4347 my %iso639;
4348 my $DATA = open_data_file( $ISO639, "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3.tab" );
4349 while (<$DATA>)
4351 if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
4353 close $DATA;
4354 return %iso639;
4358 ################################################################
4359 # build the locale table for locale.nls
4360 sub build_locale_data()
4362 my $base = "cldr-release-$CLDRVERSION";
4363 my $suppl = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/supplementalData.xml" );
4364 my $subtags = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/likelySubtags.xml" );
4365 my $numbers = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/numberingSystems.xml" );
4366 # obsolete phone data from CLDR version 33
4367 my $phone = load_xml_data_file( $CLDR33DATA, "common/supplemental/telephoneCodeData.xml" );
4368 my %iso639 = load_iso639();
4369 $string_data = pack "S2", 0, 0; # offset 0 == empty string
4371 %lcnames = map { $_->{name} => $_ } @locales;
4373 my %lcids;
4374 foreach my $loc (@locales) { $lcids{$loc->{lcid}} = $loc if defined $loc->{lcid}; }
4376 my %days = ( "sun" => 0, "mon" => 1, "tue" => 2, "wed" => 3, "thu" => 4, "fri" => 5, "sat" => 6 );
4378 # assign locale parents
4380 foreach my $loc (@locales)
4382 next if $loc->{name} eq "";
4383 next if defined $loc->{parent};
4384 (my $unix_name = $loc->{name}) =~ s/-/_/g;
4385 my $parent = xml_query( $suppl, "/supplementalData/parentLocales/parentLocale[contains(concat(' ',\@locales,' '),' $unix_name ')]/\@parent" );
4386 if ($parent)
4388 $parent =~ s/_/-/g;
4389 $parent = "" if $parent eq "root";
4391 elsif ($loc->{name} =~ /(.*)-[0-9A-Za-z]+/) { $parent = $1; }
4392 $loc->{parent} = $parent || "";
4395 # load per-locale XML files
4397 foreach my $loc (@locales)
4399 next if defined $loc->{alias};
4400 (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
4401 $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
4402 my $xml = load_xml_data_file( $CLDRDATA, $file );
4403 $loc->{xml} = $xml;
4404 $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
4405 $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
4406 $loc->{script} = xml_query( $xml, "/ldml/identity/script/\@type" );
4407 if (!defined($loc->{territory}) && $loc->{name} =~ /-([A-Z]{2}|[0-9]{3})$/) { $loc->{territory} = $1; }
4408 if (!defined($loc->{script}) && $loc->{name} =~ /-([A-Z][a-z]{3})(-[A-Z]{2})?$/) { $loc->{script} = $1; }
4411 # assign a default territory and sort locale
4413 foreach my $loc (@locales)
4415 next if defined $loc->{alias};
4416 next if defined $loc->{territory};
4417 my $id = $loc->{sortlocale};
4418 if (defined $id && ($id =~ /[-_]([A-Z0-9]+)$/))
4420 $loc->{territory} = $1;
4421 next;
4423 my @children = grep /^$loc->{name}-[A-Z0-9]+$/ && !defined $lcnames{$_}->{alias}, keys %lcnames;
4424 if (@children == 1)
4426 $id = $children[0];
4428 else
4430 my $name = $loc->{file} || $loc->{name};
4431 $name =~ s/-(Arab|Cyrl|Deva|Guru|Hans|Hant|Latn|Tfng|Vaii)$//;
4432 $name =~ s/-/_/g;
4433 $id = xml_query( $subtags, "/supplementalData/likelySubtags/likelySubtag[\@from='$name']/\@to" );
4434 $id =~ s/_/-/g if $id;
4436 if ($id =~ /[-_]([A-Z0-9]+)$/)
4438 $loc->{territory} = $1;
4439 next if defined $loc->{sortlocale};
4440 next unless $id =~ /^$loc->{name}/;
4441 while (defined $lcnames{$id} && defined $lcnames{$id}->{alias}) { $id = $lcnames{$id}->{alias}; }
4442 $loc->{sortlocale} = $id if defined $lcnames{$id};
4443 next;
4445 print STDERR "no territory found for $loc->{name}\n";
4448 # fill geoid table
4450 my %geotable;
4451 foreach my $geo (@geoids)
4453 my $name = $geo->{name};
4454 next unless defined $name;
4455 $geo->{alias} = $geotable{$name} if defined $geotable{$name};
4456 $geotable{$name} ||= $geo;
4458 foreach my $loc (@locales)
4460 next if defined $loc->{alias};
4461 my $territory = $loc->{territory};
4462 $geotable{$territory} ||= { name => $territory };
4464 foreach my $name (keys %geotable)
4466 my $geo = $geotable{$name};
4467 $geo->{dialcode} = xml_query( $phone, "(/supplementalData/telephoneCodeData/codesByTerritory[\@territory='$name']/telephoneCountryCode)[1]/\@code" );
4468 if ($name =~ /\d+/)
4470 $geo->{uncode} = $name;
4471 next;
4473 $geo->{iso2} = $name;
4474 $geo->{iso3} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@alpha3");
4475 $geo->{uncode} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@numeric");
4476 $geo->{sintlsymbol} ||= xml_query( $suppl, "(/supplementalData/currencyData/region[\@iso3166='$name']/currency[not(\@to)])[1]/\@iso4217") || "XXX";
4477 $geo->{sintlsymbol} =~ s/XXX/XDR/;
4479 foreach my $geo (@geoids)
4481 $geo->{parentid} = $geotable{$geo->{parent}}->{id} if defined $geo->{parent};
4482 next if defined $geo->{iso2};
4483 next if defined $geo->{alias};
4484 next unless defined $geo->{uncode};
4485 my @contains;
4486 my $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and not(\@status)]/\@contains");
4487 push @contains, split /\s+/, $list if defined $list;
4488 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and \@status='deprecated']/\@contains");
4489 push @contains, split /\s+/, $list if defined $list;
4490 while (@contains)
4492 my $territory = pop @contains;
4493 if (defined $geotable{$territory})
4495 $geotable{$territory}->{parentid} ||= $geo->{id};
4497 elsif ($territory =~ /\d+/)
4499 # expand region recursively
4500 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$territory' and not(\@status)]/\@contains" );
4501 push @contains, split /\s+/, $list if defined $list;
4506 # assign calendars to their locale
4508 foreach my $cal (@calendars)
4510 next unless defined $cal->{locale};
4511 my $loc = $lcnames{$cal->{locale}};
4512 $loc->{calendar} = [ ] unless defined $loc->{calendar};
4513 push @{$loc->{calendar}}, $cal;
4516 # assign default lcid to aliases
4518 foreach my $loc (@locales)
4520 next unless defined $loc->{alias};
4521 next if defined $loc->{lcid};
4522 my $alias = $loc->{alias};
4523 my $lcid = $lcnames{$alias}->{lcid} || 0x1000;
4524 $loc->{lcid} = $lcid | 0x80000000;
4527 # assign sort aliases to parent locale
4529 foreach my $loc (@locales)
4531 next unless $loc->{name} =~ /_/;
4532 next unless defined $loc->{alias};
4533 my $alias = $loc->{alias};
4534 my $parent = $lcnames{$alias};
4535 my $basename = $parent->{name};
4536 while (1)
4538 @{$parent->{sortnames}}[($loc->{lcid} >> 16) - 1] = $loc->{name};
4539 $alias = locale_parent( $alias );
4540 last unless $alias && defined $lcnames{$alias};
4541 $parent = $lcnames{$alias};
4542 last if defined $parent->{sortbase} && $parent->{sortbase} ne $basename;
4543 $parent->{sortbase} = $basename;
4547 # assign an array index to all locales
4549 my $idx = 0;
4550 foreach my $loc (@locales)
4552 next if defined $loc->{alias};
4553 $loc->{idx} = $idx++;
4555 foreach my $loc (@locales)
4557 my $alias = $loc->{alias};
4558 next unless defined $alias;
4559 while (defined $lcnames{$alias}->{alias}) { $alias = $lcnames{$alias}->{alias}; }
4560 $loc->{idx} = $lcnames{$alias}->{idx};
4563 # output lcids table
4565 my $lcid_data = "";
4566 foreach my $id (sort { $a <=> $b } keys %lcids)
4568 my $loc = $lcids{$id};
4569 $lcid_data .= pack "L<S<2", $id, $loc->{idx}, add_string($loc->{name});
4572 # output lcnames table
4574 my $lcname_data = "";
4575 foreach my $name (sort compare_locales keys %lcnames)
4577 my $loc = $lcnames{$name};
4578 $lcname_data .= pack "S<2L<", add_string($name), $loc->{idx}, $loc->{lcid} || 0x1000;
4581 # output locales array
4583 my $locale_data = "";
4584 my $default_lcid = 0x8001;
4585 foreach my $loc (@locales)
4587 next if defined $loc->{alias};
4588 my $sname = $loc->{name};
4589 my $language = $loc->{language};
4590 my $territory = $loc->{territory};
4591 my $script = $loc->{script};
4592 my $neutral = ($sname && $sname !~ /-$territory/);
4593 my $sparent = $loc->{sparent} || (($sname =~ /(.*)-[0-9A-Za-z]+/) ? $1 : $loc->{parent});
4594 my $unique_lcid = $loc->{lcid};
4595 unless (defined $unique_lcid) { $unique_lcid = $default_lcid++; }
4596 my $geo = $geotable{$territory};
4597 my $territory_match = "contains(concat(' ',normalize-space(\@territories),' '),' $territory ')";
4599 # languages and scripts
4601 my $ssortlocale = $loc->{sortlocale} || ($neutral ? "$sname-$territory" : $sname);
4602 my $idefaultlanguage = defined $lcnames{$ssortlocale} ? $lcnames{$ssortlocale}->{lcid} : undef;
4603 $idefaultlanguage = $lcnames{"en-US"}->{lcid} unless $ssortlocale;
4604 (my $siso639langname = $sname) =~ s/-.*$//;
4605 my $siso639langname2 = $iso639{$siso639langname} || $siso639langname;
4606 my $sopentypelang = sprintf "%-4s", locale_entry( $loc, "sopentypelang", uc $siso639langname2 );
4607 my $sabbrevlangname = defined $loc->{lcid} ? locale_entry( $loc, "sabbrevlangname", uc $siso639langname2 ) : "ZZZ";
4608 my $siso3166ctryname2 = $geo->{iso3} || $geo->{uncode};
4609 my $senglanguage = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" ) || "";
4610 my $sengcountry = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" ) || "";
4611 my $snativelangname = loc_query( $loc, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" );
4612 my $snativectryname = loc_query( $loc, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" );
4613 $sengcountry =~ s/South Korea/Korea/;
4614 $snativelangname ||= $senglanguage;
4615 $snativectryname ||= $sengcountry;
4616 if ($script)
4618 my $engscript = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4619 my $nativescript = loc_query( $loc, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4620 $senglanguage .= " ($engscript)" if $engscript;
4621 $snativelangname .= " ($nativescript)" if $nativescript;
4623 my $sengdisplayname = $neutral ? $senglanguage : "$senglanguage ($sengcountry)";
4624 my $snativedisplayname = $neutral ? $snativelangname : "$snativelangname ($snativectryname)";
4625 $sengdisplayname =~ s/\) \(/, /;
4626 $snativedisplayname =~ s/\) \(/, /;
4627 my $sscripts = locale_entry( $loc, "sscripts", $script ) || xml_query( $suppl, "/supplementalData/languageData/language[\@type='$language' and not(\@alt)]/\@scripts" );
4628 $sscripts = (join ";", (sort split / /, ($sscripts || "Latn"))) . ";";
4629 my $ireadinglayout = locale_entry( $loc, "ireadinglayout", 0 );
4630 my $charlayout = loc_query( $loc, "/ldml/layout/orientation/characterOrder" );
4631 if ($charlayout eq "right-to-left")
4633 $ireadinglayout = 1;
4635 elsif ($charlayout eq "top-to-bottom")
4637 my $linelayout = loc_query( $loc, "/ldml/layout/orientation/lineOrder" );
4638 $ireadinglayout = $linelayout eq "right-to-left" ? 2 : 3;
4640 my $igeoid = $geo->{id} || 0;
4642 # numbers
4644 my $sdecimal = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/decimal" );
4645 my $slist = locale_entry( $loc, "slist", ";" );
4646 my $smondecimalsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyDecimal" ) || $sdecimal;
4647 my $sthousand = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/group" );
4648 $sthousand =~ s/\x{202f}/\x{00a0}/;
4649 my $smonthousandsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyGroup" ) || $sthousand;
4650 my $spositivesign = "";
4651 my $snegativesign = "-";
4652 my $spercent = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/percentSign" );
4653 my $snan = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/nan" );
4654 my $sposinfinity = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/infinity" );
4655 my $sneginfinity = $sposinfinity ? "-$sposinfinity" : "";
4656 my $sgrouping = format_to_grouping( loc_query( $loc, "/ldml/numbers/decimalFormats[\@numberSystem='latn']/decimalFormatLength[not(\@type)]/decimalFormat/pattern" ));
4657 my $percentformat = loc_query( $loc, "/ldml/numbers/percentFormats[\@numberSystem='latn']/percentFormatLength[not(\@type)]/percentFormat/pattern" );
4658 my $currencyformat = loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='accounting']/pattern" ) ||
4659 loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='standard']/pattern" );
4660 my $smongrouping = format_to_grouping( $currencyformat );
4661 my ($icurrency, $inegcurr) = parse_currency_format( $sname, $currencyformat );
4662 my ($ipospercent, $inegpercent) = parse_percent_format( $percentformat );
4663 my $native_numbering = loc_query( $loc, "/ldml/numbers/otherNumberingSystems/native" );
4664 my @snativedigits = split //, xml_query( $numbers, "/supplementalData/numberingSystems/numberingSystem[\@id='$native_numbering']/\@digits" );
4665 my $digitsubstitution = !(ord($snativedigits[0]) >= 0x600 && ord($snativedigits[0]) <= 0x6ff);
4666 my $measure = defined xml_query( $suppl, "/supplementalData/measurementData/measurementSystem[\@type='US' and $territory_match]" );
4667 my $papersize = defined xml_query( $suppl, "/supplementalData/measurementData/paperSize[\@type='US-Letter' and $territory_match]" );
4669 # currencies
4671 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
4672 my $scurrency = $geo->{scurrency} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[\@alt='narrow']" );
4673 $scurrency ||= loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[not(\@alt)]" );
4674 $geo->{scurrency} = $scurrency if $scurrency;
4675 $scurrency ||= $sintlsymbol;
4676 my $sengcurrname = $loc->{sengcurrname} || loc_query( $lcnames{en}, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" );
4677 my $snativecurrname = $loc->{sengcurrname} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" ) || $sengcurrname;
4678 my $icurrdigits = xml_query( $suppl, "/supplementalData/currencyData/fractions/info[\@iso4217='$sintlsymbol']/\@digits" );
4679 $icurrdigits = 2 unless defined $icurrdigits;
4681 # calendars
4683 my $firstday = xml_query( $suppl, "/supplementalData/weekData/firstDay[not(\@alt) and $territory_match]/\@day" );
4684 my $ifirstdayofweek = $firstday ? $days{$firstday} : 1;
4685 my $firstweekofyear = (xml_query( $suppl, "/supplementalData/weekData/minDays[$territory_match]/\@count" ) || 0) == 4 ? 2 : 0;
4686 my $serastring = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/eras/eraAbbr/era[\@type='1' and not(\@alt)]" );
4687 my (@sdayname, @sabbrevdayname, @sshortestdayname);
4688 foreach my $d (sort { $days{$a} <=> $days{$b} } keys %days)
4690 my $n = $days{$d};
4691 my %name;
4692 foreach my $type (qw(wide abbreviated short))
4694 $name{$type} = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/days/dayContext[\@type='format']/dayWidth[\@type='$type']/day[\@type='$d' and not(\@alt)]" );
4696 push @sdayname, $name{wide};
4697 push @sabbrevdayname, $name{abbreviated} || $name{wide};
4698 push @sshortestdayname, $name{short} || $name{abbreviated} || $name{wide};
4700 my (@smonthname, @sabbrevmonthname, @sgenitivemonth, @sabbrevgenitivemonth);
4701 foreach my $n (1..13)
4703 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='wide']/month[\@type='$n']" );
4704 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4705 my $genitive = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n']" );
4706 my $abbrevgen = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4707 push @smonthname, $name || $genitive || "";
4708 push @sabbrevmonthname, $abbrev || $abbrevgen || $name || $genitive || "";
4709 push @sgenitivemonth, $genitive || "";
4710 push @sabbrevgenitivemonth, $abbrevgen || $genitive || "";
4712 @sgenitivemonth = () if join("|",@smonthname) eq join("|",@sgenitivemonth);
4713 @sabbrevgenitivemonth = () if join("|",@sabbrevmonthname) eq join("|",@sabbrevgenitivemonth);
4714 my %caltypes = ( "gregorian" => 1, "japanese" => 3, "chinese" => 4, "dangi" => 5, "islamic" => 6, "buddhist" => 7, "hebrew" => 8,
4715 "persian" => 22, "islamic-civil" => 23, "islamic-umalqura" => 23 );
4716 my $calpref = xml_query( $suppl, "/supplementalData/calendarPreferenceData/calendarPreference[$territory_match]/\@ordering" ) || "gregorian";
4717 my $icalendartype;
4718 my @scalnames;
4719 foreach my $c (split /\s+/, $calpref)
4721 next unless defined $caltypes{$c};
4722 $icalendartype .= chr($caltypes{$c});
4723 $scalnames[$caltypes{$c} - 1] = loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$c']" );
4726 # date/time formats
4728 my $s1159 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='am' and not(\@alt)]" );
4729 my $s2359 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='pm' and not (\@alt)]" );
4730 my $sshortestam = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='am' and not(\@alt)]" );
4731 my $sshortestpm = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='pm' and not (\@alt)]" );
4732 my @stimeformat = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='medium']/timeFormat/pattern[not(\@alt)]" ));
4733 push @stimeformat, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4734 pop @stimeformat if $stimeformat[0] eq $stimeformat[1];
4735 @stimeformat = map convert_time_format($_), @stimeformat;
4736 my @sshorttime = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='short']/timeFormat/pattern[not(\@alt)]" ));
4737 push @sshorttime, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hm' and not(\@alt)]" );
4738 pop @sshorttime if $sshorttime[0] eq $sshorttime[1];
4739 @sshorttime = map convert_time_format($_), @sshorttime;
4740 my @sshortdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4741 push @sshortdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
4742 @sshortdate = map convert_date_format($_), @sshortdate;
4743 my @slongdate = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" ));
4744 push @slongdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
4745 @slongdate = map convert_date_format($_), @slongdate;
4746 my @smonthday = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMd' and not(\@alt)]" ));
4747 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Md' and not(\@alt)]" );
4748 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMd' and not(\@alt)]" );
4749 @smonthday = map convert_date_format($_), @smonthday;
4750 my @syearmonth = map convert_date_format($_), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMM' and not(\@alt)]" );
4751 my @sduration = map convert_time_format( lc $_ ), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4752 my $srelativelongdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMEd' and not(\@alt)]" ) ||
4753 loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMEd' and not(\@alt)]" );
4754 $srelativelongdate = convert_date_format( $srelativelongdate );
4756 if (defined $loc->{calendar})
4758 foreach my $cal (@{$loc->{calendar}})
4760 $cal->{sshortdate} = \@sshortdate;
4761 $cal->{syearmonth} = \@syearmonth;
4762 $cal->{slongdate} = \@slongdate;
4763 $cal->{serastring} = [ $serastring ];
4764 $cal->{sdayname} = \@sdayname;
4765 $cal->{sabbrevdayname} = \@sabbrevdayname;
4766 $cal->{smonthname} = \@smonthname;
4767 $cal->{sabbrevmonthname} = \@sabbrevmonthname;
4768 $cal->{scalname} = $scalnames[$cal->{id}];
4769 $cal->{smonthday} = \@smonthday;
4770 $cal->{sshortestdayname} = \@sshortestdayname;
4771 $cal->{sabbreverastring} = [ $serastring ];
4772 $cal->{sshortestdayname} = \@sshortestdayname;
4773 $cal->{srelativelongdate} = $srelativelongdate;
4777 # codepages
4779 my %ansicpmap = ( 437 => 1252, 720 => 1256, 737 => 1253, 775 => 1257, 850 => 1252,
4780 852 => 1250, 855 => 1251, 866 => 1251, 857 => 1254, 862 => 1255 );
4781 my %maccpmap = ( 437 => 10000, 720 => 10004, 737 => 10006, 775 => 10029, 850 => 10000,
4782 852 => 10029, 855 => 10007, 857 => 10081, 862 => 10005, 866 => 10007,
4783 874 => 10021, 932 => 10001, 936 => 10008, 949 => 10003, 950 => 10002,
4784 1258 => 10000 );
4785 my %ebcdiccpmap = ( 437 => 37, 720 => 20420, 737 => 20273, 866 => 20880, 932 => 20290 );
4786 my %codepagemasks = ( 874 => [ 0x01000000, 0x00000000, 0x00000000, 0, 0x00010000, 0x00000000, 0x00010000, 0x00000000 ],
4787 932 => [ 0x00000000, 0x28c70000, 0x00000010, 0, 0x00020000, 0x00000000, 0x00020000, 0x00000000 ],
4788 936 => [ 0x00000000, 0x28010000, 0x00000002, 0, 0x00040000, 0x00000000, 0x00040000, 0x00000000 ],
4789 949 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00080000, 0x00000000, 0x00080000, 0x00000000 ],
4790 950 => [ 0x00000000, 0x28c10000, 0x00000012, 0, 0x00100000, 0x00000000, 0x00100000, 0x00000000 ],
4791 1258 => [ 0x2000000f, 0x00000000, 0x00000000, 0, 0x00000100, 0x00008000, 0x00000100, 0x00008000 ],
4792 866 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x00020000, 0x00000004, 0x02020000 ],
4793 862 => [ 0x00000800, 0x40000000, 0x00000000, 0, 0x00000020, 0x00200000, 0x00000020, 0x00200000 ],
4794 857 => [ 0x0000001f, 0x00000000, 0x00000000, 0, 0x00000010, 0x01000000, 0x00000010, 0x01000000 ],
4795 855 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x02000000, 0x00000004, 0x02000000 ],
4796 852 => [ 0x00000027, 0x00000000, 0x00000000, 0, 0x00000002, 0x04000000, 0x00000002, 0x04000000 ],
4797 775 => [ 0x00000007, 0x00000000, 0x00000000, 0, 0x00000080, 0x08000000, 0x00000080, 0x08000000 ],
4798 737 => [ 0x00000080, 0x00000000, 0x00000000, 0, 0x00000008, 0x10000000, 0x00000008, 0x10010000 ],
4799 720 => [ 0x00002000, 0x00000000, 0x00000000, 0, 0x00000040, 0x20000000, 0x00000040, 0x20080000 ],
4800 850 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x40000000, 0x0000019f, 0xdfd70000 ],
4801 437 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x80000000, 0x0000019f, 0xdfd70000 ],
4802 65001 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00000000, 0x00000000, 0x0000019f, 0xdfd70000 ] );
4803 my $oemcp = locale_entry( $loc, "oemcp", 65001 );
4804 my $maccp = locale_entry( $loc, "maccp", undef ) || $maccpmap{$oemcp} || 65001;
4805 my $ebcdiccp = locale_entry( $loc, "ebcdiccp", undef ) || $ebcdiccpmap{$oemcp} || 500;
4806 $ebcdiccp = 500 if (defined $loc->{oemcp} && $loc->{oemcp} == 65001) || (defined $loc->{maccp} && $loc->{maccp} == 65001);
4807 my $ansicp = $ansicpmap{$oemcp} || $oemcp;
4808 my @fontsig = (0) x 8;
4809 my $sig = locale_entry( $loc, "fontsig", [] );
4810 foreach my $i (0..7) { $fontsig[$i] |= $codepagemasks{$oemcp}->[$i]; }
4811 foreach my $i (0..$#{$sig}) { $fontsig[$i] |= $sig->[$i]; }
4812 $fontsig[3] |= 1 << 31;
4813 $fontsig[3] |= 1 << 27 if $ireadinglayout == 1;
4814 $fontsig[3] |= 1 << 28 if $ireadinglayout == 3;
4816 # special cases for invariant locale
4818 unless ($loc->{name})
4820 $siso639langname = "iv";
4821 $siso639langname2 = "ivl";
4822 $senglanguage = $snativelangname = "Invariant Language";
4823 $sengcountry = $snativectryname = "Invariant Country";
4824 $sengdisplayname = "Invariant Language (Invariant Country)";
4825 $snativedisplayname = "Invariant Language (Invariant Region)";
4826 $sengcurrname = $snativecurrname = "International Monetary Fund";
4827 $scurrency = "\x{00a4}";
4828 $ifirstdayofweek = 0;
4829 $igeoid = $geotable{"US"}->{id};
4830 @stimeformat = ("HH:mm:ss");
4831 @sshortdate = ("MM/dd/yyyy", "yyyy-MM-dd");
4832 @slongdate = ("dddd, dd MMMM yyyy");
4833 @syearmonth = ("yyyy MMMM");
4834 @smonthday = ("MMMM dd", "MMMM d", "M/d", "MMM d");
4835 @sshorttime = ("HH:mm", "hh:mm tt", "H:mm", "h:mm tt");
4836 $srelativelongdate = "dddd, MMMM dd";
4837 $sposinfinity = "Infinity";
4838 $sneginfinity = "-Infinity";
4839 $spositivesign = "+";
4840 $ipospercent = $inegpercent = 0;
4843 # output data
4845 $locale_data .= pack "L<2",
4846 add_string( $sname ), # name
4847 add_string( $sopentypelang ); # LOCALE_SOPENTYPELANGUAGETAG
4849 $locale_data .= pack "S<14",
4850 $loc->{lcid} || 0x1000, # LOCALE_ILANGUAGE
4851 $unique_lcid, # unique_lcid
4852 locale_entry( $loc, "idigits", 2 ), # LOCALE_IDIGITS
4853 locale_entry( $loc, "inegnumber", 1 ), # LOCALE_INEGNUMBER
4854 $icurrdigits, # LOCALE_ICURRDIGITS
4855 $icurrency, # LOCALE_ICURRENCY
4856 $inegcurr, # LOCALE_INEGCURR
4857 locale_entry( $loc, "ilzero", 1 ), # LOCALE_ILZERO
4858 !$neutral, # LOCALE_INEUTRAL
4859 $ifirstdayofweek, # LOCALE_IFIRSTDAYOFWEEK
4860 $firstweekofyear, # LOCALE_IFIRSTWEEKOFYEAR
4861 $geo->{dialcode} || 1 , # LOCALE_ICOUNTRY,
4862 $measure, # LOCALE_IMEASURE
4863 $digitsubstitution; # LOCALE_IDIGITSUBSTITUTION
4865 $locale_data .= pack "L<18",
4866 add_string( $sgrouping ), # LOCALE_SGROUPING
4867 add_string( $smongrouping ), # LOCALE_SMONGROUPING
4868 add_string( $slist ), # LOCALE_SLIST
4869 add_string( $sdecimal ), # LOCALE_SDECIMAL
4870 add_string( $sthousand ), # LOCALE_STHOUSAND
4871 add_string( $scurrency ), # LOCALE_SCURRENCY
4872 add_string( $smondecimalsep ), # LOCALE_SMONDECIMALSEP
4873 add_string( $smonthousandsep ), # LOCALE_SMONTHOUSANDSEP
4874 add_string( $spositivesign ), # LOCALE_SPOSITIVESIGN
4875 add_string( $snegativesign ), # LOCALE_SNEGATIVESIGN
4876 add_string( $s1159 ), # LOCALE_S1159
4877 add_string( $s2359 ), # LOCALE_S2359
4878 add_strarray( @snativedigits ), # LOCALE_SNATIVEDIGITS
4879 add_strarray( @stimeformat ), # LOCALE_STIMEFORMAT
4880 add_strarray( @sshortdate ), # LOCALE_SSHORTDATE
4881 add_strarray( @slongdate ), # LOCALE_SLONGDATE
4882 add_strarray( @syearmonth ), # LOCALE_SYEARMONTH
4883 add_strarray( @sduration ); # LOCALE_SDURATION
4885 $locale_data .= pack "S<8",
4886 $idefaultlanguage || 0x1000, # LOCALE_IDEFAULTLANGUAGE
4887 $ansicp, # LOCALE_IDEFAULTANSICODEPAGE
4888 $oemcp, # LOCALE_IDEFAULTCODEPAGE
4889 $maccp, # LOCALE_IDEFAULTMACCODEPAGE
4890 $ebcdiccp, # LOCALE_IDEFAULTEBCDICCODEPAGE
4891 $igeoid < 65536 ? $igeoid : 39070, # old_geoid
4892 $papersize ? 1 : 9, # LOCALE_IPAPERSIZE
4893 0; # FIXME # islamic_cal
4895 $locale_data .= pack "L<24",
4896 add_string( $icalendartype ), # LOCALE_ICALENDARTYPE
4897 add_string( $sabbrevlangname ), # LOCALE_SABBREVLANGNAME
4898 add_string( $siso639langname ), # LOCALE_SISO639LANGNAME
4899 add_string( $senglanguage ), # LOCALE_SENGLANGUAGE
4900 add_string( $snativelangname ), # LOCALE_SNATIVELANGNAME
4901 add_string( $sengcountry ), # LOCALE_SENGCOUNTRY
4902 add_string( $snativectryname ), # LOCALE_SNATIVECTRYNAME
4903 add_string( $siso3166ctryname2 ), # LOCALE_SABBREVCTRYNAME
4904 add_string( $territory ), # LOCALE_SISO3166CTRYNAME
4905 add_string( $sintlsymbol ), # LOCALE_SINTLSYMBOL
4906 add_string( $sengcurrname ), # LOCALE_SENGCURRNAME
4907 add_string( $snativecurrname ), # LOCALE_SNATIVECURRNAME
4908 add_fontsig( @fontsig ), # LOCALE_FONTSIGNATURE
4909 add_string( $siso639langname2 ), # LOCALE_SISO639LANGNAME2
4910 add_string( $siso3166ctryname2 ), # LOCALE_SISO3166CTRYNAME2
4911 add_string( $sparent ), # LOCALE_SPARENT
4912 add_strarray( @sdayname ), # LOCALE_SDAYNAME
4913 add_strarray( @sabbrevdayname ), # LOCALE_SABBREVDAYNAME
4914 add_strarray( @smonthname ), # LOCALE_SMONTHNAME
4915 add_strarray( @sabbrevmonthname ), # LOCALE_SABBREVMONTHNAME
4916 add_strarray( @sgenitivemonth ), # LOCALE_SGENITIVEMONTH
4917 add_strarray( @sabbrevgenitivemonth ), # LOCALE_SABBREVGENITIVEMONTH
4918 add_strarray( @scalnames ), # LOCALE_SCALNAMES
4919 add_strarray( @{$loc->{sortnames}} ); # LOCALE_SSORTNAMES
4921 $locale_data .= pack "S<6",
4922 $inegpercent, # LOCALE_INEGATIVEPERCENT
4923 $ipospercent, # LOCALE_IPOSITIVEPERCENT
4924 0, # unknown
4925 $ireadinglayout, # LOCALE_IREADINGLAYOUT
4926 0x2a, # unknown
4927 0x2a; # unknown
4929 $locale_data .= pack "L<24",
4930 0, # unknown
4931 add_string( $sengdisplayname ), # LOCALE_SENGLISHDISPLAYNAME
4932 add_string( $snativedisplayname ), # LOCALE_SNATIVEDISPLAYNAME
4933 add_string( $spercent ), # LOCALE_SPERCENT
4934 add_string( $snan ), # LOCALE_SNAN
4935 add_string( $sposinfinity ), # LOCALE_SPOSINFINITY
4936 add_string( $sneginfinity ), # LOCALE_SNEGINFINITY
4937 0, # unknown
4938 add_string( $serastring ), # CAL_SERASTRING
4939 add_string( $serastring ), # CAL_SABBREVERASTRING
4940 0, # unknown
4941 add_string( $ssortlocale ), # LOCALE_SCONSOLEFALLBACKNAME
4942 add_strarray( @sshorttime ), # LOCALE_SSHORTTIME
4943 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
4944 0, # unknown
4945 add_string( $ssortlocale ), # LOCALE_SSORTLOCALE
4946 add_string( "0409:00000409" ), # FIXME # LOCALE_SKEYBOARDSTOINSTALL
4947 add_string( $sscripts ), # LOCALE_SSCRIPTS
4948 add_string( $srelativelongdate ), # LOCALE_SRELATIVELONGDATE
4949 $igeoid, # LOCALE_IGEOID
4950 add_string( $sshortestam || "a" ), # LOCALE_SSHORTESTAM
4951 add_string( $sshortestpm || "p" ), # LOCALE_SSHORTESTPM
4952 add_strarray( @smonthday ), # LOCALE_SMONTHDAY
4953 add_string( "k0-windows-us" ) # FIXME # keyboard_layout
4956 # output language groups
4958 my %groups;
4959 add_registry_key( "Locale", "00000409" );
4960 foreach my $loc (@locales)
4962 next unless defined $loc->{lcid};
4963 next if ($loc->{lcid} & 0x80000000);
4964 next if !defined($loc->{alias}) && $loc->{name} !~ /-$loc->{territory}/; # skip neutral locales
4965 my $group = locale_entry( $loc, "group", 1 );
4966 my $name = sprintf( "%08x", $loc->{lcid} );
4967 my $val = sprintf( "%x", $group );
4968 add_registry_value( "Locale", $name, $val ) unless ($loc->{lcid} & 0x000f0000);
4969 add_registry_value( "Locale\\Alternate Sorts", $name, $val ) if $loc->{name} =~ /_/;
4970 $groups{$val} = 1;
4972 foreach my $group (keys %groups) { add_registry_value( "Language Groups", $group, "1" ); }
4974 # output calendar data
4976 my $calendar_data = "";
4977 foreach my $cal (@calendars)
4979 my $scalname = $cal->{name};
4980 my $iyearoffsetrange = 0;
4981 my $itwodigityearmax = $cal->{itwodigityearmax};
4982 my @sshortdate;
4983 my @syearmonth;
4984 my @slongdate;
4985 my @serastring;
4986 my @sdayname;
4987 my @sabbrevdayname;
4988 my @smonthname;
4989 my @sabbrevmonthname;
4990 my @smonthday;
4991 my @sabbreverastring;
4992 my @sshortestdayname;
4994 my $type = $cal->{type};
4995 if (defined $cal->{locale} && defined $type)
4997 my $loc = $lcnames{$cal->{locale}};
4998 my $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4999 push @sshortdate, $fmt if $fmt;
5000 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMd' and not(\@alt)]" );
5001 push @sshortdate, $fmt if $fmt;
5002 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
5003 push @sshortdate, $fmt if $fmt;
5004 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMMMd' and not(\@alt)]" );
5005 push @sshortdate, $fmt if $fmt;
5006 @sshortdate = map convert_date_format($_), @sshortdate;
5007 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" );
5008 push @slongdate, $fmt if $fmt;
5009 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
5010 push @slongdate, $fmt if $fmt;
5011 @slongdate = map convert_date_format($_), @slongdate;
5013 foreach my $n (1..13)
5015 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n' and not(\@yeartype)]" );
5016 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n' and not(\@yeartype)]" );
5017 push @smonthname, $name || "";
5018 push @sabbrevmonthname, $abbrev || $name || "";
5021 $scalname ||= loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$type']" );
5022 if (defined $cal->{eras})
5024 my @eras;
5025 my $idx = 1;
5026 foreach my $era (@{$cal->{eras}})
5028 my $start = xml_query( $suppl, "/supplementalData/calendarData/calendar[\@type='$type']/eras/era[\@type='$era']/\@start" );
5029 next unless $start =~ /^(-?\d+)-(\d+)-(\d+)/;
5030 my ($year, $mon, $day, $zero, $first) = ($1, $2, $3, $1 - 1, 1);
5031 if ($zero < 0)
5033 $first -= $zero;
5034 $year = 1;
5035 $itwodigityearmax = 2049 - $zero;
5037 unshift @eras, pack( "S<8", 6, $idx++, $year, $mon, $day, $zero, $first, 0 );
5038 push @serastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraAbbr/era[\@type='$era']" );
5039 push @sabbreverastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraNarrow/era[\@type='$era']" );
5041 $iyearoffsetrange = add_str_data( pack "S<L<*", scalar @eras, map { add_str_data($_); } @eras );
5045 @sshortdate = @{$cal->{sshortdate}} if defined $cal->{sshortdate} && !@sshortdate;
5046 @syearmonth = @{$cal->{syearmonth}} if defined $cal->{syearmonth};
5047 @slongdate = @{$cal->{slongdate}} if defined $cal->{slongdate} && !@slongdate;
5048 @serastring = @{$cal->{serastring}} if defined $cal->{serastring} && !@serastring;
5049 @sdayname = @{$cal->{sdayname}} if defined $cal->{sdayname};
5050 @sabbrevdayname = @{$cal->{sabbrevdayname}} if defined $cal->{sabbrevdayname};
5051 @smonthname = @{$cal->{smonthname}} if defined $cal->{smonthname} && !join("",@smonthname);
5052 @sabbrevmonthname = @{$cal->{sabbrevmonthname}} if defined $cal->{sabbrevmonthname} && !join("",@sabbrevmonthname);
5053 @smonthday = @{$cal->{smonthday}} if defined $cal->{smonthday};
5054 @sabbreverastring = @{$cal->{sabbreverastring}} if defined $cal->{sabbreverastring} && !@sabbreverastring;
5055 @sshortestdayname = @{$cal->{sshortestdayname}} if defined $cal->{sshortestdayname};
5056 my $srelativelongdate = $cal->{srelativelongdate};
5058 @serastring = ("A.D.") unless @serastring;
5059 @sabbreverastring = ("AD") unless @sabbreverastring;
5061 if ($cal->{id} != 1) # calendar 1 is a placeholder, information is fetched from locale instead
5063 @sshortdate = ("") unless @sshortdate;
5064 @syearmonth = ("") unless @syearmonth;
5065 @slongdate = ("") unless @slongdate;
5066 @sdayname = ("") x 7 unless @sdayname;
5067 @sabbrevdayname = ("") x 7 unless @sabbrevdayname;
5068 @sshortestdayname = ("") x 7 unless @sshortestdayname;
5069 @smonthname = ("") x 13 unless @smonthname;
5070 @sabbrevmonthname = ("") x 13 unless @sabbrevmonthname;
5071 @smonthday = ("") unless @smonthday;
5074 $calendar_data .= pack "S<2L<17",
5075 $cal->{id}, # CAL_ICALINTVALUE
5076 $itwodigityearmax || 99, # CAL_ITWODIGITYEARMAX
5077 add_strarray( @sshortdate ), # CAL_SSHORTDATE
5078 add_strarray( @syearmonth ), # CAL_SYEARMONTH
5079 add_strarray( @slongdate ), # CAL_SLONGDATE
5080 add_strarray( @serastring ), # CAL_SERASTRING
5081 $iyearoffsetrange, # CAL_IYEAROFFSETRANGE
5082 add_strarray( @sdayname ), # CAL_SDAYNAME
5083 add_strarray( @sabbrevdayname ), # CAL_SABBREVDAYNAME
5084 add_strarray( @smonthname ), # CAL_SMONTHNAME
5085 add_strarray( @sabbrevmonthname ), # CAL_SABBREVMONTHNAME
5086 add_string( $scalname ), # CAL_SCALNAME
5087 add_strarray( @smonthday ), # CAL_SMONTHDAY
5088 add_strarray( @sabbreverastring ), # CAL_SABBREVERASTRING
5089 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5090 add_string( $srelativelongdate ); # CAL_SRELATIVELONGDATE
5093 # output locale header
5095 my $nb_lcids = scalar keys %lcids;
5096 my $nb_locales = scalar grep { !defined $_->{alias} } @locales;
5097 my $nb_lcnames = scalar keys %lcnames;
5098 my $locale_size = length($locale_data) / $nb_locales;
5099 my $nb_calendars = scalar @calendars;
5100 my $calendar_size = length($calendar_data) / $nb_calendars;
5101 my $lcids_offset = 19 * 4; # size of header
5102 my $lcnames_offset = $lcids_offset + length $lcid_data;
5103 my $locales_offset = $lcnames_offset + length $lcname_data;
5104 my $calendar_offset = $locales_offset + length $locale_data;
5105 my $strings_offset = $calendar_offset + length $calendar_data;
5107 my $locale_header = pack "L<7S<4L<S<2L<3S<2L<4",
5108 8, # offset
5110 7, # version
5111 0x5344534e, # magic
5112 0, 0, 0,
5114 $nb_lcids,
5115 $nb_locales,
5116 $locale_size,
5117 $locales_offset,
5118 $nb_lcnames,
5120 $lcids_offset,
5121 $lcnames_offset,
5123 $nb_calendars,
5124 $calendar_size,
5125 $calendar_offset,
5126 $strings_offset,
5127 0, 0;
5129 return align_string( 4, $locale_header . $lcid_data . $lcname_data . $locale_data . $calendar_data . $string_data );
5133 ################################################################
5134 # build the charmaps table for locale.nls
5135 sub build_charmaps_data()
5137 my $data = "";
5139 # MAP_FOLDDIGITS
5140 my @digits = (ord('0') .. ord('9'));
5141 $digitmap_table[0x3007] = $digits[0]; # Ideographic Zero
5142 @digitmap_table[0x0c78..0x0c7b] = @digits[0..3]; # Telugu Fraction Digits
5143 @digitmap_table[0x0c7c..0x0c7e] = @digits[1..3]; # Telugu Fraction Digits
5144 @digitmap_table[0x3021..0x3029] = @digits[1..9]; # Hangzhou Numerals
5145 @digitmap_table[0xa8e0..0xa8e9] = @digits; # Combining Devanagari Digits
5146 @digitmap_table[0x10107..0x1010f] = @digits[1..9]; # Aegean Numbers
5147 $digitmap_table[0x10320] = $digits[1]; # Old Italic Numerals
5148 $digitmap_table[0x10321] = $digits[5]; # Old Italic Numerals
5149 $data .= dump_binary_case_table( @digitmap_table );
5151 # CJK compatibility map
5152 $data .= dump_binary_case_table( @cjk_compat_table );
5154 # LCMAP_HIRAGANA/KATAKANA
5155 my (@hiragana_table, @katakana_table);
5156 foreach my $ch (0x3041..0x3096, 0x309d..0x309e)
5158 $hiragana_table[$ch + 0x60] = $ch;
5159 $katakana_table[$ch] = $ch + 0x60;
5161 $data .= dump_binary_case_table( @hiragana_table ) . dump_binary_case_table( @katakana_table );
5163 # LCMAP_HALFWIDTH/FULLWIDTH
5164 $halfwidth_table[0x2018] = 0x0027;
5165 $halfwidth_table[0x2019] = 0x0027;
5166 $halfwidth_table[0x201c] = 0x0022;
5167 $halfwidth_table[0x201d] = 0x0022;
5168 $halfwidth_table[0x309b] = 0xff9e;
5169 $halfwidth_table[0x309c] = 0xff9f;
5170 $fullwidth_table[0x309b] = 0x3099;
5171 $fullwidth_table[0x309c] = 0x309a;
5172 $data .= dump_binary_case_table( @halfwidth_table ) . dump_binary_case_table( @fullwidth_table );
5174 # LCMAP_TRADITIONAL/SIMPLIFIED_CHINESE
5175 $data .= dump_binary_case_table( @chinese_traditional_table ) . dump_binary_case_table( @chinese_simplified_table );
5177 # FIXME: some more unknown tables here
5179 return $data;
5183 ################################################################
5184 # build the geoids table for locale.nls
5185 sub build_geoids_data()
5187 my $data = "";
5188 my %index;
5189 my $idx = 0;
5190 my @geo_header = (0x00650067, 0x0000006f, 0, 4 * 7, scalar @geoids, 0, 0);
5192 foreach my $geo (@geoids)
5194 my $id = $geo->{id};
5195 $geo = $geo->{alias} if defined $geo->{alias};
5196 my $lat = "0.000";
5197 my $long = "0.000";
5198 my $iso2 = $geo->{iso2} || "XX";
5199 my $iso3 = $geo->{iso3} || "XX";
5200 my $isregion = $geo->{region} || (defined $geo->{uncode} && !defined $geo->{iso2});
5201 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
5202 my $scurrency = $geo->{scurrency} || "\x{00a4}";
5204 $data .= pack( "L<", $id );
5205 $data .= pad_string( 24, encode( "UTF16LE", $lat ));
5206 $data .= pad_string( 24, encode( "UTF16LE", $long ));
5207 $data .= pack( "L<2", $isregion ? 14 : 16, $geo->{parentid} || 39070 );
5208 $data .= pad_string( 8, encode( "UTF16LE", $iso2 ));
5209 $data .= pad_string( 8, encode( "UTF16LE", $iso3 ));
5210 $data .= pack( "S<2", $geo->{uncode} || 0, $geo->{dialcode} || 0 );
5211 $data .= pad_string( 8, encode( "UTF16LE", $sintlsymbol ));
5212 $data .= pad_string( 16, encode( "UTF16LE", $scurrency ));
5213 $index{$geo->{name}} = $idx if $geo->{name};
5214 $idx++;
5216 $index{"XX"} = $index{"001"};
5218 $geo_header[5] = $geo_header[3] + length $data;
5219 $geo_header[6] = scalar keys %index;
5221 foreach my $name (sort keys %index)
5223 $data .= pad_string( 8, encode( "UTF16LE", $name ));
5224 $data .= pack "L<", $index{$name};
5227 $geo_header[2] = $geo_header[3] + length $data;
5228 return pack( "L<7", @geo_header ) . $data;
5232 ################################################################
5233 # build a binary locale table
5234 sub dump_locales($$)
5236 my ($filename, $chartypes) = @_;
5238 printf "Building $filename\n";
5240 my $locale_data = build_locale_data();
5241 my $charmaps_data = build_charmaps_data();
5242 my $geoids_data = build_geoids_data();
5243 my $scripts_data = ""; # FIXME
5245 my @header = ( 0 ) x 8;
5246 $header[0] = 4 * scalar @header; # chartypes offset
5247 $header[4] = $header[0] + length $chartypes; # locales offset
5248 $header[5] = $header[4] + length $locale_data; # charmaps offset
5249 $header[6] = $header[5] + length $charmaps_data; # geoids offset
5250 $header[7] = $header[6] + length $geoids_data; # scripts offset
5252 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5253 print OUTPUT pack "L<*", @header;
5254 print OUTPUT $chartypes, $locale_data, $charmaps_data, $geoids_data, $scripts_data;
5255 close OUTPUT;
5256 save_file($filename);
5260 ################################################################
5261 # build the script to create registry keys
5262 sub dump_registry_script($%)
5264 my ($filename, %keys) = @_;
5265 my $indent = 1;
5267 printf "Building %s\n", $filename;
5268 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5269 print OUTPUT "HKLM\n{\n";
5270 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
5272 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
5273 $indent++;
5275 foreach my $k (sort keys %keys)
5277 my @subkeys = split /\\/, $k;
5278 my ($def, @vals) = @{$keys{$k}};
5279 for (my $i = 0; $i < @subkeys; $i++)
5281 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "",
5282 $subkeys[$i] =~ /\s/ ? "'$subkeys[$i]'" : $subkeys[$i],
5283 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
5284 $indent++;
5286 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
5287 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
5289 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
5290 close OUTPUT;
5291 save_file($filename);
5295 ################################################################
5296 # save a file if modified
5297 sub save_file($)
5299 my $file = shift;
5300 if (-f $file && !system "cmp $file $file.new >/dev/null")
5302 unlink "$file.new";
5304 else
5306 rename "$file.new", "$file";
5311 ################################################################
5312 # main routine
5314 chdir ".." if -f "./make_unicode";
5315 load_data();
5316 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
5317 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
5318 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
5319 dump_mirroring( "dlls/dwrite/mirror.c" );
5320 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
5321 dump_bracket( "dlls/dwrite/bracket.c" );
5322 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
5323 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
5324 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
5325 dump_linebreak( "dlls/dwrite/linebreak.c" );
5326 dump_scripts( "dlls/dwrite/scripts" );
5327 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
5328 dump_vertical( "dlls/win32u/vertical.c", 1 );
5329 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
5330 dump_intl_nls("nls/l_intl.nls");
5331 dump_norm_table( "nls/normnfc.nls" );
5332 dump_norm_table( "nls/normnfd.nls" );
5333 dump_norm_table( "nls/normnfkc.nls" );
5334 dump_norm_table( "nls/normnfkd.nls" );
5335 dump_norm_table( "nls/normidna.nls" );
5336 my $chartypes = dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
5337 dump_locales( "nls/locale.nls", $chartypes );
5338 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
5339 dump_eucjp_codepage();
5340 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
5342 exit 0;
5344 # Local Variables:
5345 # compile-command: "./make_unicode"
5346 # End: