ntdll: Move some duplicated locale definitions to a common header.
[wine.git] / tools / make_unicode
blob520cbe0f1a5ae8f7edefb1dc76e4e5029cc00417
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
23 use XML::LibXML;
24 use Encode;
26 # base URLs for www.unicode.org files
27 my $UNIVERSION = "14.0.0";
28 my $UNIDATA = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
29 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
30 my $JISDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
31 my $KSCDATA = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
32 my $REPORTS = "http://www.unicode.org/reports";
33 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
34 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
36 my $CLDRVERSION = "41";
37 my $CLDRDATA = "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip";
38 my $CLDR33DATA = "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip";
40 my $ISO639VERSION = "20220120";
41 my $ISO639 = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip";
43 # Sort keys file
44 my $SORTKEYS = "tr10/allkeys.txt";
46 # Default char for undefined mappings
47 my $DEF_CHAR = ord '?';
49 # Last valid Unicode character
50 my $MAX_CHAR = 0x10ffff;
52 my @allfiles =
54 "CodpageFiles/037.txt",
55 "CodpageFiles/437.txt",
56 "CodpageFiles/500.txt",
57 "CodpageFiles/708.txt",
58 "CodpageFiles/720.txt",
59 "CodpageFiles/737.txt",
60 "CodpageFiles/775.txt",
61 "CodpageFiles/850.txt",
62 "CodpageFiles/852.txt",
63 "CodpageFiles/855.txt",
64 "CodpageFiles/857.txt",
65 "CodpageFiles/860.txt",
66 "CodpageFiles/861.txt",
67 "CodpageFiles/862.txt",
68 "CodpageFiles/863.txt",
69 "CodpageFiles/864.txt",
70 "CodpageFiles/865.txt",
71 "CodpageFiles/866.txt",
72 "CodpageFiles/869.txt",
73 "CodpageFiles/874.txt",
74 "CodpageFiles/875.txt",
75 "CodpageFiles/932.txt",
76 "CodpageFiles/936.txt",
77 "CodpageFiles/949.txt",
78 "CodpageFiles/950.txt",
79 "CodpageFiles/1026.txt",
80 "CodpageFiles/1250.txt",
81 "CodpageFiles/1251.txt",
82 "CodpageFiles/1252.txt",
83 "CodpageFiles/1253.txt",
84 "CodpageFiles/1254.txt",
85 "CodpageFiles/1255.txt",
86 "CodpageFiles/1256.txt",
87 "CodpageFiles/1257.txt",
88 "CodpageFiles/1258.txt",
89 "CodpageFiles/1361.txt",
90 "CodpageFiles/10000.txt",
91 "CodpageFiles/10001.txt",
92 "CodpageFiles/10002.txt",
93 "CodpageFiles/10003.txt",
94 "CodpageFiles/10004.txt",
95 "CodpageFiles/10005.txt",
96 "CodpageFiles/10006.txt",
97 "CodpageFiles/10007.txt",
98 "CodpageFiles/10008.txt",
99 "CodpageFiles/10010.txt",
100 "CodpageFiles/10017.txt",
101 "CodpageFiles/10021.txt",
102 "CodpageFiles/10029.txt",
103 "CodpageFiles/10079.txt",
104 "CodpageFiles/10081.txt",
105 "CodpageFiles/10082.txt",
106 "CodpageFiles/20127.txt",
107 "CodpageFiles/20866.txt",
108 "CodpageFiles/21866.txt",
109 "CodpageFiles/28591.txt",
110 "CodpageFiles/28592.txt",
111 "CodpageFiles/28593.txt",
112 "CodpageFiles/28594.txt",
113 "CodpageFiles/28595.txt",
114 "CodpageFiles/28596.txt",
115 "CodpageFiles/28597.txt",
116 "CodpageFiles/28598.txt",
117 "CodpageFiles/28599.txt",
118 "CodpageFiles/28603.txt",
119 "CodpageFiles/28605.txt",
123 my %ctype =
125 # CT_CTYPE1
126 "upper" => 0x0001,
127 "lower" => 0x0002,
128 "digit" => 0x0004,
129 "space" => 0x0008,
130 "punct" => 0x0010,
131 "cntrl" => 0x0020,
132 "blank" => 0x0040,
133 "xdigit" => 0x0080,
134 "alpha" => 0x0100 | 0x80000000,
135 "defin" => 0x0200,
136 # CT_CTYPE3 in high 16 bits
137 "nonspacing" => 0x00010000,
138 "diacritic" => 0x00020000,
139 "vowelmark" => 0x00040000,
140 "symbol" => 0x00080000,
141 "katakana" => 0x00100000,
142 "hiragana" => 0x00200000,
143 "halfwidth" => 0x00400000,
144 "fullwidth" => 0x00800000,
145 "ideograph" => 0x01000000,
146 "kashida" => 0x02000000,
147 "lexical" => 0x04000000,
148 "highsurrogate" => 0x08000000,
149 "lowsurrogate" => 0x10000000,
152 my %bracket_types =
154 "o" => 0x0000,
155 "c" => 0x0001,
158 my %indic_types =
160 "Other" => 0x0000,
161 "Bindu" => 0x0001,
162 "Visarga" => 0x0002,
163 "Avagraha" => 0x0003,
164 "Nukta" => 0x0004,
165 "Virama" => 0x0005,
166 "Vowel_Independent" => 0x0006,
167 "Vowel_Dependent" => 0x0007,
168 "Vowel" => 0x0008,
169 "Consonant_Placeholder" => 0x0009,
170 "Consonant" => 0x000a,
171 "Consonant_Dead" => 0x000b,
172 "Consonant_Succeeding_Repha" => 0x000c,
173 "Consonant_Subjoined" => 0x000d,
174 "Consonant_Medial" => 0x000e,
175 "Consonant_Final" => 0x000f,
176 "Consonant_Head_Letter" => 0x0010,
177 "Modifying_Letter" => 0x0011,
178 "Tone_Letter" => 0x0012,
179 "Tone_Mark" => 0x0013,
180 "Register_Shifter" => 0x0014,
181 "Consonant_Preceding_Repha" => 0x0015,
182 "Pure_Killer" => 0x0016,
183 "Invisible_Stacker" => 0x0017,
184 "Gemination_Mark" => 0x0018,
185 "Cantillation_Mark" => 0x0019,
186 "Non_Joiner" => 0x001a,
187 "Joiner" => 0x001b,
188 "Number_Joiner" => 0x001c,
189 "Number" => 0x001d,
190 "Brahmi_Joining_Number" => 0x001e,
191 "Consonant_With_Stacker" => 0x001f,
192 "Consonant_Prefixed" => 0x0020,
193 "Syllable_Modifier" => 0x0021,
194 "Consonant_Killer" => 0x0022,
195 "Consonant_Initial_Postfixed" => 0x0023,
198 my %matra_types =
200 "Right" => 0x01,
201 "Left" => 0x02,
202 "Visual_Order_Left" => 0x03,
203 "Left_And_Right" => 0x04,
204 "Top" => 0x05,
205 "Bottom" => 0x06,
206 "Top_And_Bottom" => 0x07,
207 "Top_And_Right" => 0x08,
208 "Top_And_Left" => 0x09,
209 "Top_And_Left_And_Right" => 0x0a,
210 "Bottom_And_Right" => 0x0b,
211 "Top_And_Bottom_And_Right" => 0x0c,
212 "Overstruck" => 0x0d,
213 "Invisible" => 0x0e,
214 "Bottom_And_Left" => 0x0f,
215 "Top_And_Bottom_And_Left" => 0x10,
218 my %break_types =
220 "BK" => 0x0001,
221 "CR" => 0x0002,
222 "LF" => 0x0003,
223 "CM" => 0x0004,
224 "SG" => 0x0005,
225 "GL" => 0x0006,
226 "CB" => 0x0007,
227 "SP" => 0x0008,
228 "ZW" => 0x0009,
229 "NL" => 0x000a,
230 "WJ" => 0x000b,
231 "JL" => 0x000c,
232 "JV" => 0x000d,
233 "JT" => 0x000e,
234 "H2" => 0x000f,
235 "H3" => 0x0010,
236 "XX" => 0x0011,
237 "OP" => 0x0012,
238 "CL" => 0x0013,
239 "CP" => 0x0014,
240 "QU" => 0x0015,
241 "NS" => 0x0016,
242 "EX" => 0x0017,
243 "SY" => 0x0018,
244 "IS" => 0x0019,
245 "PR" => 0x001a,
246 "PO" => 0x001b,
247 "NU" => 0x001c,
248 "AL" => 0x001d,
249 "ID" => 0x001e,
250 "IN" => 0x001f,
251 "HY" => 0x0020,
252 "BB" => 0x0021,
253 "BA" => 0x0022,
254 "SA" => 0x0023,
255 "AI" => 0x0024,
256 "B2" => 0x0025,
257 "HL" => 0x0026,
258 "CJ" => 0x0027,
259 "RI" => 0x0028,
260 "EB" => 0x0029,
261 "EM" => 0x002a,
262 "ZWJ" => 0x002b,
265 my %vertical_types =
267 "R" => 0x0000,
268 "U" => 0x0001,
269 "Tr" => 0x0002,
270 "Tu" => 0x0003,
273 my %categories =
275 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
276 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
277 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
278 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
279 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
280 "Me" => $ctype{"defin"}, # Mark, Enclosing
281 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
282 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
283 "No" => $ctype{"defin"}, # Number, Other
284 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
285 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
286 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
287 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
288 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
289 "Cs" => $ctype{"defin"}, # Other, Surrogate
290 "Co" => $ctype{"defin"}, # Other, Private Use
291 "Cn" => $ctype{"defin"}, # Other, Not Assigned
292 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
293 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
294 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
295 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
296 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
297 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
298 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
299 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
300 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
301 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
302 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
303 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
304 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
307 # a few characters need additional categories that cannot be determined automatically
308 my %special_categories =
310 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
311 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
312 "space" => [ 0x09..0x0d, 0x85 ],
313 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
314 "cntrl" => [ 0x070f, 0x200c, 0x200d,
315 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
316 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
317 0xfff9, 0xfffa, 0xfffb ],
318 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
319 0xd7, 0xf7 ],
320 "digit" => [ 0xb2, 0xb3, 0xb9 ],
321 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
322 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
323 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
324 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
325 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
326 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
327 0x02b9..0x02ba, 0x02c6..0x02cf ],
328 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
329 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
330 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
331 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
332 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
333 0x3131..0x3164 ],
334 "ideograph" => [ 0x3006..0x3007 ],
335 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
336 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
337 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
338 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
339 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
340 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
341 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
342 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
343 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
344 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
345 "kashida" => [ 0x0640 ],
348 my %directions =
350 "L" => 1, # Left-to-Right
351 "R" => 2, # Right-to-Left
352 "AL" => 12, # Right-to-Left Arabic
353 "EN" => 3, # European Number
354 "ES" => 4, # European Number Separator
355 "ET" => 5, # European Number Terminator
356 "AN" => 6, # Arabic Number
357 "CS" => 7, # Common Number Separator
358 "NSM" => 13, # Non-Spacing Mark
359 "BN" => 14, # Boundary Neutral
360 "B" => 8, # Paragraph Separator
361 "S" => 9, # Segment Separator
362 "WS" => 10, # Whitespace
363 "ON" => 11, # Other Neutrals
364 "LRE" => 15, # Left-to-Right Embedding
365 "LRO" => 15, # Left-to-Right Override
366 "RLE" => 15, # Right-to-Left Embedding
367 "RLO" => 15, # Right-to-Left Override
368 "PDF" => 15, # Pop Directional Format
369 "LRI" => 15, # Left-to-Right Isolate
370 "RLI" => 15, # Right-to-Left Isolate
371 "FSI" => 15, # First Strong Isolate
372 "PDI" => 15 # Pop Directional Isolate
375 my %c2_types =
377 "L" => 1, # C2_LEFTTORIGHT
378 "R" => 2, # C2_RIGHTTOLEFT
379 "AL" => 2, # C2_RIGHTTOLEFT
380 "EN" => 3, # C2_EUROPENUMBER
381 "ES" => 4, # C2_EUROPESEPARATOR
382 "ET" => 5, # C2_EUROPETERMINATOR
383 "AN" => 6, # C2_ARABICNUMBER
384 "CS" => 7, # C2_COMMONSEPARATOR
385 "NSM" => 11, # C2_OTHERNEUTRAL
386 "BN" => 0, # C2_NOTAPPLICABLE
387 "B" => 8, # C2_BLOCKSEPARATOR
388 "S" => 9, # C2_SEGMENTSEPARATOR
389 "WS" => 10, # C2_WHITESPACE
390 "ON" => 11, # C2_OTHERNEUTRAL
391 "LRE" => 11, # C2_OTHERNEUTRAL
392 "LRO" => 11, # C2_OTHERNEUTRAL
393 "RLE" => 11, # C2_OTHERNEUTRAL
394 "RLO" => 11, # C2_OTHERNEUTRAL
395 "PDF" => 11, # C2_OTHERNEUTRAL
396 "LRI" => 11, # C2_OTHERNEUTRAL
397 "RLI" => 11, # C2_OTHERNEUTRAL
398 "FSI" => 11, # C2_OTHERNEUTRAL
399 "PDI" => 11 # C2_OTHERNEUTRAL
402 my %bidi_types =
404 "ON" => 0, # Other Neutrals
405 "L" => 1, # Left-to-Right
406 "R" => 2, # Right-to-Left
407 "AN" => 3, # Arabic Number
408 "EN" => 4, # European Number
409 "AL" => 5, # Right-to-Left Arabic
410 "NSM" => 6, # Non-Spacing Mark
411 "CS" => 7, # Common Number Separator
412 "ES" => 8, # European Number Separator
413 "ET" => 9, # European Number Terminator
414 "BN" => 10, # Boundary Neutral
415 "S" => 11, # Segment Separator
416 "WS" => 12, # Whitespace
417 "B" => 13, # Paragraph Separator
418 "RLO" => 14, # Right-to-Left Override
419 "RLE" => 15, # Right-to-Left Embedding
420 "LRO" => 16, # Left-to-Right Override
421 "LRE" => 17, # Left-to-Right Embedding
422 "PDF" => 18, # Pop Directional Format
423 "LRI" => 19, # Left-to-Right Isolate
424 "RLI" => 20, # Right-to-Left Isolate
425 "FSI" => 21, # First Strong Isolate
426 "PDI" => 22 # Pop Directional Isolate
429 my %joining_types =
431 "U" => 0, # Non_Joining
432 "L" => 1, # Left_Joining
433 "R" => 2, # Right_Joining
434 "D" => 3, # Dual_Joining
435 "C" => 3, # Join_Causing
436 "ALAPH" => 4, # Syriac ALAPH
437 "DALATH RISH" => 5, # Syriac DALATH RISH group
438 "T" => 6, # Transparent
441 my @locales =
443 { name => "", lcid => 0x0000007f, file => "root", territory => "IV", sabbrevlangname => "IVL", sopentypelang =>"dflt" },
444 { name => "aa", dir => "seed", sopentypelang => "AFR" },
445 { name => "aa-DJ", dir => "seed" },
446 { name => "aa-ER", dir => "seed" },
447 { name => "aa-ET", dir => "seed" },
448 { name => "af", lcid => 0x00000036, oemcp => 850, sabbrevlangname => "AFK", sopentypelang => "AFK" },
449 { name => "af-NA" },
450 { name => "af-ZA", lcid => 0x00000436 },
451 { name => "agq" },
452 { name => "agq-CM" },
453 { name => "ak", sopentypelang => "TWI" },
454 { name => "ak-GH" },
455 { name => "am", lcid => 0x0000005e, sabbrevlangname => "AMH" },
456 { name => "am-ET", lcid => 0x0000045e },
457 { name => "ar", lcid => 0x00000001, territory => "SA", oemcp => 720, group => 13 },
458 { name => "ar-001" },
459 { name => "ar-AE", lcid => 0x00003801, sabbrevlangname => "ARU" },
460 { name => "ar-BH", lcid => 0x00003c01, sabbrevlangname => "ARH" },
461 { name => "ar-DJ" },
462 { name => "ar-DZ", lcid => 0x00001401, sabbrevlangname => "ARG" },
463 { name => "ar-EG", lcid => 0x00000c01, sabbrevlangname => "ARE" },
464 { name => "ar-EH" },
465 { name => "ar-ER" },
466 { name => "ar-IL" },
467 { name => "ar-IQ", lcid => 0x00000801, sabbrevlangname => "ARI" },
468 { name => "ar-JO", lcid => 0x00002c01, sabbrevlangname => "ARJ" },
469 { name => "ar-KM" },
470 { name => "ar-KW", lcid => 0x00003401, sabbrevlangname => "ARK" },
471 { name => "ar-LB", lcid => 0x00003001, sabbrevlangname => "ARB" },
472 { name => "ar-LY", lcid => 0x00001001, sabbrevlangname => "ARL" },
473 { name => "ar-MA", lcid => 0x00001801, sabbrevlangname => "ARM" },
474 { name => "ar-MR" },
475 { name => "ar-OM", lcid => 0x00002001, sabbrevlangname => "ARO" },
476 { name => "ar-PS" },
477 { name => "ar-QA", lcid => 0x00004001, sabbrevlangname => "ARQ" },
478 { name => "ar-SA", lcid => 0x00000401, sabbrevlangname => "ARA" },
479 { name => "ar-SD" },
480 { name => "ar-SO" },
481 { name => "ar-SS" },
482 { name => "ar-SY", lcid => 0x00002801, sabbrevlangname => "ARS" },
483 { name => "ar-TD" },
484 { name => "ar-TN", lcid => 0x00001c01, sabbrevlangname => "ART" },
485 { name => "ar-YE", lcid => 0x00002401, sabbrevlangname => "ARY" },
486 { name => "arn", lcid => 0x0000007a, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sabbrevlangname => "MPD", sopentypelang => "MAP" },
487 { name => "arn-CL", lcid => 0x0000047a, dir => "seed" },
488 { name => "arn-Latn", alias => "arn" },
489 { name => "arn-Latn-CL", alias => "arn-CL" },
490 { name => "as", lcid => 0x0000004d, slist => ",", group => 15 },
491 { name => "as-IN", lcid => 0x0000044d },
492 { name => "asa" },
493 { name => "asa-TZ" },
494 { name => "ast" },
495 { name => "ast-ES" },
496 { name => "az", lcid => 0x0000002c, oemcp => 857, ebcdiccp => 20905, group => 2 },
497 { name => "az-Cyrl", lcid => 0x0000742c, oemcp => 866, ebcdiccp => 20880, group => 5, sabbrevlangname => "AZC" },
498 { name => "az-Cyrl-AZ", lcid => 0x0000082c },
499 { name => "az-Latn", lcid => 0x0000782c },
500 { name => "az-Latn-AZ", lcid => 0x0000042c },
501 { name => "ba", lcid => 0x0000006d, oemcp => 866, group => 5, dir => "seed", sabbrevlangname => "BAS", sopentypelang => "BSH" },
502 { name => "ba-Cyrl", alias => "ba" },
503 { name => "ba-Cyrl-RU", alias => "ba-RU" },
504 { name => "ba-RU", lcid => 0x0000046d, dir => "seed" },
505 { name => "bas" },
506 { name => "bas-CM" },
507 { name => "be", lcid => 0x00000023, oemcp => 866, ebcdiccp => 500, group => 5 },
508 { name => "be-BY", lcid => 0x00000423 },
509 { name => "bem" },
510 { name => "bem-ZM" },
511 { name => "bez" },
512 { name => "bez-TZ" },
513 { name => "bg", lcid => 0x00000002, oemcp => 866, ebcdiccp => 21025, group => 5, sabbrevlangname => "BGR", sopentypelang => "BGR" },
514 { name => "bg-BG", lcid => 0x00000402 },
515 { name => "bin", lcid => 0x00000066, oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "EDO" },
516 { name => "bin-NG", lcid => 0x00000466, file => "bin", dir => "exemplars" },
517 { name => "bm", sopentypelang => "BMB" },
518 { name => "bm-Latn", file => "bm" },
519 { name => "bm-Latn-ML", file => "bm_ML" },
520 { name => "bm-ML", alias => "bm-Latn-ML" },
521 { name => "bn", lcid => 0x00000045, slist => ",", group => 15, sabbrevlangname => "BNB" },
522 { name => "bn-BD", lcid => 0x00000845 },
523 { name => "bn-IN", lcid => 0x00000445, sabbrevlangname => "BNG" },
524 { name => "bo", lcid => 0x00000051, slist => ",", group => 15, sabbrevlangname => "BOB", sopentypelang => "TIB" },
525 { name => "bo-CN", lcid => 0x00000451 },
526 { name => "bo-IN", slist => "," },
527 { name => "bo-Tibt", alias => "bo" },
528 { name => "bo-Tibt-CN", alias => "bo-CN" },
529 { name => "bo-Tibt-IN", alias => "bo-IN" },
530 { name => "br", lcid => 0x0000007e, oemcp => 850, ebcdiccp => 20297 },
531 { name => "br-FR", lcid => 0x0000047e },
532 { name => "br-Latn", alias => "br" },
533 { name => "br-Latn-FR", alias => "br-FR" },
534 { name => "brx" },
535 { name => "brx-IN" },
536 { name => "bs", lcid => 0x0000781a, oemcp => 852, maccp => 10082, ebcdiccp => 870, group => 2, sabbrevlangname => "BSB" },
537 { name => "bs-Cyrl", lcid => 0x0000641a, oemcp => 855, group => 5, sabbrevlangname => "BSC" },
538 { name => "bs-Cyrl-BA", lcid => 0x0000201a },
539 { name => "bs-Latn", lcid => 0x0000681a },
540 { name => "bs-Latn-BA", lcid => 0x0000141a },
541 { name => "byn", dir => "seed", sopentypelang => "BIL" },
542 { name => "byn-ER", dir => "seed" },
543 { name => "ca", lcid => 0x00000003, oemcp => 850 },
544 { name => "ca-AD", maccp => 65001 },
545 { name => "ca-ES", lcid => 0x00000403 },
546 { name => "ca-ES-valencia", lcid => 0x00000803, file => "ca_ES_VALENCIA", sabbrevlangname => "VAL" },
547 { name => "ca-FR", maccp => 65001 },
548 { name => "ca-IT", maccp => 65001 },
549 { name => "ccp" },
550 { name => "ccp-BD", alias => "ccp-Cakm-BD" },
551 { name => "ccp-Cakm", file => "ccp" },
552 { name => "ccp-Cakm-BD", file => "ccp_BD" },
553 { name => "ccp-Cakm-IN", file => "ccp_IN" },
554 { name => "ccp-IN", alias => "ccp-Cakm-IN" },
555 { name => "ce" },
556 { name => "ce-RU" },
557 { name => "ceb" },
558 { name => "ceb-Latn", file => "ceb" },
559 { name => "ceb-Latn-PH", file => "ceb_PH" },
560 { name => "ceb-PH", alias => "ceb-Latn-PH" },
561 { name => "cgg" },
562 { name => "cgg-UG" },
563 { name => "chr", lcid => 0x0000005c, slist => ",", sabbrevlangname => "CRE" },
564 { name => "chr-Cher", lcid => 0x00007c5c, file => "chr" },
565 { name => "chr-Cher-US", lcid => 0x0000045c, file => "chr_US" },
566 { name => "chr-US", alias => "chr-Cher-US" },
567 { name => "ckb", alias => "ku" },
568 { name => "ckb-IQ", alias => "ku-Arab-IQ" },
569 { name => "ckb-IR", alias => "ku-Arab-IR" },
570 { name => "co", lcid => 0x00000083, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
571 { name => "co-FR", lcid => 0x00000483, dir => "seed" },
572 { name => "co-Latn", alias => "co" },
573 { name => "co-Latn-FR", alias => "co-FR" },
574 { name => "cs", lcid => 0x00000005, oemcp => 852, group => 2, sabbrevlangname => "CSY", sopentypelang => "CSY" },
575 { name => "cs-CZ", lcid => 0x00000405 },
576 { name => "cu", dir => "seed", sopentypelang => "CSL" },
577 { name => "cu-RU", dir => "seed" },
578 { name => "cy", lcid => 0x00000052, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "CYM", sopentypelang => "WEL" },
579 { name => "cy-GB", lcid => 0x00000452 },
580 { name => "da", lcid => 0x00000006, oemcp => 850, ebcdiccp => 20277 },
581 { name => "da-DK", lcid => 0x00000406 },
582 { name => "da-GL", maccp => 65001 },
583 { name => "dav" },
584 { name => "dav-KE" },
585 { name => "de", lcid => 0x00000007, oemcp => 850, ebcdiccp => 20273 },
586 { name => "de-AT", lcid => 0x00000c07, sabbrevlangname => "DEA" },
587 { name => "de-BE" },
588 { name => "de-CH", lcid => 0x00000807, sabbrevlangname => "DES" },
589 { name => "de-DE", lcid => 0x00000407 },
590 { name => "de-DE_phoneb", lcid => 0x00010407, alias => "de-DE" },
591 { name => "de-DE-u-co-phonebk", alias => "de-DE_phoneb" },
592 { name => "de-IT", oemcp => 65001 },
593 { name => "de-LI", lcid => 0x00001407, sabbrevlangname => "DEC" },
594 { name => "de-LU", lcid => 0x00001007, sabbrevlangname => "DEL" },
595 { name => "dje", sopentypelang => "DJR" },
596 { name => "dje-NE" },
597 { name => "doi" },
598 { name => "doi-IN" },
599 { name => "dsb", lcid => 0x00007c2e, sparent => "hsb", oemcp => 850, ebcdiccp => 870, sabbrevlangname => "DSB", sopentypelang => "LSB" },
600 { name => "dsb-DE", lcid => 0x0000082e },
601 { name => "dua" },
602 { name => "dua-CM" },
603 { name => "dv", lcid => 0x00000065, slist => "\x{060c}", group => 13, dir => "seed" },
604 { name => "dv-MV", lcid => 0x00000465, dir => "seed" },
605 { name => "dyo" },
606 { name => "dyo-SN" },
607 { name => "dz", sopentypelang => "DZN" },
608 { name => "dz-BT", lcid => 0x00000c51, sabbrevlangname => "ZZZ" },
609 { name => "ebu" },
610 { name => "ebu-KE" },
611 { name => "ee" },
612 { name => "ee-GH" },
613 { name => "ee-TG" },
614 { name => "el", lcid => 0x00000008, oemcp => 737, group => 4 },
615 { name => "el-CY" },
616 { name => "el-GR", lcid => 0x00000408 },
617 { name => "en", lcid => 0x00000009, oemcp => 437, slist => ",", sabbrevlangname => "ENU" },
618 { name => "en-001", oemcp => 850 },
619 { name => "en-029", lcid => 0x00002409, file => "en", oemcp => 850, sabbrevlangname => "ENB" },
620 { name => "en-150", oemcp => 65001 },
621 { name => "en-AE", lcid => 0x00004c09, oemcp => 65001, sabbrevlangname => "ZZZ" },
622 { name => "en-AG", oemcp => 850 },
623 { name => "en-AI", oemcp => 850 },
624 { name => "en-AS", oemcp => 850 },
625 { name => "en-AT", oemcp => 65001 },
626 { name => "en-AU", lcid => 0x00000c09, oemcp => 850, sabbrevlangname => "ENA" },
627 { name => "en-BB", oemcp => 850 },
628 { name => "en-BE", oemcp => 850 },
629 { name => "en-BI", oemcp => 65001 },
630 { name => "en-BM", oemcp => 850 },
631 { name => "en-BS", oemcp => 850 },
632 { name => "en-BW", oemcp => 850 },
633 { name => "en-BZ", lcid => 0x00002809, oemcp => 850, sabbrevlangname => "ENL" },
634 { name => "en-CA", lcid => 0x00001009, oemcp => 850, ebcdiccp => 37, sabbrevlangname => "ENC" },
635 { name => "en-CC", oemcp => 850 },
636 { name => "en-CH", oemcp => 65001 },
637 { name => "en-CK", oemcp => 850 },
638 { name => "en-CM", oemcp => 850 },
639 { name => "en-CX", oemcp => 850 },
640 { name => "en-CY", oemcp => 65001 },
641 { name => "en-DE", oemcp => 65001 },
642 { name => "en-DG", oemcp => 850 },
643 { name => "en-DK", oemcp => 65001 },
644 { name => "en-DM", oemcp => 850 },
645 { name => "en-ER", oemcp => 850 },
646 { name => "en-FI", oemcp => 65001 },
647 { name => "en-FJ", oemcp => 850 },
648 { name => "en-FK", oemcp => 850 },
649 { name => "en-FM", oemcp => 850 },
650 { name => "en-GB", lcid => 0x00000809, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "ENG" },
651 { name => "en-GD", oemcp => 850 },
652 { name => "en-GG", oemcp => 850 },
653 { name => "en-GH", oemcp => 850 },
654 { name => "en-GI", oemcp => 850 },
655 { name => "en-GM", oemcp => 850 },
656 { name => "en-GU", oemcp => 850 },
657 { name => "en-GY", oemcp => 850 },
658 { name => "en-HK", lcid => 0x00003c09, oemcp => 850, sabbrevlangname => "ENH" },
659 { name => "en-ID", lcid => 0x00003809, file => "en", oemcp => 850, sabbrevlangname => "ZZZ" },
660 { name => "en-IE", lcid => 0x00001809, oemcp => 850, sabbrevlangname => "ENI" },
661 { name => "en-IL", oemcp => 65001 },
662 { name => "en-IM", oemcp => 850 },
663 { name => "en-IN", lcid => 0x00004009, sabbrevlangname => "ENN" },
664 { name => "en-IO", oemcp => 850 },
665 { name => "en-JE", oemcp => 850 },
666 { name => "en-JM", lcid => 0x00002009, oemcp => 850, sabbrevlangname => "ENJ" },
667 { name => "en-KE", oemcp => 850 },
668 { name => "en-KI", oemcp => 850 },
669 { name => "en-KN", oemcp => 850 },
670 { name => "en-KY", oemcp => 850 },
671 { name => "en-LC", oemcp => 850 },
672 { name => "en-LR", oemcp => 850 },
673 { name => "en-LS", oemcp => 850 },
674 { name => "en-MG", oemcp => 850 },
675 { name => "en-MH", oemcp => 850 },
676 { name => "en-MO", oemcp => 850 },
677 { name => "en-MP", oemcp => 850 },
678 { name => "en-MS", oemcp => 850 },
679 { name => "en-MT", oemcp => 850 },
680 { name => "en-MU", oemcp => 850 },
681 { name => "en-MW", oemcp => 850 },
682 { name => "en-MY", lcid => 0x00004409, sabbrevlangname => "ENM" },
683 { name => "en-NA", oemcp => 850 },
684 { name => "en-NF", oemcp => 850 },
685 { name => "en-NG", oemcp => 850 },
686 { name => "en-NL", oemcp => 65001 },
687 { name => "en-NR", oemcp => 850 },
688 { name => "en-NU", oemcp => 850 },
689 { name => "en-NZ", lcid => 0x00001409, oemcp => 850, sabbrevlangname => "ENZ" },
690 { name => "en-PG", oemcp => 850 },
691 { name => "en-PH", lcid => 0x00003409, ebcdiccp => 500, sabbrevlangname => "ENP" },
692 { name => "en-PK", oemcp => 850 },
693 { name => "en-PN", oemcp => 850 },
694 { name => "en-PR", oemcp => 850 },
695 { name => "en-PW", oemcp => 850 },
696 { name => "en-RW", oemcp => 850 },
697 { name => "en-SB", oemcp => 850 },
698 { name => "en-SC", oemcp => 850 },
699 { name => "en-SD", oemcp => 850 },
700 { name => "en-SE", oemcp => 65001 },
701 { name => "en-SG", lcid => 0x00004809, sabbrevlangname => "ENE" },
702 { name => "en-SH", oemcp => 850 },
703 { name => "en-SI", oemcp => 65001 },
704 { name => "en-SL", oemcp => 850 },
705 { name => "en-SS", oemcp => 850 },
706 { name => "en-SX", oemcp => 850 },
707 { name => "en-SZ", oemcp => 850 },
708 { name => "en-TC", oemcp => 850 },
709 { name => "en-TK", oemcp => 850 },
710 { name => "en-TO", oemcp => 850 },
711 { name => "en-TT", lcid => 0x00002c09, oemcp => 850, sabbrevlangname => "ENT" },
712 { name => "en-TV", oemcp => 850 },
713 { name => "en-TZ", oemcp => 850 },
714 { name => "en-UG", oemcp => 850 },
715 { name => "en-UM", oemcp => 850 },
716 { name => "en-US", lcid => 0x00000409 },
717 { name => "en-VC", oemcp => 850 },
718 { name => "en-VG", oemcp => 850 },
719 { name => "en-VI", oemcp => 850 },
720 { name => "en-VU", oemcp => 850 },
721 { name => "en-WS", oemcp => 850 },
722 { name => "en-ZA", lcid => 0x00001c09, ebcdiccp => 500, sabbrevlangname => "ENS" },
723 { name => "en-ZM", oemcp => 850 },
724 { name => "en-ZW", lcid => 0x00003009, ebcdiccp => 500, sabbrevlangname => "ENW" },
725 { name => "eo", sopentypelang => "NTO" },
726 { name => "eo-001" },
727 { name => "es", lcid => 0x0000000a, oemcp => 850, ebcdiccp => 20284, sabbrevlangname => "ESP", sopentypelang => "ESP" },
728 { name => "es-419", lcid => 0x0000580a, sabbrevlangname => "ESJ" },
729 { name => "es-AR", lcid => 0x00002c0a, sabbrevlangname => "ESS" },
730 { name => "es-BO", lcid => 0x0000400a, sabbrevlangname => "ESB" },
731 { name => "es-BR", oemcp => 65001 },
732 { name => "es-BZ", oemcp => 65001 },
733 { name => "es-CL", lcid => 0x0000340a, sabbrevlangname => "ESL" },
734 { name => "es-CO", lcid => 0x0000240a, sabbrevlangname => "ESO" },
735 { name => "es-CR", lcid => 0x0000140a, sabbrevlangname => "ESC" },
736 { name => "es-CU", lcid => 0x00005c0a, sabbrevlangname => "ESK" },
737 { name => "es-DO", lcid => 0x00001c0a, sabbrevlangname => "ESD" },
738 { name => "es-EA" },
739 { name => "es-EC", lcid => 0x0000300a, sabbrevlangname => "ESF" },
740 { name => "es-ES", lcid => 0x00000c0a, sabbrevlangname => "ESN" },
741 { name => "es-ES_tradnl", lcid => 0x0000040a, file => "es_ES" },
742 { name => "es-ES-u-co-trad", alias => "es-ES_tradnl" },
743 { name => "es-GQ" },
744 { name => "es-GT", lcid => 0x0000100a, sabbrevlangname => "ESG" },
745 { name => "es-HN", lcid => 0x0000480a, sabbrevlangname => "ESH" },
746 { name => "es-IC" },
747 { name => "es-MX", lcid => 0x0000080a, sabbrevlangname => "ESM" },
748 { name => "es-NI", lcid => 0x00004c0a, sabbrevlangname => "ESI" },
749 { name => "es-PA", lcid => 0x0000180a, sabbrevlangname => "ESA" },
750 { name => "es-PE", lcid => 0x0000280a, sabbrevlangname => "ESR" },
751 { name => "es-PH" },
752 { name => "es-PR", lcid => 0x0000500a, sabbrevlangname => "ESU" },
753 { name => "es-PY", lcid => 0x00003c0a, sabbrevlangname => "ESZ" },
754 { name => "es-SV", lcid => 0x0000440a, sabbrevlangname => "ESE" },
755 { name => "es-US", lcid => 0x0000540a, sabbrevlangname => "EST" },
756 { name => "es-UY", lcid => 0x0000380a, sabbrevlangname => "ESY" },
757 { name => "es-VE", lcid => 0x0000200a, sabbrevlangname => "ESV" },
758 { name => "et", lcid => 0x00000025, oemcp => 775, group => 3, sabbrevlangname => "ETI", sopentypelang => "ETI" },
759 { name => "et-EE", lcid => 0x00000425 },
760 { name => "eu", lcid => 0x0000002d, oemcp => 850, maccp => 65001, sabbrevlangname => "EUQ", sopentypelang => "EUQ" },
761 { name => "eu-ES", lcid => 0x0000042d },
762 { name => "ewo" },
763 { name => "ewo-CM" },
764 { name => "fa", lcid => 0x00000029, inegnumber => 3, oemcp => 720, slist => "\x{061b}", group => 13, sabbrevlangname => "FAR", sopentypelang => "FAR" },
765 { name => "fa-AF", alias => "prs-AF" },
766 { name => "fa-IR", lcid => 0x00000429 },
767 { name => "ff", lcid => 0x00000067, oemcp => 850, ebcdiccp => 20297 },
768 { name => "ff-CM", alias => "ff-Latn-CM" },
769 { name => "ff-GN", alias => "ff-Latn-GN" },
770 { name => "ff-MR", alias => "ff-Latn-MR" },
771 { name => "ff-NG", alias => "ff-Latn-NG" },
772 { name => "ff-SN", alias => "ff-Latn-SN" },
773 { name => "ff-Adlm" },
774 { name => "ff-Adlm-BF" },
775 { name => "ff-Adlm-CM" },
776 { name => "ff-Adlm-GH" },
777 { name => "ff-Adlm-GM" },
778 { name => "ff-Adlm-GN" },
779 { name => "ff-Adlm-GW" },
780 { name => "ff-Adlm-LR" },
781 { name => "ff-Adlm-MR" },
782 { name => "ff-Adlm-NE" },
783 { name => "ff-Adlm-NG" },
784 { name => "ff-Adlm-SL" },
785 { name => "ff-Adlm-SN" },
786 { name => "ff-Latn", lcid => 0x00007c67 },
787 { name => "ff-Latn-BF", oemcp => 65001 },
788 { name => "ff-Latn-CM" },
789 { name => "ff-Latn-GH", oemcp => 65001 },
790 { name => "ff-Latn-GM", oemcp => 65001 },
791 { name => "ff-Latn-GN" },
792 { name => "ff-Latn-GW", oemcp => 65001 },
793 { name => "ff-Latn-LR", oemcp => 65001 },
794 { name => "ff-Latn-MR" },
795 { name => "ff-Latn-NE", oemcp => 65001 },
796 { name => "ff-Latn-NG", lcid => 0x00000467, sabbrevlangname => "ZZZ" },
797 { name => "ff-Latn-SL", oemcp => 65001 },
798 { name => "ff-Latn-SN", lcid => 0x00000867 },
799 { name => "fi", lcid => 0x0000000b, oemcp => 850, ebcdiccp => 20278 },
800 { name => "fi-FI", lcid => 0x0000040b },
801 { name => "fil", lcid => 0x00000064, oemcp => 437, ebcdiccp => 500, sabbrevlangname => "FPO", sopentypelang => "PIL" },
802 { name => "fil-PH", lcid => 0x00000464 },
803 { name => "fil-Latn", alias => "fil" },
804 { name => "fil-Latn-PH", alias => "fil-PH" },
805 { name => "fo", lcid => 0x00000038, oemcp => 850, maccp => 10079, ebcdiccp => 20277, sabbrevlangname => "FOS", sopentypelang => "FOS" },
806 { name => "fo-DK", oemcp => 65001, maccp => 65001 },
807 { name => "fo-FO", lcid => 0x00000438 },
808 { name => "fr", lcid => 0x0000000c, oemcp => 850, ebcdiccp => 20297 },
809 { name => "fr-029", lcid => 0x00001c0c, file => "fr", sabbrevlangname => "ZZZ" },
810 { name => "fr-BE", lcid => 0x0000080c, sabbrevlangname => "FRB" },
811 { name => "fr-BF" },
812 { name => "fr-BI" },
813 { name => "fr-BJ" },
814 { name => "fr-BL" },
815 { name => "fr-CA", lcid => 0x00000c0c, sabbrevlangname => "FRC" },
816 { name => "fr-CD", lcid => 0x0000240c, sabbrevlangname => "FRD" },
817 { name => "fr-CF" },
818 { name => "fr-CG" },
819 { name => "fr-CH", lcid => 0x0000100c, sabbrevlangname => "FRS" },
820 { name => "fr-CI", lcid => 0x0000300c, sabbrevlangname => "FRI" },
821 { name => "fr-CM", lcid => 0x00002c0c, sabbrevlangname => "FRE" },
822 { name => "fr-DJ" },
823 { name => "fr-DZ" },
824 { name => "fr-FR", lcid => 0x0000040c },
825 { name => "fr-GA" },
826 { name => "fr-GF" },
827 { name => "fr-GN" },
828 { name => "fr-GP" },
829 { name => "fr-GQ" },
830 { name => "fr-HT", lcid => 0x00003c0c, sabbrevlangname => "FRH" },
831 { name => "fr-KM" },
832 { name => "fr-LU", lcid => 0x0000140c, sabbrevlangname => "FRL" },
833 { name => "fr-MA", lcid => 0x0000380c, sabbrevlangname => "FRO" },
834 { name => "fr-MC", lcid => 0x0000180c, sabbrevlangname => "FRM" },
835 { name => "fr-MF" },
836 { name => "fr-MG" },
837 { name => "fr-ML", lcid => 0x0000340c, sabbrevlangname => "FRF" },
838 { name => "fr-MQ" },
839 { name => "fr-MR" },
840 { name => "fr-MU" },
841 { name => "fr-NC" },
842 { name => "fr-NE" },
843 { name => "fr-PF" },
844 { name => "fr-PM" },
845 { name => "fr-RE", lcid => 0x0000200c, sabbrevlangname => "FRR" },
846 { name => "fr-RW" },
847 { name => "fr-SC" },
848 { name => "fr-SN", lcid => 0x0000280c, sabbrevlangname => "FRN" },
849 { name => "fr-SY" },
850 { name => "fr-TD" },
851 { name => "fr-TG" },
852 { name => "fr-TN" },
853 { name => "fr-VU" },
854 { name => "fr-WF" },
855 { name => "fr-YT" },
856 { name => "fur", sopentypelang => "FRL" },
857 { name => "fur-IT" },
858 { name => "fy", lcid => 0x00000062, oemcp => 850, sabbrevlangname => "FYN", sopentypelang => "FRI" },
859 { name => "fy-NL", lcid => 0x00000462 },
860 { name => "ga", lcid => 0x0000003c, oemcp => 850, sabbrevlangname => "IRE", sopentypelang => "IRI" },
861 { name => "ga-GB" },
862 { name => "ga-IE", lcid => 0x0000083c },
863 { name => "gd", lcid => 0x00000091, oemcp => 850, ebcdiccp => 20285, sopentypelang => "GAE" },
864 { name => "gd-GB", lcid => 0x00000491 },
865 { name => "gd-Latn", alias => "gd" },
866 { name => "gl", lcid => 0x00000056, oemcp => 850, sabbrevlangname => "GLC", sopentypelang => "GAL" },
867 { name => "gl-ES", lcid => 0x00000456 },
868 { name => "gn", lcid => 0x00000074, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed", sopentypelang => "GUA" },
869 { name => "gn-PY", lcid => 0x00000474, dir => "seed" },
870 { name => "gsw", lcid => 0x00000084, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "ZZZ", sopentypelang => "ALS" },
871 { name => "gsw-CH" },
872 { name => "gsw-FR", lcid => 0x00000484, sabbrevlangname => "GSW" },
873 { name => "gsw-LI" },
874 { name => "gu", lcid => 0x00000047, slist => ",", group => 15 },
875 { name => "gu-IN", lcid => 0x00000447 },
876 { name => "guz" },
877 { name => "guz-KE" },
878 { name => "gv", sopentypelang => "MNX" },
879 { name => "gv-IM" },
880 { name => "ha", lcid => 0x00000068, oemcp => 437 },
881 { name => "ha-GH", alias => "ha-Latn-GH" },
882 { name => "ha-Latn", lcid => 0x00007c68, file => "ha" },
883 { name => "ha-Latn-GH", file => "ha_GH", ebcdiccp => 500 },
884 { name => "ha-Latn-NE", file => "ha_NE", ebcdiccp => 500 },
885 { name => "ha-Latn-NG", lcid => 0x00000468, file => "ha_NG" },
886 { name => "ha-NE", alias => "ha-Latn-NE" },
887 { name => "ha-NG", alias => "ha-Latn-NG" },
888 { name => "haw", lcid => 0x00000075, oemcp => 437 },
889 { name => "haw-Latn", alias => "haw" },
890 { name => "haw-Latn-US", alias => "haw-US" },
891 { name => "haw-US", lcid => 0x00000475 },
892 { name => "he", lcid => 0x0000000d, oemcp => 862, slist => ",", group => 12, sopentypelang => "IWR" },
893 { name => "he-IL", lcid => 0x0000040d },
894 { name => "hi", lcid => 0x00000039, slist => ",", group => 15 },
895 { name => "hi-IN", lcid => 0x00000439 },
896 { name => "hr", lcid => 0x0000001a, inegnumber => 2, oemcp => 852, maccp => 10082, group => 2 },
897 { name => "hr-BA", lcid => 0x0000101a, ebcdiccp => 870, inegnumber => 1, sabbrevlangname => "HRB" },
898 { name => "hr-HR", lcid => 0x0000041a },
899 { name => "hsb", lcid => 0x0000002e, oemcp => 850, ebcdiccp => 870, sopentypelang => "USB" },
900 { name => "hsb-DE", lcid => 0x0000042e },
901 { name => "hu", lcid => 0x0000000e, oemcp => 852, group => 2 },
902 { name => "hu-HU", lcid => 0x0000040e },
903 { name => "hu-HU_technl", lcid => 0x0001040e, alias => "hu-HU" },
904 { name => "hy", lcid => 0x0000002b, slist => ",", group => 17 },
905 { name => "hy-AM", lcid => 0x0000042b },
906 { name => "ia" },
907 { name => "ia-001" },
908 ## name => "ibb", lcid => 0x00000069 },
909 ## name => "ibb-NG", lcid => 0x00000469 },
910 { name => "id", lcid => 0x00000021, oemcp => 850 },
911 { name => "id-ID", lcid => 0x00000421 },
912 { name => "ig", lcid => 0x00000070, oemcp => 437 },
913 { name => "ig-Latn", alias => "ig" },
914 { name => "ig-Latn-NG", alias => "ig-NG" },
915 { name => "ig-NG", lcid => 0x00000470 },
916 { name => "ii", lcid => 0x00000078, group => 9, sopentypelang => "YIM" },
917 { name => "ii-CN", lcid => 0x00000478 },
918 { name => "ii-Yiii", alias => "ii" },
919 { name => "ii-Yiii-CN", alias => "ii-CN" },
920 { name => "is", lcid => 0x0000000f, oemcp => 850, maccp => 10079, ebcdiccp => 20871 },
921 { name => "is-IS", lcid => 0x0000040f },
922 { name => "it", lcid => 0x00000010, oemcp => 850, ebcdiccp => 20280 },
923 { name => "it-CH", lcid => 0x00000810, ebcdiccp => 500, sabbrevlangname => "ITS" },
924 { name => "it-IT", lcid => 0x00000410 },
925 { name => "it-SM" },
926 { name => "it-VA", oemcp => 65001 },
927 { name => "iu", lcid => 0x0000005d, oemcp => 437, slist => ",", sortlocale => "iu-Latn-CA", dir => "seed", sabbrevlangname => "IUK", sopentypelang => "INU" },
928 { name => "iu-Cans", lcid => 0x0000785d, file => "iu", oemcp => 65001, dir => "seed", sabbrevlangname => "IUS" },
929 { name => "iu-Cans-CA", lcid => 0x0000045d, file => "iu_CA", dir => "seed" },
930 { name => "iu-Latn", lcid => 0x00007c5d, dir => "seed" },
931 { name => "iu-Latn-CA", lcid => 0x0000085d, dir => "seed" },
932 { name => "ja", lcid => 0x00000011, ireadinglayout => 2, oemcp => 932, slist => ",", sscripts => "Hani Hira Jpan Kana", group => 7, sopentypelang => "JAN" },
933 { name => "ja-JP", lcid => 0x00000411 },
934 { name => "ja-JP_radstr", lcid => 0x00040411, alias => "ja-JP" },
935 { name => "ja-JP-u-co-unihan", alias => "ja-JP_radstr" },
936 { name => "jgo" },
937 { name => "jgo-CM" },
938 { name => "jmc" },
939 { name => "jmc-TZ" },
940 { name => "jv", oemcp => 850 },
941 { name => "jv-ID", alias => "jv-Latn-ID" },
942 ## name => "jv-Java" },
943 ## name => "jv-Java-ID" },
944 { name => "jv-Latn", file => "jv" },
945 { name => "jv-Latn-ID", file => "jv_ID" },
946 { name => "ka", lcid => 0x00000037, group => 16 },
947 { name => "ka-GE", lcid => 0x00000437 },
948 { name => "ka-GE_modern", lcid => 0x00010437, alias => "ka-GE" },
949 { name => "kab", sopentypelang => "KAB0" },
950 { name => "kab-DZ" },
951 { name => "kam", sopentypelang => "KMB" },
952 { name => "kam-KE" },
953 { name => "kde" },
954 { name => "kde-TZ" },
955 { name => "kea" },
956 { name => "kea-CV" },
957 { name => "kgp" },
958 { name => "kgp-BR" },
959 { name => "khq" },
960 { name => "khq-ML" },
961 { name => "ki" },
962 { name => "ki-KE" },
963 { name => "kk", lcid => 0x0000003f, group => 5, sabbrevlangname => "KKZ" },
964 { name => "kk-Cyrl", alias => "kk" },
965 { name => "kk-Cyrl-KZ", alias => "kk-KZ" },
966 { name => "kk-KZ", lcid => 0x0000043f },
967 { name => "kkj" },
968 { name => "kkj-CM" },
969 { name => "kl", lcid => 0x0000006f, oemcp => 850, ebcdiccp => 20277, sopentypelang => "GRN" },
970 { name => "kl-GL", lcid => 0x0000046f },
971 { name => "kln", sopentypelang => "KAL" },
972 { name => "kln-KE" },
973 { name => "km", lcid => 0x00000053, inegnumber => 2, slist => ",", group => 15 },
974 { name => "km-KH", lcid => 0x00000453 },
975 { name => "kn", lcid => 0x0000004b, slist => ",", group => 15, sabbrevlangname => "KDI" },
976 { name => "kn-IN", lcid => 0x0000044b },
977 { name => "ko", lcid => 0x00000012, ireadinglayout => 2, slist => ",", oemcp => 949, ebcdiccp => 20833, sscripts => "Hang Hani Kore", group => 8 },
978 { name => "ko-KP", oemcp => 65001 },
979 { name => "ko-KR", lcid => 0x00000412 },
980 { name => "kok", lcid => 0x00000057, slist => ",", group => 15, sabbrevlangname => "KNK" },
981 { name => "kok-IN", lcid => 0x00000457 },
982 { name => "kr", lcid => 0x00000071, sortlocale => "kr-Latn-NG", oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "KNR" },
983 { name => "kr-Latn", file => "kr", dir => "exemplars" },
984 { name => "kr-Latn-NG", lcid => 0x00000471, file => "kr", dir => "exemplars" },
985 { name => "kr-NG", alias => "kr-Latn-NG" },
986 { name => "ks", lcid => 0x00000060, group => 15, sabbrevlangname => "ZZZ", sopentypelang => "KSH" },
987 { name => "ks-Arab", lcid => 0x00000460 },
988 { name => "ks-Arab-IN" },
989 { name => "ks-Deva", slist => "," },
990 { name => "ks-Deva-IN", lcid => 0x00000860 },
991 { name => "ks-IN", alias => "ks-Arab-IN" },
992 { name => "ksb" },
993 { name => "ksb-TZ" },
994 { name => "ksf" },
995 { name => "ksf-CM" },
996 { name => "ksh", sopentypelang => "KSH0" },
997 { name => "ksh-DE" },
998 { name => "ku", lcid => 0x00000092, file => "ckb", slist => "\x{061b}", sortlocale => "ku-Arab-IQ", oemcp => 720 },
999 { name => "ku-Arab", lcid => 0x00007c92, file => "ckb", group => 13 },
1000 { name => "ku-Arab-IQ", lcid => 0x00000492, file => "ckb_IQ" },
1001 { name => "ku-Arab-IR", file => "ckb_IR", oemcp => 65001 },
1002 { name => "kw" },
1003 { name => "kw-GB" },
1004 { name => "ky", lcid => 0x00000040, oemcp => 866, group => 5, sabbrevlangname => "KYR" },
1005 { name => "ky-Cyrl", alias => "ky" },
1006 { name => "ky-Cyrl-KG", alias => "ky-KG" },
1007 { name => "ky-KG", lcid => 0x00000440 },
1008 { name => "la", lcid => 0x00000076, oemcp => 437, slist => ",", dir => "seed", sabbrevlangname => "ZZZ" },
1009 { name => "la-001", lcid => 0x00000476, file => "la", dir => "seed" },
1010 { name => "lag" },
1011 { name => "lag-TZ" },
1012 { name => "lb", lcid => 0x0000006e, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "LBX" },
1013 { name => "lb-LU", lcid => 0x0000046e },
1014 { name => "lg" },
1015 { name => "lg-UG" },
1016 { name => "lkt" },
1017 { name => "lkt-US" },
1018 { name => "ln" },
1019 { name => "ln-AO" },
1020 { name => "ln-CD" },
1021 { name => "ln-CF" },
1022 { name => "ln-CG" },
1023 { name => "lo", lcid => 0x00000054, group => 15 },
1024 { name => "lo-LA", lcid => 0x00000454 },
1025 { name => "lrc" },
1026 { name => "lrc-IQ" },
1027 { name => "lrc-IR" },
1028 { name => "lt", lcid => 0x00000027, oemcp => 775, group => 3, sabbrevlangname => "LTH", sopentypelang => "LTH" },
1029 { name => "lt-LT", lcid => 0x00000427 },
1030 { name => "lu" },
1031 { name => "lu-CD" },
1032 { name => "luo" },
1033 { name => "luo-KE" },
1034 { name => "luy", sopentypelang => "LUH" },
1035 { name => "luy-KE" },
1036 { name => "lv", lcid => 0x00000026, oemcp => 775, group => 3, sabbrevlangname => "LVI", sopentypelang => "LVI" },
1037 { name => "lv-LV", lcid => 0x00000426 },
1038 { name => "mai" },
1039 { name => "mai-IN" },
1040 { name => "mas" },
1041 { name => "mas-KE" },
1042 { name => "mas-TZ" },
1043 { name => "mer" },
1044 { name => "mer-KE" },
1045 { name => "mfe" },
1046 { name => "mfe-MU" },
1047 { name => "mg" },
1048 { name => "mg-MG" },
1049 { name => "mgh" },
1050 { name => "mgh-MZ" },
1051 { name => "mgo" },
1052 { name => "mgo-CM" },
1053 { name => "mi", lcid => 0x00000081, slist => "," },
1054 { name => "mi-Latn", alias => "mi" },
1055 { name => "mi-Latn-NZ", alias => "mi-NZ" },
1056 { name => "mi-NZ", lcid => 0x00000481 },
1057 { name => "mk", lcid => 0x0000002f, oemcp => 866, ebcdiccp => 500, group => 5, sabbrevlangname => "MKI" },
1058 { name => "mk-MK", lcid => 0x0000042f },
1059 { name => "ml", lcid => 0x0000004c, group => 15, sabbrevlangname => "MYM", sopentypelang => "MLR" },
1060 { name => "ml-IN", lcid => 0x0000044c },
1061 { name => "mn", lcid => 0x00000050, oemcp => 866, sopentypelang => "MNG" },
1062 { name => "mn-Cyrl", lcid => 0x00007850, file => "mn", sabbrevlangname => "MNN" },
1063 { name => "mn-Cyrl-MN", alias => "mn-MN" },
1064 { name => "mn-MN", lcid => 0x00000450, sparent => "mn-Cyrl", group => 5 },
1065 { name => "mn-Mong", lcid => 0x00007c50, oemcp => 65001, slist => ",", group => 15, dir => "seed", sabbrevlangname => "MNG" },
1066 { name => "mn-Mong-CN", lcid => 0x00000850, dir => "seed" },
1067 { name => "mn-Mong-MN", lcid => 0x00000c50, dir => "seed", sabbrevlangname => "MNM" },
1068 { name => "mni", lcid => 0x00000058, slist => ",", sabbrevlangname => "ZZZ" },
1069 { name => "mni-IN", lcid => 0x00000458, file => "mni_Beng_IN" },
1070 { name => "moh", lcid => 0x0000007c, oemcp => 850, ebcdiccp => 37, slist => ",", dir => "seed", sabbrevlangname => "MWK" },
1071 { name => "moh-CA", lcid => 0x0000047c, dir => "seed" },
1072 { name => "moh-Latn", alias => "moh" },
1073 { name => "moh-Latn-CA", alias => "moh-CA" },
1074 { name => "mr", lcid => 0x0000004e, slist => ",", group => 15 },
1075 { name => "mr-IN", lcid => 0x0000044e },
1076 { name => "ms", lcid => 0x0000003e, oemcp => 850, sabbrevlangname => "MSL", sopentypelang => "MLY" },
1077 { name => "ms-BN", lcid => 0x0000083e, sabbrevlangname => "MSB" },
1078 { name => "ms-ID" },
1079 { name => "ms-Latn", alias => "ms" },
1080 { name => "ms-Latn-BN", alias => "ms-BN" },
1081 { name => "ms-Latn-MY", alias => "ms-MY" },
1082 { name => "ms-Latn-SG", alias => "ms-SG" },
1083 { name => "ms-MY", lcid => 0x0000043e },
1084 { name => "ms-SG" },
1085 { name => "mt", lcid => 0x0000003a, sopentypelang => "MTS" },
1086 { name => "mt-MT", lcid => 0x0000043a },
1087 { name => "mua" },
1088 { name => "mua-CM" },
1089 { name => "my", lcid => 0x00000055, sopentypelang => "BRM" },
1090 { name => "my-MM", lcid => 0x00000455 },
1091 { name => "mzn" },
1092 { name => "mzn-IR" },
1093 { name => "naq" },
1094 { name => "naq-NA" },
1095 { name => "nb", lcid => 0x00007c14, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NOR", sopentypelang => "NOR" },
1096 { name => "nb-NO", lcid => 0x00000414 },
1097 { name => "nb-SJ" },
1098 { name => "nd", sopentypelang => "NDB" },
1099 { name => "nd-ZW" },
1100 { name => "nds" },
1101 { name => "nds-DE" },
1102 { name => "nds-NL" },
1103 { name => "ne", lcid => 0x00000061, slist => "," },
1104 { name => "ne-IN", lcid => 0x00000861, sabbrevlangname => "NEI" },
1105 { name => "ne-NP", lcid => 0x00000461, group => 15 },
1106 { name => "nl", lcid => 0x00000013, oemcp => 850 },
1107 { name => "nl-AW" },
1108 { name => "nl-BE", lcid => 0x00000813, sabbrevlangname => "NLB" },
1109 { name => "nl-BQ" },
1110 { name => "nl-CW" },
1111 { name => "nl-NL", lcid => 0x00000413 },
1112 { name => "nl-SR" },
1113 { name => "nl-SX" },
1114 { name => "nmg" },
1115 { name => "nmg-CM" },
1116 { name => "nn", lcid => 0x00007814, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NON", sopentypelang => "NYN" },
1117 { name => "nn-NO", lcid => 0x00000814 },
1118 { name => "nnh" },
1119 { name => "nnh-CM" },
1120 { name => "no", lcid => 0x00000014, oemcp => 850, ebcdiccp => 20277, sortlocale => "nb-NO" },
1121 { name => "nqo", idigits => 3, inegnumber => 3, slist => "\x{060c}", dir => "seed", sopentypelang => "NKO" },
1122 { name => "nqo-GN", dir => "seed" },
1123 { name => "nr", dir => "seed", sopentypelang => "NDB" },
1124 { name => "nr-ZA", dir => "seed" },
1125 { name => "nso", lcid => 0x0000006c, oemcp => 850, dir => "seed", sopentypelang => "SOT" },
1126 { name => "nso-ZA", lcid => 0x0000046c, dir => "seed" },
1127 { name => "nus" },
1128 { name => "nus-SD", alias => "nus-SS" },
1129 { name => "nus-SS" },
1130 { name => "nyn", sopentypelang => "NKL" },
1131 { name => "nyn-UG" },
1132 { name => "oc", lcid => 0x00000082, oemcp => 850, ebcdiccp => 20297, dir => "seed" },
1133 { name => "oc-FR", lcid => 0x00000482, dir => "seed" },
1134 { name => "oc-Latn", alias => "oc" },
1135 { name => "oc-Latn-FR", alias => "oc-FR" },
1136 { name => "om", lcid => 0x00000072, sopentypelang => "ORO" },
1137 { name => "om-ET", lcid => 0x00000472 },
1138 { name => "om-KE" },
1139 { name => "or", lcid => 0x00000048, slist => ",", group => 15 },
1140 { name => "or-IN", lcid => 0x00000448 },
1141 { name => "os" },
1142 { name => "os-GE" },
1143 { name => "os-RU" },
1144 { name => "pa", lcid => 0x00000046, slist => "," },
1145 { name => "pa-Arab", lcid => 0x00007c46, slist => ";", inegnumber => 2, oemcp => 720, group => 13, sabbrevlangname => "PAP" },
1146 { name => "pa-Arab-PK", lcid => 0x00000846 },
1147 { name => "pa-Guru" },
1148 { name => "pa-Guru-IN", alias => "pa-IN" },
1149 { name => "pa-IN", lcid => 0x00000446, sparent => "pa-Guru", file => "pa_Guru_IN", group => 15 },
1150 ## name => "pap", lcid => 0x00000079 },
1151 ## name => "pap-029", lcid => 0x00000479 },
1152 { name => "pcm" },
1153 { name => "pcm-NG" },
1154 { name => "pl", lcid => 0x00000015, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "PLK", sopentypelang => "PLK" },
1155 { name => "pl-PL", lcid => 0x00000415 },
1156 ## name => "prg" },
1157 ## name => "prg-001" },
1158 { name => "prs", lcid => 0x0000008c, file => "fa", inegnumber => 3, oemcp => 720, group => 13, sopentypelang => "DRI" },
1159 { name => "prs-AF", lcid => 0x0000048c, file => "fa_AF" },
1160 { name => "prs-Arab", alias => "prs" },
1161 { name => "prs-Arab-AF", alias => "prs-AF" },
1162 { name => "ps", lcid => 0x00000063, group => 13, sabbrevlangname => "PAS", sopentypelang => "PAS" },
1163 { name => "ps-AF", lcid => 0x00000463 },
1164 { name => "ps-PK" },
1165 { name => "pt", lcid => 0x00000016, oemcp => 850, sabbrevlangname => "PTB", sopentypelang => "PTG" },
1166 { name => "pt-AO" },
1167 { name => "pt-BR", lcid => 0x00000416 },
1168 { name => "pt-CH", oemcp => 65001 },
1169 { name => "pt-CV" },
1170 { name => "pt-GQ", oemcp => 65001 },
1171 { name => "pt-GW" },
1172 { name => "pt-LU", oemcp => 65001 },
1173 { name => "pt-MO" },
1174 { name => "pt-MZ" },
1175 { name => "pt-PT", lcid => 0x00000816, sabbrevlangname => "PTG" },
1176 { name => "pt-ST" },
1177 { name => "pt-TL" },
1178 ## name => qps-Latn-x-sh", lcid => 0x80000901 },
1179 ## name => qps-ploc", lcid => 0x80000501 },
1180 ## name => qps-ploca", lcid => 0x800005fe },
1181 ## name => qps-plocm", lcid => 0x800009ff },
1182 { name => "qu", alias => "quz" },
1183 { name => "qu-BO", alias => "quz-BO" },
1184 { name => "qu-EC", alias => "quz-EC" },
1185 { name => "qu-PE", alias => "quz-PE" },
1186 { name => "quc", lcid => 0x00000086, oemcp => 850, ebcdiccp => 20284, slist => ",", dir => "seed" },
1187 { name => "quc-Latn", lcid => 0x00007c86, file => "quc", dir => "seed" },
1188 { name => "quc-Latn-GT", lcid => 0x00000486, file => "quc_GT", dir => "seed" },
1189 { name => "qut", alias => "quc" },
1190 { name => "qut-GT", alias => "quc-Latn-GT" },
1191 { name => "quz", lcid => 0x0000006b, file => "qu", territory => "BO", oemcp => 850, ebcdiccp => 20284, slist => "," },
1192 { name => "quz-BO", lcid => 0x0000046b, file => "qu_BO" },
1193 { name => "quz-EC", lcid => 0x0000086b, file => "qu_EC" },
1194 { name => "quz-Latn", alias => "quz" },
1195 { name => "quz-Latn-BO", alias => "quz-BO" },
1196 { name => "quz-Latn-EC", alias => "quz-EC" },
1197 { name => "quz-Latn-PE", alias => "quz-PE" },
1198 { name => "quz-PE", lcid => 0x00000c6b, file => "qu_PE" },
1199 { name => "rm", lcid => 0x00000017, oemcp => 850, ebcdiccp => 20273, sabbrevlangname => "RMC", sopentypelang => "RMS" },
1200 { name => "rm-CH", lcid => 0x00000417 },
1201 { name => "rn" },
1202 { name => "rn-BI" },
1203 { name => "ro", lcid => 0x00000018, oemcp => 852, ebcdiccp => 20880, sabbrevlangname => "ROM", sopentypelang => "ROM" },
1204 { name => "ro-MD", lcid => 0x00000818, maccp => 65001, sabbrevlangname => "ROD" },
1205 { name => "ro-RO", lcid => 0x00000418, group => 2 },
1206 { name => "rof" },
1207 { name => "rof-TZ" },
1208 { name => "ru", lcid => 0x00000019, oemcp => 866 },
1209 { name => "ru-BY", maccp => 65001 },
1210 { name => "ru-KG", maccp => 65001 },
1211 { name => "ru-KZ", maccp => 65001 },
1212 { name => "ru-MD", lcid => 0x00000819, maccp => 65001, sabbrevlangname => "RUM" },
1213 { name => "ru-RU", lcid => 0x00000419, group => 5 },
1214 { name => "ru-UA", maccp => 65001 },
1215 { name => "rw", lcid => 0x00000087, oemcp => 437, sopentypelang => "RUA" },
1216 { name => "rw-RW", lcid => 0x00000487 },
1217 { name => "rwk" },
1218 { name => "rwk-TZ" },
1219 { name => "sa", lcid => 0x0000004f, slist => ",", group => 15 },
1220 { name => "sa-Deva", alias => "sa" },
1221 { name => "sa-Deva-IN", alias => "sa-IN" },
1222 { name => "sa-IN", lcid => 0x0000044f },
1223 { name => "sah", lcid => 0x00000085, oemcp => 866, group => 5, sopentypelang => "YAK" },
1224 { name => "sah-Cyrl", alias => "sah" },
1225 { name => "sah-Cyrl-RU", alias => "sah-RU" },
1226 { name => "sah-RU", lcid => 0x00000485 },
1227 { name => "saq" },
1228 { name => "saq-KE" },
1229 { name => "sat" },
1230 { name => "sat-Olck" },
1231 { name => "sat-Olck-IN" },
1232 { name => "sbp" },
1233 { name => "sbp-TZ" },
1234 { name => "sc" },
1235 { name => "sc-IT" },
1236 { name => "sd", lcid => 0x00000059, inegnumber => 3, oemcp => 720, sabbrevlangname => "SIP" },
1237 { name => "sd-Arab", lcid => 0x00007c59, group => 13 },
1238 { name => "sd-Arab-PK", lcid => 0x00000859 },
1239 { name => "sd-Deva", inegnumber => 1, slist => ",", oemcp => 65001, group => 15 },
1240 { name => "sd-Deva-IN", lcid => 0x00000459, sabbrevlangname => "ZZZ" },
1241 { name => "sd-PK", alias => "sd-Arab-PK" },
1242 { name => "se", lcid => 0x0000003b, oemcp => 850, ebcdiccp => 20277, sopentypelang => "NSM" },
1243 { name => "se-FI", lcid => 0x00000c3b, ebcdiccp => 20278, sabbrevlangname => "SMG" },
1244 { name => "se-NO", lcid => 0x0000043b },
1245 { name => "se-SE", lcid => 0x0000083b, ebcdiccp => 20278, sabbrevlangname => "SMF" },
1246 { name => "se-Latn", alias => "se" },
1247 { name => "se-Latn-FI", alias => "se-FI" },
1248 { name => "se-Latn-NO", alias => "se-NO" },
1249 { name => "se-Latn-SE", alias => "se-SE" },
1250 { name => "seh" },
1251 { name => "seh-MZ" },
1252 { name => "ses" },
1253 { name => "ses-ML" },
1254 { name => "sg", sopentypelang => "SGO" },
1255 { name => "sg-CF" },
1256 { name => "shi" },
1257 { name => "shi-Latn" },
1258 { name => "shi-Latn-MA" },
1259 { name => "shi-Tfng" },
1260 { name => "shi-Tfng-MA" },
1261 { name => "si", lcid => 0x0000005b, group => 15, sopentypelang => "SNH" },
1262 { name => "si-LK", lcid => 0x0000045b },
1263 { name => "sk", lcid => 0x0000001b, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "SKY", sopentypelang => "SKY" },
1264 { name => "sk-SK", lcid => 0x0000041b },
1265 { name => "sl", lcid => 0x00000024, oemcp => 852, ebcdiccp => 20880, group => 2 },
1266 { name => "sl-SI", lcid => 0x00000424 },
1267 { name => "sma", lcid => 0x0000783b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMB", sopentypelang => "SSM" },
1268 { name => "sma-Latn", alias => "sma" },
1269 { name => "sma-Latn-NO", alias => "sma-NO" },
1270 { name => "sma-Latn-SE", alias => "sma-SE" },
1271 { name => "sma-NO", lcid => 0x0000183b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMA" },
1272 { name => "sma-SE", lcid => 0x00001c3b, dir => "seed" },
1273 { name => "smj", lcid => 0x00007c3b, sparent => "se", ebcdiccp => 20278, dir => "seed", sabbrevlangname => "SMK", sopentypelang => "LSM" },
1274 { name => "smj-Latn", alias => "smj" },
1275 { name => "smj-Latn-NO", alias => "smj-NO" },
1276 { name => "smj-Latn-SE", alias => "smj-SE" },
1277 { name => "smj-NO", lcid => 0x0000103b, ebcdiccp => 20277, dir => "seed", sabbrevlangname => "SMJ" },
1278 { name => "smj-SE", lcid => 0x0000143b, dir => "seed" },
1279 { name => "smn", lcid => 0x0000703b, sparent => "se", ebcdiccp => 20278, sopentypelang => "ISM" },
1280 { name => "smn-FI", lcid => 0x0000243b },
1281 { name => "smn-Latn", alias => "smn" },
1282 { name => "smn-Latn-FI", alias => "smn-FI" },
1283 { name => "sms", lcid => 0x0000743b, sparent => "se", ebcdiccp => 20278, dir => "seed", sopentypelang => "SKS" },
1284 { name => "sms-FI", lcid => 0x0000203b, dir => "seed" },
1285 { name => "sms-Latn", alias => "sms" },
1286 { name => "sms-Latn-FI", alias => "sms-FI" },
1287 { name => "sn", sopentypelang => "SNA0" },
1288 { name => "sn-Latn", file => "sn" },
1289 { name => "sn-Latn-ZW", file => "sn_ZW" },
1290 { name => "sn-ZW", alias => "sn-Latn-ZW" },
1291 { name => "so", lcid => 0x00000077, sopentypelang => "SML" },
1292 { name => "so-DJ" },
1293 { name => "so-ET" },
1294 { name => "so-KE" },
1295 { name => "so-SO", lcid => 0x00000477 },
1296 { name => "sq", lcid => 0x0000001c, oemcp => 852, ebcdiccp => 20880, group => 2 },
1297 { name => "sq-AL", lcid => 0x0000041c },
1298 { name => "sq-MK" },
1299 { name => "sq-XK" },
1300 { name => "sr", lcid => 0x00007c1a, sortlocale => "sr-Latn-RS", oemcp => 852, group => 2, sabbrevlangname => "SRB", sopentypelang => "SRB" },
1301 { name => "sr-Cyrl", lcid => 0x00006c1a, oemcp => 855, ebcdiccp => 21025, group => 5, sabbrevlangname => "SRO" },
1302 { name => "sr-Cyrl-BA", lcid => 0x00001c1a, sabbrevlangname => "SRN" },
1303 { name => "sr-Cyrl-ME", lcid => 0x0000301a, sabbrevlangname => "SRQ" },
1304 { name => "sr-Cyrl-RS", lcid => 0x0000281a },
1305 { name => "sr-Cyrl-XK" },
1306 { name => "sr-Latn", lcid => 0x0000701a, sabbrevlangname => "SRM" },
1307 { name => "sr-Latn-BA", lcid => 0x0000181a, maccp => 10082, ebcdiccp => 870, sabbrevlangname => "SRS" },
1308 { name => "sr-Latn-ME", lcid => 0x00002c1a, sabbrevlangname => "SRP" },
1309 { name => "sr-Latn-RS", lcid => 0x0000241a, sabbrevlangname => "SRM" },
1310 { name => "sr-Latn-XK" },
1311 ## name => "sr-Cyrl-CS", lcid => 0x00000c1a },
1312 ## name => "sr-Latn-CS", lcid => 0x0000081a },
1313 { name => "ss", dir => "seed", sopentypelang => "SWZ" },
1314 { name => "ss-SZ", dir => "seed" },
1315 { name => "ss-ZA", dir => "seed" },
1316 { name => "ssy", dir => "seed" },
1317 { name => "ssy-ER", dir => "seed" },
1318 { name => "st", lcid => 0x00000030, dir => "seed" },
1319 { name => "st-LS", dir => "seed" },
1320 { name => "st-ZA", lcid => 0x00000430, dir => "seed" },
1321 { name => "su" },
1322 { name => "su-Latn" },
1323 { name => "su-Latn-ID" },
1324 { name => "sv", lcid => 0x0000001d, oemcp => 850, ebcdiccp => 20278, sabbrevlangname => "SVE", sopentypelang => "SVE" },
1325 { name => "sv-AX" },
1326 { name => "sv-FI", lcid => 0x0000081d, sabbrevlangname => "SVF" },
1327 { name => "sv-SE", lcid => 0x0000041d, sabbrevlangname => "SVE" },
1328 { name => "sw", lcid => 0x00000041, territory => "KE", oemcp => 437, ebcdiccp => 500, sabbrevlangname => "SWK", sopentypelang => "SWK" },
1329 { name => "sw-CD" },
1330 { name => "sw-KE", lcid => 0x00000441 },
1331 { name => "sw-TZ" },
1332 { name => "sw-UG" },
1333 { name => "swc-CD", alias => "sw-CD" },
1334 { name => "syr", lcid => 0x0000005a, slist => ",", group => 13, dir => "seed" },
1335 { name => "syr-SY", lcid => 0x0000045a, dir => "seed" },
1336 { name => "syr-Syrc", alias => "syr" },
1337 { name => "syr-Syrc-SY", alias => "syr-SY" },
1338 { name => "ta", lcid => 0x00000049, slist => ",", group => 15, sabbrevlangname => "TAI" },
1339 { name => "ta-IN", lcid => 0x00000449 },
1340 { name => "ta-LK", lcid => 0x00000849, sabbrevlangname => "TAM" },
1341 { name => "ta-MY" },
1342 { name => "ta-SG" },
1343 { name => "te", lcid => 0x0000004a, group => 15 },
1344 { name => "te-IN", lcid => 0x0000044a },
1345 { name => "teo" },
1346 { name => "teo-KE" },
1347 { name => "teo-UG" },
1348 { name => "tg", lcid => 0x00000028, oemcp => 866, group => 5, sabbrevlangname => "TAJ", sopentypelang => "TAJ" },
1349 { name => "tg-Cyrl", lcid => 0x00007c28, file => "tg" },
1350 { name => "tg-Cyrl-TJ", lcid => 0x00000428, file => "tg_TJ" },
1351 { name => "tg-TJ", alias => "tg-Cyrl-TJ" },
1352 { name => "th", lcid => 0x0000001e, oemcp => 874, ebcdiccp => 20838, slist => ",", group => 11 },
1353 { name => "th-TH", lcid => 0x0000041e },
1354 { name => "ti", lcid => 0x00000073, territory => "ER", sopentypelang => "TGY" },
1355 { name => "ti-ER", lcid => 0x00000873 },
1356 { name => "ti-ET", lcid => 0x00000473, sabbrevlangname => "TIE" },
1357 { name => "tig", dir => "seed", sopentypelang => "TGR" },
1358 { name => "tig-ER", dir => "seed" },
1359 { name => "tk", lcid => 0x00000042, oemcp => 852, ebcdiccp => 20880, group => 2, sopentypelang => "TKM" },
1360 { name => "tk-Latn", alias => "tk" },
1361 { name => "tk-Latn-TM", alias => "tk-TM" },
1362 { name => "tk-TM", lcid => 0x00000442 },
1363 { name => "tn", lcid => 0x00000032, oemcp => 850, dir => "seed", sopentypelang => "TNA" },
1364 { name => "tn-BW", lcid => 0x00000832, dir => "seed", sabbrevlangname => "TSB" },
1365 { name => "tn-ZA", lcid => 0x00000432, dir => "seed" },
1366 { name => "to", sopentypelang => "TGN" },
1367 { name => "to-TO" },
1368 { name => "tr", lcid => 0x0000001f, oemcp => 857, ebcdiccp => 20905, group => 6, sabbrevlangname => "TRK", sopentypelang => "TRK" },
1369 { name => "tr-CY" },
1370 { name => "tr-TR", lcid => 0x0000041f },
1371 { name => "ts", lcid => 0x00000031, dir => "seed", sopentypelang => "TSG" },
1372 { name => "ts-ZA", lcid => 0x00000431, dir => "seed" },
1373 { name => "tt", lcid => 0x00000044, oemcp => 866, group => 5, sabbrevlangname => "TTT" },
1374 { name => "tt-Cyrl", alias => "tt" },
1375 { name => "tt-Cyrl-RU", alias => "tt-RU" },
1376 { name => "tt-RU", lcid => 0x00000444 },
1377 { name => "twq" },
1378 { name => "twq-NE" },
1379 { name => "tzm", lcid => 0x0000005f, sortlocale => "tzm-Latn-DZ", oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "TZA" },
1380 { name => "tzm-Latn", lcid => 0x00007c5f, territory => "DZ", file => "tzm" },
1381 { name => "tzm-Latn-MA", file => "tzm_MA", oemcp => 65001 },
1382 { name => "tzm-Latn-DZ", lcid => 0x0000085f, file => "tzm" },
1383 { name => "tzm-MA", alias => "tzm-Latn-MA" },
1384 ## name => "tzm-Arab", group => 13 },
1385 ## name => "tzm-Arab-MA", lcid => 0x0000045f },
1386 ## name => "tzm-Tfng", lcid => 0x0000785f },
1387 ## name => "tzm-Tfng-MA", lcid => 0x0000105f },
1388 { name => "ug", lcid => 0x00000080, oemcp => 720, slist => ",", group => 13, sopentypelang => "UYG" },
1389 { name => "ug-Arab", alias => "ug" },
1390 { name => "ug-Arab-CN", alias => "ug-CN" },
1391 { name => "ug-CN", lcid => 0x00000480 },
1392 { name => "uk", lcid => 0x00000022, oemcp => 866, maccp => 10017, ebcdiccp => 500, group => 5 },
1393 { name => "uk-UA", lcid => 0x00000422 },
1394 { name => "ur", lcid => 0x00000020, oemcp => 720 },
1395 { name => "ur-IN", lcid => 0x00000820, maccp => 65001, sabbrevlangname => "URI" },
1396 { name => "ur-PK", lcid => 0x00000420, group => 13 },
1397 { name => "uz", lcid => 0x00000043, oemcp => 857, maccp => 10029, group => 2 },
1398 { name => "uz-Arab", oemcp => 65001, maccp => 65001 },
1399 { name => "uz-Arab-AF" },
1400 { name => "uz-Cyrl", lcid => 0x00007843, oemcp => 866, maccp => 10007, group => 5, sabbrevlangname => "UZC" },
1401 { name => "uz-Cyrl-UZ", lcid => 0x00000843 },
1402 { name => "uz-Latn", lcid => 0x00007c43 },
1403 { name => "uz-Latn-UZ", lcid => 0x00000443 },
1404 { name => "vai" },
1405 { name => "vai-Latn" },
1406 { name => "vai-Latn-LR" },
1407 { name => "vai-Vaii" },
1408 { name => "vai-Vaii-LR" },
1409 { name => "ve", lcid => 0x00000033, dir => "seed", sabbrevlangname => "ZZZ" },
1410 { name => "ve-ZA", lcid => 0x00000433, dir => "seed" },
1411 { name => "vi", lcid => 0x0000002a, oemcp => 1258, slist => ",", group => 14, sabbrevlangname => "VIT", sopentypelang => "VIT" },
1412 { name => "vi-VN", lcid => 0x0000042a },
1413 { name => "vo", dir => "seed" },
1414 { name => "vo-001", dir => "seed" },
1415 { name => "vun" },
1416 { name => "vun-TZ" },
1417 { name => "wae" },
1418 { name => "wae-CH" },
1419 { name => "wal", dir => "seed" },
1420 { name => "wal-ET", dir => "seed" },
1421 { name => "wo", lcid => 0x00000088, oemcp => 850, ebcdiccp => 20297, sopentypelang => "WLF" },
1422 { name => "wo-Latn", alias => "wo" },
1423 { name => "wo-Latn-SN", alias => "wo-SN" },
1424 { name => "wo-SN", lcid => 0x00000488 },
1425 { name => "x-IV_mathan", lcid => 0x0001007f, alias => "" },
1426 { name => "xh", lcid => 0x00000034, oemcp => 850, sopentypelang => "XHS" },
1427 { name => "xh-ZA", lcid => 0x00000434 },
1428 { name => "xog" },
1429 { name => "xog-UG" },
1430 { name => "yav" },
1431 { name => "yav-CM" },
1432 { name => "yi", lcid => 0x0000003d, sabbrevlangname => "ZZZ", sopentypelang => "JII" },
1433 { name => "yi-001", lcid => 0x0000043d },
1434 { name => "yo", lcid => 0x0000006a, oemcp => 437, sopentypelang => "YBA" },
1435 { name => "yo-BJ", ebcdiccp => 500 },
1436 { name => "yo-Latn", alias => "yo" },
1437 { name => "yo-Latn-NG", alias => "yo-NG" },
1438 { name => "yo-NG", lcid => 0x0000046a },
1439 { name => "yrl" },
1440 { name => "yrl-BR" },
1441 { name => "yrl-CO" },
1442 { name => "yrl-VE" },
1443 { name => "yue" },
1444 { name => "yue-Hans" },
1445 { name => "yue-Hans-CN" },
1446 { name => "yue-Hant" },
1447 { name => "yue-Hant-HK" },
1448 { name => "zgh" },
1449 { name => "zgh-MA", alias => "zgh-Tfng-MA" },
1450 { name => "zgh-Tfng", file => "zgh" },
1451 { name => "zgh-Tfng-MA", file => "zgh_MA" },
1452 { name => "zh", lcid => 0x00007804, ireadinglayout => 2, oemcp => 936, slist => ",", sscripts => "Hani Hans", sabbrevlangname => "CHS", sopentypelang => "ZHS" },
1453 { name => "zh-CN", lcid => 0x00000804, file => "zh_Hans_CN", sparent => "zh-Hans" },
1454 { name => "zh-CN_phoneb", lcid => 0x00050804, alias => "zh-CN" },
1455 { name => "zh-CN_stroke", lcid => 0x00020804, alias => "zh-CN" },
1456 { name => "zh-Hans", lcid => 0x00000004, group => 10 },
1457 { name => "zh-Hans-CN", alias => "zh-CN" },
1458 { name => "zh-Hans-CN-u-co-phonebk", alias => "zh-CN_phoneb" },
1459 { name => "zh-Hans-CN-u-co-stroke", alias => "zh-CN_stroke" },
1460 { name => "zh-Hans-HK", slist => ";" },
1461 { name => "zh-Hans-MO", slist => ";" },
1462 { name => "zh-Hans-SG", alias => "zh-SG" },
1463 { name => "zh-Hans-SG-u-co-phonebk", alias => "zh-SG_phoneb" },
1464 { name => "zh-Hans-SG-u-co-stroke", alias => "zh-SG_stroke" },
1465 { name => "zh-Hant", lcid => 0x00007c04, sortlocale => "zh-HK", ireadinglayout => 2, oemcp => 950, slist => ",", sscripts => "Hani Hant", group => 9, sabbrevlangname => "CHT", sopentypelang => "ZHH" },
1466 { name => "zh-Hant-HK", alias => "zh-HK" },
1467 { name => "zh-Hant-HK-u-co-unihan", alias => "zh-HK_radstr" },
1468 { name => "zh-Hant-MO", alias => "zh-MO" },
1469 { name => "zh-Hant-MO-u-co-stroke", alias => "zh-MO_stroke" },
1470 { name => "zh-Hant-MO-u-co-unihan", alias => "zh-MO_radstr" },
1471 { name => "zh-Hant-TW", alias => "zh-TW" },
1472 { name => "zh-Hant-TW-u-co-phonetic", alias => "zh-TW_pronun" },
1473 { name => "zh-Hant-TW-u-co-unihan", alias => "zh-TW_radstr" },
1474 { name => "zh-HK", lcid => 0x00000c04, file => "zh_Hant_HK", sparent => "zh-Hant", sabbrevlangname => "ZHH" },
1475 { name => "zh-HK_radstr", lcid => 0x00040c04, alias => "zh-HK" },
1476 { name => "zh-MO", lcid => 0x00001404, file => "zh_Hant_MO", sparent => "zh-Hant", sabbrevlangname => "ZHM", sopentypelang => "ZHT" },
1477 { name => "zh-MO_radstr", lcid => 0x00041404, alias => "zh-MO" },
1478 { name => "zh-MO_stroke", lcid => 0x00021404, alias => "zh-MO" },
1479 { name => "zh-SG", lcid => 0x00001004, file => "zh_Hans_SG", sparent => "zh-Hans", sabbrevlangname => "ZHI" },
1480 { name => "zh-SG_phoneb", lcid => 0x00051004, alias => "zh-SG" },
1481 { name => "zh-SG_stroke", lcid => 0x00021004, alias => "zh-SG" },
1482 { name => "zh-TW", lcid => 0x00000404, file => "zh_Hant_TW", sparent => "zh-Hant", sopentypelang => "ZHT" },
1483 { name => "zh-TW_pronun", lcid => 0x00030404, alias => "zh-TW" },
1484 { name => "zh-TW_radstr", lcid => 0x00040404, alias => "zh-TW" },
1485 { name => "zu", lcid => 0x00000035, oemcp => 850 },
1486 { name => "zu-ZA", lcid => 0x00000435 },
1489 my @calendars =
1491 { id => 1, name => "Gregorian", itwodigityearmax => 2049 },
1492 { id => 2, type => "gregorian", locale => "en-US", itwodigityearmax => 2049 },
1493 { id => 3, type => "japanese", locale => "ja-JP", eras => [ 232..236 ] },
1494 { id => 4, type => "roc", locale => "zh-TW", eras => [ 1 ] },
1495 { id => 5, type => "dangi", locale => "ko-KR", eras => [ 0 ] },
1496 { id => 6, type => "islamic", locale => "ar-SA", itwodigityearmax => 1451 },
1497 { id => 7, type => "buddhist", locale => "th-TH", eras => [ 0 ] },
1498 { id => 8, type => "hebrew", locale => "he-IL", itwodigityearmax => 5810 },
1499 { id => 9, type => "gregorian", locale => "fr-FR", itwodigityearmax => 2049 },
1500 { id => 10, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1501 { id => 11, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1502 { id => 12, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1503 { id => 13, name => "Julian", locale => "en-US", itwodigityearmax => 2049 },
1504 { id => 14, name => "Japanese Lunisolar" },
1505 { id => 15, name => "Chinese Lunisolar" },
1506 { id => 16, name => "Saka" },
1507 { id => 17, name => "Lunar ETO Chinese" },
1508 { id => 18, name => "Lunar ETO Korean" },
1509 { id => 19, name => "Lunar ETO Rokuyou" },
1510 { id => 20, name => "Korean Lunisolar" },
1511 { id => 21, name => "Taiwan Lunisolar" },
1512 { id => 22, type => "persian", locale => "prs-AF", itwodigityearmax => 1429 },
1513 { id => 23, type => "islamic-umalqura", locale => "ar-SA", itwodigityearmax => 1451 },
1516 my @geoids =
1518 { id => 2, name => "AG" }, # Antigua and Barbuda
1519 { id => 3, name => "AF" }, # Afghanistan
1520 { id => 4, name => "DZ" }, # Algeria
1521 { id => 5, name => "AZ" }, # Azerbaijan
1522 { id => 6, name => "AL" }, # Albania
1523 { id => 7, name => "AM" }, # Armenia
1524 { id => 8, name => "AD" }, # Andorra
1525 { id => 9, name => "AO" }, # Angola
1526 { id => 10, name => "AS" }, # American Samoa
1527 { id => 11, name => "AR" }, # Argentina
1528 { id => 12, name => "AU" }, # Australia
1529 { id => 14, name => "AT" }, # Austria
1530 { id => 17, name => "BH" }, # Bahrain
1531 { id => 18, name => "BB" }, # Barbados
1532 { id => 19, name => "BW" }, # Botswana
1533 { id => 20, name => "BM" }, # Bermuda
1534 { id => 21, name => "BE" }, # Belgium
1535 { id => 22, name => "BS" }, # Bahamas, The
1536 { id => 23, name => "BD" }, # Bangladesh
1537 { id => 24, name => "BZ" }, # Belize
1538 { id => 25, name => "BA" }, # Bosnia and Herzegovina
1539 { id => 26, name => "BO" }, # Bolivia
1540 { id => 27, name => "MM" }, # Myanmar
1541 { id => 28, name => "BJ" }, # Benin
1542 { id => 29, name => "BY" }, # Belarus
1543 { id => 30, name => "SB" }, # Solomon Islands
1544 { id => 32, name => "BR" }, # Brazil
1545 { id => 34, name => "BT" }, # Bhutan
1546 { id => 35, name => "BG" }, # Bulgaria
1547 { id => 37, name => "BN" }, # Brunei
1548 { id => 38, name => "BI" }, # Burundi
1549 { id => 39, name => "CA" }, # Canada
1550 { id => 40, name => "KH" }, # Cambodia
1551 { id => 41, name => "TD" }, # Chad
1552 { id => 42, name => "LK" }, # Sri Lanka
1553 { id => 43, name => "CG" }, # Congo
1554 { id => 44, name => "CD" }, # Congo (DRC)
1555 { id => 45, name => "CN" }, # China
1556 { id => 46, name => "CL" }, # Chile
1557 { id => 49, name => "CM" }, # Cameroon
1558 { id => 50, name => "KM" }, # Comoros
1559 { id => 51, name => "CO" }, # Colombia
1560 { id => 54, name => "CR" }, # Costa Rica
1561 { id => 55, name => "CF" }, # Central African Republic
1562 { id => 56, name => "CU" }, # Cuba
1563 { id => 57, name => "CV" }, # Cape Verde
1564 { id => 59, name => "CY" }, # Cyprus
1565 { id => 61, name => "DK" }, # Denmark
1566 { id => 62, name => "DJ" }, # Djibouti
1567 { id => 63, name => "DM" }, # Dominica
1568 { id => 65, name => "DO" }, # Dominican Republic
1569 { id => 66, name => "EC" }, # Ecuador
1570 { id => 67, name => "EG" }, # Egypt
1571 { id => 68, name => "IE" }, # Ireland
1572 { id => 69, name => "GQ" }, # Equatorial Guinea
1573 { id => 70, name => "EE" }, # Estonia
1574 { id => 71, name => "ER" }, # Eritrea
1575 { id => 72, name => "SV" }, # El Salvador
1576 { id => 73, name => "ET" }, # Ethiopia
1577 { id => 75, name => "CZ" }, # Czech Republic
1578 { id => 77, name => "FI" }, # Finland
1579 { id => 78, name => "FJ" }, # Fiji Islands
1580 { id => 80, name => "FM" }, # Micronesia
1581 { id => 81, name => "FO" }, # Faroe Islands
1582 { id => 84, name => "FR" }, # France
1583 { id => 86, name => "GM" }, # Gambia, The
1584 { id => 87, name => "GA" }, # Gabon
1585 { id => 88, name => "GE" }, # Georgia
1586 { id => 89, name => "GH" }, # Ghana
1587 { id => 90, name => "GI" }, # Gibraltar
1588 { id => 91, name => "GD" }, # Grenada
1589 { id => 93, name => "GL" }, # Greenland
1590 { id => 94, name => "DE" }, # Germany
1591 { id => 98, name => "GR" }, # Greece
1592 { id => 99, name => "GT" }, # Guatemala
1593 { id => 100, name => "GN" }, # Guinea
1594 { id => 101, name => "GY" }, # Guyana
1595 { id => 103, name => "HT" }, # Haiti
1596 { id => 104, name => "HK" }, # Hong Kong S.A.R.
1597 { id => 106, name => "HN" }, # Honduras
1598 { id => 108, name => "HR" }, # Croatia
1599 { id => 109, name => "HU" }, # Hungary
1600 { id => 110, name => "IS" }, # Iceland
1601 { id => 111, name => "ID" }, # Indonesia
1602 { id => 113, name => "IN" }, # India
1603 { id => 114, name => "IO" }, # British Indian Ocean Territory
1604 { id => 116, name => "IR" }, # Iran
1605 { id => 117, name => "IL" }, # Israel
1606 { id => 118, name => "IT" }, # Italy
1607 { id => 119, name => "CI" }, # Côte d'Ivoire
1608 { id => 121, name => "IQ" }, # Iraq
1609 { id => 122, name => "JP" }, # Japan
1610 { id => 124, name => "JM" }, # Jamaica
1611 { id => 125, name => "SJ" }, # Jan Mayen
1612 { id => 126, name => "JO" }, # Jordan
1613 { id => 127, parent => "UM" }, # Johnston Atoll
1614 { id => 129, name => "KE" }, # Kenya
1615 { id => 130, name => "KG" }, # Kyrgyzstan
1616 { id => 131, name => "KP" }, # North Korea
1617 { id => 133, name => "KI" }, # Kiribati
1618 { id => 134, name => "KR" }, # Korea
1619 { id => 136, name => "KW" }, # Kuwait
1620 { id => 137, name => "KZ" }, # Kazakhstan
1621 { id => 138, name => "LA" }, # Laos
1622 { id => 139, name => "LB" }, # Lebanon
1623 { id => 140, name => "LV" }, # Latvia
1624 { id => 141, name => "LT" }, # Lithuania
1625 { id => 142, name => "LR" }, # Liberia
1626 { id => 143, name => "SK" }, # Slovakia
1627 { id => 145, name => "LI" }, # Liechtenstein
1628 { id => 146, name => "LS" }, # Lesotho
1629 { id => 147, name => "LU" }, # Luxembourg
1630 { id => 148, name => "LY" }, # Libya
1631 { id => 149, name => "MG" }, # Madagascar
1632 { id => 151, name => "MO" }, # Macao S.A.R.
1633 { id => 152, name => "MD" }, # Moldova
1634 { id => 154, name => "MN" }, # Mongolia
1635 { id => 156, name => "MW" }, # Malawi
1636 { id => 157, name => "ML" }, # Mali
1637 { id => 158, name => "MC" }, # Monaco
1638 { id => 159, name => "MA" }, # Morocco
1639 { id => 160, name => "MU" }, # Mauritius
1640 { id => 162, name => "MR" }, # Mauritania
1641 { id => 163, name => "MT" }, # Malta
1642 { id => 164, name => "OM" }, # Oman
1643 { id => 165, name => "MV" }, # Maldives
1644 { id => 166, name => "MX" }, # Mexico
1645 { id => 167, name => "MY" }, # Malaysia
1646 { id => 168, name => "MZ" }, # Mozambique
1647 { id => 173, name => "NE" }, # Niger
1648 { id => 174, name => "VU" }, # Vanuatu
1649 { id => 175, name => "NG" }, # Nigeria
1650 { id => 176, name => "NL" }, # Netherlands
1651 { id => 177, name => "NO" }, # Norway
1652 { id => 178, name => "NP" }, # Nepal
1653 { id => 180, name => "NR" }, # Nauru
1654 { id => 181, name => "SR" }, # Suriname
1655 { id => 182, name => "NI" }, # Nicaragua
1656 { id => 183, name => "NZ" }, # New Zealand
1657 { id => 184, name => "PS" }, # Palestinian Authority
1658 { id => 185, name => "PY" }, # Paraguay
1659 { id => 187, name => "PE" }, # Peru
1660 { id => 190, name => "PK" }, # Pakistan
1661 { id => 191, name => "PL" }, # Poland
1662 { id => 192, name => "PA" }, # Panama
1663 { id => 193, name => "PT" }, # Portugal
1664 { id => 194, name => "PG" }, # Papua New Guinea
1665 { id => 195, name => "PW" }, # Palau
1666 { id => 196, name => "GW" }, # Guinea-Bissau
1667 { id => 197, name => "QA" }, # Qatar
1668 { id => 198, name => "RE" }, # Reunion
1669 { id => 199, name => "MH" }, # Marshall Islands
1670 { id => 200, name => "RO" }, # Romania
1671 { id => 201, name => "PH" }, # Philippines
1672 { id => 202, name => "PR" }, # Puerto Rico
1673 { id => 203, name => "RU" }, # Russia
1674 { id => 204, name => "RW" }, # Rwanda
1675 { id => 205, name => "SA" }, # Saudi Arabia
1676 { id => 206, name => "PM" }, # St. Pierre and Miquelon
1677 { id => 207, name => "KN" }, # St. Kitts and Nevis
1678 { id => 208, name => "SC" }, # Seychelles
1679 { id => 209, name => "ZA" }, # South Africa
1680 { id => 210, name => "SN" }, # Senegal
1681 { id => 212, name => "SI" }, # Slovenia
1682 { id => 213, name => "SL" }, # Sierra Leone
1683 { id => 214, name => "SM" }, # San Marino
1684 { id => 215, name => "SG" }, # Singapore
1685 { id => 216, name => "SO" }, # Somalia
1686 { id => 217, name => "ES" }, # Spain
1687 { id => 218, name => "LC" }, # St. Lucia
1688 { id => 219, name => "SD" }, # Sudan
1689 { id => 220, name => "SJ" }, # Svalbard
1690 { id => 221, name => "SE" }, # Sweden
1691 { id => 222, name => "SY" }, # Syria
1692 { id => 223, name => "CH" }, # Switzerland
1693 { id => 224, name => "AE" }, # United Arab Emirates
1694 { id => 225, name => "TT" }, # Trinidad and Tobago
1695 { id => 227, name => "TH" }, # Thailand
1696 { id => 228, name => "TJ" }, # Tajikistan
1697 { id => 231, name => "TO" }, # Tonga
1698 { id => 232, name => "TG" }, # Togo
1699 { id => 233, name => "ST" }, # São Tomé and Príncipe
1700 { id => 234, name => "TN" }, # Tunisia
1701 { id => 235, name => "TR" }, # Turkey
1702 { id => 236, name => "TV" }, # Tuvalu
1703 { id => 237, name => "TW" }, # Taiwan
1704 { id => 238, name => "TM" }, # Turkmenistan
1705 { id => 239, name => "TZ" }, # Tanzania
1706 { id => 240, name => "UG" }, # Uganda
1707 { id => 241, name => "UA" }, # Ukraine
1708 { id => 242, name => "GB" }, # United Kingdom
1709 { id => 244, name => "US" }, # United States
1710 { id => 245, name => "BF" }, # Burkina Faso
1711 { id => 246, name => "UY" }, # Uruguay
1712 { id => 247, name => "UZ" }, # Uzbekistan
1713 { id => 248, name => "VC" }, # St. Vincent and the Grenadines
1714 { id => 249, name => "VE" }, # Bolivarian Republic of Venezuela
1715 { id => 251, name => "VN" }, # Vietnam
1716 { id => 252, name => "VI" }, # Virgin Islands
1717 { id => 253, name => "VA" }, # Vatican City
1718 { id => 254, name => "NA" }, # Namibia
1719 { id => 257, name => "EH" }, # Western Sahara (disputed)
1720 { id => 258, parent => "UM" }, # Wake Island
1721 { id => 259, name => "WS" }, # Samoa
1722 { id => 260, name => "SZ" }, # Swaziland
1723 { id => 261, name => "YE" }, # Yemen
1724 { id => 263, name => "ZM" }, # Zambia
1725 { id => 264, name => "ZW" }, # Zimbabwe
1726 { id => 269, name => "CS" }, # Serbia and Montenegro (Former)
1727 { id => 270, name => "ME" }, # Montenegro
1728 { id => 271, name => "RS" }, # Serbia
1729 { id => 273, name => "CW" }, # Curaçao
1730 { id => 276, name => "SS" }, # South Sudan
1731 { id => 300, name => "AI" }, # Anguilla
1732 { id => 301, name => "AQ" }, # Antarctica
1733 { id => 302, name => "AW" }, # Aruba
1734 { id => 303, parent => "SH" }, # Ascension Island
1735 { id => 304, parent => "053" }, # Ashmore and Cartier Islands
1736 { id => 305, parent => "UM" }, # Baker Island
1737 { id => 306, name => "BV" }, # Bouvet Island
1738 { id => 307, name => "KY" }, # Cayman Islands
1739 { id => 308, name => "830", parent => "155" }, # Channel Islands
1740 { id => 309, name => "CX" }, # Christmas Island
1741 { id => 310, parent => "009" }, # Clipperton Island
1742 { id => 311, name => "CC" }, # Cocos (Keeling) Islands
1743 { id => 312, name => "CK" }, # Cook Islands
1744 { id => 313, parent => "053" }, # Coral Sea Islands
1745 { id => 314, parent => "IO" }, # Diego Garcia
1746 { id => 315, name => "FK" }, # Falkland Islands (Islas Malvinas)
1747 { id => 317, name => "GF" }, # French Guiana
1748 { id => 318, name => "PF" }, # French Polynesia
1749 { id => 319, name => "TF" }, # French Southern and Antarctic Lands
1750 { id => 321, name => "GP" }, # Guadeloupe
1751 { id => 322, name => "GU" }, # Guam
1752 { id => 323 }, # Guantanamo Bay
1753 { id => 324, name => "GG" }, # Guernsey
1754 { id => 325, name => "HM" }, # Heard Island and McDonald Islands
1755 { id => 326, parent => "UM" }, # Howland Island
1756 { id => 327, parent => "UM" }, # Jarvis Island
1757 { id => 328, name => "JE" }, # Jersey
1758 { id => 329, parent => "UM" }, # Kingman Reef
1759 { id => 330, name => "MQ" }, # Martinique
1760 { id => 331, name => "YT" }, # Mayotte
1761 { id => 332, name => "MS" }, # Montserrat
1762 { id => 333, name => "AN", region => 1 }, # Netherlands Antilles (Former)
1763 { id => 334, name => "NC" }, # New Caledonia
1764 { id => 335, name => "NU" }, # Niue
1765 { id => 336, name => "NF" }, # Norfolk Island
1766 { id => 337, name => "MP" }, # Northern Mariana Islands
1767 { id => 338, parent => "UM" }, # Palmyra Atoll
1768 { id => 339, name => "PN" }, # Pitcairn Islands
1769 { id => 340, parent => "MP" }, # Rota Island
1770 { id => 341, parent => "MP" }, # Saipan
1771 { id => 342, name => "GS" }, # South Georgia and the South Sandwich Islands
1772 { id => 343, name => "SH" }, # St. Helena
1773 { id => 346, parent => "MP" }, # Tinian Island
1774 { id => 347, name => "TK" }, # Tokelau
1775 { id => 348, parent => "SH" }, # Tristan da Cunha
1776 { id => 349, name => "TC" }, # Turks and Caicos Islands
1777 { id => 351, name => "VG" }, # Virgin Islands, British
1778 { id => 352, name => "WF" }, # Wallis and Futuna
1779 { id => 742, name => "002" }, # Africa
1780 { id => 2129, name => "142" }, # Asia
1781 { id => 10541, name => "150" }, # Europe
1782 { id => 15126, name => "IM" }, # Man, Isle of
1783 { id => 19618, name => "MK" }, # Macedonia, Former Yugoslav Republic of
1784 { id => 20900, name => "054" }, # Melanesia
1785 { id => 21206, name => "057" }, # Micronesia
1786 { id => 21242, parent => "UM" }, # Midway Islands
1787 { id => 23581, name => "021" }, # Northern America
1788 { id => 26286, name => "061" }, # Polynesia
1789 { id => 27082, name => "013" }, # Central America
1790 { id => 27114, name => "009" }, # Oceania
1791 { id => 30967, name => "SX" }, # Sint Maarten (Dutch part)
1792 { id => 31396, name => "005" }, # South America
1793 { id => 31706, name => "MF" }, # Saint Martin (French part)
1794 { id => 39070, name => "001" }, # World
1795 { id => 42483, name => "011" }, # Western Africa
1796 { id => 42484, name => "017" }, # Middle Africa
1797 { id => 42487, name => "015" }, # Northern Africa
1798 { id => 47590, name => "143" }, # Central Asia
1799 { id => 47599, name => "035" }, # South-Eastern Asia
1800 { id => 47600, name => "030" }, # Eastern Asia
1801 { id => 47603, name => "014" }, # Eastern Africa
1802 { id => 47609, name => "151" }, # Eastern Europe
1803 { id => 47610, name => "039" }, # Southern Europe
1804 { id => 47611, name => "145" }, # Middle East
1805 { id => 47614, name => "034" }, # Southern Asia
1806 { id => 7299303, name => "TL" }, # Democratic Republic of Timor-Leste
1807 { id => 9914689, name => "XK" }, # Kosovo
1808 { id => 10026358, name => "019" }, # Americas
1809 { id => 10028789, name => "AX" }, # Ã…land Islands
1810 { id => 10039880, name => "029", sintlsymbol => "XCD" }, # Caribbean
1811 { id => 10039882, name => "154" }, # Northern Europe
1812 { id => 10039883, name => "018" }, # Southern Africa
1813 { id => 10210824, name => "155" }, # Western Europe
1814 { id => 10210825, name => "053" }, # Australia and New Zealand
1815 { id => 161832015, name => "BL" }, # Saint Barthélemy
1816 { id => 161832256, name => "UM" }, # U.S. Minor Outlying Islands
1817 { id => 161832257, name => "419", parent => "019" }, # Latin America and the Caribbean
1818 { id => 161832258, name => "BQ" }, # Bonaire, Sint Eustatius and Saba
1821 my @cp2uni = ();
1822 my @glyph2uni = ();
1823 my @lead_bytes = ();
1824 my @uni2cp = ();
1825 my @tolower_table = ();
1826 my @toupper_table = ();
1827 my @digitmap_table = ();
1828 my @category_table = ();
1829 my @initial_joining_table = ();
1830 my @direction_table = ();
1831 my @decomp_table = ();
1832 my @combining_class_table = ();
1833 my @decomp_compat_table = ();
1834 my @comp_exclusions = ();
1835 my @idna_decomp_table = ();
1836 my @idna_disallowed = ();
1837 my %registry_keys;
1838 my $default_char;
1839 my $default_wchar;
1841 my %joining_forms =
1843 "isolated" => [],
1844 "final" => [],
1845 "initial" => [],
1846 "medial" => []
1849 sub to_utf16(@)
1851 my @ret;
1852 foreach my $ch (@_)
1854 if ($ch < 0x10000)
1856 push @ret, $ch;
1858 else
1860 my $val = $ch - 0x10000;
1861 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
1864 return @ret;
1867 ################################################################
1868 # fetch a unicode.org file and open it
1869 sub open_data_file($$)
1871 my ($base, $name) = @_;
1872 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
1873 (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
1874 my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
1875 local *FILE;
1877 if ($base =~ /.*\/([^\/]+)\.zip$/)
1879 my $zip = "$1$suffix.zip";
1880 unless (-f "$cache/$zip")
1882 system "mkdir", "-p", $cache;
1883 print "Fetching $base...\n";
1884 !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
1886 open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
1888 else
1890 (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
1891 unless (-f $dest)
1893 system "mkdir", "-p", $dir;
1894 print "Fetching $base/$name...\n";
1895 !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
1897 open FILE, "<$dest" or die "cannot open $dest";
1899 return *FILE;
1902 ################################################################
1903 # load a unicode.org file as XML data
1904 sub load_xml_data_file($$)
1906 my ($base, $name) = @_;
1907 my $FILE = open_data_file( $base, $name );
1908 my $xml = XML::LibXML->load_xml( IO => $FILE );
1909 close FILE;
1910 return $xml;
1913 ################################################################
1914 # recursively get the decomposition for a character
1915 sub get_decomposition($$);
1916 sub get_decomposition($$)
1918 my ($char, $table) = @_;
1919 my @ret;
1921 return $char unless defined $table->[$char];
1922 foreach my $ch (@{$table->[$char]})
1924 push @ret, get_decomposition( $ch, $table );
1926 return @ret;
1929 ################################################################
1930 # get the composition that results in a given character
1931 sub get_composition($$)
1933 my ($ch, $compat) = @_;
1934 return () unless defined $decomp_table[$ch]; # no decomposition
1935 my @ret = @{$decomp_table[$ch]};
1936 return () if @ret < 2; # singleton decomposition
1937 return () if $comp_exclusions[$ch]; # composition exclusion
1938 return () if $combining_class_table[$ch]; # non-starter
1939 return () if $combining_class_table[$ret[0]]; # first char is non-starter
1940 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
1941 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
1942 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
1943 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
1944 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
1945 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
1946 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
1947 return @ret;
1950 ################################################################
1951 # recursively build decompositions
1952 sub build_decompositions(@)
1954 my @src = @_;
1955 my @dst;
1957 for (my $i = 0; $i < @src; $i++)
1959 next unless defined $src[$i];
1960 my @decomp = to_utf16( get_decomposition( $i, \@src ));
1961 $dst[$i] = \@decomp;
1963 return @dst;
1966 ################################################################
1967 # compose Hangul sequences
1968 sub compose_hangul(@)
1970 my $SBASE = 0xac00;
1971 my $LBASE = 0x1100;
1972 my $VBASE = 0x1161;
1973 my $TBASE = 0x11a7;
1974 my $LCOUNT = 19;
1975 my $VCOUNT = 21;
1976 my $TCOUNT = 28;
1977 my $NCOUNT = $VCOUNT * $TCOUNT;
1978 my $SCOUNT = $LCOUNT * $NCOUNT;
1980 my @seq = @_;
1981 my @ret;
1982 my $i;
1984 for ($i = 0; $i < @seq; $i++)
1986 my $ch = $seq[$i];
1987 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
1988 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
1990 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
1991 $i++;
1993 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
1994 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
1996 $ch += $seq[$i+1] - $TBASE;
1997 $i++;
1999 push @ret, $ch;
2001 return @ret;
2004 ################################################################
2005 # remove linguistic-only mappings from the case table
2006 sub remove_linguistic_mappings($$)
2008 my ($upper, $lower) = @_;
2010 # remove case mappings that don't round-trip
2012 for (my $i = 0; $i < @{$upper}; $i++)
2014 next unless defined ${$upper}[$i];
2015 my $ch = ${$upper}[$i];
2016 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
2018 for (my $i = 0; $i < @{$lower}; $i++)
2020 next unless defined ${$lower}[$i];
2021 my $ch = ${$lower}[$i];
2022 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
2026 ################################################################
2027 # read in the Unicode database files
2028 sub load_data()
2030 my $start;
2032 # now build mappings from the decomposition field of the Unicode database
2034 my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
2035 while (<$UNICODE_DATA>)
2037 # Decode the fields ...
2038 my ($code, $name, $cat, $comb, $bidi,
2039 $decomp, $dec, $dig, $num, $mirror,
2040 $oldname, $comment, $upper, $lower, $title) = split /;/;
2041 my $src = hex $code;
2043 die "unknown category $cat" unless defined $categories{$cat};
2044 die "unknown directionality $bidi" unless defined $directions{$bidi};
2046 $category_table[$src] = $categories{$cat};
2047 $direction_table[$src] = $bidi;
2048 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
2050 $initial_joining_table[$src] = $joining_types{"T"};
2052 else
2054 $initial_joining_table[$src] = $joining_types{"U"};
2057 if ($lower ne "")
2059 $tolower_table[$src] = hex $lower;
2061 if ($upper ne "")
2063 $toupper_table[$src] = hex $upper;
2065 if ($dec ne "")
2067 $category_table[$src] |= $ctype{"digit"};
2069 if ($dig ne "")
2071 $digitmap_table[$src] = ord $dig;
2073 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
2075 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
2076 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
2077 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
2078 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
2079 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
2080 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
2081 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
2082 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
2083 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
2084 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
2085 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
2086 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
2088 # copy the category and direction for everything between First/Last pairs
2089 if ($name =~ /, First>/) { $start = $src; }
2090 if ($name =~ /, Last>/)
2092 while ($start < $src)
2094 $category_table[$start] = $category_table[$src];
2095 $direction_table[$start] = $direction_table[$src];
2096 $combining_class_table[$start] = $combining_class_table[$src];
2097 $start++;
2101 next if $decomp eq ""; # no decomposition, skip it
2103 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
2105 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
2106 $decomp_compat_table[$src] = \@seq;
2109 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
2111 # decomposition of the form "<foo> 1234" -> use char if type is known
2112 if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
2114 ${joining_forms{$1}}[hex $2] = $src;
2117 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
2119 # decomposition "<compat> 0020 1234" -> combining accent
2121 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
2123 # store decomposition
2124 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
2126 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
2128 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
2130 # Single char decomposition
2131 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
2135 close $UNICODE_DATA;
2137 # patch the category of some special characters
2139 for (my $i = 0; $i < @decomp_table; $i++)
2141 next unless defined $decomp_table[$i];
2142 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
2144 foreach my $cat (keys %special_categories)
2146 my $flag = $ctype{$cat};
2147 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
2149 for (my $i = 0; $i < @decomp_compat_table; $i++)
2151 next unless defined $decomp_compat_table[$i];
2152 next unless @{$decomp_compat_table[$i]} == 2;
2153 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
2156 # load the composition exclusions
2158 my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
2159 while (<$EXCL>)
2161 s/\#.*//; # remove comments
2162 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
2164 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
2166 elsif (/^([0-9a-fA-F]+)\s*$/)
2168 $comp_exclusions[hex $1] = 1;
2171 close $EXCL;
2173 # load the IDNA mappings
2175 @idna_decomp_table = @decomp_compat_table;
2176 my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
2177 while (<$IDNA>)
2179 s/\#.*//; # remove comments
2180 next if /^\s*$/;
2181 my ($char, $type, $mapping) = split /;/;
2182 my ($ch1, $ch2);
2183 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
2185 $ch1 = hex $1;
2186 $ch2 = hex $2;
2188 elsif ($char =~ /([0-9a-fA-F]+)/)
2190 $ch1 = $ch2 = hex $1;
2193 if ($type =~ /mapped/ || $type =~ /deviation/)
2195 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
2196 my @seq = map { hex $_; } split /\s+/, $mapping;
2197 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
2199 elsif ($type =~ /valid/)
2202 elsif ($type =~ /ignored/)
2204 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
2206 elsif ($type =~ /disallowed/)
2208 foreach my $i ($ch1 .. $ch2)
2210 $idna_decomp_table[$i] = undef;
2211 $idna_disallowed[$i] = 1;
2215 close $IDNA;
2219 ################################################################
2220 # add a new registry key
2221 sub add_registry_key($$)
2223 my ($key, $defval) = @_;
2224 $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
2227 ################################################################
2228 # add a new registry value
2229 sub add_registry_value($$$)
2231 my ($key, $name, $value) = @_;
2232 add_registry_key( $key, undef );
2233 push @{$registry_keys{$key}}, "'$name' = s '$value'";
2236 ################################################################
2237 # define a new lead byte
2238 sub add_lead_byte($)
2240 my $ch = shift;
2241 return if defined $cp2uni[$ch];
2242 push @lead_bytes, $ch;
2243 $cp2uni[$ch] = 0;
2246 ################################################################
2247 # define a new char mapping
2248 sub add_mapping($$)
2250 my ($cp, $uni) = @_;
2251 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2252 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2253 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
2256 ################################################################
2257 # get a mapping including glyph chars for MB_USEGLYPHCHARS
2258 sub get_glyphs_mapping(@)
2260 my @table = @_;
2262 for (my $i = 0; $i < @glyph2uni; $i++)
2264 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
2266 return @table;
2269 ################################################################
2270 # build EUC-JP table from the JIS 0208/0212 files
2271 sub dump_eucjp_codepage()
2273 @cp2uni = ();
2274 @glyph2uni = ();
2275 @lead_bytes = ();
2276 @uni2cp = ();
2277 $default_char = $DEF_CHAR;
2278 $default_wchar = 0x30fb;
2280 # ASCII chars
2281 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
2283 # lead bytes
2284 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
2286 # JIS X 0201 right plane
2287 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
2289 # undefined chars
2290 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
2291 $cp2uni[0xa0] = 0xf8f0;
2292 $cp2uni[0xff] = 0xf8f3;
2294 # Fix backslash conversion
2295 add_mapping( 0xa1c0, 0xff3c );
2297 # Add private mappings for rows undefined in JIS 0208/0212
2298 my $private = 0xe000;
2299 foreach my $hi (0xf5 .. 0xfe)
2301 foreach my $lo (0xa1 .. 0xfe)
2303 add_mapping( ($hi << 8) + $lo, $private++ );
2306 foreach my $hi (0xf5 .. 0xfe)
2308 foreach my $lo (0x21 .. 0x7e)
2310 add_mapping( ($hi << 8) + $lo, $private++ );
2314 my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
2315 while (<$INPUT>)
2317 next if /^\#/; # skip comments
2318 next if /^$/; # skip empty lines
2319 next if /\x1a/; # skip ^Z
2320 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2322 add_mapping( 0x8080 + hex $1, hex $2 );
2323 next;
2325 die "Unrecognized line $_\n";
2327 close $INPUT;
2329 $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
2330 while (<$INPUT>)
2332 next if /^\#/; # skip comments
2333 next if /^$/; # skip empty lines
2334 next if /\x1a/; # skip ^Z
2335 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2337 add_mapping( 0x8000 + hex $1, hex $2 );
2338 next;
2340 die "Unrecognized line $_\n";
2342 close $INPUT;
2344 output_codepage_file( 20932 );
2347 ################################################################
2348 # build Korean Wansung table from the KSX1001 file
2349 sub dump_krwansung_codepage(@)
2351 my @cp949 = @_;
2352 @cp2uni = ();
2353 @glyph2uni = ();
2354 @lead_bytes = ();
2355 @uni2cp = ();
2356 $default_char = 0x3f;
2357 $default_wchar = 0x003f;
2359 # ASCII and undefined chars
2360 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
2361 add_mapping( 0xa0, 0xf8e6 );
2362 add_mapping( 0xad, 0xf8e7 );
2363 add_mapping( 0xae, 0xf8e8 );
2364 add_mapping( 0xaf, 0xf8e9 );
2365 add_mapping( 0xfe, 0xf8ea );
2366 add_mapping( 0xff, 0xf8eb );
2368 my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
2369 while (<$INPUT>)
2371 next if /^\#/; # skip comments
2372 next if /^$/; # skip empty lines
2373 next if /\x1a/; # skip ^Z
2374 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2376 add_mapping( 0x8080 + hex $1, hex $2 );
2377 next;
2379 die "Unrecognized line $_\n";
2381 close $INPUT;
2383 # get some extra mappings from cp 949
2384 my @defined_lb;
2385 map { $defined_lb[$_] = 1; } @lead_bytes;
2386 foreach my $i (0x0000 .. 0xffff)
2388 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
2389 next unless defined $cp949[$i];
2390 if ($cp949[$i] >= 0xff)
2392 # only add chars for lead bytes that exist in 20949
2393 my $hi = $cp949[$i] >> 8;
2394 my $lo = $cp949[$i] & 0xff;
2395 next unless $defined_lb[$hi];
2396 next unless $lo >= 0xa1 && $lo <= 0xfe;
2398 add_mapping( $cp949[$i], $i );
2401 output_codepage_file( 20949 );
2404 ################################################################
2405 # build the sort keys table
2406 sub dump_sortkeys($)
2408 my $filename = shift;
2409 my @sortkeys = ();
2411 my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
2412 while (<$INPUT>)
2414 next if /^\#/; # skip comments
2415 next if /^$/; # skip empty lines
2416 next if /\x1a/; # skip ^Z
2417 next if /^\@version/; # skip @version header
2418 if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
2420 my ($uni,$variable) = (hex $1, $2);
2421 next if $uni > 65535;
2422 $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
2423 next;
2425 if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
2427 # multiple character sequence, ignored for now
2428 next;
2430 die "$SORTKEYS: Unrecognized line $_\n";
2432 close $INPUT;
2434 # compress the keys to 32 bit:
2435 # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
2437 @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
2438 ${$a}[2] <=> ${$b}[2] or
2439 ${$a}[3] <=> ${$b}[3] or
2440 ${$a}[4] <=> ${$b}[4] or
2441 $a cmp $b; } @sortkeys;
2443 my ($n2, $n3) = (1, 1);
2444 my @keys = (-1, -1, -1, -1, -1 );
2445 my @flatkeys = ();
2447 for (my $i = 0; $i < @sortkeys; $i++)
2449 next unless defined $sortkeys[$i];
2450 my @current = @{$sortkeys[$i]};
2451 if ($current[1] == $keys[1])
2453 if ($current[2] == $keys[2])
2455 if ($current[3] == $keys[3])
2457 # nothing
2459 else
2461 $keys[3] = $current[3];
2462 $n3++;
2463 die if ($n3 >= 16);
2466 else
2468 $keys[2] = $current[2];
2469 $keys[3] = $current[3];
2470 $n2++;
2471 $n3 = 1;
2472 die if ($n2 >= 256);
2475 else
2477 $keys[1] = $current[1];
2478 $keys[2] = $current[2];
2479 $keys[3] = $current[3];
2480 $n2 = 1;
2481 $n3 = 1;
2484 if ($current[2]) { $current[2] = $n2; }
2485 if ($current[3]) { $current[3] = $n3; }
2486 if ($current[4]) { $current[4] = 1; }
2488 $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
2491 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2492 printf "Building $filename\n";
2493 printf OUTPUT "/* Unicode collation element table */\n";
2494 printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
2495 printf OUTPUT "/* DO NOT EDIT!! */\n\n";
2496 print OUTPUT "#include \"windef.h\"\n\n";
2498 dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
2500 close OUTPUT;
2501 save_file($filename);
2505 ################################################################
2506 # dump an array of integers
2507 sub dump_array($$@)
2509 my ($bit_width, $default, @array) = @_;
2510 my $format = sprintf "0x%%0%ux", $bit_width / 4;
2511 my $i;
2512 my $ret = " ";
2513 for ($i = 0; $i < $#array; $i++)
2515 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2516 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
2518 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2519 return $ret;
2523 ################################################################
2524 # dump an SBCS mapping table in binary format
2525 sub dump_binary_sbcs_table($)
2527 my $codepage = shift;
2529 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2530 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
2532 print OUTPUT pack "S<*", @header;
2533 print OUTPUT pack "C12", (0) x 12;
2534 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2536 if (@glyph2uni)
2538 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
2540 else
2542 print OUTPUT pack "S<*", 0;
2545 print OUTPUT pack "S<*", 0, 0;
2547 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2551 ################################################################
2552 # dump a DBCS mapping table in binary format
2553 sub dump_binary_dbcs_table($)
2555 my $codepage = shift;
2556 my @lb_ranges = get_lb_ranges();
2557 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2559 my @offsets = (0) x 256;
2560 my $pos = 0;
2561 foreach my $i (@lead_bytes)
2563 $offsets[$i] = ($pos += 256);
2564 $cp2uni[$i] = 0;
2567 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
2569 print OUTPUT pack "S<*", @header;
2570 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
2571 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2572 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
2574 foreach my $i (@lead_bytes)
2576 my $base = $i << 8;
2577 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
2580 print OUTPUT pack "S<", 4;
2581 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2585 ################################################################
2586 # get the list of defined lead byte ranges
2587 sub get_lb_ranges()
2589 my @list = ();
2590 my @ranges = ();
2592 foreach my $i (@lead_bytes) { $list[$i] = 1; }
2593 my $on = 0;
2594 for (my $i = 0; $i < 256; $i++)
2596 if ($on)
2598 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
2600 else
2602 if ($list[$i]) { push @ranges, $i; $on = 1; }
2605 if ($on) { push @ranges, 0xff; }
2606 return @ranges;
2609 ################################################################
2610 # dump the Indic Syllabic Category table
2611 sub dump_indic($)
2613 my $filename = shift;
2614 my @indic_table;
2616 my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
2617 while (<$INPUT>)
2619 next if /^\#/; # skip comments
2620 next if /^\s*$/; # skip empty lines
2621 next if /\x1a/; # skip ^Z
2622 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2624 my $type = $2;
2625 die "unknown indic $type" unless defined $indic_types{$type};
2626 if (hex $1 < 65536)
2628 $indic_table[hex $1] = $indic_types{$type};
2630 next;
2632 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2634 my $type = $3;
2635 die "unknown indic $type" unless defined $indic_types{$type};
2636 if (hex $1 < 65536 and hex $2 < 65536)
2638 foreach my $i (hex $1 .. hex $2)
2640 $indic_table[$i] = $indic_types{$type};
2643 next;
2645 die "malformed line $_";
2647 close $INPUT;
2649 $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
2650 while (<$INPUT>)
2652 next if /^\#/; # skip comments
2653 next if /^\s*$/; # skip empty lines
2654 next if /\x1a/; # skip ^Z
2655 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2657 my $type = $2;
2658 die "unknown matra $type" unless defined $matra_types{$type};
2659 $indic_table[hex $1] |= $matra_types{$type} << 8;
2660 next;
2662 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2664 my $type = $3;
2665 die "unknown matra $type" unless defined $matra_types{$type};
2666 foreach my $i (hex $1 .. hex $2)
2668 $indic_table[$i] |= $matra_types{$type} << 8;
2670 next;
2672 die "malformed line $_";
2674 close $INPUT;
2676 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2677 print "Building $filename\n";
2678 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
2679 print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
2680 print OUTPUT "/* and from $UNIDATA:IndicPositionalCategory.txt */\n";
2681 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2682 print OUTPUT "#include \"windef.h\"\n\n";
2684 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
2686 close OUTPUT;
2687 save_file($filename);
2690 ################################################################
2691 # dump the Line Break Properties table
2692 sub dump_linebreak($)
2694 my $filename = shift;
2695 my @break_table;
2697 my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
2698 while (<$INPUT>)
2700 next if /^\#/; # skip comments
2701 next if /^\s*$/; # skip empty lines
2702 next if /\x1a/; # skip ^Z
2703 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2705 my $type = $2;
2706 die "unknown breaktype $type" unless defined $break_types{$type};
2707 $break_table[hex $1] = $break_types{$type};
2708 next;
2710 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2712 my $type = $3;
2713 die "unknown breaktype $type" unless defined $break_types{$type};
2714 foreach my $i (hex $1 .. hex $2)
2716 $break_table[$i] = $break_types{$type};
2718 next;
2720 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2722 my $type = $2;
2723 die "unknown breaktype $type" unless defined $break_types{$type};
2724 $break_table[hex $1] = $break_types{$type};
2725 next;
2727 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2729 my $type = $3;
2730 die "unknown breaktype $type" unless defined $break_types{$type};
2731 foreach my $i (hex $1 .. hex $2)
2733 $break_table[$i] = $break_types{$type};
2735 next;
2737 die "malformed line $_";
2739 close $INPUT;
2741 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2742 print "Building $filename\n";
2743 print OUTPUT "/* Unicode Line Break Properties */\n";
2744 print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
2745 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2746 print OUTPUT "#include \"windef.h\"\n\n";
2748 dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
2750 close OUTPUT;
2751 save_file($filename);
2754 my %scripts =
2756 "Unknown" => 0,
2757 "Common" => 1,
2758 "Inherited" => 2,
2759 "Arabic" => 3,
2760 "Armenian" => 4,
2761 "Avestan" => 5,
2762 "Balinese" => 6,
2763 "Bamum" => 7,
2764 "Batak" => 8,
2765 "Bengali" => 9,
2766 "Bopomofo" => 10,
2767 "Brahmi" => 11,
2768 "Braille" => 12,
2769 "Buginese" => 13,
2770 "Buhid" => 14,
2771 "Canadian_Aboriginal" => 15,
2772 "Carian" => 16,
2773 "Cham" => 17,
2774 "Cherokee" => 18,
2775 "Coptic" => 19,
2776 "Cuneiform" => 20,
2777 "Cypriot" => 21,
2778 "Cyrillic" => 22,
2779 "Deseret" => 23,
2780 "Devanagari" => 24,
2781 "Egyptian_Hieroglyphs" => 25,
2782 "Ethiopic" => 26,
2783 "Georgian" => 27,
2784 "Glagolitic" => 28,
2785 "Gothic" => 29,
2786 "Greek" => 30,
2787 "Gujarati" => 31,
2788 "Gurmukhi" => 32,
2789 "Han" => 33,
2790 "Hangul" => 34,
2791 "Hanunoo" => 35,
2792 "Hebrew" => 36,
2793 "Hiragana" => 37,
2794 "Imperial_Aramaic" => 38,
2795 "Inscriptional_Pahlavi" => 39,
2796 "Inscriptional_Parthian" => 40,
2797 "Javanese" => 41,
2798 "Kaithi" => 42,
2799 "Kannada" => 43,
2800 "Katakana" => 44,
2801 "Kayah_Li" => 45,
2802 "Kharoshthi" => 46,
2803 "Khmer" => 47,
2804 "Lao" => 48,
2805 "Latin" => 49,
2806 "Lepcha" => 50,
2807 "Limbu" => 51,
2808 "Linear_B" => 52,
2809 "Lisu" => 53,
2810 "Lycian" => 54,
2811 "Lydian" => 55,
2812 "Malayalam" => 56,
2813 "Mandaic" => 57,
2814 "Meetei_Mayek" => 58,
2815 "Mongolian" => 59,
2816 "Myanmar" => 60,
2817 "New_Tai_Lue" => 61,
2818 "Nko" => 62,
2819 "Ogham" => 63,
2820 "Ol_Chiki" => 64,
2821 "Old_Italic" => 65,
2822 "Old_Persian" => 66,
2823 "Old_South_Arabian" => 67,
2824 "Old_Turkic" => 68,
2825 "Oriya" => 69,
2826 "Osmanya" => 70,
2827 "Phags_Pa" => 71,
2828 "Phoenician" => 72,
2829 "Rejang" => 73,
2830 "Runic" => 74,
2831 "Samaritan" => 75,
2832 "Saurashtra" => 76,
2833 "Shavian" => 77,
2834 "Sinhala" => 78,
2835 "Sundanese" => 79,
2836 "Syloti_Nagri" => 80,
2837 "Syriac" => 81,
2838 "Tagalog" => 82,
2839 "Tagbanwa" => 83,
2840 "Tai_Le" => 84,
2841 "Tai_Tham" => 85,
2842 "Tai_Viet" => 86,
2843 "Tamil" => 87,
2844 "Telugu" => 88,
2845 "Thaana" => 89,
2846 "Thai" => 90,
2847 "Tibetan" => 91,
2848 "Tifinagh" => 92,
2849 "Ugaritic" => 93,
2850 "Vai" => 94,
2851 "Yi" => 95,
2852 # Win8/Win8.1
2853 "Chakma" => 96,
2854 "Meroitic_Cursive" => 97,
2855 "Meroitic_Hieroglyphs" => 98,
2856 "Miao" => 99,
2857 "Sharada" => 100,
2858 "Sora_Sompeng" => 101,
2859 "Takri" => 102,
2860 # Win10
2861 "Bassa_Vah" => 103,
2862 "Caucasian_Albanian" => 104,
2863 "Duployan" => 105,
2864 "Elbasan" => 106,
2865 "Grantha" => 107,
2866 "Khojki" => 108,
2867 "Khudawadi" => 109,
2868 "Linear_A" => 110,
2869 "Mahajani" => 111,
2870 "Manichaean" => 112,
2871 "Mende_Kikakui" => 113,
2872 "Modi" => 114,
2873 "Mro" => 115,
2874 "Nabataean" => 116,
2875 "Old_North_Arabian" => 117,
2876 "Old_Permic" => 118,
2877 "Pahawh_Hmong" => 119,
2878 "Palmyrene" => 120,
2879 "Pau_Cin_Hau" => 121,
2880 "Psalter_Pahlavi" => 122,
2881 "Siddham" => 123,
2882 "Tirhuta" => 124,
2883 "Warang_Citi" => 125,
2884 # Win10 RS1
2885 "Adlam" => 126,
2886 "Ahom" => 127,
2887 "Anatolian_Hieroglyphs" => 128,
2888 "Bhaiksuki" => 129,
2889 "Hatran" => 130,
2890 "Marchen" => 131,
2891 "Multani" => 132,
2892 "Newa" => 133,
2893 "Old_Hungarian" => 134,
2894 "Osage" => 135,
2895 "SignWriting" => 136,
2896 "Tangut" => 137,
2897 # Win10 RS4
2898 "Masaram_Gondi" => 138,
2899 "Nushu" => 139,
2900 "Soyombo" => 140,
2901 "Zanabazar_Square" => 141,
2902 # Win10 1903
2903 "Dogra" => 142,
2904 "Gunjala_Gondi" => 143,
2905 "Hanifi_Rohingya" => 144,
2906 "Makasar" => 145,
2907 "Medefaidrin" => 146,
2908 "Old_Sogdian" => 147,
2909 "Sogdian" => 148,
2910 # Win10 2004
2911 "Elymaic" => 149,
2912 "Nyiakeng_Puachue_Hmong" => 150,
2913 "Nandinagari" => 151,
2914 "Wancho" => 152,
2915 # Win11
2916 "Chorasmian" => 153,
2917 "Dives_Akuru" => 154,
2918 "Khitan_Small_Script" => 155,
2919 "Yezidi" => 156,
2922 ################################################################
2923 # dump Script IDs table
2924 sub dump_scripts($)
2926 my $filename = shift;
2927 my $header = $filename;
2928 my @scripts_table;
2929 my $script_index;
2930 my $i;
2932 my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
2933 # Fill the table
2934 # Unknown script id is always 0, so undefined scripts are automatically treated as such
2935 while (<$INPUT>)
2937 my $type = "";
2939 next if /^\#/; # skip comments
2940 next if /^\s*$/; # skip empty lines
2941 next if /\x1a/; # skip ^Z
2942 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2944 $type = $2;
2945 if (defined $scripts{$type})
2947 $scripts_table[hex $1] = $scripts{$type};
2949 next;
2951 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2953 $type = $3;
2954 if (defined $scripts{$type})
2956 foreach my $i (hex $1 .. hex $2)
2958 $scripts_table[$i] = $scripts{$type};
2961 next;
2965 close $INPUT;
2967 $header = "$filename.h";
2968 open OUTPUT,">$header.new" or die "Cannot create $header";
2969 print "Building $header\n";
2970 print OUTPUT "/* Unicode Script IDs */\n";
2971 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
2972 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2974 print OUTPUT "enum unicode_script_id {\n";
2975 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
2977 print OUTPUT " Script_$script = $scripts{$script},\n";
2979 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
2980 print OUTPUT "};\n";
2982 close OUTPUT;
2983 save_file($header);
2985 $filename = "$filename.c";
2986 open OUTPUT,">$filename.new" or die "Cannot create $header";
2987 print "Building $filename\n";
2988 print OUTPUT "/* Unicode Script IDs */\n";
2989 print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
2990 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2991 print OUTPUT "#include \"windef.h\"\n\n";
2993 dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
2994 close OUTPUT;
2995 save_file($filename);
2998 ################################################################
2999 # dump the BiDi mirroring table
3000 sub dump_mirroring($)
3002 my $filename = shift;
3003 my @mirror_table = ();
3005 my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
3006 while (<$INPUT>)
3008 next if /^\#/; # skip comments
3009 next if /^$/; # skip empty lines
3010 next if /\x1a/; # skip ^Z
3011 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
3013 $mirror_table[hex $1] = hex $2;
3014 next;
3016 die "malformed line $_";
3018 close $INPUT;
3020 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3021 print "Building $filename\n";
3022 print OUTPUT "/* Unicode BiDi mirroring */\n";
3023 print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
3024 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3025 print OUTPUT "#include \"windef.h\"\n\n";
3026 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
3027 close OUTPUT;
3028 save_file($filename);
3031 ################################################################
3032 # dump the Bidi Brackets
3033 sub dump_bracket($)
3035 my $filename = shift;
3036 my @bracket_table;
3038 my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
3039 while (<$INPUT>)
3041 next if /^\#/; # skip comments
3042 next if /^\s*$/; # skip empty lines
3043 next if /\x1a/; # skip ^Z
3044 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
3046 my $type = $3;
3047 die "unknown bracket $type" unless defined $bracket_types{$type};
3048 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
3049 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
3050 $bracket_table[hex $1] += $bracket_types{$type} << 8;
3051 next;
3053 die "malformed line $_";
3055 close $INPUT;
3057 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3058 print "Building $filename\n";
3059 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
3060 print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
3061 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3062 print OUTPUT "#include \"windef.h\"\n\n";
3064 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
3066 close OUTPUT;
3067 save_file($filename);
3070 ################################################################
3071 # dump the Arabic shaping table
3072 sub dump_shaping($)
3074 my $filename = shift;
3075 my @joining_table = @initial_joining_table;
3077 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
3078 while (<$INPUT>)
3080 next if /^\#/; # skip comments
3081 next if /^\s*$/; # skip empty lines
3082 next if /\x1a/; # skip ^Z
3083 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3085 my $type = $2;
3086 $joining_table[hex $1] = $joining_types{$type};
3087 next;
3089 die "malformed line $_";
3091 close $INPUT;
3093 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3094 print "Building $filename\n";
3095 print OUTPUT "/* Unicode Arabic shaping */\n";
3096 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
3097 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3098 print OUTPUT "#include \"windef.h\"\n\n";
3100 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
3102 print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
3103 for (my $i = 0x600; $i <= 0x6ff; $i++)
3105 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
3106 ${joining_forms{"isolated"}}[$i] || $i,
3107 ${joining_forms{"final"}}[$i] || $i,
3108 ${joining_forms{"initial"}}[$i] || $i,
3109 ${joining_forms{"medial"}}[$i] || $i;
3111 print OUTPUT "};\n";
3113 close OUTPUT;
3114 save_file($filename);
3117 ################################################################
3118 # dump the Arabic shaping table
3119 sub dump_arabic_shaping($)
3121 my $filename = shift;
3122 my @joining_table = @initial_joining_table;
3124 my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
3125 while (<$INPUT>)
3127 next if /^\#/; # skip comments
3128 next if /^\s*$/; # skip empty lines
3129 next if /\x1a/; # skip ^Z
3130 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3132 my $type = $2;
3133 my $group = $3;
3135 if ($group eq "ALAPH" || $group eq "DALATH RISH")
3137 $joining_table[hex $1] = $joining_types{$group};
3139 else
3141 $joining_table[hex $1] = $joining_types{$type};
3144 next;
3146 die "malformed line $_";
3148 close $INPUT;
3150 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3151 print "Building $filename\n";
3152 print OUTPUT "/* Unicode Arabic shaping */\n";
3153 print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
3154 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3155 print OUTPUT "#include \"windef.h\"\n\n";
3157 dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
3159 close OUTPUT;
3160 save_file($filename);
3163 ################################################################
3164 # dump the Vertical Orientation table
3165 sub dump_vertical($$)
3167 my ($filename, $unix) = @_;
3168 my @vertical_table;
3170 my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
3171 while (<$INPUT>)
3173 next if /^\#/; # skip comments
3174 next if /^\s*$/; # skip empty lines
3175 next if /\x1a/; # skip ^Z
3176 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
3178 my $type = $2;
3179 die "unknown vertical $type" unless defined $vertical_types{$type};
3180 if (hex $1 < 65536)
3182 $vertical_table[hex $1] = $vertical_types{$type};
3184 next;
3186 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
3188 my $type = $3;
3189 die "unknown vertical $type" unless defined $vertical_types{$type};
3190 foreach my $i (hex $1 .. hex $2)
3192 $vertical_table[$i] = $vertical_types{$type};
3194 next;
3196 die "malformed line $_";
3198 close $INPUT;
3200 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3201 print "Building $filename\n";
3202 print OUTPUT "/* Unicode Vertical Orientation */\n";
3203 print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
3204 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3205 if ($unix)
3207 print OUTPUT "#if 0\n";
3208 print OUTPUT "#pragma makedep unix\n";
3209 print OUTPUT "#endif\n\n";
3211 print OUTPUT "#include \"windef.h\"\n\n";
3213 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
3215 close OUTPUT;
3216 save_file($filename);
3219 ################################################################
3220 # dump the digit folding tables
3221 sub dump_digit_folding($)
3223 my ($filename) = shift;
3224 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3225 print "Building $filename\n";
3226 print OUTPUT "/* Unicode digit folding mappings */\n";
3227 print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
3228 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3229 print OUTPUT "#include \"windef.h\"\n\n";
3231 dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
3232 close OUTPUT;
3233 save_file($filename);
3237 ################################################################
3238 # compress a mapping table by removing identical rows
3239 sub compress_array($$@)
3241 my $rows = shift;
3242 my $def = shift;
3243 my @table = @_;
3244 my $len = @table / $rows;
3245 my @array;
3246 my $data = "";
3248 # try to merge table rows
3249 for (my $row = 0; $row < $rows; $row++)
3251 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
3252 my $pos = index $data, $rowtxt;
3253 if ($pos == -1)
3255 # check if the tail of the data can match the start of the new row
3256 my $first = substr( $rowtxt, 0, 1 );
3257 for (my $i = length($data) - 1; $i > 0; $i--)
3259 $pos = index( substr( $data, -$i ), $first );
3260 last if $pos == -1;
3261 $i -= $pos;
3262 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
3263 substr( $data, -$i ) = "";
3264 last;
3266 $pos = length $data;
3267 $data .= $rowtxt;
3269 $array[$row] = $rows + $pos;
3271 return @array, unpack "U*", $data;
3274 ################################################################
3275 # dump a char -> 16-bit value mapping table using two-level tables
3276 sub dump_two_level_mapping($$@)
3278 my $name = shift;
3279 my $def = shift;
3280 my $size = shift;
3281 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3282 my @row_array = compress_array( 4096, $def, @_[0..65535] );
3283 my @array = compress_array( 256, 0, @row_array[0..4095] );
3285 for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
3287 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
3288 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
3289 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
3290 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
3293 ################################################################
3294 # dump a char -> value mapping table using three-level tables
3295 sub dump_three_level_mapping($$@)
3297 my $name = shift;
3298 my $def = shift;
3299 my $size = shift;
3300 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3301 my $level3 = ($MAX_CHAR + 1) / 16;
3302 my $level2 = $level3 / 16;
3303 my $level1 = $level2 / 16;
3304 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
3305 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
3306 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
3308 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
3309 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
3311 printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
3312 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
3313 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
3314 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
3315 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
3318 ################################################################
3319 # dump a binary case mapping table in l_intl.nls format
3320 sub dump_binary_case_table(@)
3322 my (@table) = @_;
3323 my $max_char = 0x10000;
3324 my $level1 = $max_char / 16;
3325 my $level2 = $level1 / 16;
3327 my @difftable;
3328 for (my $i = 0; $i < @table; $i++)
3330 next unless defined $table[$i];
3331 $difftable[$i] = ($table[$i] - $i) & 0xffff;
3334 my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
3335 my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
3336 my $offset = @array - $level1;
3337 for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
3338 return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
3341 ################################################################
3342 # dump case mappings for l_intl.nls
3343 sub dump_intl_nls($)
3345 my @upper_table = @toupper_table;
3346 my @lower_table = @tolower_table;
3347 remove_linguistic_mappings( \@upper_table, \@lower_table );
3349 my $upper = dump_binary_case_table( @upper_table );
3350 my $lower = dump_binary_case_table( @lower_table );
3352 my $filename = shift;
3353 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3354 printf "Building $filename\n";
3356 binmode OUTPUT;
3357 print OUTPUT pack "S<", 1; # version
3358 print OUTPUT $upper;
3359 print OUTPUT $lower;
3360 close OUTPUT;
3361 save_file($filename);
3365 ################################################################
3366 # dump the bidi direction table
3367 sub dump_bidi_dir_table($)
3369 my $filename = shift;
3370 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3371 printf "Building $filename\n";
3372 printf OUTPUT "/* Unicode BiDi direction table */\n";
3373 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
3374 printf OUTPUT "#include \"windef.h\"\n\n";
3376 my @table;
3378 for (my $i = 0; $i < 65536; $i++)
3380 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
3383 dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
3385 close OUTPUT;
3386 save_file($filename);
3390 sub rol($$)
3392 my ($byte, $count) = @_;
3393 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
3396 ################################################################
3397 # compress the character properties table
3398 sub compress_char_props_table($@)
3400 my $rows = shift;
3401 my @table = @_;
3402 my $len = @table / $rows;
3403 my $pos = 0;
3404 my @array = (0) x $rows;
3405 my %sequences;
3407 # add some predefined sequences
3408 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
3410 # try to merge table rows
3411 for (my $row = 0; $row < $rows; $row++)
3413 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
3414 my $rowtxt = pack "L*", @table_row;
3415 if (defined($sequences{$rowtxt}))
3417 # reuse an existing row
3418 $array[$row] = $sequences{$rowtxt};
3420 else
3422 # create a new row
3423 $sequences{$rowtxt} = $array[$row] = ++$pos;
3424 push @array, @table_row;
3427 return @array;
3430 ################################################################
3431 # dump a normalization table in binary format
3432 sub dump_norm_table($)
3434 my $filename = shift;
3436 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
3437 my %decomp = ( "nfc" => \@decomp_table,
3438 "nfd" => \@decomp_table,
3439 "nfkc" => \@decomp_compat_table,
3440 "nfkd" => \@decomp_compat_table ,
3441 "idna" => \@idna_decomp_table );
3443 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3444 print "Building $filename\n";
3446 my $type = $filename;
3447 $type =~ s!.*/norm(\w+)\.nls!$1!;
3449 my $compose = $forms{$type} & 1;
3450 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
3452 my @version = split /\./, $UNIVERSION;
3454 # combining classes
3456 my @classes;
3457 my @class_values;
3459 foreach my $c (grep defined, @combining_class_table)
3461 $classes[$c] = 1 if $c < 0x100;
3463 for (my $i = 0; $i < @classes; $i++)
3465 next unless defined $classes[$i];
3466 $classes[$i] = @class_values;
3467 push @class_values, $i;
3469 push @class_values, 0 if (@class_values % 2);
3470 die "too many classes" if @class_values >= 0x40;
3472 # character properties
3474 my @char_props;
3475 my @decomposed;
3476 my @comp_hash_table;
3477 my $comp_hash_size = $compose ? 254 : 0;
3479 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3481 next unless defined $combining_class_table[$i];
3482 if (defined $decomp{$type}->[$i])
3484 my @dec = get_decomposition( $i, $decomp{$type} );
3485 if ($compose && (my @comp = get_composition( $i, $compat )))
3487 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
3488 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
3490 my $val = 0;
3491 foreach my $d (@dec)
3493 $val = $combining_class_table[$d];
3494 last if $val;
3496 $char_props[$i] = $classes[$val];
3498 else
3500 $char_props[$i] = 0xbf;
3502 @dec = compose_hangul( @dec ) if $compose;
3503 @dec = to_utf16( @dec );
3504 push @dec, 0 if @dec >= 7;
3505 $decomposed[$i] = \@dec;
3507 else
3509 if ($combining_class_table[$i] == 0x100)
3511 $char_props[$i] = 0x7f;
3513 elsif ($combining_class_table[$i])
3515 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
3517 elsif ($type eq "idna" && defined $idna_disallowed[$i])
3519 $char_props[$i] = 0xff;
3521 else
3523 $char_props[$i] = 0;
3528 if ($compose)
3530 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3532 my @comp = get_composition( $i, $compat );
3533 next unless @comp;
3534 if ($combining_class_table[$comp[1]])
3536 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
3537 $char_props[$comp[1]] |= 0x40;
3539 else
3541 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
3542 $char_props[$comp[1]] |= 0xc0;
3547 # surrogates
3548 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
3549 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
3551 # Hangul
3552 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
3553 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
3554 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
3556 # invalid chars
3557 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
3558 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
3559 foreach my $i (0x00..0x10)
3561 $char_props[($i << 16) | 0xfffe] = 0xff;
3562 $char_props[($i << 16) | 0xffff] = 0xff;
3565 # decomposition hash table
3567 my @decomp_hash_table;
3568 my @decomp_hash_index;
3569 my @decomp_hash_data;
3570 my $decomp_hash_size = 944;
3572 # build string of character data, reusing substrings when possible
3573 my $decomp_char_data = "";
3574 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
3576 my $str = pack "U*", @{$i};
3577 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
3579 for (my $i = 0; $i < @decomposed; $i++)
3581 next unless defined $decomposed[$i];
3582 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
3583 die "sequence not found" if $pos == -1;
3584 my $len = @{$decomposed[$i]};
3585 $len = 7 if $len > 7;
3586 my $hash = $i % $decomp_hash_size;
3587 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
3589 for (my $i = 0; $i < $decomp_hash_size; $i++)
3591 $decomp_hash_index[$i] = @decomp_hash_data / 2;
3592 next unless defined $decomp_hash_table[$i];
3593 if (@{$decomp_hash_table[$i]} == 1)
3595 my $entry = $decomp_hash_table[$i]->[0];
3596 if ($char_props[$entry->[0]] == 0xbf)
3598 $decomp_hash_index[$i] = $entry->[1];
3599 next;
3602 foreach my $entry (@{$decomp_hash_table[$i]})
3604 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
3607 push @decomp_hash_data, 0, 0;
3609 # composition hash table
3611 my @comp_hash_index;
3612 my @comp_hash_data;
3613 if (@comp_hash_table)
3615 for (my $i = 0; $i < $comp_hash_size; $i++)
3617 $comp_hash_index[$i] = @comp_hash_data;
3618 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
3620 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
3621 push @comp_hash_data, 0, 0, 0;
3624 my $level1 = ($MAX_CHAR + 1) / 128;
3625 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
3627 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
3628 0, $decomp_hash_size, $comp_hash_size, 0 );
3629 my @tables = (0) x 8;
3631 $tables[0] = 16 + @header + @tables;
3632 $tables[1] = $tables[0] + @class_values / 2;
3633 $tables[2] = $tables[1] + $level1 / 2;
3634 $tables[3] = $tables[2] + (@rows - $level1) / 2;
3635 $tables[4] = $tables[3] + @decomp_hash_index;
3636 $tables[5] = $tables[4] + @decomp_hash_data;
3637 $tables[6] = $tables[5] + length $decomp_char_data;
3638 $tables[7] = $tables[6] + @comp_hash_index;
3640 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
3641 print OUTPUT pack "S<*", @header;
3642 print OUTPUT pack "S<*", @tables;
3643 print OUTPUT pack "C*", @class_values;
3645 print OUTPUT pack "C*", @rows[0..$level1-1];
3646 print OUTPUT pack "C*", @rows[$level1..$#rows];
3647 print OUTPUT pack "S<*", @decomp_hash_index;
3648 print OUTPUT pack "S<*", @decomp_hash_data;
3649 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
3650 print OUTPUT pack "S<*", @comp_hash_index;
3651 print OUTPUT pack "S<*", @comp_hash_data;
3653 close OUTPUT;
3654 save_file($filename);
3656 add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
3660 ################################################################
3661 # output a codepage definition file from the global tables
3662 sub output_codepage_file($)
3664 my $codepage = shift;
3666 my $output = sprintf "nls/c_%03d.nls", $codepage;
3667 open OUTPUT,">$output.new" or die "Cannot create $output";
3669 printf "Building %s\n", $output;
3670 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
3671 else { dump_binary_dbcs_table( $codepage ); }
3673 close OUTPUT;
3674 save_file($output);
3676 add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
3679 ################################################################
3680 # output a codepage table from a Microsoft-style mapping file
3681 sub dump_msdata_codepage($)
3683 my $filename = shift;
3685 my $state = "";
3686 my ($codepage, $width, $count);
3687 my ($lb_cur, $lb_end);
3689 @cp2uni = ();
3690 @glyph2uni = ();
3691 @lead_bytes = ();
3692 @uni2cp = ();
3693 $default_char = $DEF_CHAR;
3694 $default_wchar = $DEF_CHAR;
3696 my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
3698 while (<$INPUT>)
3700 next if /^;/; # skip comments
3701 next if /^\s*$/; # skip empty lines
3702 next if /\x1a/; # skip ^Z
3703 last if /^ENDCODEPAGE/;
3705 if (/^CODEPAGE\s+(\d+)/)
3707 $codepage = $1;
3708 next;
3710 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
3712 $width = $1;
3713 $default_char = hex $2;
3714 $default_wchar = hex $3;
3715 next;
3717 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
3719 $state = $1;
3720 $count = $2;
3721 next;
3723 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
3725 if ($state eq "MBTABLE")
3727 my $cp = hex $1;
3728 my $uni = hex $2;
3729 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3730 next;
3732 if ($state eq "GLYPHTABLE")
3734 my $cp = hex $1;
3735 my $uni = hex $2;
3736 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
3737 next;
3739 if ($state eq "WCTABLE")
3741 my $uni = hex $1;
3742 my $cp = hex $2;
3743 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
3744 next;
3746 if ($state eq "DBCSRANGE")
3748 my $start = hex $1;
3749 my $end = hex $2;
3750 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
3751 $lb_cur = $start;
3752 $lb_end = $end;
3753 next;
3755 if ($state eq "DBCSTABLE")
3757 my $mb = hex $1;
3758 my $uni = hex $2;
3759 my $cp = ($lb_cur << 8) | $mb;
3760 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3761 if (!--$count)
3763 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
3765 next;
3768 die "$filename: Unrecognized line $_\n";
3770 close $INPUT;
3772 output_codepage_file( $codepage );
3774 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
3777 ################################################################
3778 # align a string length
3779 sub align_string($$)
3781 my ($align, $str) = @_;
3782 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
3783 return $str;
3786 ################################################################
3787 # pad a string with zeros
3788 sub pad_string($$)
3790 my ($pad, $str) = @_;
3791 $str .= pack "C*", (0) x ($pad - length($str)) if length($str) < $pad;
3792 return $str;
3795 ################################################################
3796 # pack a GUID string
3797 sub pack_guid($)
3799 $_ = shift;
3800 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
3801 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
3804 ################################################################
3805 # comparison function for compression sort
3806 sub cmp_compression
3808 return scalar @{$a} <=> scalar @{$b} ||
3809 $a->[4] <=> $b->[4] ||
3810 $a->[5] <=> $b->[5] ||
3811 $a->[6] <=> $b->[6] ||
3812 $a->[7] <=> $b->[7] ||
3813 $a->[8] <=> $b->[8] ||
3814 $a->[9] <=> $b->[9] ||
3815 $a->[10] <=> $b->[10] ||
3816 $a->[11] <=> $b->[11] ||
3817 $a->[12] <=> $b->[12];
3820 ################################################################
3821 # build a binary sort keys table
3822 sub dump_sortkey_table($$)
3824 my ($filename, $download) = @_;
3826 my @keys;
3827 my ($part, $section, $subsection, $guid, $version, $ling_flag);
3828 my @multiple_weights;
3829 my @expansions;
3830 my @compressions;
3831 my %exceptions;
3832 my %guids;
3833 my %compr_flags;
3834 my %locales;
3835 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
3836 my $jamostr = "";
3838 my $re_hex = '0x[0-9A-Fa-f]+';
3839 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
3840 $guids{$default_guid} = { };
3842 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
3844 my $KEYS = open_data_file( $MSDATA, $download );
3846 printf "Building $filename\n";
3848 while (<$KEYS>)
3850 s/\s*;.*$//;
3851 next if /^\s*$/; # skip empty lines
3852 if (/^\s*(SORTKEY|SORTTABLES)/)
3854 $part = $1;
3855 next;
3857 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
3859 $part = $section = "";
3860 next;
3862 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
3864 $section = $1;
3865 $guid = undef;
3866 next;
3868 next unless $part;
3869 if ("$part.$section" eq "SORTKEY.DEFAULT")
3871 if (/^\s*($re_hex)\s+$re_key/)
3873 $keys[hex $1] = [ split(/\s+/,$2) ];
3874 next;
3877 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
3879 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
3881 $version = hex $1;
3882 next;
3884 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
3886 # ignore for now
3887 next;
3890 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
3891 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
3892 "$part.$section" eq "SORTTABLES.INVERSECASING")
3894 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
3896 $guid = lc $1;
3897 $guids{$guid} = { } unless defined $guids{$guid};
3898 $guids{$guid}->{flags} |= $flags{$section};
3899 next;
3901 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3903 $locales{$1} = $guid;
3904 next;
3907 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
3909 if (/^\s*(\d+)\s+(\d+)/)
3911 push @multiple_weights, $1, $2;
3912 next;
3915 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
3917 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3919 my $pos = scalar @expansions / 2;
3920 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
3921 push @expansions, hex $2, hex $3;
3922 next;
3925 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
3927 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3929 $keys[hex $1] = $keys[hex $2];
3930 next;
3933 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
3935 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
3937 if ($subsection || !$guid) # start a new one
3939 $guid = lc $1;
3940 $subsection = "";
3941 $guids{$guid} = { } unless defined $guids{$guid};
3942 $guids{$guid}->{flags} |= $flags{$2} if $2;
3943 $guids{$guid}->{compr} = @compressions;
3944 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
3945 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
3946 push @compressions, [ ];
3948 else # merge with current one
3950 $guids{lc $1} = { } unless defined $guids{lc $1};
3951 $guids{lc $1}->{flags} |= $flags{$2} if $2;
3952 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
3953 $compr_flags{lc $1} = $compr_flags{$guid};
3955 next;
3957 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3959 $locales{$1} = $guid;
3960 next;
3962 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
3964 $subsection = $1;
3965 next;
3967 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
3969 my @comp = map { hex $_; } split(/\s+/,$1);
3970 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
3971 # add compression flags
3972 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
3973 next;
3976 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
3978 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
3980 $guid = lc $1;
3981 $guids{$guid} = { } unless defined $guids{lc $1};
3982 $ling_flag = ($2 ? "+" : "-");
3983 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
3984 next;
3986 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3988 $locales{$1} = $guid;
3989 next;
3991 if (/^\s*($re_hex)\s+$re_key/)
3993 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
3994 next;
3997 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
3999 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
4001 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
4002 next;
4005 die "$download: $part.$section: unrecognized line $_\n";
4007 close $KEYS;
4009 # Sortkey table
4011 my $table;
4012 for (my $i = 0; $i < 0x10000; $i++)
4014 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
4015 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4018 foreach my $id (sort keys %exceptions)
4020 my $pos = length($table) / 4;
4021 my @exc = @{$exceptions{$id}};
4022 my @filled;
4023 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
4024 my $guid = substr( $id, 0, -1 );
4025 $guids{$guid}->{$key} = $pos;
4026 $pos += 0x100;
4027 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
4028 for (my $j = 0; $j < 0x10000; $j++)
4030 next unless defined $exc[$j] || defined $flags[$j];
4031 $filled[$j >> 8] = 1;
4032 $j |= 0xff;
4034 for (my $j = 0; $j < 0x100; $j++)
4036 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
4037 $pos += 0x100 if $filled[$j];
4039 for (my $j = 0; $j < 0x10000; $j++)
4041 next unless $filled[$j >> 8];
4042 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
4043 $k[3] |= $flags[$j] || 0;
4044 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4048 # Case mapping tables
4050 # standard table
4051 my @casemaps;
4052 my @upper = @toupper_table;
4053 my @lower = @tolower_table;
4054 remove_linguistic_mappings( \@upper, \@lower );
4055 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4057 # linguistic table
4058 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
4060 # Turkish table
4061 @upper = @toupper_table;
4062 @lower = @tolower_table;
4063 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
4064 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
4065 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4066 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
4068 # Char type table
4070 my @table;
4071 my $types = "";
4072 my %typestr;
4073 for (my $i = 0; $i < 0x10000; $i++)
4075 my $str = pack "S<3",
4076 ($category_table[$i] || 0) & 0xffff,
4077 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
4078 ($category_table[$i] || 0) >> 16;
4080 if (!defined($typestr{$str}))
4082 $typestr{$str} = length($types) / 6;
4083 $types .= $str;
4085 $table[$i] = $typestr{$str};
4088 my @rows = compress_array( 4096, 0, @table[0..65535] );
4089 my @array = compress_array( 256, 0, @rows[0..4095] );
4090 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
4091 for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
4093 my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
4094 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
4095 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
4097 # Sort tables
4099 # guids
4100 my $sorttables = pack "L<2", $version, scalar %guids;
4101 foreach my $id (sort keys %guids)
4103 my %guid = %{$guids{$id}};
4104 my $flags = $guid{flags} || 0;
4105 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
4106 $sorttables .= pack_guid($id) . pack "L<5",
4107 $flags,
4108 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
4109 $guid{except} || 0,
4110 $guid{ling_except} || 0,
4111 $map / 2;
4114 # expansions
4115 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
4117 # compressions
4118 $sorttables .= pack "L<", scalar @compressions;
4119 my $rowstr = "";
4120 foreach my $c (@compressions)
4122 my $pos = length($rowstr) / 2;
4123 my $min = 0xffff;
4124 my $max = 0;
4125 my @lengths = (0) x 8;
4126 foreach my $r (sort cmp_compression @{$c})
4128 my @row = @{$r};
4129 $lengths[scalar @row - 6]++;
4130 foreach my $val (@row[4..$#row])
4132 $min = $val if $min > $val;
4133 $max = $val if $max < $val;
4135 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
4136 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
4138 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
4140 $sorttables .= $rowstr;
4142 # multiple weights
4143 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
4145 # jamo sort
4146 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
4148 # Locales
4150 add_registry_key( "Sorting\\Ids", "{$default_guid}" );
4151 foreach my $loc (sort keys %locales)
4153 # skip specific locales that match more general ones
4154 my @parts = split /[-_]/, $loc;
4155 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
4156 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
4157 add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
4160 # File header
4162 my @header;
4163 $header[0] = 16;
4164 $header[1] = $header[0] + length $table;
4165 $header[2] = $header[1] + length $casemaps;
4166 $header[3] = $header[2] + length $chartypes;
4168 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4169 print OUTPUT pack "L<*", @header;
4170 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
4171 close OUTPUT;
4172 save_file($filename);
4173 return $chartypes;
4177 my %lcnames;
4179 sub locale_parent($)
4181 my $loc = shift;
4183 return undef unless $loc;
4184 return $lcnames{$loc}->{sparent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{sparent};
4185 return $lcnames{$loc}->{parent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{parent};
4186 if ($loc =~ /(.*)-[0-9A-Za-z]+/) { return $1; }
4187 return "";
4190 sub compare_locales
4192 (my $n1 = $a) =~ tr/A-Z_/a-z-/;
4193 (my $n2 = $b) =~ tr/A-Z_/a-z-/;
4194 return $n1 cmp $n2;
4197 # query an xml key
4198 sub xml_query($$)
4200 my ($xml, $query) = @_;
4201 my $ret = $xml->find( $query );
4202 return undef unless $ret;
4203 printf STDERR "multiple entries for %s\n", $query if (@{$ret} > 1);
4204 return @{$ret}[0]->textContent;
4207 # query an xml key for a locale, with fallback to the parents
4208 sub loc_query($$)
4210 my ($loc, $query) = @_;
4212 $loc = $lcnames{"en-US"} unless $loc->{name}; # fallback to "en-US" for root locale
4214 for (my $cur = $loc->{name}; defined $cur; $cur = locale_parent( $cur ))
4216 next unless defined $lcnames{$cur};
4217 my $xml = $lcnames{$cur}->{xml};
4218 my $ret = $xml->find( $query );
4219 next unless $ret;
4220 printf STDERR "%s: multiple entries for %s\n", $cur, $query if (@{$ret} > 1);
4221 next if @{$ret}[0]->textContent eq "\x{2191}\x{2191}\x{2191}"; # "↑↑↑"
4222 return @{$ret}[0]->textContent;
4224 return undef;
4227 # retrieve a locale field entry by going up the parents tree
4228 sub locale_entry($$$)
4230 my ($loc, $field, $def) = @_;
4232 return $loc->{$field} if defined $loc->{$field};
4234 unless ($loc->{name}) # fallback to "en-US" for root locale
4236 $loc = $lcnames{"en-US"};
4237 return $loc->{$field} if defined $loc->{$field};
4239 while (defined $loc->{alias}) # resolve aliases
4241 $loc = $lcnames{$loc->{alias}};
4242 return $loc->{$field} if defined $loc->{$field};
4244 my $cur = $loc->{name};
4245 while ($cur)
4247 if (defined $lcnames{$cur} && defined $lcnames{$cur}->{sparent})
4249 $cur = $lcnames{$cur}->{sparent};
4251 elsif ($cur =~ /(.*)-[0-9A-Za-z]+/)
4253 $cur = $1;
4255 else
4257 return $def;
4259 return $lcnames{$cur}->{$field} if defined $lcnames{$cur} && defined $lcnames{$cur}->{$field};
4261 return $def;
4264 my $string_data;
4266 sub add_str_data($)
4268 my $txt = shift;
4269 my $ret = index( $string_data, $txt );
4270 if ($ret == -1)
4272 $ret = length($string_data);
4273 $string_data .= $txt
4275 return $ret / 2;
4278 sub add_string($)
4280 my $str = shift;
4281 return 0 unless defined($str) && $str ne "";
4282 my $utf = encode( "UTF16LE", $str );
4283 return add_str_data( (pack "S<", length($utf) / 2) . $utf . (pack "S", 0) );
4286 sub add_fontsig(@)
4288 return add_str_data( pack "S<L<*", scalar(@_) * 2, @_ );
4291 sub add_strarray(@)
4293 return 0 unless @_;
4294 return add_str_data( pack "S<L<*", scalar @_, map { add_string($_) } @_);
4297 sub format_to_grouping($)
4299 my $format = shift;
4300 if ($format =~ /#,(#+),(#+0)/) { return chr(length($2)) . chr(length($1)); }
4301 if ($format =~ /#,(#+0)/) { return chr(length($1)); }
4302 # printf STDERR "unknown format %s\n", $format;
4303 return chr(3);
4306 sub parse_currency_format($$)
4308 my $name = shift;
4309 my ($posfmt, $negfmt) = split /;/, shift;
4310 my @pospatterns = ( "\xa4[^\xa0]*#", # $1.1
4311 "00[^\xa0]*\xa4", # 1.1$
4312 "\xa4.*\xa0.*#", # $ 1.1
4313 "00.*\xa0.*\xa4" ); # 1.1 $
4314 my @negpatterns = ( "\\(\xa4[^\xa0]*#", # ($1.1)
4315 "-\xa4[^\xa0]*#", # -$1.1
4316 "\xa4[^\xa0]*-#", # $-1.1
4317 "\xa4[^\xa0]*#.*00-", # $1.1-
4318 "00[^\xa0]*\xa4\\)", # (1.1$)
4319 "-#.*00[^\xa0]*\xa4", # -1.1$
4320 "00-[^\xa0]*\xa4", # 1.1-$
4321 "00[^\xa0]*\xa4-", # 1.1$-
4322 "-#.*00.*\xa0.*\xa4", # -1.1 $
4323 "-\xa4.*\xa0.*#", # -$ 1.1
4324 "00.*\xa0.*\xa4-", # 1.1 $-
4325 "\xa4.*\xa0.*#.*00-", # $ 1.1-
4326 "\xa4.*\xa0.*-#", # $ -1.1
4327 "00-.*\xa0.*\xa4", # 1.1- $
4328 "\\(\xa4.*\xa0.*#", # ($ 1.1)
4329 "00.*\xa0.*\xa4\\)"); # (1.1 $)
4330 my ($pos, $neg);
4332 for ($pos = 0; $pos < @pospatterns; $pos++)
4334 last if ($posfmt =~ /$pospatterns[$pos]/);
4336 #printf STDERR "$name: unknown format '%s'\n", $posfmt if ($pos == @pospatterns);
4337 $pos = 0 if ($pos == @pospatterns);
4339 if (defined $negfmt)
4341 for ($neg = 0; $neg < @negpatterns; $neg++)
4343 last if ($negfmt =~ /$negpatterns[$neg]/);
4345 #printf STDERR "$name: unknown format '%s'\n", $negfmt if ($neg == @negpatterns);
4346 $neg = 0 if ($neg == @negpatterns);
4348 elsif ($pos == 0) { $neg = 1; }
4349 elsif ($pos == 1) { $neg = 5; }
4350 elsif ($pos == 2) { $neg = 9; }
4351 elsif ($pos == 3) { $neg = 8; }
4353 return ($pos, $neg);
4356 sub parse_percent_format($)
4358 my $fmt = shift;
4359 my @patterns = ( "0.+%", # 1 %
4360 "0%", # 1%
4361 "%#", # %1
4362 "%.+#" ); # % 1
4363 my $pos;
4364 for ($pos = 0; $pos < @patterns; $pos++)
4366 last if ($fmt =~ /$patterns[$pos]/);
4368 printf STDERR "unknown format '%s'\n", $fmt if ($pos == @patterns);
4369 return ($pos, ($pos == 3) ? 7 : $pos);
4372 sub convert_date_format($)
4374 my $fmt = shift;
4375 $fmt =~ s/G+/gg/;
4376 $fmt =~ s/LLLL/MMMM/;
4377 $fmt =~ s/LLL/MMM/;
4378 $fmt =~ s/E+/dddd/;
4379 $fmt =~ s/ccc+/dddd/;
4380 $fmt =~ s/([^gy])y([^y])/$1yyyy$2/;
4381 $fmt =~ s/^y([^y])/yyyy$1/;
4382 $fmt =~ s/([^gy])y$/$1yyyy/;
4383 return $fmt;
4386 sub convert_time_format($)
4388 my $fmt = shift;
4389 $fmt =~ s/a+/tt/;
4390 $fmt =~ s/B+/tt/;
4391 return $fmt;
4394 sub load_iso639()
4396 my %iso639;
4397 my $DATA = open_data_file( $ISO639, "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3.tab" );
4398 while (<$DATA>)
4400 if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
4402 close $DATA;
4403 return %iso639;
4407 ################################################################
4408 # build the locale table for locale.nls
4409 sub build_locale_data()
4411 my $base = "cldr-release-$CLDRVERSION";
4412 my $suppl = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/supplementalData.xml" );
4413 my $subtags = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/likelySubtags.xml" );
4414 my $numbers = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/numberingSystems.xml" );
4415 # obsolete phone data from CLDR version 33
4416 my $phone = load_xml_data_file( $CLDR33DATA, "common/supplemental/telephoneCodeData.xml" );
4417 my %iso639 = load_iso639();
4418 $string_data = pack "S2", 0, 0; # offset 0 == empty string
4420 %lcnames = map { $_->{name} => $_ } @locales;
4422 my %lcids;
4423 foreach my $loc (@locales) { $lcids{$loc->{lcid}} = $loc if defined $loc->{lcid}; }
4425 my %days = ( "sun" => 0, "mon" => 1, "tue" => 2, "wed" => 3, "thu" => 4, "fri" => 5, "sat" => 6 );
4427 # assign locale parents
4429 foreach my $loc (@locales)
4431 next if $loc->{name} eq "";
4432 next if defined $loc->{parent};
4433 (my $unix_name = $loc->{name}) =~ s/-/_/g;
4434 my $parent = xml_query( $suppl, "/supplementalData/parentLocales/parentLocale[contains(concat(' ',\@locales,' '),' $unix_name ')]/\@parent" );
4435 if ($parent)
4437 $parent =~ s/_/-/g;
4438 $parent = "" if $parent eq "root";
4440 elsif ($loc->{name} =~ /(.*)-[0-9A-Za-z]+/) { $parent = $1; }
4441 $loc->{parent} = $parent || "";
4444 # load per-locale XML files
4446 foreach my $loc (@locales)
4448 next if defined $loc->{alias};
4449 (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
4450 $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
4451 my $xml = load_xml_data_file( $CLDRDATA, $file );
4452 $loc->{xml} = $xml;
4453 $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
4454 $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
4455 $loc->{script} = xml_query( $xml, "/ldml/identity/script/\@type" );
4456 if (!defined($loc->{territory}) && $loc->{name} =~ /-([A-Z]{2}|[0-9]{3})$/) { $loc->{territory} = $1; }
4457 if (!defined($loc->{script}) && $loc->{name} =~ /-([A-Z][a-z]{3})(-[A-Z]{2})?$/) { $loc->{script} = $1; }
4460 # assign a default territory and sort locale
4462 foreach my $loc (@locales)
4464 next if defined $loc->{alias};
4465 next if defined $loc->{territory};
4466 my $id = $loc->{sortlocale};
4467 if (defined $id && ($id =~ /[-_]([A-Z0-9]+)$/))
4469 $loc->{territory} = $1;
4470 next;
4472 my @children = grep /^$loc->{name}-[A-Z0-9]+$/ && !defined $lcnames{$_}->{alias}, keys %lcnames;
4473 if (@children == 1)
4475 $id = $children[0];
4477 else
4479 my $name = $loc->{file} || $loc->{name};
4480 $name =~ s/-(Arab|Cyrl|Deva|Guru|Hans|Hant|Latn|Tfng|Vaii)$//;
4481 $name =~ s/-/_/g;
4482 $id = xml_query( $subtags, "/supplementalData/likelySubtags/likelySubtag[\@from='$name']/\@to" );
4483 $id =~ s/_/-/g if $id;
4485 if ($id =~ /[-_]([A-Z0-9]+)$/)
4487 $loc->{territory} = $1;
4488 next if defined $loc->{sortlocale};
4489 next unless $id =~ /^$loc->{name}/;
4490 while (defined $lcnames{$id} && defined $lcnames{$id}->{alias}) { $id = $lcnames{$id}->{alias}; }
4491 $loc->{sortlocale} = $id if defined $lcnames{$id};
4492 next;
4494 print STDERR "no territory found for $loc->{name}\n";
4497 # fill geoid table
4499 my %geotable;
4500 foreach my $geo (@geoids)
4502 my $name = $geo->{name};
4503 next unless defined $name;
4504 $geo->{alias} = $geotable{$name} if defined $geotable{$name};
4505 $geotable{$name} ||= $geo;
4507 foreach my $loc (@locales)
4509 next if defined $loc->{alias};
4510 my $territory = $loc->{territory};
4511 $geotable{$territory} ||= { name => $territory };
4513 foreach my $name (keys %geotable)
4515 my $geo = $geotable{$name};
4516 $geo->{dialcode} = xml_query( $phone, "(/supplementalData/telephoneCodeData/codesByTerritory[\@territory='$name']/telephoneCountryCode)[1]/\@code" );
4517 if ($name =~ /\d+/)
4519 $geo->{uncode} = $name;
4520 next;
4522 $geo->{iso2} = $name;
4523 $geo->{iso3} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@alpha3");
4524 $geo->{uncode} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@numeric");
4525 $geo->{sintlsymbol} ||= xml_query( $suppl, "(/supplementalData/currencyData/region[\@iso3166='$name']/currency[not(\@to)])[1]/\@iso4217") || "XXX";
4526 $geo->{sintlsymbol} =~ s/XXX/XDR/;
4528 foreach my $geo (@geoids)
4530 $geo->{parentid} = $geotable{$geo->{parent}}->{id} if defined $geo->{parent};
4531 next if defined $geo->{iso2};
4532 next if defined $geo->{alias};
4533 next unless defined $geo->{uncode};
4534 my @contains;
4535 my $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and not(\@status)]/\@contains");
4536 push @contains, split /\s+/, $list if defined $list;
4537 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and \@status='deprecated']/\@contains");
4538 push @contains, split /\s+/, $list if defined $list;
4539 while (@contains)
4541 my $territory = pop @contains;
4542 if (defined $geotable{$territory})
4544 $geotable{$territory}->{parentid} ||= $geo->{id};
4546 elsif ($territory =~ /\d+/)
4548 # expand region recursively
4549 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$territory' and not(\@status)]/\@contains" );
4550 push @contains, split /\s+/, $list if defined $list;
4555 # assign calendars to their locale
4557 foreach my $cal (@calendars)
4559 next unless defined $cal->{locale};
4560 my $loc = $lcnames{$cal->{locale}};
4561 $loc->{calendar} = [ ] unless defined $loc->{calendar};
4562 push @{$loc->{calendar}}, $cal;
4565 # assign default lcid to aliases
4567 foreach my $loc (@locales)
4569 next unless defined $loc->{alias};
4570 next if defined $loc->{lcid};
4571 my $alias = $loc->{alias};
4572 my $lcid = $lcnames{$alias}->{lcid} || 0x1000;
4573 $loc->{lcid} = $lcid | 0x80000000;
4576 # assign sort aliases to parent locale
4578 foreach my $loc (@locales)
4580 next unless $loc->{name} =~ /_/;
4581 next unless defined $loc->{alias};
4582 my $alias = $loc->{alias};
4583 my $parent = $lcnames{$alias};
4584 my $basename = $parent->{name};
4585 while (1)
4587 @{$parent->{sortnames}}[($loc->{lcid} >> 16) - 1] = $loc->{name};
4588 $alias = locale_parent( $alias );
4589 last unless $alias && defined $lcnames{$alias};
4590 $parent = $lcnames{$alias};
4591 last if defined $parent->{sortbase} && $parent->{sortbase} ne $basename;
4592 $parent->{sortbase} = $basename;
4596 # assign an array index to all locales
4598 my $idx = 0;
4599 foreach my $loc (@locales)
4601 next if defined $loc->{alias};
4602 $loc->{idx} = $idx++;
4604 foreach my $loc (@locales)
4606 my $alias = $loc->{alias};
4607 next unless defined $alias;
4608 while (defined $lcnames{$alias}->{alias}) { $alias = $lcnames{$alias}->{alias}; }
4609 $loc->{idx} = $lcnames{$alias}->{idx};
4612 # output lcids table
4614 my $lcid_data = "";
4615 foreach my $id (sort { $a <=> $b } keys %lcids)
4617 my $loc = $lcids{$id};
4618 $lcid_data .= pack "L<S<2", $id, $loc->{idx}, add_string($loc->{name});
4621 # output lcnames table
4623 my $lcname_data = "";
4624 foreach my $name (sort compare_locales keys %lcnames)
4626 my $loc = $lcnames{$name};
4627 $lcname_data .= pack "S<2L<", add_string($name), $loc->{idx}, $loc->{lcid} || 0x1000;
4630 # output locales array
4632 my $locale_data = "";
4633 my $default_lcid = 0x8001;
4634 foreach my $loc (@locales)
4636 next if defined $loc->{alias};
4637 my $sname = $loc->{name};
4638 my $language = $loc->{language};
4639 my $territory = $loc->{territory};
4640 my $script = $loc->{script};
4641 my $neutral = ($sname && $sname !~ /-$territory/);
4642 my $sparent = $loc->{sparent} || (($sname =~ /(.*)-[0-9A-Za-z]+/) ? $1 : $loc->{parent});
4643 my $unique_lcid = $loc->{lcid};
4644 unless (defined $unique_lcid) { $unique_lcid = $default_lcid++; }
4645 my $geo = $geotable{$territory};
4646 my $territory_match = "contains(concat(' ',normalize-space(\@territories),' '),' $territory ')";
4648 # languages and scripts
4650 my $ssortlocale = $loc->{sortlocale} || ($neutral ? "$sname-$territory" : $sname);
4651 my $idefaultlanguage = defined $lcnames{$ssortlocale} ? $lcnames{$ssortlocale}->{lcid} : undef;
4652 $idefaultlanguage = $lcnames{"en-US"}->{lcid} unless $ssortlocale;
4653 (my $siso639langname = $sname) =~ s/-.*$//;
4654 my $siso639langname2 = $iso639{$siso639langname} || $siso639langname;
4655 my $sopentypelang = sprintf "%-4s", locale_entry( $loc, "sopentypelang", uc $siso639langname2 );
4656 my $sabbrevlangname = defined $loc->{lcid} ? locale_entry( $loc, "sabbrevlangname", uc $siso639langname2 ) : "ZZZ";
4657 my $siso3166ctryname2 = $geo->{iso3} || $geo->{uncode};
4658 my $senglanguage = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" ) || "";
4659 my $sengcountry = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" ) || "";
4660 my $snativelangname = loc_query( $loc, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" );
4661 my $snativectryname = loc_query( $loc, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" );
4662 $sengcountry =~ s/South Korea/Korea/;
4663 $snativelangname ||= $senglanguage;
4664 $snativectryname ||= $sengcountry;
4665 if ($script)
4667 my $engscript = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4668 my $nativescript = loc_query( $loc, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4669 $senglanguage .= " ($engscript)" if $engscript;
4670 $snativelangname .= " ($nativescript)" if $nativescript;
4672 my $sengdisplayname = $neutral ? $senglanguage : "$senglanguage ($sengcountry)";
4673 my $snativedisplayname = $neutral ? $snativelangname : "$snativelangname ($snativectryname)";
4674 $sengdisplayname =~ s/\) \(/, /;
4675 $snativedisplayname =~ s/\) \(/, /;
4676 my $sscripts = locale_entry( $loc, "sscripts", $script ) || xml_query( $suppl, "/supplementalData/languageData/language[\@type='$language' and not(\@alt)]/\@scripts" );
4677 $sscripts = (join ";", (sort split / /, ($sscripts || "Latn"))) . ";";
4678 my $ireadinglayout = locale_entry( $loc, "ireadinglayout", 0 );
4679 my $charlayout = loc_query( $loc, "/ldml/layout/orientation/characterOrder" );
4680 if ($charlayout eq "right-to-left")
4682 $ireadinglayout = 1;
4684 elsif ($charlayout eq "top-to-bottom")
4686 my $linelayout = loc_query( $loc, "/ldml/layout/orientation/lineOrder" );
4687 $ireadinglayout = $linelayout eq "right-to-left" ? 2 : 3;
4689 my $igeoid = $geo->{id} || 0;
4691 # numbers
4693 my $sdecimal = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/decimal" );
4694 my $slist = locale_entry( $loc, "slist", ";" );
4695 my $smondecimalsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyDecimal" ) || $sdecimal;
4696 my $sthousand = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/group" );
4697 $sthousand =~ s/\x{202f}/\x{00a0}/;
4698 my $smonthousandsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyGroup" ) || $sthousand;
4699 my $spositivesign = "";
4700 my $snegativesign = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/minusSign" );
4701 my $spercent = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/percentSign" );
4702 my $snan = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/nan" );
4703 my $sposinfinity = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/infinity" );
4704 my $sneginfinity = $sposinfinity ? "-$sposinfinity" : "";
4705 my $sgrouping = format_to_grouping( loc_query( $loc, "/ldml/numbers/decimalFormats[\@numberSystem='latn']/decimalFormatLength[not(\@type)]/decimalFormat/pattern" ));
4706 my $percentformat = loc_query( $loc, "/ldml/numbers/percentFormats[\@numberSystem='latn']/percentFormatLength[not(\@type)]/percentFormat/pattern" );
4707 my $currencyformat = loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='accounting']/pattern" ) ||
4708 loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='standard']/pattern" );
4709 my $smongrouping = format_to_grouping( $currencyformat );
4710 my ($icurrency, $inegcurr) = parse_currency_format( $sname, $currencyformat );
4711 my ($ipospercent, $inegpercent) = parse_percent_format( $percentformat );
4712 my $native_numbering = loc_query( $loc, "/ldml/numbers/otherNumberingSystems/native" );
4713 my @snativedigits = split //, xml_query( $numbers, "/supplementalData/numberingSystems/numberingSystem[\@id='$native_numbering']/\@digits" );
4714 my $digitsubstitution = !(ord($snativedigits[0]) >= 0x600 && ord($snativedigits[0]) <= 0x6ff);
4715 my $measure = defined xml_query( $suppl, "/supplementalData/measurementData/measurementSystem[\@type='US' and $territory_match]" );
4716 my $papersize = defined xml_query( $suppl, "/supplementalData/measurementData/paperSize[\@type='US-Letter' and $territory_match]" );
4718 # currencies
4720 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
4721 my $scurrency = $geo->{scurrency} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[\@alt='narrow']" );
4722 $scurrency ||= loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[not(\@alt)]" );
4723 $geo->{scurrency} = $scurrency if $scurrency;
4724 $scurrency ||= $sintlsymbol;
4725 my $sengcurrname = $loc->{sengcurrname} || loc_query( $lcnames{en}, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" );
4726 my $snativecurrname = $loc->{sengcurrname} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" ) || $sengcurrname;
4727 my $icurrdigits = xml_query( $suppl, "/supplementalData/currencyData/fractions/info[\@iso4217='$sintlsymbol']/\@digits" );
4728 $icurrdigits = 2 unless defined $icurrdigits;
4730 # calendars
4732 my $firstday = xml_query( $suppl, "/supplementalData/weekData/firstDay[not(\@alt) and $territory_match]/\@day" );
4733 my $ifirstdayofweek = $firstday ? $days{$firstday} : 1;
4734 my $firstweekofyear = (xml_query( $suppl, "/supplementalData/weekData/minDays[$territory_match]/\@count" ) || 0) == 4 ? 2 : 0;
4735 my $serastring = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/eras/eraAbbr/era[\@type='1' and not(\@alt)]" );
4736 my (@sdayname, @sabbrevdayname, @sshortestdayname);
4737 foreach my $d (sort { $days{$a} <=> $days{$b} } keys %days)
4739 my $n = $days{$d};
4740 my %name;
4741 foreach my $type (qw(wide abbreviated short))
4743 $name{$type} = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/days/dayContext[\@type='format']/dayWidth[\@type='$type']/day[\@type='$d' and not(\@alt)]" );
4745 push @sdayname, $name{wide};
4746 push @sabbrevdayname, $name{abbreviated} || $name{wide};
4747 push @sshortestdayname, $name{short} || $name{abbreviated} || $name{wide};
4749 my (@smonthname, @sabbrevmonthname, @sgenitivemonth, @sabbrevgenitivemonth);
4750 foreach my $n (1..13)
4752 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='wide']/month[\@type='$n']" );
4753 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4754 my $genitive = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n']" );
4755 my $abbrevgen = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4756 push @smonthname, $name || $genitive || "";
4757 push @sabbrevmonthname, $abbrev || $abbrevgen || $name || $genitive || "";
4758 push @sgenitivemonth, $genitive || "";
4759 push @sabbrevgenitivemonth, $abbrevgen || $genitive || "";
4761 @sgenitivemonth = () if join("|",@smonthname) eq join("|",@sgenitivemonth);
4762 @sabbrevgenitivemonth = () if join("|",@sabbrevmonthname) eq join("|",@sabbrevgenitivemonth);
4763 my %caltypes = ( "gregorian" => 1, "japanese" => 3, "chinese" => 4, "dangi" => 5, "islamic" => 6, "buddhist" => 7, "hebrew" => 8,
4764 "persian" => 22, "islamic-civil" => 23, "islamic-umalqura" => 23 );
4765 my $calpref = xml_query( $suppl, "/supplementalData/calendarPreferenceData/calendarPreference[$territory_match]/\@ordering" ) || "gregorian";
4766 my $icalendartype;
4767 my @scalnames;
4768 foreach my $c (split /\s+/, $calpref)
4770 next unless defined $caltypes{$c};
4771 $icalendartype .= chr($caltypes{$c});
4772 $scalnames[$caltypes{$c} - 1] = loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$c']" );
4775 # date/time formats
4777 my $s1159 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='am' and not(\@alt)]" );
4778 my $s2359 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='pm' and not (\@alt)]" );
4779 my $sshortestam = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='am' and not(\@alt)]" );
4780 my $sshortestpm = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='pm' and not (\@alt)]" );
4781 my @stimeformat = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='medium']/timeFormat/pattern[not(\@alt)]" ));
4782 push @stimeformat, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4783 pop @stimeformat if $stimeformat[0] eq $stimeformat[1];
4784 @stimeformat = map convert_time_format($_), @stimeformat;
4785 my @sshorttime = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='short']/timeFormat/pattern[not(\@alt)]" ));
4786 push @sshorttime, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hm' and not(\@alt)]" );
4787 pop @sshorttime if $sshorttime[0] eq $sshorttime[1];
4788 @sshorttime = map convert_time_format($_), @sshorttime;
4789 my @sshortdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4790 push @sshortdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
4791 @sshortdate = map convert_date_format($_), @sshortdate;
4792 my @slongdate = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" ));
4793 push @slongdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
4794 @slongdate = map convert_date_format($_), @slongdate;
4795 my @smonthday = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMd' and not(\@alt)]" ));
4796 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Md' and not(\@alt)]" );
4797 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMd' and not(\@alt)]" );
4798 @smonthday = map convert_date_format($_), @smonthday;
4799 my @syearmonth = map convert_date_format($_), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMM' and not(\@alt)]" );
4800 my @sduration = map convert_time_format( lc $_ ), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4801 my $srelativelongdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMEd' and not(\@alt)]" ) ||
4802 loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMEd' and not(\@alt)]" );
4803 $srelativelongdate = convert_date_format( $srelativelongdate );
4805 if (defined $loc->{calendar})
4807 foreach my $cal (@{$loc->{calendar}})
4809 $cal->{sshortdate} = \@sshortdate;
4810 $cal->{syearmonth} = \@syearmonth;
4811 $cal->{slongdate} = \@slongdate;
4812 $cal->{serastring} = [ $serastring ];
4813 $cal->{sdayname} = \@sdayname;
4814 $cal->{sabbrevdayname} = \@sabbrevdayname;
4815 $cal->{smonthname} = \@smonthname;
4816 $cal->{sabbrevmonthname} = \@sabbrevmonthname;
4817 $cal->{scalname} = $scalnames[$cal->{id}];
4818 $cal->{smonthday} = \@smonthday;
4819 $cal->{sshortestdayname} = \@sshortestdayname;
4820 $cal->{sabbreverastring} = [ $serastring ];
4821 $cal->{sshortestdayname} = \@sshortestdayname;
4822 $cal->{srelativelongdate} = $srelativelongdate;
4826 # codepages
4828 my %ansicpmap = ( 437 => 1252, 720 => 1256, 737 => 1253, 775 => 1257, 850 => 1252,
4829 852 => 1250, 855 => 1251, 866 => 1251, 857 => 1254, 862 => 1255 );
4830 my %maccpmap = ( 437 => 10000, 720 => 10004, 737 => 10006, 775 => 10029, 850 => 10000,
4831 852 => 10029, 855 => 10007, 857 => 10081, 862 => 10005, 866 => 10007,
4832 874 => 10021, 932 => 10001, 936 => 10008, 949 => 10003, 950 => 10002,
4833 1258 => 10000 );
4834 my %ebcdiccpmap = ( 437 => 37, 720 => 20420, 737 => 20273, 866 => 20880, 932 => 20290 );
4835 my %codepagemasks = ( 874 => [ 0x01000000, 0x00000000, 0x00000000, 0, 0x00010000, 0x00000000, 0x00010000, 0x00000000 ],
4836 932 => [ 0x00000000, 0x28c70000, 0x00000010, 0, 0x00020000, 0x00000000, 0x00020000, 0x00000000 ],
4837 936 => [ 0x00000000, 0x28010000, 0x00000002, 0, 0x00040000, 0x00000000, 0x00040000, 0x00000000 ],
4838 949 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00080000, 0x00000000, 0x00080000, 0x00000000 ],
4839 950 => [ 0x00000000, 0x28c10000, 0x00000012, 0, 0x00100000, 0x00000000, 0x00100000, 0x00000000 ],
4840 1258 => [ 0x2000000f, 0x00000000, 0x00000000, 0, 0x00000100, 0x00008000, 0x00000100, 0x00008000 ],
4841 866 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x00020000, 0x00000004, 0x02020000 ],
4842 862 => [ 0x00000800, 0x40000000, 0x00000000, 0, 0x00000020, 0x00200000, 0x00000020, 0x00200000 ],
4843 857 => [ 0x0000001f, 0x00000000, 0x00000000, 0, 0x00000010, 0x01000000, 0x00000010, 0x01000000 ],
4844 855 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x02000000, 0x00000004, 0x02000000 ],
4845 852 => [ 0x00000027, 0x00000000, 0x00000000, 0, 0x00000002, 0x04000000, 0x00000002, 0x04000000 ],
4846 775 => [ 0x00000007, 0x00000000, 0x00000000, 0, 0x00000080, 0x08000000, 0x00000080, 0x08000000 ],
4847 737 => [ 0x00000080, 0x00000000, 0x00000000, 0, 0x00000008, 0x10000000, 0x00000008, 0x10010000 ],
4848 720 => [ 0x00002000, 0x00000000, 0x00000000, 0, 0x00000040, 0x20000000, 0x00000040, 0x20080000 ],
4849 850 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x40000000, 0x0000019f, 0xdfd70000 ],
4850 437 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x80000000, 0x0000019f, 0xdfd70000 ],
4851 65001 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00000000, 0x00000000, 0x0000019f, 0xdfd70000 ] );
4852 my $oemcp = locale_entry( $loc, "oemcp", 65001 );
4853 my $maccp = locale_entry( $loc, "maccp", undef ) || $maccpmap{$oemcp} || 65001;
4854 my $ebcdiccp = locale_entry( $loc, "ebcdiccp", undef ) || $ebcdiccpmap{$oemcp} || 500;
4855 $ebcdiccp = 500 if (defined $loc->{oemcp} && $loc->{oemcp} == 65001) || (defined $loc->{maccp} && $loc->{maccp} == 65001);
4856 my $ansicp = $ansicpmap{$oemcp} || $oemcp;
4857 my @fontsig = (0) x 8;
4858 my $sig = locale_entry( $loc, "fontsig", [] );
4859 foreach my $i (0..7) { $fontsig[$i] |= $codepagemasks{$oemcp}->[$i]; }
4860 foreach my $i (0..$#{$sig}) { $fontsig[$i] |= $sig->[$i]; }
4861 $fontsig[3] |= 1 << 31;
4862 $fontsig[3] |= 1 << 27 if $ireadinglayout == 1;
4863 $fontsig[3] |= 1 << 28 if $ireadinglayout == 3;
4865 # special cases for invariant locale
4867 unless ($loc->{name})
4869 $siso639langname = "iv";
4870 $siso639langname2 = "ivl";
4871 $senglanguage = $snativelangname = "Invariant Language";
4872 $sengcountry = $snativectryname = "Invariant Country";
4873 $sengdisplayname = "Invariant Language (Invariant Country)";
4874 $snativedisplayname = "Invariant Language (Invariant Region)";
4875 $sengcurrname = $snativecurrname = "International Monetary Fund";
4876 $scurrency = "\x{00a4}";
4877 $ifirstdayofweek = 0;
4878 $igeoid = $geotable{"US"}->{id};
4879 @stimeformat = ("HH:mm:ss");
4880 @sshortdate = ("MM/dd/yyyy", "yyyy-MM-dd");
4881 @slongdate = ("dddd, dd MMMM yyyy");
4882 @syearmonth = ("yyyy MMMM");
4883 @smonthday = ("MMMM dd", "MMMM d", "M/d", "MMM d");
4884 @sshorttime = ("HH:mm", "hh:mm tt", "H:mm", "h:mm tt");
4885 $srelativelongdate = "dddd, MMMM dd";
4886 $sposinfinity = "Infinity";
4887 $sneginfinity = "-Infinity";
4888 $spositivesign = "+";
4889 $ipospercent = $inegpercent = 0;
4892 # output data
4894 $locale_data .= pack "L<2",
4895 add_string( $sname ), # name
4896 add_string( $sopentypelang ); # LOCALE_SOPENTYPELANGUAGETAG
4898 $locale_data .= pack "S<14",
4899 $loc->{lcid} || 0x1000, # LOCALE_ILANGUAGE
4900 $unique_lcid, # unique_lcid
4901 locale_entry( $loc, "idigits", 2 ), # LOCALE_IDIGITS
4902 locale_entry( $loc, "inegnumber", 1 ), # LOCALE_INEGNUMBER
4903 $icurrdigits, # LOCALE_ICURRDIGITS
4904 $icurrency, # LOCALE_ICURRENCY
4905 $inegcurr, # LOCALE_INEGCURR
4906 locale_entry( $loc, "ilzero", 1 ), # LOCALE_ILZERO
4907 !$neutral, # LOCALE_INEUTRAL
4908 $ifirstdayofweek, # LOCALE_IFIRSTDAYOFWEEK
4909 $firstweekofyear, # LOCALE_IFIRSTWEEKOFYEAR
4910 $geo->{dialcode} || 1 , # LOCALE_ICOUNTRY,
4911 $measure, # LOCALE_IMEASURE
4912 $digitsubstitution; # LOCALE_IDIGITSUBSTITUTION
4914 $locale_data .= pack "L<18",
4915 add_string( $sgrouping ), # LOCALE_SGROUPING
4916 add_string( $smongrouping ), # LOCALE_SMONGROUPING
4917 add_string( $slist ), # LOCALE_SLIST
4918 add_string( $sdecimal ), # LOCALE_SDECIMAL
4919 add_string( $sthousand ), # LOCALE_STHOUSAND
4920 add_string( $scurrency ), # LOCALE_SCURRENCY
4921 add_string( $smondecimalsep ), # LOCALE_SMONDECIMALSEP
4922 add_string( $smonthousandsep ), # LOCALE_SMONTHOUSANDSEP
4923 add_string( $spositivesign ), # LOCALE_SPOSITIVESIGN
4924 add_string( $snegativesign ), # LOCALE_SNEGATIVESIGN
4925 add_string( $s1159 ), # LOCALE_S1159
4926 add_string( $s2359 ), # LOCALE_S2359
4927 add_strarray( @snativedigits ), # LOCALE_SNATIVEDIGITS
4928 add_strarray( @stimeformat ), # LOCALE_STIMEFORMAT
4929 add_strarray( @sshortdate ), # LOCALE_SSHORTDATE
4930 add_strarray( @slongdate ), # LOCALE_SLONGDATE
4931 add_strarray( @syearmonth ), # LOCALE_SYEARMONTH
4932 add_strarray( @sduration ); # LOCALE_SDURATION
4934 $locale_data .= pack "S<8",
4935 $idefaultlanguage || 0x1000, # LOCALE_IDEFAULTLANGUAGE
4936 $ansicp, # LOCALE_IDEFAULTANSICODEPAGE
4937 $oemcp, # LOCALE_IDEFAULTCODEPAGE
4938 $maccp, # LOCALE_IDEFAULTMACCODEPAGE
4939 $ebcdiccp, # LOCALE_IDEFAULTEBCDICCODEPAGE
4940 $igeoid < 65536 ? $igeoid : 39070, # old_geoid
4941 $papersize ? 1 : 9, # LOCALE_IPAPERSIZE
4942 0; # FIXME # islamic_cal
4944 $locale_data .= pack "L<24",
4945 add_string( $icalendartype ), # LOCALE_ICALENDARTYPE
4946 add_string( $sabbrevlangname ), # LOCALE_SABBREVLANGNAME
4947 add_string( $siso639langname ), # LOCALE_SISO639LANGNAME
4948 add_string( $senglanguage ), # LOCALE_SENGLANGUAGE
4949 add_string( $snativelangname ), # LOCALE_SNATIVELANGNAME
4950 add_string( $sengcountry ), # LOCALE_SENGCOUNTRY
4951 add_string( $snativectryname ), # LOCALE_SNATIVECTRYNAME
4952 add_string( $siso3166ctryname2 ), # LOCALE_SABBREVCTRYNAME
4953 add_string( $territory ), # LOCALE_SISO3166CTRYNAME
4954 add_string( $sintlsymbol ), # LOCALE_SINTLSYMBOL
4955 add_string( $sengcurrname ), # LOCALE_SENGCURRNAME
4956 add_string( $snativecurrname ), # LOCALE_SNATIVECURRNAME
4957 add_fontsig( @fontsig ), # LOCALE_FONTSIGNATURE
4958 add_string( $siso639langname2 ), # LOCALE_SISO639LANGNAME2
4959 add_string( $siso3166ctryname2 ), # LOCALE_SISO3166CTRYNAME2
4960 add_string( $sparent ), # LOCALE_SPARENT
4961 add_strarray( @sdayname ), # LOCALE_SDAYNAME
4962 add_strarray( @sabbrevdayname ), # LOCALE_SABBREVDAYNAME
4963 add_strarray( @smonthname ), # LOCALE_SMONTHNAME
4964 add_strarray( @sabbrevmonthname ), # LOCALE_SABBREVMONTHNAME
4965 add_strarray( @sgenitivemonth ), # LOCALE_SGENITIVEMONTH
4966 add_strarray( @sabbrevgenitivemonth ), # LOCALE_SABBREVGENITIVEMONTH
4967 add_strarray( @scalnames ), # LOCALE_SCALNAMES
4968 add_strarray( @{$loc->{sortnames}} ); # LOCALE_SSORTNAMES
4970 $locale_data .= pack "S<6",
4971 $inegpercent, # LOCALE_INEGATIVEPERCENT
4972 $ipospercent, # LOCALE_IPOSITIVEPERCENT
4973 0, # unknown
4974 $ireadinglayout, # LOCALE_IREADINGLAYOUT
4975 0x2a, # unknown
4976 0x2a; # unknown
4978 $locale_data .= pack "L<24",
4979 0, # unknown
4980 add_string( $sengdisplayname ), # LOCALE_SENGLISHDISPLAYNAME
4981 add_string( $snativedisplayname ), # LOCALE_SNATIVEDISPLAYNAME
4982 add_string( $spercent ), # LOCALE_SPERCENT
4983 add_string( $snan ), # LOCALE_SNAN
4984 add_string( $sposinfinity ), # LOCALE_SPOSINFINITY
4985 add_string( $sneginfinity ), # LOCALE_SNEGINFINITY
4986 0, # unknown
4987 add_string( $serastring ), # CAL_SERASTRING
4988 add_string( $serastring ), # CAL_SABBREVERASTRING
4989 0, # unknown
4990 add_string( $ssortlocale ), # LOCALE_SCONSOLEFALLBACKNAME
4991 add_strarray( @sshorttime ), # LOCALE_SSHORTTIME
4992 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
4993 0, # unknown
4994 add_string( $ssortlocale ), # LOCALE_SSORTLOCALE
4995 add_string( "0409:00000409" ), # FIXME # LOCALE_SKEYBOARDSTOINSTALL
4996 add_string( $sscripts ), # LOCALE_SSCRIPTS
4997 add_string( $srelativelongdate ), # LOCALE_SRELATIVELONGDATE
4998 $igeoid, # LOCALE_IGEOID
4999 add_string( $sshortestam || "a" ), # LOCALE_SSHORTESTAM
5000 add_string( $sshortestpm || "p" ), # LOCALE_SSHORTESTPM
5001 add_strarray( @smonthday ), # LOCALE_SMONTHDAY
5002 add_string( "k0-windows-us" ) # FIXME # keyboard_layout
5005 # output language groups
5007 my %groups;
5008 add_registry_key( "Locale", "00000409" );
5009 foreach my $loc (@locales)
5011 next unless defined $loc->{lcid};
5012 next if ($loc->{lcid} & 0x80000000);
5013 next if !defined($loc->{alias}) && $loc->{name} !~ /-$loc->{territory}/; # skip neutral locales
5014 my $group = locale_entry( $loc, "group", 1 );
5015 my $name = sprintf( "%08x", $loc->{lcid} );
5016 my $val = sprintf( "%x", $group );
5017 add_registry_value( "Locale", $name, $val ) unless ($loc->{lcid} & 0x000f0000);
5018 add_registry_value( "Locale\\Alternate Sorts", $name, $val ) if $loc->{name} =~ /_/;
5019 $groups{$val} = 1;
5021 foreach my $group (keys %groups) { add_registry_value( "Language Groups", $group, "1" ); }
5023 # output calendar data
5025 my $calendar_data = "";
5026 foreach my $cal (@calendars)
5028 my $scalname = $cal->{name};
5029 my $iyearoffsetrange = 0;
5030 my $itwodigityearmax = $cal->{itwodigityearmax};
5031 my @sshortdate;
5032 my @syearmonth;
5033 my @slongdate;
5034 my @serastring;
5035 my @sdayname;
5036 my @sabbrevdayname;
5037 my @smonthname;
5038 my @sabbrevmonthname;
5039 my @smonthday;
5040 my @sabbreverastring;
5041 my @sshortestdayname;
5043 my $type = $cal->{type};
5044 if (defined $cal->{locale} && defined $type)
5046 my $loc = $lcnames{$cal->{locale}};
5047 my $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
5048 push @sshortdate, $fmt if $fmt;
5049 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMd' and not(\@alt)]" );
5050 push @sshortdate, $fmt if $fmt;
5051 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
5052 push @sshortdate, $fmt if $fmt;
5053 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMMMd' and not(\@alt)]" );
5054 push @sshortdate, $fmt if $fmt;
5055 @sshortdate = map convert_date_format($_), @sshortdate;
5056 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" );
5057 push @slongdate, $fmt if $fmt;
5058 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
5059 push @slongdate, $fmt if $fmt;
5060 @slongdate = map convert_date_format($_), @slongdate;
5062 foreach my $n (1..13)
5064 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n' and not(\@yeartype)]" );
5065 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n' and not(\@yeartype)]" );
5066 push @smonthname, $name || "";
5067 push @sabbrevmonthname, $abbrev || $name || "";
5070 $scalname ||= loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$type']" );
5071 if (defined $cal->{eras})
5073 my @eras;
5074 my $idx = 1;
5075 foreach my $era (@{$cal->{eras}})
5077 my $start = xml_query( $suppl, "/supplementalData/calendarData/calendar[\@type='$type']/eras/era[\@type='$era']/\@start" );
5078 next unless $start =~ /^(-?\d+)-(\d+)-(\d+)/;
5079 my ($year, $mon, $day, $zero, $first) = ($1, $2, $3, $1 - 1, 1);
5080 if ($zero < 0)
5082 $first -= $zero;
5083 $year = 1;
5084 $itwodigityearmax = 2049 - $zero;
5086 unshift @eras, pack( "S<8", 6, $idx++, $year, $mon, $day, $zero, $first, 0 );
5087 push @serastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraAbbr/era[\@type='$era']" );
5088 push @sabbreverastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraNarrow/era[\@type='$era']" );
5090 $iyearoffsetrange = add_str_data( pack "S<L<*", scalar @eras, map { add_str_data($_); } @eras );
5094 @sshortdate = @{$cal->{sshortdate}} if defined $cal->{sshortdate} && !@sshortdate;
5095 @syearmonth = @{$cal->{syearmonth}} if defined $cal->{syearmonth};
5096 @slongdate = @{$cal->{slongdate}} if defined $cal->{slongdate} && !@slongdate;
5097 @serastring = @{$cal->{serastring}} if defined $cal->{serastring} && !@serastring;
5098 @sdayname = @{$cal->{sdayname}} if defined $cal->{sdayname};
5099 @sabbrevdayname = @{$cal->{sabbrevdayname}} if defined $cal->{sabbrevdayname};
5100 @smonthname = @{$cal->{smonthname}} if defined $cal->{smonthname} && !join("",@smonthname);
5101 @sabbrevmonthname = @{$cal->{sabbrevmonthname}} if defined $cal->{sabbrevmonthname} && !join("",@sabbrevmonthname);
5102 @smonthday = @{$cal->{smonthday}} if defined $cal->{smonthday};
5103 @sabbreverastring = @{$cal->{sabbreverastring}} if defined $cal->{sabbreverastring} && !@sabbreverastring;
5104 @sshortestdayname = @{$cal->{sshortestdayname}} if defined $cal->{sshortestdayname};
5105 my $srelativelongdate = $cal->{srelativelongdate};
5107 @serastring = ("A.D.") unless @serastring;
5108 @sabbreverastring = ("AD") unless @sabbreverastring;
5110 if ($cal->{id} != 1) # calendar 1 is a placeholder, information is fetched from locale instead
5112 @sshortdate = ("") unless @sshortdate;
5113 @syearmonth = ("") unless @syearmonth;
5114 @slongdate = ("") unless @slongdate;
5115 @sdayname = ("") x 7 unless @sdayname;
5116 @sabbrevdayname = ("") x 7 unless @sabbrevdayname;
5117 @sshortestdayname = ("") x 7 unless @sshortestdayname;
5118 @smonthname = ("") x 13 unless @smonthname;
5119 @sabbrevmonthname = ("") x 13 unless @sabbrevmonthname;
5120 @smonthday = ("") unless @smonthday;
5123 $calendar_data .= pack "S<2L<17",
5124 $cal->{id}, # CAL_ICALINTVALUE
5125 $itwodigityearmax || 99, # CAL_ITWODIGITYEARMAX
5126 add_strarray( @sshortdate ), # CAL_SSHORTDATE
5127 add_strarray( @syearmonth ), # CAL_SYEARMONTH
5128 add_strarray( @slongdate ), # CAL_SLONGDATE
5129 add_strarray( @serastring ), # CAL_SERASTRING
5130 $iyearoffsetrange, # CAL_IYEAROFFSETRANGE
5131 add_strarray( @sdayname ), # CAL_SDAYNAME
5132 add_strarray( @sabbrevdayname ), # CAL_SABBREVDAYNAME
5133 add_strarray( @smonthname ), # CAL_SMONTHNAME
5134 add_strarray( @sabbrevmonthname ), # CAL_SABBREVMONTHNAME
5135 add_string( $scalname ), # CAL_SCALNAME
5136 add_strarray( @smonthday ), # CAL_SMONTHDAY
5137 add_strarray( @sabbreverastring ), # CAL_SABBREVERASTRING
5138 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5139 add_string( $srelativelongdate ); # CAL_SRELATIVELONGDATE
5142 # output locale header
5144 my $nb_lcids = scalar keys %lcids;
5145 my $nb_locales = scalar grep { !defined $_->{alias} } @locales;
5146 my $nb_lcnames = scalar keys %lcnames;
5147 my $locale_size = length($locale_data) / $nb_locales;
5148 my $nb_calendars = scalar @calendars;
5149 my $calendar_size = length($calendar_data) / $nb_calendars;
5150 my $lcids_offset = 19 * 4; # size of header
5151 my $lcnames_offset = $lcids_offset + length $lcid_data;
5152 my $locales_offset = $lcnames_offset + length $lcname_data;
5153 my $calendar_offset = $locales_offset + length $locale_data;
5154 my $strings_offset = $calendar_offset + length $calendar_data;
5156 my $locale_header = pack "L<7S<4L<S<2L<3S<2L<4",
5157 8, # offset
5159 7, # version
5160 0x5344534e, # magic
5161 0, 0, 0,
5163 $nb_lcids,
5164 $nb_locales,
5165 $locale_size,
5166 $locales_offset,
5167 $nb_lcnames,
5169 $lcids_offset,
5170 $lcnames_offset,
5172 $nb_calendars,
5173 $calendar_size,
5174 $calendar_offset,
5175 $strings_offset,
5176 0, 0;
5178 return align_string( 4, $locale_header . $lcid_data . $lcname_data . $locale_data . $calendar_data . $string_data );
5182 ################################################################
5183 # build the geoids table for locale.nls
5184 sub build_geoids_data()
5186 my $data = "";
5187 my %index;
5188 my $idx = 0;
5189 my @geo_header = (0x00650067, 0x0000006f, 0, 4 * 7, scalar @geoids, 0, 0);
5191 foreach my $geo (@geoids)
5193 my $id = $geo->{id};
5194 $geo = $geo->{alias} if defined $geo->{alias};
5195 my $lat = "0.000";
5196 my $long = "0.000";
5197 my $iso2 = $geo->{iso2} || "XX";
5198 my $iso3 = $geo->{iso3} || "XX";
5199 my $isregion = $geo->{region} || (defined $geo->{uncode} && !defined $geo->{iso2});
5200 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
5201 my $scurrency = $geo->{scurrency} || "\x{00a4}";
5203 $data .= pack( "L<", $id );
5204 $data .= pad_string( 24, encode( "UTF16LE", $lat ));
5205 $data .= pad_string( 24, encode( "UTF16LE", $long ));
5206 $data .= pack( "L<2", $isregion ? 14 : 16, $geo->{parentid} || 39070 );
5207 $data .= pad_string( 8, encode( "UTF16LE", $iso2 ));
5208 $data .= pad_string( 8, encode( "UTF16LE", $iso3 ));
5209 $data .= pack( "S<2", $geo->{uncode} || 0, $geo->{dialcode} || 0 );
5210 $data .= pad_string( 8, encode( "UTF16LE", $sintlsymbol ));
5211 $data .= pad_string( 16, encode( "UTF16LE", $scurrency ));
5212 $index{$geo->{name}} = $idx if $geo->{name};
5213 $idx++;
5215 $index{"XX"} = $index{"001"};
5217 $geo_header[5] = $geo_header[3] + length $data;
5218 $geo_header[6] = scalar keys %index;
5220 foreach my $name (sort keys %index)
5222 $data .= pad_string( 8, encode( "UTF16LE", $name ));
5223 $data .= pack "L<", $index{$name};
5226 $geo_header[2] = $geo_header[3] + length $data;
5227 return pack( "L<7", @geo_header ) . $data;
5231 ################################################################
5232 # build a binary locale table
5233 sub dump_locales($$)
5235 my ($filename, $chartypes) = @_;
5237 printf "Building $filename\n";
5239 my $locale_data = build_locale_data();
5240 my $charmaps_data = ""; # FIXME
5241 my $geoids_data = build_geoids_data();
5242 my $scripts_data = ""; # FIXME
5244 my @header = ( 0 ) x 8;
5245 $header[0] = 4 * scalar @header; # chartypes offset
5246 $header[4] = $header[0] + length $chartypes; # locales offset
5247 $header[5] = $header[4] + length $locale_data; # charmaps offset
5248 $header[6] = $header[5] + length $charmaps_data; # geoids offset
5249 $header[7] = $header[6] + length $geoids_data; # scripts offset
5251 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5252 print OUTPUT pack "L<*", @header;
5253 print OUTPUT $chartypes, $locale_data, $charmaps_data, $geoids_data, $scripts_data;
5254 close OUTPUT;
5255 save_file($filename);
5259 ################################################################
5260 # build the script to create registry keys
5261 sub dump_registry_script($%)
5263 my ($filename, %keys) = @_;
5264 my $indent = 1;
5266 printf "Building %s\n", $filename;
5267 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5268 print OUTPUT "HKLM\n{\n";
5269 foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
5271 printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
5272 $indent++;
5274 foreach my $k (sort keys %keys)
5276 my @subkeys = split /\\/, $k;
5277 my ($def, @vals) = @{$keys{$k}};
5278 for (my $i = 0; $i < @subkeys; $i++)
5280 printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "",
5281 $subkeys[$i] =~ /\s/ ? "'$subkeys[$i]'" : $subkeys[$i],
5282 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
5283 $indent++;
5285 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
5286 for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
5288 while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
5289 close OUTPUT;
5290 save_file($filename);
5294 ################################################################
5295 # save a file if modified
5296 sub save_file($)
5298 my $file = shift;
5299 if (-f $file && !system "cmp $file $file.new >/dev/null")
5301 unlink "$file.new";
5303 else
5305 rename "$file.new", "$file";
5310 ################################################################
5311 # main routine
5313 chdir ".." if -f "./make_unicode";
5314 load_data();
5315 dump_sortkeys( "dlls/kernelbase/collation.c" );
5316 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
5317 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
5318 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
5319 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
5320 dump_mirroring( "dlls/dwrite/mirror.c" );
5321 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
5322 dump_bracket( "dlls/dwrite/bracket.c" );
5323 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
5324 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
5325 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
5326 dump_linebreak( "dlls/dwrite/linebreak.c" );
5327 dump_scripts( "dlls/dwrite/scripts" );
5328 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
5329 dump_vertical( "dlls/win32u/vertical.c", 1 );
5330 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
5331 dump_intl_nls("nls/l_intl.nls");
5332 dump_norm_table( "nls/normnfc.nls" );
5333 dump_norm_table( "nls/normnfd.nls" );
5334 dump_norm_table( "nls/normnfkc.nls" );
5335 dump_norm_table( "nls/normnfkd.nls" );
5336 dump_norm_table( "nls/normidna.nls" );
5337 my $chartypes = dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
5338 dump_locales( "nls/locale.nls", $chartypes );
5339 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
5340 dump_eucjp_codepage();
5341 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
5343 exit 0;
5345 # Local Variables:
5346 # compile-command: "./make_unicode"
5347 # End: