win32u: Use user message packing for CB_GETLBTEXT and LB_GETTEXT.
[wine.git] / tools / make_unicode
blob48d637738102bbd13f74eca0a57b71b1c7f2aa39
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
23 use XML::LibXML;
24 use Digest::SHA;
25 use Encode;
26 use Time::Local qw(timegm_modern);
28 my $UNIVERSION = "15.0.0";
29 my $CLDRVERSION = "43";
30 my $ISO639VERSION = "20230123";
31 my $TZVERSION = "2023c";
33 my %data_files =
35 ucd => { url => "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip", name => "UCD-$UNIVERSION.zip",
36 sha => "5fbde400f3e687d25cc9b0a8d30d7619e76cb2f4c3e85ba9df8ec1312cb6718c" },
37 unihan => { url => "https://www.unicode.org/Public/$UNIVERSION/ucd/Unihan.zip", name => "Unihan-$UNIVERSION.zip",
38 sha => "24b154691fc97cb44267b925d62064297086b3f896b57a8181c7b6d42702a026" },
39 idna => { url => "https://www.unicode.org/Public/idna/$UNIVERSION/IdnaMappingTable.txt", name => "IdnaMappingTable-$UNIVERSION.txt",
40 sha => "cc8522199541d60326a42a8f91f8748fd15630a42502dd2cf4878e81e2066ead" },
41 cldr => { url => "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip",
42 sha => "132cdd24e479abb6e86db1429931cec3dada485fd41da39ece3c08e531c477df" },
43 cldr33 => { url => "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip",
44 sha => "fa3490082c086d21257153609642f54fcf788fcfda4966fe67f3f6daca0d58b9" },
45 sorting => { url => "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498/Windows 10 Sorting Weight Table.txt",
46 sha => "81fcfa1e5ed3e3a94d329959ff7d97d522ddf9d653d2c4d6ddcccc5cd4df663f" },
47 codepages => { url => "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498/Windows Supported Code Page Data Files.zip",
48 sha => "5074e6dd253056ba61fc6c870c9a955467855129c6ad3a51761c386b301b125a" },
49 iso639 => { url => "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip",
50 sha => "884faa6cc5ac5181ed7969eed75355c1bc665447614cf4c06c62e87b38fe6a97" },
51 ksx1001 => { url => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT",
52 sha => "d8d2a35206ac0ea2865f5d801c9d6717f735bf46f263a658a64a960abe59e371" },
53 jis0208 => { url => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT",
54 sha => "1c571870457f19c97720631fa83ee491549a96ba1436da1296786a67d8632e87" },
55 jis0212 => { url => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT",
56 sha => "477820bb3055bbcc90880d788cd95607d221dc94457bae249231adecf13c12e6" },
57 tzdata => { url => "https://data.iana.org/time-zones/releases/tzdata$TZVERSION.tar.gz",
58 sha => "3f510b5d1b4ae9bb38e485aa302a776b317fb3637bdb6404c4adf7b6cadd965c" },
62 # Default char for undefined mappings
63 my $DEF_CHAR = ord '?';
65 # Last valid Unicode character
66 my $MAX_CHAR = 0x10ffff;
68 my $nlskey = "-SYSTEM\\-CurrentControlSet\\-Control\\-Nls";
69 my $zonekey = "-Software\\-Microsoft\\-Windows NT\\-CurrentVersion\\Time Zones";
71 my @allfiles =
73 "CodpageFiles/037.txt",
74 "CodpageFiles/437.txt",
75 "CodpageFiles/500.txt",
76 "CodpageFiles/708.txt",
77 "CodpageFiles/720.txt",
78 "CodpageFiles/737.txt",
79 "CodpageFiles/775.txt",
80 "CodpageFiles/850.txt",
81 "CodpageFiles/852.txt",
82 "CodpageFiles/855.txt",
83 "CodpageFiles/857.txt",
84 "CodpageFiles/860.txt",
85 "CodpageFiles/861.txt",
86 "CodpageFiles/862.txt",
87 "CodpageFiles/863.txt",
88 "CodpageFiles/864.txt",
89 "CodpageFiles/865.txt",
90 "CodpageFiles/866.txt",
91 "CodpageFiles/869.txt",
92 "CodpageFiles/874.txt",
93 "CodpageFiles/875.txt",
94 "CodpageFiles/932.txt",
95 "CodpageFiles/936.txt",
96 "CodpageFiles/949.txt",
97 "CodpageFiles/950.txt",
98 "CodpageFiles/1026.txt",
99 "CodpageFiles/1250.txt",
100 "CodpageFiles/1251.txt",
101 "CodpageFiles/1252.txt",
102 "CodpageFiles/1253.txt",
103 "CodpageFiles/1254.txt",
104 "CodpageFiles/1255.txt",
105 "CodpageFiles/1256.txt",
106 "CodpageFiles/1257.txt",
107 "CodpageFiles/1258.txt",
108 "CodpageFiles/1361.txt",
109 "CodpageFiles/10000.txt",
110 "CodpageFiles/10001.txt",
111 "CodpageFiles/10002.txt",
112 "CodpageFiles/10003.txt",
113 "CodpageFiles/10004.txt",
114 "CodpageFiles/10005.txt",
115 "CodpageFiles/10006.txt",
116 "CodpageFiles/10007.txt",
117 "CodpageFiles/10008.txt",
118 "CodpageFiles/10010.txt",
119 "CodpageFiles/10017.txt",
120 "CodpageFiles/10021.txt",
121 "CodpageFiles/10029.txt",
122 "CodpageFiles/10079.txt",
123 "CodpageFiles/10081.txt",
124 "CodpageFiles/10082.txt",
125 "CodpageFiles/20127.txt",
126 "CodpageFiles/20866.txt",
127 "CodpageFiles/21866.txt",
128 "CodpageFiles/28591.txt",
129 "CodpageFiles/28592.txt",
130 "CodpageFiles/28593.txt",
131 "CodpageFiles/28594.txt",
132 "CodpageFiles/28595.txt",
133 "CodpageFiles/28596.txt",
134 "CodpageFiles/28597.txt",
135 "CodpageFiles/28598.txt",
136 "CodpageFiles/28599.txt",
137 "CodpageFiles/28603.txt",
138 "CodpageFiles/28605.txt",
141 my @timezone_files = qw(africa antarctica asia australasia europe northamerica southamerica etcetera backward);
143 my %ctype =
145 # CT_CTYPE1
146 "upper" => 0x0001,
147 "lower" => 0x0002,
148 "digit" => 0x0004,
149 "space" => 0x0008,
150 "punct" => 0x0010,
151 "cntrl" => 0x0020,
152 "blank" => 0x0040,
153 "xdigit" => 0x0080,
154 "alpha" => 0x0100 | 0x80000000,
155 "defin" => 0x0200,
156 # CT_CTYPE3 in high 16 bits
157 "nonspacing" => 0x00010000,
158 "diacritic" => 0x00020000,
159 "vowelmark" => 0x00040000,
160 "symbol" => 0x00080000,
161 "katakana" => 0x00100000,
162 "hiragana" => 0x00200000,
163 "halfwidth" => 0x00400000,
164 "fullwidth" => 0x00800000,
165 "ideograph" => 0x01000000,
166 "kashida" => 0x02000000,
167 "lexical" => 0x04000000,
168 "highsurrogate" => 0x08000000,
169 "lowsurrogate" => 0x10000000,
172 my %bracket_types =
174 "o" => 0x0000,
175 "c" => 0x0001,
178 my %indic_types =
180 "Other" => 0x0000,
181 "Bindu" => 0x0001,
182 "Visarga" => 0x0002,
183 "Avagraha" => 0x0003,
184 "Nukta" => 0x0004,
185 "Virama" => 0x0005,
186 "Vowel_Independent" => 0x0006,
187 "Vowel_Dependent" => 0x0007,
188 "Vowel" => 0x0008,
189 "Consonant_Placeholder" => 0x0009,
190 "Consonant" => 0x000a,
191 "Consonant_Dead" => 0x000b,
192 "Consonant_Succeeding_Repha" => 0x000c,
193 "Consonant_Subjoined" => 0x000d,
194 "Consonant_Medial" => 0x000e,
195 "Consonant_Final" => 0x000f,
196 "Consonant_Head_Letter" => 0x0010,
197 "Modifying_Letter" => 0x0011,
198 "Tone_Letter" => 0x0012,
199 "Tone_Mark" => 0x0013,
200 "Register_Shifter" => 0x0014,
201 "Consonant_Preceding_Repha" => 0x0015,
202 "Pure_Killer" => 0x0016,
203 "Invisible_Stacker" => 0x0017,
204 "Gemination_Mark" => 0x0018,
205 "Cantillation_Mark" => 0x0019,
206 "Non_Joiner" => 0x001a,
207 "Joiner" => 0x001b,
208 "Number_Joiner" => 0x001c,
209 "Number" => 0x001d,
210 "Brahmi_Joining_Number" => 0x001e,
211 "Consonant_With_Stacker" => 0x001f,
212 "Consonant_Prefixed" => 0x0020,
213 "Syllable_Modifier" => 0x0021,
214 "Consonant_Killer" => 0x0022,
215 "Consonant_Initial_Postfixed" => 0x0023,
218 my %matra_types =
220 "Right" => 0x01,
221 "Left" => 0x02,
222 "Visual_Order_Left" => 0x03,
223 "Left_And_Right" => 0x04,
224 "Top" => 0x05,
225 "Bottom" => 0x06,
226 "Top_And_Bottom" => 0x07,
227 "Top_And_Right" => 0x08,
228 "Top_And_Left" => 0x09,
229 "Top_And_Left_And_Right" => 0x0a,
230 "Bottom_And_Right" => 0x0b,
231 "Top_And_Bottom_And_Right" => 0x0c,
232 "Overstruck" => 0x0d,
233 "Invisible" => 0x0e,
234 "Bottom_And_Left" => 0x0f,
235 "Top_And_Bottom_And_Left" => 0x10,
238 my %break_types =
240 "BK" => 0x0001,
241 "CR" => 0x0002,
242 "LF" => 0x0003,
243 "CM" => 0x0004,
244 "SG" => 0x0005,
245 "GL" => 0x0006,
246 "CB" => 0x0007,
247 "SP" => 0x0008,
248 "ZW" => 0x0009,
249 "NL" => 0x000a,
250 "WJ" => 0x000b,
251 "JL" => 0x000c,
252 "JV" => 0x000d,
253 "JT" => 0x000e,
254 "H2" => 0x000f,
255 "H3" => 0x0010,
256 "XX" => 0x0011,
257 "OP" => 0x0012,
258 "CL" => 0x0013,
259 "CP" => 0x0014,
260 "QU" => 0x0015,
261 "NS" => 0x0016,
262 "EX" => 0x0017,
263 "SY" => 0x0018,
264 "IS" => 0x0019,
265 "PR" => 0x001a,
266 "PO" => 0x001b,
267 "NU" => 0x001c,
268 "AL" => 0x001d,
269 "ID" => 0x001e,
270 "IN" => 0x001f,
271 "HY" => 0x0020,
272 "BB" => 0x0021,
273 "BA" => 0x0022,
274 "SA" => 0x0023,
275 "AI" => 0x0024,
276 "B2" => 0x0025,
277 "HL" => 0x0026,
278 "CJ" => 0x0027,
279 "RI" => 0x0028,
280 "EB" => 0x0029,
281 "EM" => 0x002a,
282 "ZWJ" => 0x002b,
285 my %vertical_types =
287 "R" => 0x0000,
288 "U" => 0x0001,
289 "Tr" => 0x0002,
290 "Tu" => 0x0003,
293 my %categories =
295 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
296 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
297 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
298 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
299 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
300 "Me" => $ctype{"defin"}, # Mark, Enclosing
301 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
302 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
303 "No" => $ctype{"defin"}, # Number, Other
304 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
305 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
306 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
307 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
308 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
309 "Cs" => $ctype{"defin"}, # Other, Surrogate
310 "Co" => $ctype{"defin"}, # Other, Private Use
311 "Cn" => $ctype{"defin"}, # Other, Not Assigned
312 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
313 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
314 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
315 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
316 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
317 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
318 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
319 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
320 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
321 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
322 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
323 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
324 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
327 # a few characters need additional categories that cannot be determined automatically
328 my %special_categories =
330 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
331 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
332 "space" => [ 0x09..0x0d, 0x85 ],
333 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
334 "cntrl" => [ 0x070f, 0x200c, 0x200d,
335 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
336 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
337 0xfff9, 0xfffa, 0xfffb ],
338 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
339 0xd7, 0xf7 ],
340 "digit" => [ 0xb2, 0xb3, 0xb9 ],
341 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
342 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
343 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
344 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
345 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
346 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
347 0x02b9..0x02ba, 0x02c6..0x02cf ],
348 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
349 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
350 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
351 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
352 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
353 0x3131..0x3164 ],
354 "ideograph" => [ 0x3006..0x3007 ],
355 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
356 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
357 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
358 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
359 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
360 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
361 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
362 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
363 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
364 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
365 "kashida" => [ 0x0640 ],
368 my %directions =
370 "L" => 1, # Left-to-Right
371 "R" => 2, # Right-to-Left
372 "AL" => 12, # Right-to-Left Arabic
373 "EN" => 3, # European Number
374 "ES" => 4, # European Number Separator
375 "ET" => 5, # European Number Terminator
376 "AN" => 6, # Arabic Number
377 "CS" => 7, # Common Number Separator
378 "NSM" => 13, # Non-Spacing Mark
379 "BN" => 14, # Boundary Neutral
380 "B" => 8, # Paragraph Separator
381 "S" => 9, # Segment Separator
382 "WS" => 10, # Whitespace
383 "ON" => 11, # Other Neutrals
384 "LRE" => 15, # Left-to-Right Embedding
385 "LRO" => 15, # Left-to-Right Override
386 "RLE" => 15, # Right-to-Left Embedding
387 "RLO" => 15, # Right-to-Left Override
388 "PDF" => 15, # Pop Directional Format
389 "LRI" => 15, # Left-to-Right Isolate
390 "RLI" => 15, # Right-to-Left Isolate
391 "FSI" => 15, # First Strong Isolate
392 "PDI" => 15 # Pop Directional Isolate
395 my %c2_types =
397 "L" => 1, # C2_LEFTTORIGHT
398 "R" => 2, # C2_RIGHTTOLEFT
399 "AL" => 2, # C2_RIGHTTOLEFT
400 "EN" => 3, # C2_EUROPENUMBER
401 "ES" => 4, # C2_EUROPESEPARATOR
402 "ET" => 5, # C2_EUROPETERMINATOR
403 "AN" => 6, # C2_ARABICNUMBER
404 "CS" => 7, # C2_COMMONSEPARATOR
405 "NSM" => 11, # C2_OTHERNEUTRAL
406 "BN" => 0, # C2_NOTAPPLICABLE
407 "B" => 8, # C2_BLOCKSEPARATOR
408 "S" => 9, # C2_SEGMENTSEPARATOR
409 "WS" => 10, # C2_WHITESPACE
410 "ON" => 11, # C2_OTHERNEUTRAL
411 "LRE" => 11, # C2_OTHERNEUTRAL
412 "LRO" => 11, # C2_OTHERNEUTRAL
413 "RLE" => 11, # C2_OTHERNEUTRAL
414 "RLO" => 11, # C2_OTHERNEUTRAL
415 "PDF" => 11, # C2_OTHERNEUTRAL
416 "LRI" => 11, # C2_OTHERNEUTRAL
417 "RLI" => 11, # C2_OTHERNEUTRAL
418 "FSI" => 11, # C2_OTHERNEUTRAL
419 "PDI" => 11 # C2_OTHERNEUTRAL
422 my %bidi_types =
424 "ON" => 0, # Other Neutrals
425 "L" => 1, # Left-to-Right
426 "R" => 2, # Right-to-Left
427 "AN" => 3, # Arabic Number
428 "EN" => 4, # European Number
429 "AL" => 5, # Right-to-Left Arabic
430 "NSM" => 6, # Non-Spacing Mark
431 "CS" => 7, # Common Number Separator
432 "ES" => 8, # European Number Separator
433 "ET" => 9, # European Number Terminator
434 "BN" => 10, # Boundary Neutral
435 "S" => 11, # Segment Separator
436 "WS" => 12, # Whitespace
437 "B" => 13, # Paragraph Separator
438 "RLO" => 14, # Right-to-Left Override
439 "RLE" => 15, # Right-to-Left Embedding
440 "LRO" => 16, # Left-to-Right Override
441 "LRE" => 17, # Left-to-Right Embedding
442 "PDF" => 18, # Pop Directional Format
443 "LRI" => 19, # Left-to-Right Isolate
444 "RLI" => 20, # Right-to-Left Isolate
445 "FSI" => 21, # First Strong Isolate
446 "PDI" => 22 # Pop Directional Isolate
449 my %joining_types =
451 "U" => 0, # Non_Joining
452 "L" => 1, # Left_Joining
453 "R" => 2, # Right_Joining
454 "D" => 3, # Dual_Joining
455 "C" => 3, # Join_Causing
456 "ALAPH" => 4, # Syriac ALAPH
457 "DALATH RISH" => 5, # Syriac DALATH RISH group
458 "T" => 6, # Transparent
461 my @locales =
463 { name => "", lcid => 0x0000007f, file => "root", territory => "IV", sabbrevlangname => "IVL", sopentypelang =>"dflt" },
464 { name => "aa", sopentypelang => "AFR" },
465 { name => "aa-DJ" },
466 { name => "aa-ER" },
467 { name => "aa-ET" },
468 { name => "af", lcid => 0x00000036, oemcp => 850, sabbrevlangname => "AFK", sopentypelang => "AFK" },
469 { name => "af-NA" },
470 { name => "af-ZA", lcid => 0x00000436 },
471 { name => "agq" },
472 { name => "agq-CM" },
473 { name => "ak", sopentypelang => "TWI" },
474 { name => "ak-GH" },
475 { name => "am", lcid => 0x0000005e, sabbrevlangname => "AMH" },
476 { name => "am-ET", lcid => 0x0000045e },
477 { name => "ar", lcid => 0x00000001, territory => "SA", oemcp => 720, group => 13 },
478 { name => "ar-001" },
479 { name => "ar-AE", lcid => 0x00003801, sabbrevlangname => "ARU" },
480 { name => "ar-BH", lcid => 0x00003c01, sabbrevlangname => "ARH" },
481 { name => "ar-DJ" },
482 { name => "ar-DZ", lcid => 0x00001401, sabbrevlangname => "ARG", nativedigits => "0123456789" },
483 { name => "ar-EG", lcid => 0x00000c01, sabbrevlangname => "ARE" },
484 { name => "ar-EH" },
485 { name => "ar-ER" },
486 { name => "ar-IL" },
487 { name => "ar-IQ", lcid => 0x00000801, sabbrevlangname => "ARI" },
488 { name => "ar-JO", lcid => 0x00002c01, sabbrevlangname => "ARJ" },
489 { name => "ar-KM" },
490 { name => "ar-KW", lcid => 0x00003401, sabbrevlangname => "ARK" },
491 { name => "ar-LB", lcid => 0x00003001, sabbrevlangname => "ARB" },
492 { name => "ar-LY", lcid => 0x00001001, sabbrevlangname => "ARL", nativedigits => "0123456789" },
493 { name => "ar-MA", lcid => 0x00001801, sabbrevlangname => "ARM", nativedigits => "0123456789" },
494 { name => "ar-MR" },
495 { name => "ar-OM", lcid => 0x00002001, sabbrevlangname => "ARO" },
496 { name => "ar-PS" },
497 { name => "ar-QA", lcid => 0x00004001, sabbrevlangname => "ARQ" },
498 { name => "ar-SA", lcid => 0x00000401, sabbrevlangname => "ARA" },
499 { name => "ar-SD" },
500 { name => "ar-SO" },
501 { name => "ar-SS" },
502 { name => "ar-SY", lcid => 0x00002801, sabbrevlangname => "ARS" },
503 { name => "ar-TD" },
504 { name => "ar-TN", lcid => 0x00001c01, sabbrevlangname => "ART", nativedigits => "0123456789" },
505 { name => "ar-YE", lcid => 0x00002401, sabbrevlangname => "ARY" },
506 { name => "arn", lcid => 0x0000007a, oemcp => 850, ebcdiccp => 20284, slist => ",", sabbrevlangname => "MPD", sopentypelang => "MAP" },
507 { name => "arn-CL", lcid => 0x0000047a },
508 { name => "arn-Latn", alias => "arn" },
509 { name => "arn-Latn-CL", alias => "arn-CL" },
510 { name => "as", lcid => 0x0000004d, slist => ",", group => 15 },
511 { name => "as-IN", lcid => 0x0000044d },
512 { name => "asa" },
513 { name => "asa-TZ" },
514 { name => "ast" },
515 { name => "ast-ES" },
516 { name => "az", lcid => 0x0000002c, oemcp => 857, ebcdiccp => 20905, group => 2 },
517 { name => "az-Cyrl", lcid => 0x0000742c, oemcp => 866, ebcdiccp => 20880, group => 5, sabbrevlangname => "AZC" },
518 { name => "az-Cyrl-AZ", lcid => 0x0000082c },
519 { name => "az-Latn", lcid => 0x0000782c },
520 { name => "az-Latn-AZ", lcid => 0x0000042c },
521 { name => "ba", lcid => 0x0000006d, oemcp => 866, group => 5, sabbrevlangname => "BAS", sopentypelang => "BSH" },
522 { name => "ba-Cyrl", alias => "ba" },
523 { name => "ba-Cyrl-RU", alias => "ba-RU" },
524 { name => "ba-RU", lcid => 0x0000046d },
525 { name => "bas" },
526 { name => "bas-CM" },
527 { name => "be", lcid => 0x00000023, oemcp => 866, ebcdiccp => 500, group => 5 },
528 { name => "be-BY", lcid => 0x00000423 },
529 { name => "bem" },
530 { name => "bem-ZM" },
531 { name => "bez" },
532 { name => "bez-TZ" },
533 { name => "bg", lcid => 0x00000002, oemcp => 866, ebcdiccp => 21025, group => 5, sabbrevlangname => "BGR", sopentypelang => "BGR" },
534 { name => "bg-BG", lcid => 0x00000402 },
535 { name => "bin", lcid => 0x00000066, oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "EDO" },
536 { name => "bin-NG", lcid => 0x00000466, file => "bin", dir => "exemplars" },
537 { name => "bm", sopentypelang => "BMB" },
538 { name => "bm-Latn", file => "bm" },
539 { name => "bm-Latn-ML", file => "bm_ML" },
540 { name => "bm-ML", alias => "bm-Latn-ML" },
541 { name => "bn", lcid => 0x00000045, slist => ",", group => 15, sabbrevlangname => "BNB" },
542 { name => "bn-BD", lcid => 0x00000845 },
543 { name => "bn-IN", lcid => 0x00000445, sabbrevlangname => "BNG" },
544 { name => "bo", lcid => 0x00000051, slist => ",", group => 15, sabbrevlangname => "BOB", sopentypelang => "TIB" },
545 { name => "bo-CN", lcid => 0x00000451 },
546 { name => "bo-IN", slist => "," },
547 { name => "bo-Tibt", alias => "bo" },
548 { name => "bo-Tibt-CN", alias => "bo-CN" },
549 { name => "bo-Tibt-IN", alias => "bo-IN" },
550 { name => "br", lcid => 0x0000007e, oemcp => 850, ebcdiccp => 20297 },
551 { name => "br-FR", lcid => 0x0000047e },
552 { name => "br-Latn", alias => "br" },
553 { name => "br-Latn-FR", alias => "br-FR" },
554 { name => "brx" },
555 { name => "brx-IN" },
556 { name => "bs", lcid => 0x0000781a, oemcp => 852, maccp => 10082, ebcdiccp => 870, group => 2, sabbrevlangname => "BSB" },
557 { name => "bs-Cyrl", lcid => 0x0000641a, oemcp => 855, group => 5, sabbrevlangname => "BSC" },
558 { name => "bs-Cyrl-BA", lcid => 0x0000201a },
559 { name => "bs-Latn", lcid => 0x0000681a },
560 { name => "bs-Latn-BA", lcid => 0x0000141a },
561 { name => "byn", sopentypelang => "BIL" },
562 { name => "byn-ER" },
563 { name => "ca", lcid => 0x00000003, oemcp => 850 },
564 { name => "ca-AD", maccp => 65001 },
565 { name => "ca-ES", lcid => 0x00000403 },
566 { name => "ca-ES-valencia", lcid => 0x00000803, file => "ca_ES_VALENCIA", sabbrevlangname => "VAL" },
567 { name => "ca-FR", maccp => 65001 },
568 { name => "ca-IT", maccp => 65001 },
569 { name => "ccp" },
570 { name => "ccp-BD", alias => "ccp-Cakm-BD" },
571 { name => "ccp-Cakm", file => "ccp" },
572 { name => "ccp-Cakm-BD", file => "ccp_BD" },
573 { name => "ccp-Cakm-IN", file => "ccp_IN" },
574 { name => "ccp-IN", alias => "ccp-Cakm-IN" },
575 { name => "ce" },
576 { name => "ce-RU" },
577 { name => "ceb" },
578 { name => "ceb-Latn", file => "ceb" },
579 { name => "ceb-Latn-PH", file => "ceb_PH" },
580 { name => "ceb-PH", alias => "ceb-Latn-PH" },
581 { name => "cgg" },
582 { name => "cgg-UG" },
583 { name => "chr", lcid => 0x0000005c, slist => ",", sabbrevlangname => "CRE" },
584 { name => "chr-Cher", lcid => 0x00007c5c, file => "chr" },
585 { name => "chr-Cher-US", lcid => 0x0000045c, file => "chr_US" },
586 { name => "chr-US", alias => "chr-Cher-US" },
587 { name => "ckb", alias => "ku" },
588 { name => "ckb-IQ", alias => "ku-Arab-IQ" },
589 { name => "ckb-IR", alias => "ku-Arab-IR" },
590 { name => "co", lcid => 0x00000083, oemcp => 850, ebcdiccp => 20297 },
591 { name => "co-FR", lcid => 0x00000483 },
592 { name => "co-Latn", alias => "co" },
593 { name => "co-Latn-FR", alias => "co-FR" },
594 { name => "cs", lcid => 0x00000005, oemcp => 852, group => 2, sabbrevlangname => "CSY", sopentypelang => "CSY" },
595 { name => "cs-CZ", lcid => 0x00000405 },
596 { name => "cu", sopentypelang => "CSL" },
597 { name => "cu-RU" },
598 { name => "cy", lcid => 0x00000052, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "CYM", sopentypelang => "WEL" },
599 { name => "cy-GB", lcid => 0x00000452 },
600 { name => "da", lcid => 0x00000006, oemcp => 850, ebcdiccp => 20277 },
601 { name => "da-DK", lcid => 0x00000406 },
602 { name => "da-GL", maccp => 65001 },
603 { name => "dav" },
604 { name => "dav-KE" },
605 { name => "de", lcid => 0x00000007, oemcp => 850, ebcdiccp => 20273 },
606 { name => "de-AT", lcid => 0x00000c07, sabbrevlangname => "DEA" },
607 { name => "de-BE" },
608 { name => "de-CH", lcid => 0x00000807, sabbrevlangname => "DES" },
609 { name => "de-DE", lcid => 0x00000407 },
610 { name => "de-DE_phoneb", lcid => 0x00010407, alias => "de-DE" },
611 { name => "de-DE-u-co-phonebk", alias => "de-DE_phoneb" },
612 { name => "de-IT", oemcp => 65001 },
613 { name => "de-LI", lcid => 0x00001407, sabbrevlangname => "DEC" },
614 { name => "de-LU", lcid => 0x00001007, sabbrevlangname => "DEL" },
615 { name => "dje", sopentypelang => "DJR" },
616 { name => "dje-NE" },
617 { name => "doi", sopentypelang => "DGR" },
618 { name => "doi-IN", alias => "doi-Deva-IN" },
619 { name => "doi-Deva", file => "doi" },
620 { name => "doi-Deva-IN", file => "doi_IN" },
621 { name => "dsb", lcid => 0x00007c2e, sparent => "hsb", oemcp => 850, ebcdiccp => 870, sabbrevlangname => "DSB", sopentypelang => "LSB" },
622 { name => "dsb-DE", lcid => 0x0000082e },
623 { name => "dua" },
624 { name => "dua-CM" },
625 { name => "dv", lcid => 0x00000065, slist => "\x{060c}", group => 13, nativedigits => "0123456789" },
626 { name => "dv-MV", lcid => 0x00000465 },
627 { name => "dyo" },
628 { name => "dyo-SN" },
629 { name => "dz", sopentypelang => "DZN" },
630 { name => "dz-BT", lcid => 0x00000c51, sabbrevlangname => "ZZZ" },
631 { name => "ebu" },
632 { name => "ebu-KE" },
633 { name => "ee" },
634 { name => "ee-GH" },
635 { name => "ee-TG" },
636 { name => "el", lcid => 0x00000008, oemcp => 737, group => 4 },
637 { name => "el-CY" },
638 { name => "el-GR", lcid => 0x00000408 },
639 { name => "en", lcid => 0x00000009, oemcp => 437, slist => ",", sabbrevlangname => "ENU" },
640 { name => "en-001", oemcp => 850 },
641 { name => "en-029", lcid => 0x00002409, file => "en", oemcp => 850, sabbrevlangname => "ENB" },
642 { name => "en-150", oemcp => 65001 },
643 { name => "en-AE", lcid => 0x00004c09, oemcp => 65001, sabbrevlangname => "ZZZ" },
644 { name => "en-AG", oemcp => 850 },
645 { name => "en-AI", oemcp => 850 },
646 { name => "en-AS", oemcp => 850 },
647 { name => "en-AT", oemcp => 65001 },
648 { name => "en-AU", lcid => 0x00000c09, oemcp => 850, sabbrevlangname => "ENA" },
649 { name => "en-BB", oemcp => 850 },
650 { name => "en-BE", oemcp => 850 },
651 { name => "en-BI", oemcp => 65001 },
652 { name => "en-BM", oemcp => 850 },
653 { name => "en-BS", oemcp => 850 },
654 { name => "en-BW", oemcp => 850 },
655 { name => "en-BZ", lcid => 0x00002809, oemcp => 850, sabbrevlangname => "ENL" },
656 { name => "en-CA", lcid => 0x00001009, oemcp => 850, ebcdiccp => 37, sabbrevlangname => "ENC" },
657 { name => "en-CC", oemcp => 850 },
658 { name => "en-CH", oemcp => 65001 },
659 { name => "en-CK", oemcp => 850 },
660 { name => "en-CM", oemcp => 850 },
661 { name => "en-CX", oemcp => 850 },
662 { name => "en-CY", oemcp => 65001 },
663 { name => "en-DE", oemcp => 65001 },
664 { name => "en-DG", oemcp => 850 },
665 { name => "en-DK", oemcp => 65001 },
666 { name => "en-DM", oemcp => 850 },
667 { name => "en-ER", oemcp => 850 },
668 { name => "en-FI", oemcp => 65001 },
669 { name => "en-FJ", oemcp => 850 },
670 { name => "en-FK", oemcp => 850 },
671 { name => "en-FM", oemcp => 850 },
672 { name => "en-GB", lcid => 0x00000809, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "ENG" },
673 { name => "en-GD", oemcp => 850 },
674 { name => "en-GG", oemcp => 850 },
675 { name => "en-GH", oemcp => 850 },
676 { name => "en-GI", oemcp => 850 },
677 { name => "en-GM", oemcp => 850 },
678 { name => "en-GU", oemcp => 850 },
679 { name => "en-GY", oemcp => 850 },
680 { name => "en-HK", lcid => 0x00003c09, oemcp => 850, sabbrevlangname => "ENH" },
681 { name => "en-ID", lcid => 0x00003809, file => "en", oemcp => 850, sabbrevlangname => "ZZZ" },
682 { name => "en-IE", lcid => 0x00001809, oemcp => 850, sabbrevlangname => "ENI" },
683 { name => "en-IL", oemcp => 65001 },
684 { name => "en-IM", oemcp => 850 },
685 { name => "en-IN", lcid => 0x00004009, sabbrevlangname => "ENN" },
686 { name => "en-IO", oemcp => 850 },
687 { name => "en-JE", oemcp => 850 },
688 { name => "en-JM", lcid => 0x00002009, oemcp => 850, sabbrevlangname => "ENJ" },
689 { name => "en-KE", oemcp => 850 },
690 { name => "en-KI", oemcp => 850 },
691 { name => "en-KN", oemcp => 850 },
692 { name => "en-KY", oemcp => 850 },
693 { name => "en-LC", oemcp => 850 },
694 { name => "en-LR", oemcp => 850 },
695 { name => "en-LS", oemcp => 850 },
696 { name => "en-MG", oemcp => 850 },
697 { name => "en-MH", oemcp => 850 },
698 { name => "en-MO", oemcp => 850 },
699 { name => "en-MP", oemcp => 850 },
700 { name => "en-MS", oemcp => 850 },
701 { name => "en-MT", oemcp => 850 },
702 { name => "en-MU", oemcp => 850 },
703 { name => "en-MW", oemcp => 850 },
704 { name => "en-MY", lcid => 0x00004409, sabbrevlangname => "ENM" },
705 { name => "en-NA", oemcp => 850 },
706 { name => "en-NF", oemcp => 850 },
707 { name => "en-NG", oemcp => 850 },
708 { name => "en-NL", oemcp => 65001 },
709 { name => "en-NR", oemcp => 850 },
710 { name => "en-NU", oemcp => 850 },
711 { name => "en-NZ", lcid => 0x00001409, oemcp => 850, sabbrevlangname => "ENZ" },
712 { name => "en-PG", oemcp => 850 },
713 { name => "en-PH", lcid => 0x00003409, ebcdiccp => 500, sabbrevlangname => "ENP" },
714 { name => "en-PK", oemcp => 850 },
715 { name => "en-PN", oemcp => 850 },
716 { name => "en-PR", oemcp => 850 },
717 { name => "en-PW", oemcp => 850 },
718 { name => "en-RW", oemcp => 850 },
719 { name => "en-SB", oemcp => 850 },
720 { name => "en-SC", oemcp => 850 },
721 { name => "en-SD", oemcp => 850 },
722 { name => "en-SE", oemcp => 65001 },
723 { name => "en-SG", lcid => 0x00004809, sabbrevlangname => "ENE" },
724 { name => "en-SH", oemcp => 850 },
725 { name => "en-SI", oemcp => 65001 },
726 { name => "en-SL", oemcp => 850 },
727 { name => "en-SS", oemcp => 850 },
728 { name => "en-SX", oemcp => 850 },
729 { name => "en-SZ", oemcp => 850 },
730 { name => "en-TC", oemcp => 850 },
731 { name => "en-TK", oemcp => 850 },
732 { name => "en-TO", oemcp => 850 },
733 { name => "en-TT", lcid => 0x00002c09, oemcp => 850, sabbrevlangname => "ENT" },
734 { name => "en-TV", oemcp => 850 },
735 { name => "en-TZ", oemcp => 850 },
736 { name => "en-UG", oemcp => 850 },
737 { name => "en-UM", oemcp => 850 },
738 { name => "en-US", lcid => 0x00000409 },
739 { name => "en-VC", oemcp => 850 },
740 { name => "en-VG", oemcp => 850 },
741 { name => "en-VI", oemcp => 850 },
742 { name => "en-VU", oemcp => 850 },
743 { name => "en-WS", oemcp => 850 },
744 { name => "en-ZA", lcid => 0x00001c09, ebcdiccp => 500, sabbrevlangname => "ENS" },
745 { name => "en-ZM", oemcp => 850 },
746 { name => "en-ZW", lcid => 0x00003009, ebcdiccp => 500, sabbrevlangname => "ENW" },
747 { name => "eo", sopentypelang => "NTO" },
748 { name => "eo-001" },
749 { name => "es", lcid => 0x0000000a, oemcp => 850, ebcdiccp => 20284, sabbrevlangname => "ESP", sopentypelang => "ESP" },
750 { name => "es-419", lcid => 0x0000580a, sabbrevlangname => "ESJ" },
751 { name => "es-AR", lcid => 0x00002c0a, sabbrevlangname => "ESS" },
752 { name => "es-BO", lcid => 0x0000400a, sabbrevlangname => "ESB" },
753 { name => "es-BR", oemcp => 65001 },
754 { name => "es-BZ", oemcp => 65001 },
755 { name => "es-CL", lcid => 0x0000340a, sabbrevlangname => "ESL" },
756 { name => "es-CO", lcid => 0x0000240a, sabbrevlangname => "ESO" },
757 { name => "es-CR", lcid => 0x0000140a, sabbrevlangname => "ESC" },
758 { name => "es-CU", lcid => 0x00005c0a, sabbrevlangname => "ESK" },
759 { name => "es-DO", lcid => 0x00001c0a, sabbrevlangname => "ESD" },
760 { name => "es-EA" },
761 { name => "es-EC", lcid => 0x0000300a, sabbrevlangname => "ESF" },
762 { name => "es-ES", lcid => 0x00000c0a, sabbrevlangname => "ESN" },
763 { name => "es-ES_tradnl", lcid => 0x0000040a, file => "es_ES" },
764 { name => "es-ES-u-co-trad", alias => "es-ES_tradnl" },
765 { name => "es-GQ" },
766 { name => "es-GT", lcid => 0x0000100a, sabbrevlangname => "ESG" },
767 { name => "es-HN", lcid => 0x0000480a, sabbrevlangname => "ESH" },
768 { name => "es-IC" },
769 { name => "es-MX", lcid => 0x0000080a, sabbrevlangname => "ESM" },
770 { name => "es-NI", lcid => 0x00004c0a, sabbrevlangname => "ESI" },
771 { name => "es-PA", lcid => 0x0000180a, sabbrevlangname => "ESA" },
772 { name => "es-PE", lcid => 0x0000280a, sabbrevlangname => "ESR" },
773 { name => "es-PH" },
774 { name => "es-PR", lcid => 0x0000500a, sabbrevlangname => "ESU" },
775 { name => "es-PY", lcid => 0x00003c0a, sabbrevlangname => "ESZ" },
776 { name => "es-SV", lcid => 0x0000440a, sabbrevlangname => "ESE" },
777 { name => "es-US", lcid => 0x0000540a, sabbrevlangname => "EST" },
778 { name => "es-UY", lcid => 0x0000380a, sabbrevlangname => "ESY" },
779 { name => "es-VE", lcid => 0x0000200a, sabbrevlangname => "ESV" },
780 { name => "et", lcid => 0x00000025, oemcp => 775, group => 3, sabbrevlangname => "ETI", sopentypelang => "ETI" },
781 { name => "et-EE", lcid => 0x00000425 },
782 { name => "eu", lcid => 0x0000002d, oemcp => 850, maccp => 65001, sabbrevlangname => "EUQ", sopentypelang => "EUQ" },
783 { name => "eu-ES", lcid => 0x0000042d },
784 { name => "ewo" },
785 { name => "ewo-CM" },
786 { name => "fa", lcid => 0x00000029, inegnumber => 3, oemcp => 720, slist => "\x{061b}", group => 13, sabbrevlangname => "FAR", sopentypelang => "FAR" },
787 { name => "fa-AF", alias => "prs-AF" },
788 { name => "fa-IR", lcid => 0x00000429 },
789 { name => "ff", lcid => 0x00000067, oemcp => 850, ebcdiccp => 20297 },
790 { name => "ff-CM", alias => "ff-Latn-CM" },
791 { name => "ff-GN", alias => "ff-Latn-GN" },
792 { name => "ff-MR", alias => "ff-Latn-MR" },
793 { name => "ff-NG", alias => "ff-Latn-NG" },
794 { name => "ff-SN", alias => "ff-Latn-SN" },
795 { name => "ff-Adlm", oemcp => 65001 },
796 { name => "ff-Adlm-BF" },
797 { name => "ff-Adlm-CM" },
798 { name => "ff-Adlm-GH" },
799 { name => "ff-Adlm-GM" },
800 { name => "ff-Adlm-GN" },
801 { name => "ff-Adlm-GW" },
802 { name => "ff-Adlm-LR" },
803 { name => "ff-Adlm-MR" },
804 { name => "ff-Adlm-NE" },
805 { name => "ff-Adlm-NG" },
806 { name => "ff-Adlm-SL" },
807 { name => "ff-Adlm-SN" },
808 { name => "ff-Latn", lcid => 0x00007c67 },
809 { name => "ff-Latn-BF", oemcp => 65001 },
810 { name => "ff-Latn-CM" },
811 { name => "ff-Latn-GH", oemcp => 65001 },
812 { name => "ff-Latn-GM", oemcp => 65001 },
813 { name => "ff-Latn-GN" },
814 { name => "ff-Latn-GW", oemcp => 65001 },
815 { name => "ff-Latn-LR", oemcp => 65001 },
816 { name => "ff-Latn-MR" },
817 { name => "ff-Latn-NE", oemcp => 65001 },
818 { name => "ff-Latn-NG", lcid => 0x00000467, sabbrevlangname => "ZZZ" },
819 { name => "ff-Latn-SL", oemcp => 65001 },
820 { name => "ff-Latn-SN", lcid => 0x00000867 },
821 { name => "fi", lcid => 0x0000000b, oemcp => 850, ebcdiccp => 20278 },
822 { name => "fi-FI", lcid => 0x0000040b },
823 { name => "fil", lcid => 0x00000064, oemcp => 437, ebcdiccp => 500, sabbrevlangname => "FPO", sopentypelang => "PIL" },
824 { name => "fil-PH", lcid => 0x00000464 },
825 { name => "fil-Latn", alias => "fil" },
826 { name => "fil-Latn-PH", alias => "fil-PH" },
827 { name => "fo", lcid => 0x00000038, oemcp => 850, maccp => 10079, ebcdiccp => 20277, sabbrevlangname => "FOS", sopentypelang => "FOS" },
828 { name => "fo-DK", oemcp => 65001, maccp => 65001 },
829 { name => "fo-FO", lcid => 0x00000438 },
830 { name => "fr", lcid => 0x0000000c, oemcp => 850, ebcdiccp => 20297 },
831 { name => "fr-029", lcid => 0x00001c0c, file => "fr", sabbrevlangname => "ZZZ" },
832 { name => "fr-BE", lcid => 0x0000080c, sabbrevlangname => "FRB" },
833 { name => "fr-BF" },
834 { name => "fr-BI" },
835 { name => "fr-BJ" },
836 { name => "fr-BL" },
837 { name => "fr-CA", lcid => 0x00000c0c, sabbrevlangname => "FRC" },
838 { name => "fr-CD", lcid => 0x0000240c, sabbrevlangname => "FRD" },
839 { name => "fr-CF" },
840 { name => "fr-CG" },
841 { name => "fr-CH", lcid => 0x0000100c, sabbrevlangname => "FRS" },
842 { name => "fr-CI", lcid => 0x0000300c, sabbrevlangname => "FRI" },
843 { name => "fr-CM", lcid => 0x00002c0c, sabbrevlangname => "FRE" },
844 { name => "fr-DJ" },
845 { name => "fr-DZ" },
846 { name => "fr-FR", lcid => 0x0000040c },
847 { name => "fr-GA" },
848 { name => "fr-GF" },
849 { name => "fr-GN" },
850 { name => "fr-GP" },
851 { name => "fr-GQ" },
852 { name => "fr-HT", lcid => 0x00003c0c, sabbrevlangname => "FRH" },
853 { name => "fr-KM" },
854 { name => "fr-LU", lcid => 0x0000140c, sabbrevlangname => "FRL" },
855 { name => "fr-MA", lcid => 0x0000380c, sabbrevlangname => "FRO" },
856 { name => "fr-MC", lcid => 0x0000180c, sabbrevlangname => "FRM" },
857 { name => "fr-MF" },
858 { name => "fr-MG" },
859 { name => "fr-ML", lcid => 0x0000340c, sabbrevlangname => "FRF" },
860 { name => "fr-MQ" },
861 { name => "fr-MR" },
862 { name => "fr-MU" },
863 { name => "fr-NC" },
864 { name => "fr-NE" },
865 { name => "fr-PF" },
866 { name => "fr-PM" },
867 { name => "fr-RE", lcid => 0x0000200c, sabbrevlangname => "FRR" },
868 { name => "fr-RW" },
869 { name => "fr-SC" },
870 { name => "fr-SN", lcid => 0x0000280c, sabbrevlangname => "FRN" },
871 { name => "fr-SY" },
872 { name => "fr-TD" },
873 { name => "fr-TG" },
874 { name => "fr-TN" },
875 { name => "fr-VU" },
876 { name => "fr-WF" },
877 { name => "fr-YT" },
878 { name => "fur", sopentypelang => "FRL" },
879 { name => "fur-IT" },
880 { name => "fuv-NG", alias => "ff-Latn-NG" },
881 { name => "fy", lcid => 0x00000062, oemcp => 850, sabbrevlangname => "FYN", sopentypelang => "FRI" },
882 { name => "fy-NL", lcid => 0x00000462 },
883 { name => "ga", lcid => 0x0000003c, oemcp => 850, sabbrevlangname => "IRE", sopentypelang => "IRI" },
884 { name => "ga-GB", oemcp => 65001 },
885 { name => "ga-IE", lcid => 0x0000083c },
886 { name => "gd", lcid => 0x00000091, oemcp => 850, ebcdiccp => 20285, sopentypelang => "GAE" },
887 { name => "gd-GB", lcid => 0x00000491 },
888 { name => "gd-Latn", alias => "gd" },
889 { name => "gl", lcid => 0x00000056, oemcp => 850, sabbrevlangname => "GLC", sopentypelang => "GAL" },
890 { name => "gl-ES", lcid => 0x00000456 },
891 { name => "gn", lcid => 0x00000074, oemcp => 850, ebcdiccp => 20284, slist => ",", sopentypelang => "GUA" },
892 { name => "gn-PY", lcid => 0x00000474 },
893 { name => "gsw", lcid => 0x00000084, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "ZZZ", sopentypelang => "ALS" },
894 { name => "gsw-CH" },
895 { name => "gsw-FR", lcid => 0x00000484, sabbrevlangname => "GSW" },
896 { name => "gsw-LI" },
897 { name => "gu", lcid => 0x00000047, slist => ",", group => 15 },
898 { name => "gu-IN", lcid => 0x00000447 },
899 { name => "guz" },
900 { name => "guz-KE" },
901 { name => "gv", sopentypelang => "MNX" },
902 { name => "gv-GB", file => "gv" },
903 { name => "gv-IM" },
904 { name => "ha", lcid => 0x00000068, oemcp => 437 },
905 { name => "ha-GH", alias => "ha-Latn-GH" },
906 { name => "ha-Latn", lcid => 0x00007c68, file => "ha" },
907 { name => "ha-Latn-GH", file => "ha_GH", ebcdiccp => 500 },
908 { name => "ha-Latn-NE", file => "ha_NE", ebcdiccp => 500 },
909 { name => "ha-Latn-NG", lcid => 0x00000468, file => "ha_NG" },
910 { name => "ha-NE", alias => "ha-Latn-NE" },
911 { name => "ha-NG", alias => "ha-Latn-NG" },
912 { name => "haw", lcid => 0x00000075, oemcp => 437 },
913 { name => "haw-Latn", alias => "haw" },
914 { name => "haw-Latn-US", alias => "haw-US" },
915 { name => "haw-US", lcid => 0x00000475 },
916 { name => "he", lcid => 0x0000000d, oemcp => 862, slist => ",", group => 12, sopentypelang => "IWR" },
917 { name => "he-IL", lcid => 0x0000040d },
918 { name => "hi", lcid => 0x00000039, slist => ",", group => 15 },
919 { name => "hi-IN", lcid => 0x00000439 },
920 { name => "hr", lcid => 0x0000001a, inegnumber => 2, oemcp => 852, maccp => 10082, group => 2 },
921 { name => "hr-BA", lcid => 0x0000101a, ebcdiccp => 870, inegnumber => 1, sabbrevlangname => "HRB" },
922 { name => "hr-HR", lcid => 0x0000041a },
923 { name => "hsb", lcid => 0x0000002e, oemcp => 850, ebcdiccp => 870, sopentypelang => "USB" },
924 { name => "hsb-DE", lcid => 0x0000042e },
925 { name => "hu", lcid => 0x0000000e, oemcp => 852, group => 2 },
926 { name => "hu-HU", lcid => 0x0000040e },
927 { name => "hu-HU_technl", lcid => 0x0001040e, alias => "hu-HU" },
928 { name => "hy", lcid => 0x0000002b, slist => ",", group => 17 },
929 { name => "hy-AM", lcid => 0x0000042b },
930 { name => "ia" },
931 { name => "ia-001" },
932 ## name => "ibb", lcid => 0x00000069 },
933 ## name => "ibb-NG", lcid => 0x00000469 },
934 { name => "id", lcid => 0x00000021, oemcp => 850 },
935 { name => "id-ID", lcid => 0x00000421 },
936 { name => "ig", lcid => 0x00000070, oemcp => 437 },
937 { name => "ig-Latn", alias => "ig" },
938 { name => "ig-Latn-NG", alias => "ig-NG" },
939 { name => "ig-NG", lcid => 0x00000470 },
940 { name => "ii", lcid => 0x00000078, group => 9, sopentypelang => "YIM" },
941 { name => "ii-CN", lcid => 0x00000478 },
942 { name => "ii-Yiii", alias => "ii" },
943 { name => "ii-Yiii-CN", alias => "ii-CN" },
944 { name => "is", lcid => 0x0000000f, oemcp => 850, maccp => 10079, ebcdiccp => 20871 },
945 { name => "is-IS", lcid => 0x0000040f },
946 { name => "it", lcid => 0x00000010, oemcp => 850, ebcdiccp => 20280 },
947 { name => "it-CH", lcid => 0x00000810, ebcdiccp => 500, sabbrevlangname => "ITS" },
948 { name => "it-IT", lcid => 0x00000410 },
949 { name => "it-SM" },
950 { name => "it-VA", oemcp => 65001 },
951 { name => "iu", lcid => 0x0000005d, oemcp => 437, slist => ",", sortlocale => "iu-Latn-CA", sabbrevlangname => "IUK", sopentypelang => "INU" },
952 { name => "iu-Cans", lcid => 0x0000785d, file => "iu", oemcp => 65001, sabbrevlangname => "IUS" },
953 { name => "iu-Cans-CA", lcid => 0x0000045d, file => "iu_CA" },
954 { name => "iu-Latn", lcid => 0x00007c5d },
955 { name => "iu-Latn-CA", lcid => 0x0000085d },
956 { name => "ja", lcid => 0x00000011, ireadinglayout => 2, oemcp => 932, slist => ",", sscripts => "Hani Hira Jpan Kana", group => 7, sopentypelang => "JAN" },
957 { name => "ja-JP", lcid => 0x00000411 },
958 { name => "ja-JP_radstr", lcid => 0x00040411, alias => "ja-JP" },
959 { name => "ja-JP-u-co-unihan", alias => "ja-JP_radstr" },
960 { name => "jgo" },
961 { name => "jgo-CM" },
962 { name => "jmc" },
963 { name => "jmc-TZ" },
964 { name => "jv", oemcp => 850, nativedigits => "0123456789" },
965 { name => "jv-ID", alias => "jv-Latn-ID" },
966 ## name => "jv-Java" },
967 ## name => "jv-Java-ID" },
968 { name => "jv-Latn", file => "jv" },
969 { name => "jv-Latn-ID", file => "jv_ID" },
970 { name => "ka", lcid => 0x00000037, group => 16 },
971 { name => "ka-GE", lcid => 0x00000437 },
972 { name => "ka-GE_modern", lcid => 0x00010437, alias => "ka-GE" },
973 { name => "kab", sopentypelang => "KAB0" },
974 { name => "kab-DZ" },
975 { name => "kam", sopentypelang => "KMB" },
976 { name => "kam-KE" },
977 { name => "kde" },
978 { name => "kde-TZ" },
979 { name => "kea" },
980 { name => "kea-CV" },
981 { name => "kgp" },
982 { name => "kgp-BR" },
983 { name => "khq" },
984 { name => "khq-ML" },
985 { name => "ki" },
986 { name => "ki-KE" },
987 { name => "kk", lcid => 0x0000003f, group => 5, sabbrevlangname => "KKZ" },
988 { name => "kk-Cyrl", alias => "kk" },
989 { name => "kk-Cyrl-KZ", alias => "kk-KZ" },
990 { name => "kk-KZ", lcid => 0x0000043f },
991 { name => "kkj" },
992 { name => "kkj-CM" },
993 { name => "kl", lcid => 0x0000006f, oemcp => 850, ebcdiccp => 20277, sopentypelang => "GRN" },
994 { name => "kl-GL", lcid => 0x0000046f },
995 { name => "kln", sopentypelang => "KAL" },
996 { name => "kln-KE" },
997 { name => "km", lcid => 0x00000053, inegnumber => 2, slist => ",", group => 15 },
998 { name => "km-KH", lcid => 0x00000453 },
999 { name => "kn", lcid => 0x0000004b, slist => ",", group => 15, sabbrevlangname => "KDI" },
1000 { name => "kn-IN", lcid => 0x0000044b },
1001 { name => "ko", lcid => 0x00000012, ireadinglayout => 2, slist => ",", oemcp => 949, ebcdiccp => 20833, sscripts => "Hang Hani Kore", group => 8 },
1002 { name => "ko-KP", oemcp => 65001 },
1003 { name => "ko-KR", lcid => 0x00000412 },
1004 { name => "kok", lcid => 0x00000057, slist => ",", group => 15, sabbrevlangname => "KNK" },
1005 { name => "kok-IN", lcid => 0x00000457 },
1006 { name => "kr", lcid => 0x00000071, sortlocale => "kr-Latn-NG", oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "KNR" },
1007 { name => "kr-Latn", file => "kr", dir => "exemplars" },
1008 { name => "kr-Latn-NG", lcid => 0x00000471, file => "kr", dir => "exemplars" },
1009 { name => "kr-NG", alias => "kr-Latn-NG" },
1010 { name => "ks", lcid => 0x00000060, group => 15, sabbrevlangname => "ZZZ", sopentypelang => "KSH" },
1011 { name => "ks-Arab", lcid => 0x00000460 },
1012 { name => "ks-Arab-IN" },
1013 { name => "ks-Deva", slist => "," },
1014 { name => "ks-Deva-IN", lcid => 0x00000860 },
1015 { name => "ks-IN", alias => "ks-Arab-IN" },
1016 { name => "ksb" },
1017 { name => "ksb-TZ" },
1018 { name => "ksf" },
1019 { name => "ksf-CM" },
1020 { name => "ksh", sopentypelang => "KSH0" },
1021 { name => "ksh-DE" },
1022 { name => "ku", lcid => 0x00000092, file => "ckb", slist => "\x{061b}", sortlocale => "ku-Arab-IQ", oemcp => 720 },
1023 { name => "ku-Arab", lcid => 0x00007c92, file => "ckb", group => 13 },
1024 { name => "ku-Arab-IQ", lcid => 0x00000492, file => "ckb_IQ" },
1025 { name => "ku-Arab-IR", file => "ckb_IR", oemcp => 65001 },
1026 { name => "kw" },
1027 { name => "kw-GB" },
1028 { name => "ky", lcid => 0x00000040, oemcp => 866, group => 5, sabbrevlangname => "KYR" },
1029 { name => "ky-Cyrl", alias => "ky" },
1030 { name => "ky-Cyrl-KG", alias => "ky-KG" },
1031 { name => "ky-KG", lcid => 0x00000440 },
1032 { name => "la", lcid => 0x00000076, oemcp => 437, slist => ",", sabbrevlangname => "ZZZ" },
1033 { name => "la-VA", lcid => 0x00000476 },
1034 { name => "la-001", alias => "la-VA" },
1035 { name => "lag" },
1036 { name => "lag-TZ" },
1037 { name => "lb", lcid => 0x0000006e, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "LBX" },
1038 { name => "lb-LU", lcid => 0x0000046e },
1039 { name => "lg" },
1040 { name => "lg-UG" },
1041 { name => "lkt" },
1042 { name => "lkt-US" },
1043 { name => "ln" },
1044 { name => "ln-AO" },
1045 { name => "ln-CD" },
1046 { name => "ln-CF" },
1047 { name => "ln-CG" },
1048 { name => "lo", lcid => 0x00000054, group => 15 },
1049 { name => "lo-LA", lcid => 0x00000454 },
1050 { name => "lrc" },
1051 { name => "lrc-IQ" },
1052 { name => "lrc-IR" },
1053 { name => "lt", lcid => 0x00000027, oemcp => 775, group => 3, sabbrevlangname => "LTH", sopentypelang => "LTH" },
1054 { name => "lt-LT", lcid => 0x00000427 },
1055 { name => "lu" },
1056 { name => "lu-CD" },
1057 { name => "luo" },
1058 { name => "luo-KE" },
1059 { name => "luy", sopentypelang => "LUH" },
1060 { name => "luy-KE" },
1061 { name => "lv", lcid => 0x00000026, oemcp => 775, group => 3, sabbrevlangname => "LVI", sopentypelang => "LVI" },
1062 { name => "lv-LV", lcid => 0x00000426 },
1063 { name => "mai" },
1064 { name => "mai-IN" },
1065 { name => "mas" },
1066 { name => "mas-KE" },
1067 { name => "mas-TZ" },
1068 { name => "mer" },
1069 { name => "mer-KE" },
1070 { name => "mfe" },
1071 { name => "mfe-MU" },
1072 { name => "mg" },
1073 { name => "mg-MG" },
1074 { name => "mgh" },
1075 { name => "mgh-MZ" },
1076 { name => "mgo" },
1077 { name => "mgo-CM" },
1078 { name => "mi", lcid => 0x00000081, slist => "," },
1079 { name => "mi-Latn", alias => "mi" },
1080 { name => "mi-Latn-NZ", alias => "mi-NZ" },
1081 { name => "mi-NZ", lcid => 0x00000481 },
1082 { name => "mk", lcid => 0x0000002f, oemcp => 866, ebcdiccp => 500, group => 5, sabbrevlangname => "MKI" },
1083 { name => "mk-MK", lcid => 0x0000042f },
1084 { name => "ml", lcid => 0x0000004c, group => 15, sabbrevlangname => "MYM", sopentypelang => "MLR" },
1085 { name => "ml-IN", lcid => 0x0000044c },
1086 { name => "mn", lcid => 0x00000050, oemcp => 866, sopentypelang => "MNG" },
1087 { name => "mn-Cyrl", lcid => 0x00007850, file => "mn", sabbrevlangname => "MNN" },
1088 { name => "mn-Cyrl-MN", alias => "mn-MN" },
1089 { name => "mn-MN", lcid => 0x00000450, sparent => "mn-Cyrl", group => 5 },
1090 { name => "mn-Mong", lcid => 0x00007c50, oemcp => 65001, slist => ",", group => 15, sabbrevlangname => "MNG", nativedigits => "0123456789" },
1091 { name => "mn-Mong-CN", lcid => 0x00000850 },
1092 { name => "mn-Mong-MN", lcid => 0x00000c50, sabbrevlangname => "MNM" },
1093 { name => "mni", lcid => 0x00000058, slist => ",", sabbrevlangname => "ZZZ" },
1094 { name => "mni-IN", lcid => 0x00000458, file => "mni_Beng_IN" },
1095 { name => "mni-Beng" },
1096 { name => "mni-Beng-IN", alias => "mni-IN" },
1097 { name => "moh", lcid => 0x0000007c, oemcp => 850, ebcdiccp => 37, slist => ",", sabbrevlangname => "MWK" },
1098 { name => "moh-CA", lcid => 0x0000047c },
1099 { name => "moh-Latn", alias => "moh" },
1100 { name => "moh-Latn-CA", alias => "moh-CA" },
1101 { name => "mr", lcid => 0x0000004e, slist => ",", group => 15 },
1102 { name => "mr-IN", lcid => 0x0000044e },
1103 { name => "ms", lcid => 0x0000003e, oemcp => 850, sabbrevlangname => "MSL", sopentypelang => "MLY" },
1104 { name => "ms-BN", lcid => 0x0000083e, sabbrevlangname => "MSB" },
1105 { name => "ms-ID" },
1106 { name => "ms-Latn", alias => "ms" },
1107 { name => "ms-Latn-BN", alias => "ms-BN" },
1108 { name => "ms-Latn-MY", alias => "ms-MY" },
1109 { name => "ms-Latn-SG", alias => "ms-SG" },
1110 { name => "ms-MY", lcid => 0x0000043e },
1111 { name => "ms-SG" },
1112 { name => "mt", lcid => 0x0000003a, sopentypelang => "MTS" },
1113 { name => "mt-MT", lcid => 0x0000043a },
1114 { name => "mua" },
1115 { name => "mua-CM" },
1116 { name => "my", lcid => 0x00000055, sopentypelang => "BRM" },
1117 { name => "my-MM", lcid => 0x00000455 },
1118 { name => "mzn" },
1119 { name => "mzn-IR" },
1120 { name => "naq" },
1121 { name => "naq-NA" },
1122 { name => "nb", lcid => 0x00007c14, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NOR", sopentypelang => "NOR" },
1123 { name => "nb-NO", lcid => 0x00000414 },
1124 { name => "nb-SJ" },
1125 { name => "nd", sopentypelang => "NDB" },
1126 { name => "nd-ZW" },
1127 { name => "nds" },
1128 { name => "nds-DE" },
1129 { name => "nds-NL" },
1130 { name => "ne", lcid => 0x00000061, slist => "," },
1131 { name => "ne-IN", lcid => 0x00000861, sabbrevlangname => "NEI" },
1132 { name => "ne-NP", lcid => 0x00000461, group => 15 },
1133 { name => "nl", lcid => 0x00000013, oemcp => 850 },
1134 { name => "nl-AW" },
1135 { name => "nl-BE", lcid => 0x00000813, sabbrevlangname => "NLB" },
1136 { name => "nl-BQ" },
1137 { name => "nl-CW" },
1138 { name => "nl-NL", lcid => 0x00000413 },
1139 { name => "nl-SR" },
1140 { name => "nl-SX" },
1141 { name => "nmg" },
1142 { name => "nmg-CM" },
1143 { name => "nn", lcid => 0x00007814, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NON", sopentypelang => "NYN" },
1144 { name => "nn-NO", lcid => 0x00000814 },
1145 { name => "nnh" },
1146 { name => "nnh-CM" },
1147 { name => "no", lcid => 0x00000014, oemcp => 850, ebcdiccp => 20277, sortlocale => "nb-NO" },
1148 { name => "nqo", idigits => 3, inegnumber => 3, slist => "\x{060c}", sopentypelang => "NKO" },
1149 { name => "nqo-GN" },
1150 { name => "nr", sopentypelang => "NDB" },
1151 { name => "nr-ZA" },
1152 { name => "nso", lcid => 0x0000006c, oemcp => 850, sopentypelang => "SOT" },
1153 { name => "nso-ZA", lcid => 0x0000046c },
1154 { name => "nus" },
1155 { name => "nus-SD", alias => "nus-SS" },
1156 { name => "nus-SS" },
1157 { name => "nyn", sopentypelang => "NKL" },
1158 { name => "nyn-UG" },
1159 { name => "oc", lcid => 0x00000082, oemcp => 850, ebcdiccp => 20297 },
1160 { name => "oc-FR", lcid => 0x00000482 },
1161 { name => "oc-Latn", alias => "oc" },
1162 { name => "oc-Latn-FR", alias => "oc-FR" },
1163 { name => "om", lcid => 0x00000072, sopentypelang => "ORO" },
1164 { name => "om-ET", lcid => 0x00000472 },
1165 { name => "om-KE" },
1166 { name => "or", lcid => 0x00000048, slist => ",", group => 15 },
1167 { name => "or-IN", lcid => 0x00000448 },
1168 { name => "os" },
1169 { name => "os-GE" },
1170 { name => "os-RU" },
1171 { name => "pa", lcid => 0x00000046, slist => "," },
1172 { name => "pa-Arab", lcid => 0x00007c46, slist => ";", inegnumber => 2, oemcp => 720, group => 13, sabbrevlangname => "PAP" },
1173 { name => "pa-Arab-PK", lcid => 0x00000846 },
1174 { name => "pa-Guru" },
1175 { name => "pa-Guru-IN", alias => "pa-IN" },
1176 { name => "pa-IN", lcid => 0x00000446, sparent => "pa-Guru", file => "pa_Guru_IN", group => 15 },
1177 { name => "pap", lcid => 0x00000079, oemcp => 850, sopentypelang => "PAP0" },
1178 ## name => "pap-029", lcid => 0x00000479 },
1179 { name => "pcm" },
1180 { name => "pcm-NG", alias => "pcm-Latn-NG" },
1181 { name => "pcm-Latn", file => "pcm" },
1182 { name => "pcm-Latn-NG", file => "pcm_NG" },
1183 { name => "pl", lcid => 0x00000015, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "PLK", sopentypelang => "PLK" },
1184 { name => "pl-PL", lcid => 0x00000415 },
1185 { name => "prg" },
1186 { name => "prg-001" },
1187 { name => "prs", lcid => 0x0000008c, file => "fa", inegnumber => 3, oemcp => 720, group => 13, sopentypelang => "DRI" },
1188 { name => "prs-AF", lcid => 0x0000048c, file => "fa_AF" },
1189 { name => "prs-Arab", alias => "prs" },
1190 { name => "prs-Arab-AF", alias => "prs-AF" },
1191 { name => "ps", lcid => 0x00000063, group => 13, sabbrevlangname => "PAS", sopentypelang => "PAS" },
1192 { name => "ps-AF", lcid => 0x00000463 },
1193 { name => "ps-PK" },
1194 { name => "pt", lcid => 0x00000016, oemcp => 850, sabbrevlangname => "PTB", sopentypelang => "PTG" },
1195 { name => "pt-AO" },
1196 { name => "pt-BR", lcid => 0x00000416 },
1197 { name => "pt-CH", oemcp => 65001 },
1198 { name => "pt-CV" },
1199 { name => "pt-GQ", oemcp => 65001 },
1200 { name => "pt-GW" },
1201 { name => "pt-LU", oemcp => 65001 },
1202 { name => "pt-MO" },
1203 { name => "pt-MZ" },
1204 { name => "pt-PT", lcid => 0x00000816, sabbrevlangname => "PTG" },
1205 { name => "pt-ST" },
1206 { name => "pt-TL" },
1207 ## name => qps-Latn-x-sh", lcid => 0x80000901 },
1208 ## name => qps-ploc", lcid => 0x80000501 },
1209 ## name => qps-ploca", lcid => 0x800005fe },
1210 ## name => qps-plocm", lcid => 0x800009ff },
1211 { name => "qu", alias => "quz" },
1212 { name => "qu-BO", alias => "quz-BO" },
1213 { name => "qu-EC", alias => "quz-EC" },
1214 { name => "qu-PE", alias => "quz-PE" },
1215 { name => "quc", lcid => 0x00000086, oemcp => 850, ebcdiccp => 20284, slist => "," },
1216 { name => "quc-Latn", lcid => 0x00007c86, file => "quc" },
1217 { name => "quc-Latn-GT", lcid => 0x00000486, file => "quc_GT" },
1218 { name => "qut", alias => "quc" },
1219 { name => "qut-GT", alias => "quc-Latn-GT" },
1220 { name => "quz", lcid => 0x0000006b, file => "qu", territory => "BO", oemcp => 850, ebcdiccp => 20284, slist => "," },
1221 { name => "quz-BO", lcid => 0x0000046b, file => "qu_BO" },
1222 { name => "quz-EC", lcid => 0x0000086b, file => "qu_EC" },
1223 { name => "quz-Latn", alias => "quz" },
1224 { name => "quz-Latn-BO", alias => "quz-BO" },
1225 { name => "quz-Latn-EC", alias => "quz-EC" },
1226 { name => "quz-Latn-PE", alias => "quz-PE" },
1227 { name => "quz-PE", lcid => 0x00000c6b, file => "qu_PE" },
1228 { name => "rm", lcid => 0x00000017, oemcp => 850, ebcdiccp => 20273, sabbrevlangname => "RMC", sopentypelang => "RMS" },
1229 { name => "rm-CH", lcid => 0x00000417 },
1230 { name => "rn" },
1231 { name => "rn-BI" },
1232 { name => "ro", lcid => 0x00000018, oemcp => 852, ebcdiccp => 20880, sabbrevlangname => "ROM", sopentypelang => "ROM" },
1233 { name => "ro-MD", lcid => 0x00000818, maccp => 65001, sabbrevlangname => "ROD" },
1234 { name => "ro-RO", lcid => 0x00000418, group => 2 },
1235 { name => "rof" },
1236 { name => "rof-TZ" },
1237 { name => "ru", lcid => 0x00000019, oemcp => 866 },
1238 { name => "ru-BY", maccp => 65001 },
1239 { name => "ru-KG", maccp => 65001 },
1240 { name => "ru-KZ", maccp => 65001 },
1241 { name => "ru-MD", lcid => 0x00000819, maccp => 65001, sabbrevlangname => "RUM" },
1242 { name => "ru-RU", lcid => 0x00000419, group => 5 },
1243 { name => "ru-UA", maccp => 65001 },
1244 { name => "rw", lcid => 0x00000087, oemcp => 437, sopentypelang => "RUA" },
1245 { name => "rw-RW", lcid => 0x00000487 },
1246 { name => "rwk" },
1247 { name => "rwk-TZ" },
1248 { name => "sa", lcid => 0x0000004f, slist => ",", group => 15 },
1249 { name => "sa-Deva", alias => "sa" },
1250 { name => "sa-Deva-IN", alias => "sa-IN" },
1251 { name => "sa-IN", lcid => 0x0000044f },
1252 { name => "sah", lcid => 0x00000085, oemcp => 866, group => 5, sopentypelang => "YAK" },
1253 { name => "sah-Cyrl", alias => "sah" },
1254 { name => "sah-Cyrl-RU", alias => "sah-RU" },
1255 { name => "sah-RU", lcid => 0x00000485 },
1256 { name => "saq" },
1257 { name => "saq-KE" },
1258 { name => "sat" },
1259 { name => "sat-Olck" },
1260 { name => "sat-Olck-IN" },
1261 { name => "sbp" },
1262 { name => "sbp-TZ" },
1263 { name => "sc" },
1264 { name => "sc-IT" },
1265 { name => "sd", lcid => 0x00000059, inegnumber => 3, oemcp => 720, sabbrevlangname => "SIP" },
1266 { name => "sd-Arab", lcid => 0x00007c59, group => 13 },
1267 { name => "sd-Arab-PK", lcid => 0x00000859 },
1268 { name => "sd-Deva", inegnumber => 1, slist => ",", oemcp => 65001, group => 15 },
1269 { name => "sd-Deva-IN", lcid => 0x00000459, sabbrevlangname => "ZZZ" },
1270 { name => "sd-PK", alias => "sd-Arab-PK" },
1271 { name => "se", lcid => 0x0000003b, oemcp => 850, ebcdiccp => 20277, sopentypelang => "NSM" },
1272 { name => "se-FI", lcid => 0x00000c3b, ebcdiccp => 20278, sabbrevlangname => "SMG" },
1273 { name => "se-NO", lcid => 0x0000043b },
1274 { name => "se-SE", lcid => 0x0000083b, ebcdiccp => 20278, sabbrevlangname => "SMF" },
1275 { name => "se-Latn", alias => "se" },
1276 { name => "se-Latn-FI", alias => "se-FI" },
1277 { name => "se-Latn-NO", alias => "se-NO" },
1278 { name => "se-Latn-SE", alias => "se-SE" },
1279 { name => "seh" },
1280 { name => "seh-MZ" },
1281 { name => "ses" },
1282 { name => "ses-ML" },
1283 { name => "sg", sopentypelang => "SGO" },
1284 { name => "sg-CF" },
1285 { name => "shi" },
1286 { name => "shi-Latn" },
1287 { name => "shi-Latn-MA" },
1288 { name => "shi-Tfng" },
1289 { name => "shi-Tfng-MA" },
1290 { name => "si", lcid => 0x0000005b, group => 15, sopentypelang => "SNH" },
1291 { name => "si-LK", lcid => 0x0000045b },
1292 { name => "sk", lcid => 0x0000001b, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "SKY", sopentypelang => "SKY" },
1293 { name => "sk-SK", lcid => 0x0000041b },
1294 { name => "sl", lcid => 0x00000024, oemcp => 852, ebcdiccp => 20880, group => 2 },
1295 { name => "sl-SI", lcid => 0x00000424 },
1296 { name => "sma", lcid => 0x0000783b, sparent => "se", ebcdiccp => 20278, sabbrevlangname => "SMB", sopentypelang => "SSM" },
1297 { name => "sma-Latn", alias => "sma" },
1298 { name => "sma-Latn-NO", alias => "sma-NO" },
1299 { name => "sma-Latn-SE", alias => "sma-SE" },
1300 { name => "sma-NO", lcid => 0x0000183b, ebcdiccp => 20277, sabbrevlangname => "SMA" },
1301 { name => "sma-SE", lcid => 0x00001c3b },
1302 { name => "smj", lcid => 0x00007c3b, sparent => "se", ebcdiccp => 20278, sabbrevlangname => "SMK", sopentypelang => "LSM" },
1303 { name => "smj-Latn", alias => "smj" },
1304 { name => "smj-Latn-NO", alias => "smj-NO" },
1305 { name => "smj-Latn-SE", alias => "smj-SE" },
1306 { name => "smj-NO", lcid => 0x0000103b, ebcdiccp => 20277, sabbrevlangname => "SMJ" },
1307 { name => "smj-SE", lcid => 0x0000143b },
1308 { name => "smn", lcid => 0x0000703b, sparent => "se", ebcdiccp => 20278, sopentypelang => "ISM" },
1309 { name => "smn-FI", lcid => 0x0000243b },
1310 { name => "smn-Latn", alias => "smn" },
1311 { name => "smn-Latn-FI", alias => "smn-FI" },
1312 { name => "sms", lcid => 0x0000743b, sparent => "se", ebcdiccp => 20278, sopentypelang => "SKS" },
1313 { name => "sms-FI", lcid => 0x0000203b },
1314 { name => "sms-Latn", alias => "sms" },
1315 { name => "sms-Latn-FI", alias => "sms-FI" },
1316 { name => "sn", sopentypelang => "SNA0" },
1317 { name => "sn-Latn", file => "sn" },
1318 { name => "sn-Latn-ZW", file => "sn_ZW" },
1319 { name => "sn-ZW", alias => "sn-Latn-ZW" },
1320 { name => "so", lcid => 0x00000077, sopentypelang => "SML" },
1321 { name => "so-DJ" },
1322 { name => "so-ET" },
1323 { name => "so-KE" },
1324 { name => "so-SO", lcid => 0x00000477 },
1325 { name => "sq", lcid => 0x0000001c, oemcp => 852, ebcdiccp => 20880, group => 2 },
1326 { name => "sq-AL", lcid => 0x0000041c },
1327 { name => "sq-MK" },
1328 { name => "sq-XK" },
1329 { name => "sr", lcid => 0x00007c1a, sortlocale => "sr-Latn-RS", oemcp => 852, group => 2, sabbrevlangname => "SRB", sopentypelang => "SRB" },
1330 { name => "sr-Cyrl", lcid => 0x00006c1a, oemcp => 855, ebcdiccp => 21025, group => 5, sabbrevlangname => "SRO" },
1331 { name => "sr-Cyrl-BA", lcid => 0x00001c1a, sabbrevlangname => "SRN" },
1332 { name => "sr-Cyrl-ME", lcid => 0x0000301a, sabbrevlangname => "SRQ" },
1333 { name => "sr-Cyrl-RS", lcid => 0x0000281a },
1334 { name => "sr-Cyrl-XK" },
1335 { name => "sr-Latn", lcid => 0x0000701a, sabbrevlangname => "SRM" },
1336 { name => "sr-Latn-BA", lcid => 0x0000181a, maccp => 10082, ebcdiccp => 870, sabbrevlangname => "SRS" },
1337 { name => "sr-Latn-ME", lcid => 0x00002c1a, sabbrevlangname => "SRP" },
1338 { name => "sr-Latn-RS", lcid => 0x0000241a, sabbrevlangname => "SRM" },
1339 { name => "sr-Latn-XK" },
1340 ## name => "sr-Cyrl-CS", lcid => 0x00000c1a },
1341 ## name => "sr-Latn-CS", lcid => 0x0000081a },
1342 { name => "ss", sopentypelang => "SWZ" },
1343 { name => "ss-SZ" },
1344 { name => "ss-ZA" },
1345 { name => "ssy" },
1346 { name => "ssy-ER" },
1347 { name => "st", lcid => 0x00000030 },
1348 { name => "st-LS" },
1349 { name => "st-ZA", lcid => 0x00000430 },
1350 { name => "su" },
1351 { name => "su-Latn" },
1352 { name => "su-Latn-ID" },
1353 { name => "sv", lcid => 0x0000001d, oemcp => 850, ebcdiccp => 20278, sabbrevlangname => "SVE", sopentypelang => "SVE" },
1354 { name => "sv-AX" },
1355 { name => "sv-FI", lcid => 0x0000081d, sabbrevlangname => "SVF" },
1356 { name => "sv-SE", lcid => 0x0000041d, sabbrevlangname => "SVE" },
1357 { name => "sw", lcid => 0x00000041, territory => "KE", oemcp => 437, ebcdiccp => 500, sabbrevlangname => "SWK", sopentypelang => "SWK" },
1358 { name => "sw-CD" },
1359 { name => "sw-KE", lcid => 0x00000441 },
1360 { name => "sw-TZ" },
1361 { name => "sw-UG" },
1362 { name => "swc-CD", alias => "sw-CD" },
1363 { name => "syr", lcid => 0x0000005a, slist => ",", group => 13 },
1364 { name => "syr-SY", lcid => 0x0000045a },
1365 { name => "syr-Syrc", alias => "syr" },
1366 { name => "syr-Syrc-SY", alias => "syr-SY" },
1367 { name => "ta", lcid => 0x00000049, slist => ",", group => 15, sabbrevlangname => "TAI" },
1368 { name => "ta-IN", lcid => 0x00000449 },
1369 { name => "ta-LK", lcid => 0x00000849, sabbrevlangname => "TAM" },
1370 { name => "ta-MY" },
1371 { name => "ta-SG" },
1372 { name => "te", lcid => 0x0000004a, group => 15 },
1373 { name => "te-IN", lcid => 0x0000044a },
1374 { name => "teo" },
1375 { name => "teo-KE" },
1376 { name => "teo-UG" },
1377 { name => "tg", lcid => 0x00000028, oemcp => 866, group => 5, sabbrevlangname => "TAJ", sopentypelang => "TAJ" },
1378 { name => "tg-Cyrl", lcid => 0x00007c28, file => "tg" },
1379 { name => "tg-Cyrl-TJ", lcid => 0x00000428, file => "tg_TJ" },
1380 { name => "tg-TJ", alias => "tg-Cyrl-TJ" },
1381 { name => "th", lcid => 0x0000001e, oemcp => 874, ebcdiccp => 20838, slist => ",", group => 11 },
1382 { name => "th-TH", lcid => 0x0000041e },
1383 { name => "ti", lcid => 0x00000073, territory => "ER", sopentypelang => "TGY" },
1384 { name => "ti-ER", lcid => 0x00000873 },
1385 { name => "ti-ET", lcid => 0x00000473, sabbrevlangname => "TIE" },
1386 { name => "tig", sopentypelang => "TGR" },
1387 { name => "tig-ER" },
1388 { name => "tig-Ethi-ER", alias => "tig-ER" },
1389 { name => "tk", lcid => 0x00000042, oemcp => 852, ebcdiccp => 20880, group => 2, sopentypelang => "TKM" },
1390 { name => "tk-Latn", alias => "tk" },
1391 { name => "tk-Latn-TM", alias => "tk-TM" },
1392 { name => "tk-TM", lcid => 0x00000442 },
1393 { name => "tn", lcid => 0x00000032, oemcp => 850, sopentypelang => "TNA" },
1394 { name => "tn-BW", lcid => 0x00000832, sabbrevlangname => "TSB" },
1395 { name => "tn-ZA", lcid => 0x00000432 },
1396 { name => "to", sopentypelang => "TGN" },
1397 { name => "to-TO" },
1398 { name => "tr", lcid => 0x0000001f, oemcp => 857, ebcdiccp => 20905, group => 6, sabbrevlangname => "TRK", sopentypelang => "TRK" },
1399 { name => "tr-CY" },
1400 { name => "tr-TR", lcid => 0x0000041f },
1401 { name => "ts", lcid => 0x00000031, sopentypelang => "TSG" },
1402 { name => "ts-ZA", lcid => 0x00000431 },
1403 { name => "tt", lcid => 0x00000044, oemcp => 866, group => 5, sabbrevlangname => "TTT" },
1404 { name => "tt-Cyrl", alias => "tt" },
1405 { name => "tt-Cyrl-RU", alias => "tt-RU" },
1406 { name => "tt-RU", lcid => 0x00000444 },
1407 { name => "twq" },
1408 { name => "twq-NE" },
1409 { name => "tzm", lcid => 0x0000005f, sortlocale => "tzm-Latn-DZ", oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "TZA" },
1410 { name => "tzm-Latn", lcid => 0x00007c5f, territory => "DZ", file => "tzm" },
1411 { name => "tzm-Latn-MA", file => "tzm_MA", oemcp => 65001 },
1412 { name => "tzm-Latn-DZ", lcid => 0x0000085f, file => "tzm" },
1413 { name => "tzm-MA", alias => "tzm-Latn-MA" },
1414 { name => "tzm-DZ", alias => "tzm-Latn-DZ" },
1415 ## name => "tzm-Arab", group => 13 },
1416 ## name => "tzm-Arab-MA", lcid => 0x0000045f },
1417 ## name => "tzm-Tfng", lcid => 0x0000785f },
1418 ## name => "tzm-Tfng-MA", lcid => 0x0000105f },
1419 { name => "ug", lcid => 0x00000080, oemcp => 720, slist => ",", group => 13, sopentypelang => "UYG", nativedigits => "0123456789" },
1420 { name => "ug-Arab", alias => "ug" },
1421 { name => "ug-Arab-CN", alias => "ug-CN" },
1422 { name => "ug-CN", lcid => 0x00000480 },
1423 { name => "uk", lcid => 0x00000022, oemcp => 866, maccp => 10017, ebcdiccp => 500, group => 5 },
1424 { name => "uk-UA", lcid => 0x00000422 },
1425 { name => "ur", lcid => 0x00000020, oemcp => 720 },
1426 { name => "ur-IN", lcid => 0x00000820, maccp => 65001, sabbrevlangname => "URI" },
1427 { name => "ur-PK", lcid => 0x00000420, group => 13 },
1428 { name => "uz", lcid => 0x00000043, oemcp => 857, maccp => 10029, group => 2 },
1429 { name => "uz-Arab", oemcp => 65001, maccp => 65001 },
1430 { name => "uz-Arab-AF" },
1431 { name => "uz-Cyrl", lcid => 0x00007843, oemcp => 866, maccp => 10007, group => 5, sabbrevlangname => "UZC" },
1432 { name => "uz-Cyrl-UZ", lcid => 0x00000843 },
1433 { name => "uz-Latn", lcid => 0x00007c43 },
1434 { name => "uz-Latn-UZ", lcid => 0x00000443 },
1435 { name => "vai" },
1436 { name => "vai-Latn" },
1437 { name => "vai-Latn-LR" },
1438 { name => "vai-Vaii" },
1439 { name => "vai-Vaii-LR" },
1440 { name => "ve", lcid => 0x00000033, sabbrevlangname => "ZZZ" },
1441 { name => "ve-ZA", lcid => 0x00000433 },
1442 { name => "vi", lcid => 0x0000002a, oemcp => 1258, slist => ",", group => 14, sabbrevlangname => "VIT", sopentypelang => "VIT" },
1443 { name => "vi-VN", lcid => 0x0000042a },
1444 { name => "vo" },
1445 { name => "vo-001" },
1446 { name => "vun" },
1447 { name => "vun-TZ" },
1448 { name => "wa", oemcp => 850 },
1449 { name => "wa-BE" },
1450 { name => "wae" },
1451 { name => "wae-CH" },
1452 { name => "wal" },
1453 { name => "wal-ET" },
1454 { name => "wo", lcid => 0x00000088, oemcp => 850, ebcdiccp => 20297, sopentypelang => "WLF" },
1455 { name => "wo-Latn", alias => "wo" },
1456 { name => "wo-Latn-SN", alias => "wo-SN" },
1457 { name => "wo-SN", lcid => 0x00000488 },
1458 { name => "x-IV_mathan", lcid => 0x0001007f, alias => "" },
1459 { name => "xh", lcid => 0x00000034, oemcp => 850, sopentypelang => "XHS" },
1460 { name => "xh-ZA", lcid => 0x00000434 },
1461 { name => "xog" },
1462 { name => "xog-UG" },
1463 { name => "yav" },
1464 { name => "yav-CM" },
1465 { name => "yi", lcid => 0x0000003d, sabbrevlangname => "ZZZ", sopentypelang => "JII" },
1466 { name => "yi-001", lcid => 0x0000043d },
1467 { name => "yo", lcid => 0x0000006a, oemcp => 437, sopentypelang => "YBA" },
1468 { name => "yo-BJ", ebcdiccp => 500 },
1469 { name => "yo-Latn", alias => "yo" },
1470 { name => "yo-Latn-NG", alias => "yo-NG" },
1471 { name => "yo-NG", lcid => 0x0000046a },
1472 { name => "yrl" },
1473 { name => "yrl-BR" },
1474 { name => "yrl-CO" },
1475 { name => "yrl-VE" },
1476 { name => "yue" },
1477 { name => "yue-Hans" },
1478 { name => "yue-Hans-CN" },
1479 { name => "yue-Hant" },
1480 { name => "yue-Hant-HK" },
1481 { name => "zgh" },
1482 { name => "zgh-MA", alias => "zgh-Tfng-MA" },
1483 { name => "zgh-Tfng", file => "zgh" },
1484 { name => "zgh-Tfng-MA", file => "zgh_MA" },
1485 { name => "zh", lcid => 0x00007804, ireadinglayout => 2, oemcp => 936, slist => ",", sscripts => "Hani Hans", sabbrevlangname => "CHS", sopentypelang => "ZHS", nativedigits => "0123456789" },
1486 { name => "zh-CN", lcid => 0x00000804, file => "zh_Hans_CN", sparent => "zh-Hans" },
1487 { name => "zh-CN_phoneb", lcid => 0x00050804, alias => "zh-CN" },
1488 { name => "zh-CN_stroke", lcid => 0x00020804, alias => "zh-CN" },
1489 { name => "zh-Hans", lcid => 0x00000004, group => 10 },
1490 { name => "zh-Hans-CN", alias => "zh-CN" },
1491 { name => "zh-Hans-CN-u-co-phonebk", alias => "zh-CN_phoneb" },
1492 { name => "zh-Hans-CN-u-co-stroke", alias => "zh-CN_stroke" },
1493 { name => "zh-Hans-HK", slist => ";", nativedigits => "" },
1494 { name => "zh-Hans-MO", slist => ";", nativedigits => "" },
1495 { name => "zh-Hans-SG", alias => "zh-SG" },
1496 { name => "zh-Hans-SG-u-co-phonebk", alias => "zh-SG_phoneb" },
1497 { name => "zh-Hans-SG-u-co-stroke", alias => "zh-SG_stroke" },
1498 { name => "zh-Hant", lcid => 0x00007c04, sortlocale => "zh-HK", ireadinglayout => 2, oemcp => 950, slist => ",", sscripts => "Hani Hant", group => 9, sabbrevlangname => "CHT", sopentypelang => "ZHH" },
1499 { name => "zh-Hant-HK", alias => "zh-HK" },
1500 { name => "zh-Hant-HK-u-co-unihan", alias => "zh-HK_radstr" },
1501 { name => "zh-Hant-MO", alias => "zh-MO" },
1502 { name => "zh-Hant-MO-u-co-stroke", alias => "zh-MO_stroke" },
1503 { name => "zh-Hant-MO-u-co-unihan", alias => "zh-MO_radstr" },
1504 { name => "zh-Hant-TW", alias => "zh-TW" },
1505 { name => "zh-Hant-TW-u-co-phonetic", alias => "zh-TW_pronun" },
1506 { name => "zh-Hant-TW-u-co-unihan", alias => "zh-TW_radstr" },
1507 { name => "zh-HK", lcid => 0x00000c04, file => "zh_Hant_HK", sparent => "zh-Hant", sabbrevlangname => "ZHH" },
1508 { name => "zh-HK_radstr", lcid => 0x00040c04, alias => "zh-HK" },
1509 { name => "zh-MO", lcid => 0x00001404, file => "zh_Hant_MO", sparent => "zh-Hant", sabbrevlangname => "ZHM", sopentypelang => "ZHT" },
1510 { name => "zh-MO_radstr", lcid => 0x00041404, alias => "zh-MO" },
1511 { name => "zh-MO_stroke", lcid => 0x00021404, alias => "zh-MO" },
1512 { name => "zh-SG", lcid => 0x00001004, file => "zh_Hans_SG", sparent => "zh-Hans", sabbrevlangname => "ZHI" },
1513 { name => "zh-SG_phoneb", lcid => 0x00051004, alias => "zh-SG" },
1514 { name => "zh-SG_stroke", lcid => 0x00021004, alias => "zh-SG" },
1515 { name => "zh-TW", lcid => 0x00000404, file => "zh_Hant_TW", sparent => "zh-Hant", sopentypelang => "ZHT" },
1516 { name => "zh-TW_pronun", lcid => 0x00030404, alias => "zh-TW" },
1517 { name => "zh-TW_radstr", lcid => 0x00040404, alias => "zh-TW" },
1518 { name => "zu", lcid => 0x00000035, oemcp => 850 },
1519 { name => "zu-ZA", lcid => 0x00000435 },
1522 my @calendars =
1524 { id => 1, name => "Gregorian", itwodigityearmax => 2049 },
1525 { id => 2, type => "gregorian", locale => "en-US", itwodigityearmax => 2049 },
1526 { id => 3, type => "japanese", locale => "ja-JP", eras => [ 232..236 ] },
1527 { id => 4, type => "roc", locale => "zh-TW", eras => [ 1 ] },
1528 { id => 5, type => "dangi", locale => "ko-KR", eras => [ 0 ] },
1529 { id => 6, type => "islamic", locale => "ar-SA", itwodigityearmax => 1451 },
1530 { id => 7, type => "buddhist", locale => "th-TH", eras => [ 0 ] },
1531 { id => 8, type => "hebrew", locale => "he-IL", itwodigityearmax => 5810 },
1532 { id => 9, type => "gregorian", locale => "fr-FR", itwodigityearmax => 2049 },
1533 { id => 10, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1534 { id => 11, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1535 { id => 12, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1536 { id => 13, name => "Julian", locale => "en-US", itwodigityearmax => 2049 },
1537 { id => 14, name => "Japanese Lunisolar" },
1538 { id => 15, name => "Chinese Lunisolar" },
1539 { id => 16, name => "Saka" },
1540 { id => 17, name => "Lunar ETO Chinese" },
1541 { id => 18, name => "Lunar ETO Korean" },
1542 { id => 19, name => "Lunar ETO Rokuyou" },
1543 { id => 20, name => "Korean Lunisolar" },
1544 { id => 21, name => "Taiwan Lunisolar" },
1545 { id => 22, type => "persian", locale => "prs-AF", itwodigityearmax => 1429 },
1546 { id => 23, type => "islamic-umalqura", locale => "ar-SA", itwodigityearmax => 1451 },
1549 my @geoids =
1551 { id => 2, name => "AG" }, # Antigua and Barbuda
1552 { id => 3, name => "AF" }, # Afghanistan
1553 { id => 4, name => "DZ" }, # Algeria
1554 { id => 5, name => "AZ" }, # Azerbaijan
1555 { id => 6, name => "AL" }, # Albania
1556 { id => 7, name => "AM" }, # Armenia
1557 { id => 8, name => "AD" }, # Andorra
1558 { id => 9, name => "AO" }, # Angola
1559 { id => 10, name => "AS" }, # American Samoa
1560 { id => 11, name => "AR" }, # Argentina
1561 { id => 12, name => "AU" }, # Australia
1562 { id => 14, name => "AT" }, # Austria
1563 { id => 17, name => "BH" }, # Bahrain
1564 { id => 18, name => "BB" }, # Barbados
1565 { id => 19, name => "BW" }, # Botswana
1566 { id => 20, name => "BM" }, # Bermuda
1567 { id => 21, name => "BE" }, # Belgium
1568 { id => 22, name => "BS" }, # Bahamas, The
1569 { id => 23, name => "BD" }, # Bangladesh
1570 { id => 24, name => "BZ" }, # Belize
1571 { id => 25, name => "BA" }, # Bosnia and Herzegovina
1572 { id => 26, name => "BO" }, # Bolivia
1573 { id => 27, name => "MM" }, # Myanmar
1574 { id => 28, name => "BJ" }, # Benin
1575 { id => 29, name => "BY" }, # Belarus
1576 { id => 30, name => "SB" }, # Solomon Islands
1577 { id => 32, name => "BR" }, # Brazil
1578 { id => 34, name => "BT" }, # Bhutan
1579 { id => 35, name => "BG" }, # Bulgaria
1580 { id => 37, name => "BN" }, # Brunei
1581 { id => 38, name => "BI" }, # Burundi
1582 { id => 39, name => "CA" }, # Canada
1583 { id => 40, name => "KH" }, # Cambodia
1584 { id => 41, name => "TD" }, # Chad
1585 { id => 42, name => "LK" }, # Sri Lanka
1586 { id => 43, name => "CG" }, # Congo
1587 { id => 44, name => "CD" }, # Congo (DRC)
1588 { id => 45, name => "CN" }, # China
1589 { id => 46, name => "CL" }, # Chile
1590 { id => 49, name => "CM" }, # Cameroon
1591 { id => 50, name => "KM" }, # Comoros
1592 { id => 51, name => "CO" }, # Colombia
1593 { id => 54, name => "CR" }, # Costa Rica
1594 { id => 55, name => "CF" }, # Central African Republic
1595 { id => 56, name => "CU" }, # Cuba
1596 { id => 57, name => "CV" }, # Cape Verde
1597 { id => 59, name => "CY" }, # Cyprus
1598 { id => 61, name => "DK" }, # Denmark
1599 { id => 62, name => "DJ" }, # Djibouti
1600 { id => 63, name => "DM" }, # Dominica
1601 { id => 65, name => "DO" }, # Dominican Republic
1602 { id => 66, name => "EC" }, # Ecuador
1603 { id => 67, name => "EG" }, # Egypt
1604 { id => 68, name => "IE" }, # Ireland
1605 { id => 69, name => "GQ" }, # Equatorial Guinea
1606 { id => 70, name => "EE" }, # Estonia
1607 { id => 71, name => "ER" }, # Eritrea
1608 { id => 72, name => "SV" }, # El Salvador
1609 { id => 73, name => "ET" }, # Ethiopia
1610 { id => 75, name => "CZ" }, # Czech Republic
1611 { id => 77, name => "FI" }, # Finland
1612 { id => 78, name => "FJ" }, # Fiji Islands
1613 { id => 80, name => "FM" }, # Micronesia
1614 { id => 81, name => "FO" }, # Faroe Islands
1615 { id => 84, name => "FR" }, # France
1616 { id => 86, name => "GM" }, # Gambia, The
1617 { id => 87, name => "GA" }, # Gabon
1618 { id => 88, name => "GE" }, # Georgia
1619 { id => 89, name => "GH" }, # Ghana
1620 { id => 90, name => "GI" }, # Gibraltar
1621 { id => 91, name => "GD" }, # Grenada
1622 { id => 93, name => "GL" }, # Greenland
1623 { id => 94, name => "DE" }, # Germany
1624 { id => 98, name => "GR" }, # Greece
1625 { id => 99, name => "GT" }, # Guatemala
1626 { id => 100, name => "GN" }, # Guinea
1627 { id => 101, name => "GY" }, # Guyana
1628 { id => 103, name => "HT" }, # Haiti
1629 { id => 104, name => "HK" }, # Hong Kong S.A.R.
1630 { id => 106, name => "HN" }, # Honduras
1631 { id => 108, name => "HR" }, # Croatia
1632 { id => 109, name => "HU" }, # Hungary
1633 { id => 110, name => "IS" }, # Iceland
1634 { id => 111, name => "ID" }, # Indonesia
1635 { id => 113, name => "IN" }, # India
1636 { id => 114, name => "IO" }, # British Indian Ocean Territory
1637 { id => 116, name => "IR" }, # Iran
1638 { id => 117, name => "IL" }, # Israel
1639 { id => 118, name => "IT" }, # Italy
1640 { id => 119, name => "CI" }, # Côte d'Ivoire
1641 { id => 121, name => "IQ" }, # Iraq
1642 { id => 122, name => "JP" }, # Japan
1643 { id => 124, name => "JM" }, # Jamaica
1644 { id => 125, name => "SJ" }, # Jan Mayen
1645 { id => 126, name => "JO" }, # Jordan
1646 { id => 127, parent => "UM" }, # Johnston Atoll
1647 { id => 129, name => "KE" }, # Kenya
1648 { id => 130, name => "KG" }, # Kyrgyzstan
1649 { id => 131, name => "KP" }, # North Korea
1650 { id => 133, name => "KI" }, # Kiribati
1651 { id => 134, name => "KR" }, # Korea
1652 { id => 136, name => "KW" }, # Kuwait
1653 { id => 137, name => "KZ" }, # Kazakhstan
1654 { id => 138, name => "LA" }, # Laos
1655 { id => 139, name => "LB" }, # Lebanon
1656 { id => 140, name => "LV" }, # Latvia
1657 { id => 141, name => "LT" }, # Lithuania
1658 { id => 142, name => "LR" }, # Liberia
1659 { id => 143, name => "SK" }, # Slovakia
1660 { id => 145, name => "LI" }, # Liechtenstein
1661 { id => 146, name => "LS" }, # Lesotho
1662 { id => 147, name => "LU" }, # Luxembourg
1663 { id => 148, name => "LY" }, # Libya
1664 { id => 149, name => "MG" }, # Madagascar
1665 { id => 151, name => "MO" }, # Macao S.A.R.
1666 { id => 152, name => "MD" }, # Moldova
1667 { id => 154, name => "MN" }, # Mongolia
1668 { id => 156, name => "MW" }, # Malawi
1669 { id => 157, name => "ML" }, # Mali
1670 { id => 158, name => "MC" }, # Monaco
1671 { id => 159, name => "MA" }, # Morocco
1672 { id => 160, name => "MU" }, # Mauritius
1673 { id => 162, name => "MR" }, # Mauritania
1674 { id => 163, name => "MT" }, # Malta
1675 { id => 164, name => "OM" }, # Oman
1676 { id => 165, name => "MV" }, # Maldives
1677 { id => 166, name => "MX" }, # Mexico
1678 { id => 167, name => "MY" }, # Malaysia
1679 { id => 168, name => "MZ" }, # Mozambique
1680 { id => 173, name => "NE" }, # Niger
1681 { id => 174, name => "VU" }, # Vanuatu
1682 { id => 175, name => "NG" }, # Nigeria
1683 { id => 176, name => "NL" }, # Netherlands
1684 { id => 177, name => "NO" }, # Norway
1685 { id => 178, name => "NP" }, # Nepal
1686 { id => 180, name => "NR" }, # Nauru
1687 { id => 181, name => "SR" }, # Suriname
1688 { id => 182, name => "NI" }, # Nicaragua
1689 { id => 183, name => "NZ" }, # New Zealand
1690 { id => 184, name => "PS" }, # Palestinian Authority
1691 { id => 185, name => "PY" }, # Paraguay
1692 { id => 187, name => "PE" }, # Peru
1693 { id => 190, name => "PK" }, # Pakistan
1694 { id => 191, name => "PL" }, # Poland
1695 { id => 192, name => "PA" }, # Panama
1696 { id => 193, name => "PT" }, # Portugal
1697 { id => 194, name => "PG" }, # Papua New Guinea
1698 { id => 195, name => "PW" }, # Palau
1699 { id => 196, name => "GW" }, # Guinea-Bissau
1700 { id => 197, name => "QA" }, # Qatar
1701 { id => 198, name => "RE" }, # Reunion
1702 { id => 199, name => "MH" }, # Marshall Islands
1703 { id => 200, name => "RO" }, # Romania
1704 { id => 201, name => "PH" }, # Philippines
1705 { id => 202, name => "PR" }, # Puerto Rico
1706 { id => 203, name => "RU" }, # Russia
1707 { id => 204, name => "RW" }, # Rwanda
1708 { id => 205, name => "SA" }, # Saudi Arabia
1709 { id => 206, name => "PM" }, # St. Pierre and Miquelon
1710 { id => 207, name => "KN" }, # St. Kitts and Nevis
1711 { id => 208, name => "SC" }, # Seychelles
1712 { id => 209, name => "ZA" }, # South Africa
1713 { id => 210, name => "SN" }, # Senegal
1714 { id => 212, name => "SI" }, # Slovenia
1715 { id => 213, name => "SL" }, # Sierra Leone
1716 { id => 214, name => "SM" }, # San Marino
1717 { id => 215, name => "SG" }, # Singapore
1718 { id => 216, name => "SO" }, # Somalia
1719 { id => 217, name => "ES" }, # Spain
1720 { id => 218, name => "LC" }, # St. Lucia
1721 { id => 219, name => "SD" }, # Sudan
1722 { id => 220, name => "SJ" }, # Svalbard
1723 { id => 221, name => "SE" }, # Sweden
1724 { id => 222, name => "SY" }, # Syria
1725 { id => 223, name => "CH" }, # Switzerland
1726 { id => 224, name => "AE" }, # United Arab Emirates
1727 { id => 225, name => "TT" }, # Trinidad and Tobago
1728 { id => 227, name => "TH" }, # Thailand
1729 { id => 228, name => "TJ" }, # Tajikistan
1730 { id => 231, name => "TO" }, # Tonga
1731 { id => 232, name => "TG" }, # Togo
1732 { id => 233, name => "ST" }, # São Tomé and Príncipe
1733 { id => 234, name => "TN" }, # Tunisia
1734 { id => 235, name => "TR" }, # Turkey
1735 { id => 236, name => "TV" }, # Tuvalu
1736 { id => 237, name => "TW" }, # Taiwan
1737 { id => 238, name => "TM" }, # Turkmenistan
1738 { id => 239, name => "TZ" }, # Tanzania
1739 { id => 240, name => "UG" }, # Uganda
1740 { id => 241, name => "UA" }, # Ukraine
1741 { id => 242, name => "GB" }, # United Kingdom
1742 { id => 244, name => "US" }, # United States
1743 { id => 245, name => "BF" }, # Burkina Faso
1744 { id => 246, name => "UY" }, # Uruguay
1745 { id => 247, name => "UZ" }, # Uzbekistan
1746 { id => 248, name => "VC" }, # St. Vincent and the Grenadines
1747 { id => 249, name => "VE" }, # Bolivarian Republic of Venezuela
1748 { id => 251, name => "VN" }, # Vietnam
1749 { id => 252, name => "VI" }, # Virgin Islands
1750 { id => 253, name => "VA" }, # Vatican City
1751 { id => 254, name => "NA" }, # Namibia
1752 { id => 257, name => "EH" }, # Western Sahara (disputed)
1753 { id => 258, parent => "UM" }, # Wake Island
1754 { id => 259, name => "WS" }, # Samoa
1755 { id => 260, name => "SZ" }, # Swaziland
1756 { id => 261, name => "YE" }, # Yemen
1757 { id => 263, name => "ZM" }, # Zambia
1758 { id => 264, name => "ZW" }, # Zimbabwe
1759 { id => 269, name => "CS" }, # Serbia and Montenegro (Former)
1760 { id => 270, name => "ME" }, # Montenegro
1761 { id => 271, name => "RS" }, # Serbia
1762 { id => 273, name => "CW" }, # Curaçao
1763 { id => 276, name => "SS" }, # South Sudan
1764 { id => 300, name => "AI" }, # Anguilla
1765 { id => 301, name => "AQ" }, # Antarctica
1766 { id => 302, name => "AW" }, # Aruba
1767 { id => 303, parent => "SH" }, # Ascension Island
1768 { id => 304, parent => "053" }, # Ashmore and Cartier Islands
1769 { id => 305, parent => "UM" }, # Baker Island
1770 { id => 306, name => "BV" }, # Bouvet Island
1771 { id => 307, name => "KY" }, # Cayman Islands
1772 { id => 308, name => "830", parent => "155" }, # Channel Islands
1773 { id => 309, name => "CX" }, # Christmas Island
1774 { id => 310, parent => "009" }, # Clipperton Island
1775 { id => 311, name => "CC" }, # Cocos (Keeling) Islands
1776 { id => 312, name => "CK" }, # Cook Islands
1777 { id => 313, parent => "053" }, # Coral Sea Islands
1778 { id => 314, parent => "IO" }, # Diego Garcia
1779 { id => 315, name => "FK" }, # Falkland Islands (Islas Malvinas)
1780 { id => 317, name => "GF" }, # French Guiana
1781 { id => 318, name => "PF" }, # French Polynesia
1782 { id => 319, name => "TF" }, # French Southern and Antarctic Lands
1783 { id => 321, name => "GP" }, # Guadeloupe
1784 { id => 322, name => "GU" }, # Guam
1785 { id => 323 }, # Guantanamo Bay
1786 { id => 324, name => "GG" }, # Guernsey
1787 { id => 325, name => "HM" }, # Heard Island and McDonald Islands
1788 { id => 326, parent => "UM" }, # Howland Island
1789 { id => 327, parent => "UM" }, # Jarvis Island
1790 { id => 328, name => "JE" }, # Jersey
1791 { id => 329, parent => "UM" }, # Kingman Reef
1792 { id => 330, name => "MQ" }, # Martinique
1793 { id => 331, name => "YT" }, # Mayotte
1794 { id => 332, name => "MS" }, # Montserrat
1795 { id => 333, name => "AN", region => 1 }, # Netherlands Antilles (Former)
1796 { id => 334, name => "NC" }, # New Caledonia
1797 { id => 335, name => "NU" }, # Niue
1798 { id => 336, name => "NF" }, # Norfolk Island
1799 { id => 337, name => "MP" }, # Northern Mariana Islands
1800 { id => 338, parent => "UM" }, # Palmyra Atoll
1801 { id => 339, name => "PN" }, # Pitcairn Islands
1802 { id => 340, parent => "MP" }, # Rota Island
1803 { id => 341, parent => "MP" }, # Saipan
1804 { id => 342, name => "GS" }, # South Georgia and the South Sandwich Islands
1805 { id => 343, name => "SH" }, # St. Helena
1806 { id => 346, parent => "MP" }, # Tinian Island
1807 { id => 347, name => "TK" }, # Tokelau
1808 { id => 348, parent => "SH" }, # Tristan da Cunha
1809 { id => 349, name => "TC" }, # Turks and Caicos Islands
1810 { id => 351, name => "VG" }, # Virgin Islands, British
1811 { id => 352, name => "WF" }, # Wallis and Futuna
1812 { id => 742, name => "002" }, # Africa
1813 { id => 2129, name => "142" }, # Asia
1814 { id => 10541, name => "150" }, # Europe
1815 { id => 15126, name => "IM" }, # Man, Isle of
1816 { id => 19618, name => "MK" }, # Macedonia, Former Yugoslav Republic of
1817 { id => 20900, name => "054" }, # Melanesia
1818 { id => 21206, name => "057" }, # Micronesia
1819 { id => 21242, parent => "UM" }, # Midway Islands
1820 { id => 23581, name => "021" }, # Northern America
1821 { id => 26286, name => "061" }, # Polynesia
1822 { id => 27082, name => "013" }, # Central America
1823 { id => 27114, name => "009" }, # Oceania
1824 { id => 30967, name => "SX" }, # Sint Maarten (Dutch part)
1825 { id => 31396, name => "005" }, # South America
1826 { id => 31706, name => "MF" }, # Saint Martin (French part)
1827 { id => 39070, name => "001" }, # World
1828 { id => 42483, name => "011" }, # Western Africa
1829 { id => 42484, name => "017" }, # Middle Africa
1830 { id => 42487, name => "015" }, # Northern Africa
1831 { id => 47590, name => "143" }, # Central Asia
1832 { id => 47599, name => "035" }, # South-Eastern Asia
1833 { id => 47600, name => "030" }, # Eastern Asia
1834 { id => 47603, name => "014" }, # Eastern Africa
1835 { id => 47609, name => "151" }, # Eastern Europe
1836 { id => 47610, name => "039" }, # Southern Europe
1837 { id => 47611, name => "145" }, # Middle East
1838 { id => 47614, name => "034" }, # Southern Asia
1839 { id => 7299303, name => "TL" }, # Democratic Republic of Timor-Leste
1840 { id => 9914689, name => "XK" }, # Kosovo
1841 { id => 10026358, name => "019" }, # Americas
1842 { id => 10028789, name => "AX" }, # Ã…land Islands
1843 { id => 10039880, name => "029", sintlsymbol => "XCD" }, # Caribbean
1844 { id => 10039882, name => "154" }, # Northern Europe
1845 { id => 10039883, name => "018" }, # Southern Africa
1846 { id => 10210824, name => "155" }, # Western Europe
1847 { id => 10210825, name => "053" }, # Australia and New Zealand
1848 { id => 161832015, name => "BL" }, # Saint Barthélemy
1849 { id => 161832256, name => "UM" }, # U.S. Minor Outlying Islands
1850 { id => 161832257, name => "419", parent => "019" }, # Latin America and the Caribbean
1851 { id => 161832258, name => "BQ" }, # Bonaire, Sint Eustatius and Saba
1854 my @cp2uni = ();
1855 my @glyph2uni = ();
1856 my @lead_bytes = ();
1857 my @uni2cp = ();
1858 my @tolower_table = ();
1859 my @toupper_table = ();
1860 my @digitmap_table = ();
1861 my @halfwidth_table = ();
1862 my @fullwidth_table = ();
1863 my @cjk_compat_table = ();
1864 my @chinese_traditional_table = ();
1865 my @chinese_simplified_table = ();
1866 my @category_table = ();
1867 my @initial_joining_table = ();
1868 my @direction_table = ();
1869 my @decomp_table = ();
1870 my @combining_class_table = ();
1871 my @decomp_compat_table = ();
1872 my @comp_exclusions = ();
1873 my @idna_decomp_table = ();
1874 my @idna_disallowed = ();
1875 my %registry_keys;
1876 my $default_char;
1877 my $default_wchar;
1879 my %joining_forms =
1881 "isolated" => [],
1882 "final" => [],
1883 "initial" => [],
1884 "medial" => []
1887 my $current_data_file;
1889 sub to_utf16(@)
1891 my @ret;
1892 foreach my $ch (@_)
1894 if ($ch < 0x10000)
1896 push @ret, $ch;
1898 else
1900 my $val = $ch - 0x10000;
1901 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
1904 return @ret;
1907 ################################################################
1908 # fetch a unicode.org file and open it
1909 sub open_data_file($@)
1911 my ($id, $name) = @_;
1912 my $data = $data_files{$id};
1913 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
1914 local *FILE;
1916 my $url = $data->{url};
1917 my $filename = "$cache/" . ($data->{name} || ($url =~ s/.*\/([^\/]+)$/$1/r));
1918 unless (-f $filename)
1920 print "Fetching $url...\n";
1921 system "mkdir", "-p", $cache;
1922 !system "wget", "-q", "-O", $filename, $url or die "cannot fetch $url";
1925 my $sha = Digest::SHA->new( "sha256" )->addfile( $filename )->hexdigest;
1926 die "invalid checksum $sha for $filename" unless $sha eq $data->{sha};
1928 if ($filename =~ /\.zip$/)
1930 open FILE, "-|", "unzip", "-p", $filename, $name or die "cannot extract $name from $filename";
1932 elsif ($filename =~ /\.tar\.gz$/)
1934 open FILE, "-|", "tar", "-x", "-f", $filename, "-O", $name or die "cannot extract $name from $filename";
1936 else
1938 open FILE, "<$filename" or die "cannot open $filename";
1940 $current_data_file = $name ? "$url:$name" : $url;
1941 return *FILE;
1944 ################################################################
1945 # load a unicode.org file as XML data
1946 sub load_xml_data_file($@)
1948 my ($id, $name) = @_;
1949 my $FILE = open_data_file( $id, $name );
1950 my $xml = XML::LibXML->load_xml( IO => $FILE );
1951 close FILE;
1952 return $xml;
1955 ################################################################
1956 # recursively get the decomposition for a character
1957 sub get_decomposition($$);
1958 sub get_decomposition($$)
1960 my ($char, $table) = @_;
1961 my @ret;
1963 return $char unless defined $table->[$char];
1964 foreach my $ch (@{$table->[$char]})
1966 push @ret, get_decomposition( $ch, $table );
1968 return @ret;
1971 ################################################################
1972 # get the composition that results in a given character
1973 sub get_composition($$)
1975 my ($ch, $compat) = @_;
1976 return () unless defined $decomp_table[$ch]; # no decomposition
1977 my @ret = @{$decomp_table[$ch]};
1978 return () if @ret < 2; # singleton decomposition
1979 return () if $comp_exclusions[$ch]; # composition exclusion
1980 return () if $combining_class_table[$ch]; # non-starter
1981 return () if $combining_class_table[$ret[0]]; # first char is non-starter
1982 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
1983 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
1984 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
1985 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
1986 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
1987 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
1988 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
1989 return @ret;
1992 ################################################################
1993 # recursively build decompositions
1994 sub build_decompositions(@)
1996 my @src = @_;
1997 my @dst;
1999 for (my $i = 0; $i < @src; $i++)
2001 next unless defined $src[$i];
2002 my @decomp = to_utf16( get_decomposition( $i, \@src ));
2003 $dst[$i] = \@decomp;
2005 return @dst;
2008 ################################################################
2009 # compose Hangul sequences
2010 sub compose_hangul(@)
2012 my $SBASE = 0xac00;
2013 my $LBASE = 0x1100;
2014 my $VBASE = 0x1161;
2015 my $TBASE = 0x11a7;
2016 my $LCOUNT = 19;
2017 my $VCOUNT = 21;
2018 my $TCOUNT = 28;
2019 my $NCOUNT = $VCOUNT * $TCOUNT;
2020 my $SCOUNT = $LCOUNT * $NCOUNT;
2022 my @seq = @_;
2023 my @ret;
2024 my $i;
2026 for ($i = 0; $i < @seq; $i++)
2028 my $ch = $seq[$i];
2029 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
2030 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
2032 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
2033 $i++;
2035 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
2036 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
2038 $ch += $seq[$i+1] - $TBASE;
2039 $i++;
2041 push @ret, $ch;
2043 return @ret;
2046 ################################################################
2047 # remove linguistic-only mappings from the case table
2048 sub remove_linguistic_mappings($$)
2050 my ($upper, $lower) = @_;
2052 # remove case mappings that don't round-trip
2054 for (my $i = 0; $i < @{$upper}; $i++)
2056 next unless defined ${$upper}[$i];
2057 my $ch = ${$upper}[$i];
2058 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
2060 for (my $i = 0; $i < @{$lower}; $i++)
2062 next unless defined ${$lower}[$i];
2063 my $ch = ${$lower}[$i];
2064 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
2068 ################################################################
2069 # read in the Unicode database files
2070 sub load_data()
2072 my $start;
2074 # now build mappings from the decomposition field of the Unicode database
2076 my $UNICODE_DATA = open_data_file( "ucd", "UnicodeData.txt" );
2077 while (<$UNICODE_DATA>)
2079 # Decode the fields ...
2080 my ($code, $name, $cat, $comb, $bidi,
2081 $decomp, $dec, $dig, $num, $mirror,
2082 $oldname, $comment, $upper, $lower, $title) = split /;/;
2083 my $src = hex $code;
2085 die "unknown category $cat" unless defined $categories{$cat};
2086 die "unknown directionality $bidi" unless defined $directions{$bidi};
2088 $category_table[$src] = $categories{$cat};
2089 $direction_table[$src] = $bidi;
2090 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
2092 $initial_joining_table[$src] = $joining_types{"T"};
2094 else
2096 $initial_joining_table[$src] = $joining_types{"U"};
2099 if ($lower ne "")
2101 $tolower_table[$src] = hex $lower;
2103 if ($upper ne "")
2105 $toupper_table[$src] = hex $upper;
2107 if ($dec ne "")
2109 $category_table[$src] |= $ctype{"digit"};
2111 if ($dig ne "")
2113 $digitmap_table[$src] = ord $dig;
2115 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
2117 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
2118 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
2119 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
2120 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
2121 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
2122 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
2123 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
2124 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
2125 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
2126 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
2127 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
2128 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
2130 # copy the category and direction for everything between First/Last pairs
2131 if ($name =~ /, First>/) { $start = $src; }
2132 if ($name =~ /, Last>/)
2134 while ($start < $src)
2136 $category_table[$start] = $category_table[$src];
2137 $direction_table[$start] = $direction_table[$src];
2138 $combining_class_table[$start] = $combining_class_table[$src];
2139 $start++;
2143 next if $decomp eq ""; # no decomposition, skip it
2145 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
2147 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
2148 $decomp_compat_table[$src] = \@seq;
2151 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
2153 # decomposition of the form "<foo> 1234" -> use char if type is known
2154 my $dst = hex $2;
2155 if ($1 eq "narrow")
2157 $halfwidth_table[$dst] = $src;
2158 $fullwidth_table[$src] = $dst;
2160 elsif ($1 eq "wide")
2162 next if $dst == 0x5c; # don't remap backslash
2163 $fullwidth_table[$dst] = $src;
2164 $halfwidth_table[$src] = $dst;
2166 elsif ($1 eq "font" || $1 eq "square" || $1 eq "circle")
2168 $fullwidth_table[$src] = $dst if $src >= 0x10000;
2170 elsif ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
2172 ${joining_forms{$1}}[$dst] = $src;
2175 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
2177 # decomposition "<compat> 0020 1234" -> combining accent
2179 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
2181 # store decomposition
2182 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
2184 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
2186 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
2188 my $dst = hex $1;
2189 # Single char decomposition
2190 $decomp_table[$src] = $decomp_compat_table[$src] = [ $dst ];
2191 if ($name =~ /^CJK COMPATIBILITY IDEOGRAPH/)
2193 $cjk_compat_table[$src] = $dst;
2194 $fullwidth_table[$src] = $dst if $src >= 0x10000;
2199 close $UNICODE_DATA;
2201 # patch the category of some special characters
2203 for (my $i = 0; $i < @decomp_table; $i++)
2205 next unless defined $decomp_table[$i];
2206 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
2208 foreach my $cat (keys %special_categories)
2210 my $flag = $ctype{$cat};
2211 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
2213 for (my $i = 0; $i < @decomp_compat_table; $i++)
2215 next unless defined $decomp_compat_table[$i];
2216 next unless @{$decomp_compat_table[$i]} == 2;
2217 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
2220 # load the composition exclusions
2222 my $EXCL = open_data_file( "ucd", "CompositionExclusions.txt" );
2223 while (<$EXCL>)
2225 s/\#.*//; # remove comments
2226 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
2228 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
2230 elsif (/^([0-9a-fA-F]+)\s*$/)
2232 $comp_exclusions[hex $1] = 1;
2235 close $EXCL;
2237 # load the IDNA mappings
2239 @idna_decomp_table = @decomp_compat_table;
2240 my $IDNA = open_data_file( "idna", "IdnaMappingTable.txt" );
2241 while (<$IDNA>)
2243 s/\#.*//; # remove comments
2244 next if /^\s*$/;
2245 my ($char, $type, $mapping) = split /;/;
2246 my ($ch1, $ch2);
2247 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
2249 $ch1 = hex $1;
2250 $ch2 = hex $2;
2252 elsif ($char =~ /([0-9a-fA-F]+)/)
2254 $ch1 = $ch2 = hex $1;
2257 if ($type =~ /mapped/ || $type =~ /deviation/)
2259 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
2260 my @seq = map { hex $_; } split /\s+/, $mapping;
2261 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
2263 elsif ($type =~ /valid/)
2266 elsif ($type =~ /ignored/)
2268 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
2270 elsif ($type =~ /disallowed/)
2272 foreach my $i ($ch1 .. $ch2)
2274 $idna_decomp_table[$i] = undef;
2275 $idna_disallowed[$i] = 1;
2279 close $IDNA;
2281 # load the Unihan mappings
2283 my $UNIHAN = open_data_file( "unihan", "Unihan_Variants.txt" );
2284 while (<$UNIHAN>)
2286 s/\#.*//; # remove comments
2287 next if /^\s*$/;
2288 if (/^U\+([0-9a-fA-F]{4})\s+kTraditionalVariant\s+U\+([0-9a-fA-F]{4})$/)
2290 next if hex $1 < 0x4dc0; # skip extension A
2291 $chinese_traditional_table[hex $1] = hex $2;
2293 elsif (/^U\+([0-9a-fA-F]{4})\s+kSimplifiedVariant\s+U\+([0-9a-fA-F]{4})$/)
2295 next if hex $1 < 0x4dc0; # skip extension A
2296 $chinese_simplified_table[hex $1] = hex $2;
2299 close $UNIHAN;
2300 foreach my $i (0xf900..0xfaff)
2302 next unless defined $cjk_compat_table[$i];
2303 next if defined $chinese_simplified_table[$cjk_compat_table[$i]];
2304 $chinese_simplified_table[$i] = $cjk_compat_table[$i];
2309 ################################################################
2310 # add a new registry key
2311 sub add_registry_key($$$)
2313 my ($base, $key, $defval) = @_;
2314 $registry_keys{"$base\\$key"} = [ $defval ] unless defined $registry_keys{"$base\\$key"};
2317 ################################################################
2318 # add a new registry value with explicit type
2319 sub add_registry_value($$$$)
2321 my ($base, $key, $name, $value) = @_;
2322 add_registry_key( $base, $key, undef );
2323 push @{$registry_keys{"$base\\$key"}}, "'$name' = $value";
2326 ################################################################
2327 # add a new registry string value
2328 sub add_registry_string_value($$$$)
2330 my ($base, $key, $name, $value) = @_;
2331 $value =~ s/\'/\'\'/g;
2332 add_registry_value( $base, $key, $name, "s '$value'" );
2335 ################################################################
2336 # add a new registry dword value
2337 sub add_registry_dword_value($$$$)
2339 my ($base, $key, $name, $value) = @_;
2340 add_registry_value( $base, $key, $name, "d $value" );
2343 ################################################################
2344 # add a new registry binary value
2345 sub add_registry_binary_value($$$$)
2347 my ($base, $key, $name, $value) = @_;
2348 add_registry_value( $base, $key, $name, "b " . join "", map { sprintf "%02x", $_; } unpack( "C*", $value ));
2351 ################################################################
2352 # define a new lead byte
2353 sub add_lead_byte($)
2355 my $ch = shift;
2356 return if defined $cp2uni[$ch];
2357 push @lead_bytes, $ch;
2358 $cp2uni[$ch] = 0;
2361 ################################################################
2362 # define a new char mapping
2363 sub add_mapping($$)
2365 my ($cp, $uni) = @_;
2366 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2367 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2368 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
2371 ################################################################
2372 # get a mapping including glyph chars for MB_USEGLYPHCHARS
2373 sub get_glyphs_mapping(@)
2375 my @table = @_;
2377 for (my $i = 0; $i < @glyph2uni; $i++)
2379 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
2381 return @table;
2384 ################################################################
2385 # build EUC-JP table from the JIS 0208/0212 files
2386 sub dump_eucjp_codepage()
2388 @cp2uni = ();
2389 @glyph2uni = ();
2390 @lead_bytes = ();
2391 @uni2cp = ();
2392 $default_char = $DEF_CHAR;
2393 $default_wchar = 0x30fb;
2395 # ASCII chars
2396 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
2398 # lead bytes
2399 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
2401 # JIS X 0201 right plane
2402 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
2404 # undefined chars
2405 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
2406 $cp2uni[0xa0] = 0xf8f0;
2407 $cp2uni[0xff] = 0xf8f3;
2409 # Fix backslash conversion
2410 add_mapping( 0xa1c0, 0xff3c );
2412 # Add private mappings for rows undefined in JIS 0208/0212
2413 my $private = 0xe000;
2414 foreach my $hi (0xf5 .. 0xfe)
2416 foreach my $lo (0xa1 .. 0xfe)
2418 add_mapping( ($hi << 8) + $lo, $private++ );
2421 foreach my $hi (0xf5 .. 0xfe)
2423 foreach my $lo (0x21 .. 0x7e)
2425 add_mapping( ($hi << 8) + $lo, $private++ );
2429 my $INPUT = open_data_file( "jis0208" );
2430 while (<$INPUT>)
2432 next if /^\#/; # skip comments
2433 next if /^$/; # skip empty lines
2434 next if /\x1a/; # skip ^Z
2435 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2437 add_mapping( 0x8080 + hex $1, hex $2 );
2438 next;
2440 die "Unrecognized line $_\n";
2442 close $INPUT;
2444 $INPUT = open_data_file( "jis0212" );
2445 while (<$INPUT>)
2447 next if /^\#/; # skip comments
2448 next if /^$/; # skip empty lines
2449 next if /\x1a/; # skip ^Z
2450 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2452 add_mapping( 0x8000 + hex $1, hex $2 );
2453 next;
2455 die "Unrecognized line $_\n";
2457 close $INPUT;
2459 output_codepage_file( 20932 );
2462 ################################################################
2463 # build Korean Wansung table from the KSX1001 file
2464 sub dump_krwansung_codepage(@)
2466 my @cp949 = @_;
2467 @cp2uni = ();
2468 @glyph2uni = ();
2469 @lead_bytes = ();
2470 @uni2cp = ();
2471 $default_char = 0x3f;
2472 $default_wchar = 0x003f;
2474 # ASCII and undefined chars
2475 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
2476 add_mapping( 0xa0, 0xf8e6 );
2477 add_mapping( 0xad, 0xf8e7 );
2478 add_mapping( 0xae, 0xf8e8 );
2479 add_mapping( 0xaf, 0xf8e9 );
2480 add_mapping( 0xfe, 0xf8ea );
2481 add_mapping( 0xff, 0xf8eb );
2483 my $INPUT = open_data_file( "ksx1001" );
2484 while (<$INPUT>)
2486 next if /^\#/; # skip comments
2487 next if /^$/; # skip empty lines
2488 next if /\x1a/; # skip ^Z
2489 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2491 add_mapping( 0x8080 + hex $1, hex $2 );
2492 next;
2494 die "Unrecognized line $_\n";
2496 close $INPUT;
2498 # get some extra mappings from cp 949
2499 my @defined_lb;
2500 map { $defined_lb[$_] = 1; } @lead_bytes;
2501 foreach my $i (0x0000 .. 0xffff)
2503 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
2504 next unless defined $cp949[$i];
2505 if ($cp949[$i] >= 0xff)
2507 # only add chars for lead bytes that exist in 20949
2508 my $hi = $cp949[$i] >> 8;
2509 my $lo = $cp949[$i] & 0xff;
2510 next unless $defined_lb[$hi];
2511 next unless $lo >= 0xa1 && $lo <= 0xfe;
2513 add_mapping( $cp949[$i], $i );
2516 output_codepage_file( 20949 );
2520 ################################################################
2521 # dump an array of integers
2522 sub dump_array($$@)
2524 my ($bit_width, $default, @array) = @_;
2525 my $format = sprintf "0x%%0%ux", $bit_width / 4;
2526 my $i;
2527 my $ret = " ";
2528 for ($i = 0; $i < $#array; $i++)
2530 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2531 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
2533 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2534 return $ret;
2538 ################################################################
2539 # dump an SBCS mapping table in binary format
2540 sub dump_binary_sbcs_table($)
2542 my $codepage = shift;
2544 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2545 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
2547 print OUTPUT pack "S<*", @header;
2548 print OUTPUT pack "C12", (0) x 12;
2549 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2551 if (@glyph2uni)
2553 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
2555 else
2557 print OUTPUT pack "S<*", 0;
2560 print OUTPUT pack "S<*", 0, 0;
2562 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2566 ################################################################
2567 # dump a DBCS mapping table in binary format
2568 sub dump_binary_dbcs_table($)
2570 my $codepage = shift;
2571 my @lb_ranges = get_lb_ranges();
2572 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2574 my @offsets = (0) x 256;
2575 my $pos = 0;
2576 foreach my $i (@lead_bytes)
2578 $offsets[$i] = ($pos += 256);
2579 $cp2uni[$i] = 0;
2582 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
2584 print OUTPUT pack "S<*", @header;
2585 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
2586 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2587 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
2589 foreach my $i (@lead_bytes)
2591 my $base = $i << 8;
2592 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
2595 print OUTPUT pack "S<", 4;
2596 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2600 ################################################################
2601 # get the list of defined lead byte ranges
2602 sub get_lb_ranges()
2604 my @list = ();
2605 my @ranges = ();
2607 foreach my $i (@lead_bytes) { $list[$i] = 1; }
2608 my $on = 0;
2609 for (my $i = 0; $i < 256; $i++)
2611 if ($on)
2613 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
2615 else
2617 if ($list[$i]) { push @ranges, $i; $on = 1; }
2620 if ($on) { push @ranges, 0xff; }
2621 return @ranges;
2624 ################################################################
2625 # dump the Indic Syllabic Category table
2626 sub dump_indic($)
2628 my $filename = shift;
2629 my @indic_table;
2631 my $INPUT = open_data_file( "ucd", "IndicSyllabicCategory.txt" );
2632 while (<$INPUT>)
2634 next if /^\#/; # skip comments
2635 next if /^\s*$/; # skip empty lines
2636 next if /\x1a/; # skip ^Z
2637 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2639 my $type = $2;
2640 die "unknown indic $type" unless defined $indic_types{$type};
2641 if (hex $1 < 65536)
2643 $indic_table[hex $1] = $indic_types{$type};
2645 next;
2647 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2649 my $type = $3;
2650 die "unknown indic $type" unless defined $indic_types{$type};
2651 if (hex $1 < 65536 and hex $2 < 65536)
2653 foreach my $i (hex $1 .. hex $2)
2655 $indic_table[$i] = $indic_types{$type};
2658 next;
2660 die "malformed line $_";
2662 close $INPUT;
2664 my $prev_data_file = $current_data_file;
2665 $INPUT = open_data_file( "ucd", "IndicPositionalCategory.txt" );
2666 while (<$INPUT>)
2668 next if /^\#/; # skip comments
2669 next if /^\s*$/; # skip empty lines
2670 next if /\x1a/; # skip ^Z
2671 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2673 my $type = $2;
2674 die "unknown matra $type" unless defined $matra_types{$type};
2675 $indic_table[hex $1] |= $matra_types{$type} << 8;
2676 next;
2678 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2680 my $type = $3;
2681 die "unknown matra $type" unless defined $matra_types{$type};
2682 foreach my $i (hex $1 .. hex $2)
2684 $indic_table[$i] |= $matra_types{$type} << 8;
2686 next;
2688 die "malformed line $_";
2690 close $INPUT;
2692 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2693 print "Building $filename\n";
2694 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
2695 print OUTPUT "/* generated from $prev_data_file */\n";
2696 print OUTPUT "/* and from $current_data_file */\n";
2697 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2698 print OUTPUT "#include \"windef.h\"\n\n";
2700 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
2702 close OUTPUT;
2703 save_file($filename);
2706 ################################################################
2707 # dump the Line Break Properties table
2708 sub dump_linebreak($)
2710 my $filename = shift;
2711 my @break_table;
2713 my $INPUT = open_data_file( "ucd", "LineBreak.txt" );
2714 while (<$INPUT>)
2716 next if /^\#/; # skip comments
2717 next if /^\s*$/; # skip empty lines
2718 next if /\x1a/; # skip ^Z
2719 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2721 my $type = $2;
2722 die "unknown breaktype $type" unless defined $break_types{$type};
2723 $break_table[hex $1] = $break_types{$type};
2724 next;
2726 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2728 my $type = $3;
2729 die "unknown breaktype $type" unless defined $break_types{$type};
2730 foreach my $i (hex $1 .. hex $2)
2732 $break_table[$i] = $break_types{$type};
2734 next;
2736 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2738 my $type = $2;
2739 die "unknown breaktype $type" unless defined $break_types{$type};
2740 $break_table[hex $1] = $break_types{$type};
2741 next;
2743 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2745 my $type = $3;
2746 die "unknown breaktype $type" unless defined $break_types{$type};
2747 foreach my $i (hex $1 .. hex $2)
2749 $break_table[$i] = $break_types{$type};
2751 next;
2753 die "malformed line $_";
2755 close $INPUT;
2757 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2758 print "Building $filename\n";
2759 print OUTPUT "/* Unicode Line Break Properties */\n";
2760 print OUTPUT "/* generated from $current_data_file */\n";
2761 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2762 print OUTPUT "#include \"windef.h\"\n\n";
2764 dump_three_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
2766 close OUTPUT;
2767 save_file($filename);
2770 my %scripts =
2772 "Unknown" => 0,
2773 "Common" => 1,
2774 "Inherited" => 2,
2775 "Arabic" => 3,
2776 "Armenian" => 4,
2777 "Avestan" => 5,
2778 "Balinese" => 6,
2779 "Bamum" => 7,
2780 "Batak" => 8,
2781 "Bengali" => 9,
2782 "Bopomofo" => 10,
2783 "Brahmi" => 11,
2784 "Braille" => 12,
2785 "Buginese" => 13,
2786 "Buhid" => 14,
2787 "Canadian_Aboriginal" => 15,
2788 "Carian" => 16,
2789 "Cham" => 17,
2790 "Cherokee" => 18,
2791 "Coptic" => 19,
2792 "Cuneiform" => 20,
2793 "Cypriot" => 21,
2794 "Cyrillic" => 22,
2795 "Deseret" => 23,
2796 "Devanagari" => 24,
2797 "Egyptian_Hieroglyphs" => 25,
2798 "Ethiopic" => 26,
2799 "Georgian" => 27,
2800 "Glagolitic" => 28,
2801 "Gothic" => 29,
2802 "Greek" => 30,
2803 "Gujarati" => 31,
2804 "Gurmukhi" => 32,
2805 "Han" => 33,
2806 "Hangul" => 34,
2807 "Hanunoo" => 35,
2808 "Hebrew" => 36,
2809 "Hiragana" => 37,
2810 "Imperial_Aramaic" => 38,
2811 "Inscriptional_Pahlavi" => 39,
2812 "Inscriptional_Parthian" => 40,
2813 "Javanese" => 41,
2814 "Kaithi" => 42,
2815 "Kannada" => 43,
2816 "Katakana" => 44,
2817 "Kayah_Li" => 45,
2818 "Kharoshthi" => 46,
2819 "Khmer" => 47,
2820 "Lao" => 48,
2821 "Latin" => 49,
2822 "Lepcha" => 50,
2823 "Limbu" => 51,
2824 "Linear_B" => 52,
2825 "Lisu" => 53,
2826 "Lycian" => 54,
2827 "Lydian" => 55,
2828 "Malayalam" => 56,
2829 "Mandaic" => 57,
2830 "Meetei_Mayek" => 58,
2831 "Mongolian" => 59,
2832 "Myanmar" => 60,
2833 "New_Tai_Lue" => 61,
2834 "Nko" => 62,
2835 "Ogham" => 63,
2836 "Ol_Chiki" => 64,
2837 "Old_Italic" => 65,
2838 "Old_Persian" => 66,
2839 "Old_South_Arabian" => 67,
2840 "Old_Turkic" => 68,
2841 "Oriya" => 69,
2842 "Osmanya" => 70,
2843 "Phags_Pa" => 71,
2844 "Phoenician" => 72,
2845 "Rejang" => 73,
2846 "Runic" => 74,
2847 "Samaritan" => 75,
2848 "Saurashtra" => 76,
2849 "Shavian" => 77,
2850 "Sinhala" => 78,
2851 "Sundanese" => 79,
2852 "Syloti_Nagri" => 80,
2853 "Syriac" => 81,
2854 "Tagalog" => 82,
2855 "Tagbanwa" => 83,
2856 "Tai_Le" => 84,
2857 "Tai_Tham" => 85,
2858 "Tai_Viet" => 86,
2859 "Tamil" => 87,
2860 "Telugu" => 88,
2861 "Thaana" => 89,
2862 "Thai" => 90,
2863 "Tibetan" => 91,
2864 "Tifinagh" => 92,
2865 "Ugaritic" => 93,
2866 "Vai" => 94,
2867 "Yi" => 95,
2868 # Win8/Win8.1
2869 "Chakma" => 96,
2870 "Meroitic_Cursive" => 97,
2871 "Meroitic_Hieroglyphs" => 98,
2872 "Miao" => 99,
2873 "Sharada" => 100,
2874 "Sora_Sompeng" => 101,
2875 "Takri" => 102,
2876 # Win10
2877 "Bassa_Vah" => 103,
2878 "Caucasian_Albanian" => 104,
2879 "Duployan" => 105,
2880 "Elbasan" => 106,
2881 "Grantha" => 107,
2882 "Khojki" => 108,
2883 "Khudawadi" => 109,
2884 "Linear_A" => 110,
2885 "Mahajani" => 111,
2886 "Manichaean" => 112,
2887 "Mende_Kikakui" => 113,
2888 "Modi" => 114,
2889 "Mro" => 115,
2890 "Nabataean" => 116,
2891 "Old_North_Arabian" => 117,
2892 "Old_Permic" => 118,
2893 "Pahawh_Hmong" => 119,
2894 "Palmyrene" => 120,
2895 "Pau_Cin_Hau" => 121,
2896 "Psalter_Pahlavi" => 122,
2897 "Siddham" => 123,
2898 "Tirhuta" => 124,
2899 "Warang_Citi" => 125,
2900 # Win10 RS1
2901 "Adlam" => 126,
2902 "Ahom" => 127,
2903 "Anatolian_Hieroglyphs" => 128,
2904 "Bhaiksuki" => 129,
2905 "Hatran" => 130,
2906 "Marchen" => 131,
2907 "Multani" => 132,
2908 "Newa" => 133,
2909 "Old_Hungarian" => 134,
2910 "Osage" => 135,
2911 "SignWriting" => 136,
2912 "Tangut" => 137,
2913 # Win10 RS4
2914 "Masaram_Gondi" => 138,
2915 "Nushu" => 139,
2916 "Soyombo" => 140,
2917 "Zanabazar_Square" => 141,
2918 # Win10 1903
2919 "Dogra" => 142,
2920 "Gunjala_Gondi" => 143,
2921 "Hanifi_Rohingya" => 144,
2922 "Makasar" => 145,
2923 "Medefaidrin" => 146,
2924 "Old_Sogdian" => 147,
2925 "Sogdian" => 148,
2926 # Win10 2004
2927 "Elymaic" => 149,
2928 "Nyiakeng_Puachue_Hmong" => 150,
2929 "Nandinagari" => 151,
2930 "Wancho" => 152,
2931 # Win11
2932 "Chorasmian" => 153,
2933 "Dives_Akuru" => 154,
2934 "Khitan_Small_Script" => 155,
2935 "Yezidi" => 156,
2938 ################################################################
2939 # dump Script IDs table
2940 sub dump_scripts($)
2942 my $filename = shift;
2943 my $header = $filename;
2944 my @scripts_table;
2945 my $script_index;
2946 my $i;
2948 my $INPUT = open_data_file( "ucd", "Scripts.txt" );
2949 # Fill the table
2950 # Unknown script id is always 0, so undefined scripts are automatically treated as such
2951 while (<$INPUT>)
2953 my $type = "";
2955 next if /^\#/; # skip comments
2956 next if /^\s*$/; # skip empty lines
2957 next if /\x1a/; # skip ^Z
2958 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2960 $type = $2;
2961 if (defined $scripts{$type})
2963 $scripts_table[hex $1] = $scripts{$type};
2965 next;
2967 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2969 $type = $3;
2970 if (defined $scripts{$type})
2972 foreach my $i (hex $1 .. hex $2)
2974 $scripts_table[$i] = $scripts{$type};
2977 next;
2981 close $INPUT;
2983 $header = "$filename.h";
2984 open OUTPUT,">$header.new" or die "Cannot create $header";
2985 print "Building $header\n";
2986 print OUTPUT "/* Unicode Script IDs */\n";
2987 print OUTPUT "/* generated from $current_data_file */\n";
2988 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2990 print OUTPUT "enum unicode_script_id {\n";
2991 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
2993 print OUTPUT " Script_$script = $scripts{$script},\n";
2995 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
2996 print OUTPUT "};\n";
2998 close OUTPUT;
2999 save_file($header);
3001 $filename = "$filename.c";
3002 open OUTPUT,">$filename.new" or die "Cannot create $header";
3003 print "Building $filename\n";
3004 print OUTPUT "/* Unicode Script IDs */\n";
3005 print OUTPUT "/* generated from $current_data_file */\n";
3006 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3007 print OUTPUT "#include \"windef.h\"\n\n";
3009 dump_three_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
3010 close OUTPUT;
3011 save_file($filename);
3014 ################################################################
3015 # dump the BiDi mirroring table
3016 sub dump_mirroring($)
3018 my $filename = shift;
3019 my @mirror_table = ();
3021 my $INPUT = open_data_file( "ucd", "BidiMirroring.txt" );
3022 while (<$INPUT>)
3024 next if /^\#/; # skip comments
3025 next if /^$/; # skip empty lines
3026 next if /\x1a/; # skip ^Z
3027 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
3029 $mirror_table[hex $1] = hex $2;
3030 next;
3032 die "malformed line $_";
3034 close $INPUT;
3036 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3037 print "Building $filename\n";
3038 print OUTPUT "/* Unicode BiDi mirroring */\n";
3039 print OUTPUT "/* generated from $current_data_file */\n";
3040 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3041 print OUTPUT "#include \"windef.h\"\n\n";
3042 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
3043 close OUTPUT;
3044 save_file($filename);
3047 ################################################################
3048 # dump the Bidi Brackets
3049 sub dump_bracket($)
3051 my $filename = shift;
3052 my @bracket_table;
3054 my $INPUT = open_data_file( "ucd", "BidiBrackets.txt" );
3055 while (<$INPUT>)
3057 next if /^\#/; # skip comments
3058 next if /^\s*$/; # skip empty lines
3059 next if /\x1a/; # skip ^Z
3060 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
3062 my $type = $3;
3063 die "unknown bracket $type" unless defined $bracket_types{$type};
3064 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
3065 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
3066 $bracket_table[hex $1] += $bracket_types{$type} << 8;
3067 next;
3069 die "malformed line $_";
3071 close $INPUT;
3073 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3074 print "Building $filename\n";
3075 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
3076 print OUTPUT "/* generated from $current_data_file */\n";
3077 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3078 print OUTPUT "#include \"windef.h\"\n\n";
3080 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
3082 close OUTPUT;
3083 save_file($filename);
3086 ################################################################
3087 # dump the Arabic shaping table
3088 sub dump_shaping($)
3090 my $filename = shift;
3091 my @joining_table = @initial_joining_table;
3093 my $INPUT = open_data_file( "ucd", "ArabicShaping.txt" );
3094 while (<$INPUT>)
3096 next if /^\#/; # skip comments
3097 next if /^\s*$/; # skip empty lines
3098 next if /\x1a/; # skip ^Z
3099 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3101 my $type = $2;
3102 $joining_table[hex $1] = $joining_types{$type};
3103 next;
3105 die "malformed line $_";
3107 close $INPUT;
3109 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3110 print "Building $filename\n";
3111 print OUTPUT "/* Unicode Arabic shaping */\n";
3112 print OUTPUT "/* generated from $current_data_file */\n";
3113 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3114 print OUTPUT "#include \"windef.h\"\n\n";
3116 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
3118 print OUTPUT "\nconst unsigned short wine_shaping_forms[256][4] =\n{\n";
3119 for (my $i = 0x600; $i <= 0x6ff; $i++)
3121 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
3122 ${joining_forms{"isolated"}}[$i] || $i,
3123 ${joining_forms{"final"}}[$i] || $i,
3124 ${joining_forms{"initial"}}[$i] || $i,
3125 ${joining_forms{"medial"}}[$i] || $i;
3127 print OUTPUT "};\n";
3129 close OUTPUT;
3130 save_file($filename);
3133 ################################################################
3134 # dump the Arabic shaping table
3135 sub dump_arabic_shaping($)
3137 my $filename = shift;
3138 my @joining_table = @initial_joining_table;
3140 my $INPUT = open_data_file( "ucd", "ArabicShaping.txt" );
3141 while (<$INPUT>)
3143 next if /^\#/; # skip comments
3144 next if /^\s*$/; # skip empty lines
3145 next if /\x1a/; # skip ^Z
3146 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3148 my $type = $2;
3149 my $group = $3;
3151 if ($group eq "ALAPH" || $group eq "DALATH RISH")
3153 $joining_table[hex $1] = $joining_types{$group};
3155 else
3157 $joining_table[hex $1] = $joining_types{$type};
3160 next;
3162 die "malformed line $_";
3164 close $INPUT;
3166 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3167 print "Building $filename\n";
3168 print OUTPUT "/* Unicode Arabic shaping */\n";
3169 print OUTPUT "/* generated from $current_data_file */\n";
3170 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3171 print OUTPUT "#include \"windef.h\"\n\n";
3173 dump_three_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
3175 close OUTPUT;
3176 save_file($filename);
3179 ################################################################
3180 # dump the Vertical Orientation table
3181 sub dump_vertical($$)
3183 my ($filename, $unix) = @_;
3184 my @vertical_table;
3186 my $INPUT = open_data_file( "ucd", "VerticalOrientation.txt" );
3187 while (<$INPUT>)
3189 next if /^\#/; # skip comments
3190 next if /^\s*$/; # skip empty lines
3191 next if /\x1a/; # skip ^Z
3192 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
3194 my $type = $2;
3195 die "unknown vertical $type" unless defined $vertical_types{$type};
3196 if (hex $1 < 65536)
3198 $vertical_table[hex $1] = $vertical_types{$type};
3200 next;
3202 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
3204 my $type = $3;
3205 die "unknown vertical $type" unless defined $vertical_types{$type};
3206 foreach my $i (hex $1 .. hex $2)
3208 $vertical_table[$i] = $vertical_types{$type};
3210 next;
3212 die "malformed line $_";
3214 close $INPUT;
3216 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3217 print "Building $filename\n";
3218 print OUTPUT "/* Unicode Vertical Orientation */\n";
3219 print OUTPUT "/* generated from $current_data_file */\n";
3220 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3221 if ($unix)
3223 print OUTPUT "#if 0\n";
3224 print OUTPUT "#pragma makedep unix\n";
3225 print OUTPUT "#endif\n\n";
3227 print OUTPUT "#include \"windef.h\"\n\n";
3229 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
3231 close OUTPUT;
3232 save_file($filename);
3235 ################################################################
3236 # compress a mapping table by removing identical rows
3237 sub compress_array($$@)
3239 my $rows = shift;
3240 my $def = shift;
3241 my @table = @_;
3242 my $len = @table / $rows;
3243 my @array;
3244 my $data = "";
3246 # try to merge table rows
3247 for (my $row = 0; $row < $rows; $row++)
3249 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
3250 my $pos = index $data, $rowtxt;
3251 if ($pos == -1)
3253 # check if the tail of the data can match the start of the new row
3254 my $first = substr( $rowtxt, 0, 1 );
3255 for (my $i = length($data) - 1; $i > 0; $i--)
3257 $pos = index( substr( $data, -$i ), $first );
3258 last if $pos == -1;
3259 $i -= $pos;
3260 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
3261 substr( $data, -$i ) = "";
3262 last;
3264 $pos = length $data;
3265 $data .= $rowtxt;
3267 $array[$row] = $rows + $pos;
3269 return @array, unpack "U*", $data;
3272 ################################################################
3273 # dump a char -> value mapping table using two-level tables
3274 sub dump_two_level_mapping($$$@)
3276 my $name = shift;
3277 my $def = shift;
3278 my $size = shift;
3279 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3280 my (@array, @row_array, @data, @row_data);
3281 (@row_array[0..4095], @data) = compress_array( 4096, $def, @_[0..65535] );
3282 (@array[0..255], @row_data) = compress_array( 256, 0, @row_array );
3284 for (my $i = 0; $i < @row_data; $i++) { $row_data[$i] += @row_data + 256 - 4096; }
3286 printf OUTPUT "const %s %s[%d] =\n{\n", $type, $name, @array + @row_data + @data;
3287 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array );
3288 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @row_data );
3289 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @data );
3292 ################################################################
3293 # dump a char -> value mapping table using three-level tables
3294 sub dump_three_level_mapping($$@)
3296 my $name = shift;
3297 my $def = shift;
3298 my $size = shift;
3299 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3300 my $level3 = ($MAX_CHAR + 1) / 16;
3301 my $level2 = $level3 / 16;
3302 my $level1 = $level2 / 16;
3303 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
3304 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
3305 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
3307 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
3308 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
3310 printf OUTPUT "const %s %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
3311 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
3312 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
3313 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
3314 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
3317 ################################################################
3318 # dump a binary case mapping table in l_intl.nls format
3319 sub dump_binary_case_table(@)
3321 my (@table) = @_;
3322 my @difftable;
3323 my @res;
3325 for (my $i = 0; $i < @table; $i++)
3327 next unless defined $table[$i];
3328 $difftable[$i] = ($table[$i] - $i) & 0xffffffff;
3331 my (@low_array1, @low_array2, @low_data, @low_row_data);
3332 (@low_array2[0..4095], @low_data) = compress_array( 4096, 0, @difftable[0..65535] );
3333 (@low_array1[0..255], @low_row_data) = compress_array( 256, 0, @low_array2 );
3335 if (scalar @table > 0x10000)
3337 my (@high_array1, @high_array2, @high_data, @high_row_data);
3338 (@high_array2[0..32767], @high_data) = compress_array( 32768, 0, @difftable[65536..$MAX_CHAR] );
3339 (@high_array1[0..1023], @high_row_data) = compress_array( 1024, 0, @high_array2 );
3341 push @res, map { $_ + 1024; } @low_array1;
3342 push @res, map { $_ + @res + @low_row_data + @low_data; } @high_array1;
3343 push @res, map { $_ + @res + @low_row_data - 4096; } @low_row_data;
3344 push @res, @low_data;
3345 push @res, map { 2 * ($_ - 32768) + @res + @high_row_data; } @high_row_data;
3346 return pack( "S<*", 1 + scalar @res + 2 * scalar @high_data, @res ) . pack( "L<*", @high_data );
3348 else
3350 push @res, @low_array1;
3351 push @res, map { $_ + @res + @low_row_data - 4096; } @low_row_data;
3352 push @res, @low_data;
3353 return pack "S<*", 1 + scalar @res, @res;
3357 ################################################################
3358 # dump case mappings for l_intl.nls
3359 sub dump_intl_nls($)
3361 my @upper_table = @toupper_table;
3362 my @lower_table = @tolower_table;
3363 remove_linguistic_mappings( \@upper_table, \@lower_table );
3365 my $upper = dump_binary_case_table( @upper_table[0..65535] );
3366 my $lower = dump_binary_case_table( @lower_table[0..65535] );
3368 my $filename = shift;
3369 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3370 printf "Building $filename\n";
3372 binmode OUTPUT;
3373 print OUTPUT pack "S<", 1; # version
3374 print OUTPUT $upper;
3375 print OUTPUT $lower;
3376 close OUTPUT;
3377 save_file($filename);
3381 ################################################################
3382 # dump the bidi direction table
3383 sub dump_bidi_dir_table($)
3385 my $filename = shift;
3386 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3387 printf "Building $filename\n";
3388 printf OUTPUT "/* Unicode BiDi direction table */\n";
3389 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
3390 printf OUTPUT "#include \"windef.h\"\n\n";
3392 my @table;
3394 for (my $i = 0; $i < @direction_table; $i++)
3396 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
3399 dump_three_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
3401 close OUTPUT;
3402 save_file($filename);
3406 sub rol($$)
3408 my ($byte, $count) = @_;
3409 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
3412 ################################################################
3413 # compress the character properties table
3414 sub compress_char_props_table($@)
3416 my $rows = shift;
3417 my @table = @_;
3418 my $len = @table / $rows;
3419 my $pos = 0;
3420 my @array = (0) x $rows;
3421 my %sequences;
3423 # add some predefined sequences
3424 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
3426 # try to merge table rows
3427 for (my $row = 0; $row < $rows; $row++)
3429 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
3430 my $rowtxt = pack "L*", @table_row;
3431 if (defined($sequences{$rowtxt}))
3433 # reuse an existing row
3434 $array[$row] = $sequences{$rowtxt};
3436 else
3438 # create a new row
3439 $sequences{$rowtxt} = $array[$row] = ++$pos;
3440 push @array, @table_row;
3443 return @array;
3446 ################################################################
3447 # dump a normalization table in binary format
3448 sub dump_norm_table($)
3450 my $filename = shift;
3452 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
3453 my %decomp = ( "nfc" => \@decomp_table,
3454 "nfd" => \@decomp_table,
3455 "nfkc" => \@decomp_compat_table,
3456 "nfkd" => \@decomp_compat_table ,
3457 "idna" => \@idna_decomp_table );
3459 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3460 print "Building $filename\n";
3462 my $type = $filename;
3463 $type =~ s!.*/norm(\w+)\.nls!$1!;
3465 my $compose = $forms{$type} & 1;
3466 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
3468 my @version = split /\./, $UNIVERSION;
3470 # combining classes
3472 my @classes;
3473 my @class_values;
3475 foreach my $c (grep defined, @combining_class_table)
3477 $classes[$c] = 1 if $c < 0x100;
3479 for (my $i = 0; $i < @classes; $i++)
3481 next unless defined $classes[$i];
3482 $classes[$i] = @class_values;
3483 push @class_values, $i;
3485 push @class_values, 0 if (@class_values % 2);
3486 die "too many classes" if @class_values >= 0x40;
3488 # character properties
3490 my @char_props;
3491 my @decomposed;
3492 my @comp_hash_table;
3493 my $comp_hash_size = $compose ? 254 : 0;
3495 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3497 next unless defined $combining_class_table[$i];
3498 if (defined $decomp{$type}->[$i])
3500 my @dec = get_decomposition( $i, $decomp{$type} );
3501 if ($compose && (my @comp = get_composition( $i, $compat )))
3503 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
3504 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
3506 my $val = 0;
3507 foreach my $d (@dec)
3509 $val = $combining_class_table[$d];
3510 last if $val;
3512 $char_props[$i] = $classes[$val];
3514 else
3516 $char_props[$i] = 0xbf;
3518 @dec = compose_hangul( @dec ) if $compose;
3519 @dec = to_utf16( @dec );
3520 push @dec, 0 if @dec >= 7;
3521 $decomposed[$i] = \@dec;
3523 else
3525 if ($combining_class_table[$i] == 0x100)
3527 $char_props[$i] = 0x7f;
3529 elsif ($combining_class_table[$i])
3531 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
3533 elsif ($type eq "idna" && defined $idna_disallowed[$i])
3535 $char_props[$i] = 0xff;
3537 else
3539 $char_props[$i] = 0;
3544 if ($compose)
3546 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3548 my @comp = get_composition( $i, $compat );
3549 next unless @comp;
3550 if ($combining_class_table[$comp[1]])
3552 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
3553 $char_props[$comp[1]] |= 0x40;
3555 else
3557 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
3558 $char_props[$comp[1]] |= 0xc0;
3563 # surrogates
3564 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
3565 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
3567 # Hangul
3568 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
3569 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
3570 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
3572 # invalid chars
3573 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
3574 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
3575 foreach my $i (0x00..0x10)
3577 $char_props[($i << 16) | 0xfffe] = 0xff;
3578 $char_props[($i << 16) | 0xffff] = 0xff;
3581 # decomposition hash table
3583 my @decomp_hash_table;
3584 my @decomp_hash_index;
3585 my @decomp_hash_data;
3586 my $decomp_hash_size = 944;
3588 # build string of character data, reusing substrings when possible
3589 my $decomp_char_data = "";
3590 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
3592 my $str = pack "U*", @{$i};
3593 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
3595 for (my $i = 0; $i < @decomposed; $i++)
3597 next unless defined $decomposed[$i];
3598 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
3599 die "sequence not found" if $pos == -1;
3600 my $len = @{$decomposed[$i]};
3601 $len = 7 if $len > 7;
3602 my $hash = $i % $decomp_hash_size;
3603 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
3605 for (my $i = 0; $i < $decomp_hash_size; $i++)
3607 $decomp_hash_index[$i] = @decomp_hash_data / 2;
3608 next unless defined $decomp_hash_table[$i];
3609 if (@{$decomp_hash_table[$i]} == 1)
3611 my $entry = $decomp_hash_table[$i]->[0];
3612 if ($char_props[$entry->[0]] == 0xbf)
3614 $decomp_hash_index[$i] = $entry->[1];
3615 next;
3618 foreach my $entry (@{$decomp_hash_table[$i]})
3620 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
3623 push @decomp_hash_data, 0, 0;
3625 # composition hash table
3627 my @comp_hash_index;
3628 my @comp_hash_data;
3629 if (@comp_hash_table)
3631 for (my $i = 0; $i < $comp_hash_size; $i++)
3633 $comp_hash_index[$i] = @comp_hash_data;
3634 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
3636 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
3637 push @comp_hash_data, 0, 0, 0;
3640 my $level1 = ($MAX_CHAR + 1) / 128;
3641 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
3643 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
3644 0, $decomp_hash_size, $comp_hash_size, 0 );
3645 my @tables = (0) x 8;
3647 $tables[0] = 16 + @header + @tables;
3648 $tables[1] = $tables[0] + @class_values / 2;
3649 $tables[2] = $tables[1] + $level1 / 2;
3650 $tables[3] = $tables[2] + (@rows - $level1) / 2;
3651 $tables[4] = $tables[3] + @decomp_hash_index;
3652 $tables[5] = $tables[4] + @decomp_hash_data;
3653 $tables[6] = $tables[5] + length $decomp_char_data;
3654 $tables[7] = $tables[6] + @comp_hash_index;
3656 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
3657 print OUTPUT pack "S<*", @header;
3658 print OUTPUT pack "S<*", @tables;
3659 print OUTPUT pack "C*", @class_values;
3661 print OUTPUT pack "C*", @rows[0..$level1-1];
3662 print OUTPUT pack "C*", @rows[$level1..$#rows];
3663 print OUTPUT pack "S<*", @decomp_hash_index;
3664 print OUTPUT pack "S<*", @decomp_hash_data;
3665 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
3666 print OUTPUT pack "S<*", @comp_hash_index;
3667 print OUTPUT pack "S<*", @comp_hash_data;
3669 close OUTPUT;
3670 save_file($filename);
3672 add_registry_string_value( $nlskey, "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
3676 ################################################################
3677 # output a codepage definition file from the global tables
3678 sub output_codepage_file($)
3680 my $codepage = shift;
3682 my $output = sprintf "nls/c_%03d.nls", $codepage;
3683 open OUTPUT,">$output.new" or die "Cannot create $output";
3685 printf "Building %s\n", $output;
3686 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
3687 else { dump_binary_dbcs_table( $codepage ); }
3689 close OUTPUT;
3690 save_file($output);
3692 add_registry_string_value( $nlskey, "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
3695 ################################################################
3696 # output a codepage table from a Microsoft-style mapping file
3697 sub dump_msdata_codepage($)
3699 my $filename = shift;
3701 my $state = "";
3702 my ($codepage, $width, $count);
3703 my ($lb_cur, $lb_end);
3705 @cp2uni = ();
3706 @glyph2uni = ();
3707 @lead_bytes = ();
3708 @uni2cp = ();
3709 $default_char = $DEF_CHAR;
3710 $default_wchar = $DEF_CHAR;
3712 my $INPUT = open_data_file( "codepages", $filename );
3714 while (<$INPUT>)
3716 next if /^;/; # skip comments
3717 next if /^\s*$/; # skip empty lines
3718 next if /\x1a/; # skip ^Z
3719 last if /^ENDCODEPAGE/;
3721 if (/^CODEPAGE\s+(\d+)/)
3723 $codepage = $1;
3724 next;
3726 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
3728 $width = $1;
3729 $default_char = hex $2;
3730 $default_wchar = hex $3;
3731 next;
3733 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
3735 $state = $1;
3736 $count = $2;
3737 next;
3739 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
3741 if ($state eq "MBTABLE")
3743 my $cp = hex $1;
3744 my $uni = hex $2;
3745 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3746 next;
3748 if ($state eq "GLYPHTABLE")
3750 my $cp = hex $1;
3751 my $uni = hex $2;
3752 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
3753 next;
3755 if ($state eq "WCTABLE")
3757 my $uni = hex $1;
3758 my $cp = hex $2;
3759 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
3760 next;
3762 if ($state eq "DBCSRANGE")
3764 my $start = hex $1;
3765 my $end = hex $2;
3766 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
3767 $lb_cur = $start;
3768 $lb_end = $end;
3769 next;
3771 if ($state eq "DBCSTABLE")
3773 my $mb = hex $1;
3774 my $uni = hex $2;
3775 my $cp = ($lb_cur << 8) | $mb;
3776 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3777 if (!--$count)
3779 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
3781 next;
3784 die "$filename: Unrecognized line $_\n";
3786 close $INPUT;
3788 output_codepage_file( $codepage );
3790 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
3793 ################################################################
3794 # align a string length
3795 sub align_string($$)
3797 my ($align, $str) = @_;
3798 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
3799 return $str;
3802 ################################################################
3803 # pad a string with zeros
3804 sub pad_string($$)
3806 my ($pad, $str) = @_;
3807 $str .= pack "C*", (0) x ($pad - length($str)) if length($str) < $pad;
3808 return $str;
3811 ################################################################
3812 # pack a GUID string
3813 sub pack_guid($)
3815 $_ = shift;
3816 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
3817 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
3820 ################################################################
3821 # comparison function for compression sort
3822 sub cmp_compression
3824 return scalar @{$a} <=> scalar @{$b} ||
3825 $a->[4] <=> $b->[4] ||
3826 $a->[5] <=> $b->[5] ||
3827 $a->[6] <=> $b->[6] ||
3828 $a->[7] <=> $b->[7] ||
3829 $a->[8] <=> $b->[8] ||
3830 $a->[9] <=> $b->[9] ||
3831 $a->[10] <=> $b->[10] ||
3832 $a->[11] <=> $b->[11] ||
3833 $a->[12] <=> $b->[12];
3836 ################################################################
3837 # build a binary sort keys table
3838 sub dump_sortkey_table($)
3840 my $filename = shift;
3841 my @keys;
3842 my ($part, $section, $subsection, $guid, $version, $ling_flag);
3843 my @multiple_weights;
3844 my @expansions;
3845 my @compressions;
3846 my %exceptions;
3847 my %guids;
3848 my %compr_flags;
3849 my %locales;
3850 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
3851 my $jamostr = "";
3853 my $re_hex = '0x[0-9A-Fa-f]+';
3854 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
3855 $guids{$default_guid} = { };
3857 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
3859 my $KEYS = open_data_file( "sorting" );
3861 printf "Building $filename\n";
3863 while (<$KEYS>)
3865 s/\s*;.*$//;
3866 next if /^\s*$/; # skip empty lines
3867 if (/^\s*(SORTKEY|SORTTABLES)/)
3869 $part = $1;
3870 next;
3872 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
3874 $part = $section = "";
3875 next;
3877 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
3879 $section = $1;
3880 $guid = undef;
3881 next;
3883 next unless $part;
3884 if ("$part.$section" eq "SORTKEY.DEFAULT")
3886 if (/^\s*($re_hex)\s+$re_key/)
3888 $keys[hex $1] = [ split(/\s+/,$2) ];
3889 next;
3892 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
3894 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
3896 $version = hex $1;
3897 next;
3899 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
3901 # ignore for now
3902 next;
3905 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
3906 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
3907 "$part.$section" eq "SORTTABLES.INVERSECASING")
3909 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
3911 $guid = lc $1;
3912 $guids{$guid} = { } unless defined $guids{$guid};
3913 $guids{$guid}->{flags} |= $flags{$section};
3914 next;
3916 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3918 $locales{$1} = $guid;
3919 next;
3922 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
3924 if (/^\s*(\d+)\s+(\d+)/)
3926 push @multiple_weights, $1, $2;
3927 next;
3930 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
3932 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3934 my $pos = scalar @expansions / 2;
3935 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
3936 push @expansions, hex $2, hex $3;
3937 next;
3940 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
3942 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3944 $keys[hex $1] = $keys[hex $2];
3945 next;
3948 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
3950 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
3952 if ($subsection || !$guid) # start a new one
3954 $guid = lc $1;
3955 $subsection = "";
3956 $guids{$guid} = { } unless defined $guids{$guid};
3957 $guids{$guid}->{flags} |= $flags{$2} if $2;
3958 $guids{$guid}->{compr} = @compressions;
3959 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
3960 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
3961 push @compressions, [ ];
3963 else # merge with current one
3965 $guids{lc $1} = { } unless defined $guids{lc $1};
3966 $guids{lc $1}->{flags} |= $flags{$2} if $2;
3967 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
3968 $compr_flags{lc $1} = $compr_flags{$guid};
3970 next;
3972 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3974 $locales{$1} = $guid;
3975 next;
3977 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
3979 $subsection = $1;
3980 next;
3982 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
3984 my @comp = map { hex $_; } split(/\s+/,$1);
3985 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
3986 # add compression flags
3987 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
3988 next;
3991 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
3993 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
3995 $guid = lc $1;
3996 $guids{$guid} = { } unless defined $guids{lc $1};
3997 $ling_flag = ($2 ? "+" : "-");
3998 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
3999 next;
4001 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
4003 $locales{$1} = $guid;
4004 next;
4006 if (/^\s*($re_hex)\s+$re_key/)
4008 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
4009 next;
4012 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
4014 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
4016 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
4017 next;
4020 die "$current_data_file: $part.$section: unrecognized line $_\n";
4022 close $KEYS;
4024 # Sortkey table
4026 my $table;
4027 for (my $i = 0; $i < 0x10000; $i++)
4029 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
4030 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4033 foreach my $id (sort keys %exceptions)
4035 my $pos = length($table) / 4;
4036 my @exc = @{$exceptions{$id}};
4037 my @filled;
4038 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
4039 my $guid = substr( $id, 0, -1 );
4040 $guids{$guid}->{$key} = $pos;
4041 $pos += 0x100;
4042 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
4043 for (my $j = 0; $j < 0x10000; $j++)
4045 next unless defined $exc[$j] || defined $flags[$j];
4046 $filled[$j >> 8] = 1;
4047 $j |= 0xff;
4049 for (my $j = 0; $j < 0x100; $j++)
4051 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
4052 $pos += 0x100 if $filled[$j];
4054 for (my $j = 0; $j < 0x10000; $j++)
4056 next unless $filled[$j >> 8];
4057 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
4058 $k[3] |= $flags[$j] || 0;
4059 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4063 # Case mapping tables
4065 # standard table
4066 my @casemaps;
4067 my @upper = @toupper_table;
4068 my @lower = @tolower_table;
4069 remove_linguistic_mappings( \@upper, \@lower );
4070 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4072 # linguistic table
4073 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
4075 # Turkish table
4076 @upper = @toupper_table;
4077 @lower = @tolower_table;
4078 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
4079 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
4080 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4081 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
4083 # Char type table
4085 my @table;
4086 my $types = "";
4087 my %typestr;
4088 for (my $i = 0; $i < 0x10000; $i++)
4090 my $str = pack "S<3",
4091 ($category_table[$i] || 0) & 0xffff,
4092 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
4093 ($category_table[$i] || 0) >> 16;
4095 if (!defined($typestr{$str}))
4097 $typestr{$str} = length($types) / 6;
4098 $types .= $str;
4100 $table[$i] = $typestr{$str};
4103 my (@rows, @array, @data, @row_data);
4104 (@rows[0..4095], @data) = compress_array( 4096, 0, @table[0..65535] );
4105 (@array[0..255], @row_data) = compress_array( 256, 0, @rows );
4106 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
4107 for (my $i = 0; $i < @row_data; $i++) { $row_data[$i] += 2 * @row_data + 512 - 4096; }
4109 my $arraystr = pack("S<*", @array, @row_data) . pack("C*", @data);
4110 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
4111 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
4113 # Sort tables
4115 # guids
4116 my $sorttables = pack "L<2", $version, scalar %guids;
4117 foreach my $id (sort keys %guids)
4119 my %guid = %{$guids{$id}};
4120 my $flags = $guid{flags} || 0;
4121 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
4122 $sorttables .= pack_guid($id) . pack "L<5",
4123 $flags,
4124 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
4125 $guid{except} || 0,
4126 $guid{ling_except} || 0,
4127 $map / 2;
4130 # expansions
4131 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
4133 # compressions
4134 $sorttables .= pack "L<", scalar @compressions;
4135 my $rowstr = "";
4136 foreach my $c (@compressions)
4138 my $pos = length($rowstr) / 2;
4139 my $min = 0xffff;
4140 my $max = 0;
4141 my @lengths = (0) x 8;
4142 foreach my $r (sort cmp_compression @{$c})
4144 my @row = @{$r};
4145 $lengths[scalar @row - 6]++;
4146 foreach my $val (@row[4..$#row])
4148 $min = $val if $min > $val;
4149 $max = $val if $max < $val;
4151 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
4152 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
4154 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
4156 $sorttables .= $rowstr;
4158 # multiple weights
4159 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
4161 # jamo sort
4162 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
4164 # Locales
4166 add_registry_key( $nlskey, "Sorting\\Ids", "{$default_guid}" );
4167 foreach my $loc (sort keys %locales)
4169 # skip specific locales that match more general ones
4170 my @parts = split /[-_]/, $loc;
4171 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
4172 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
4173 add_registry_string_value( $nlskey, "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
4176 # File header
4178 my @header;
4179 $header[0] = 16;
4180 $header[1] = $header[0] + length $table;
4181 $header[2] = $header[1] + length $casemaps;
4182 $header[3] = $header[2] + length $chartypes;
4184 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4185 print OUTPUT pack "L<*", @header;
4186 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
4187 close OUTPUT;
4188 save_file($filename);
4189 return $chartypes;
4193 my %lcnames;
4195 sub locale_parent($)
4197 my $loc = shift;
4199 return undef unless $loc;
4200 return $lcnames{$loc}->{sparent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{sparent};
4201 return $lcnames{$loc}->{parent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{parent};
4202 if ($loc =~ /(.*)-[0-9A-Za-z]+/) { return $1; }
4203 return "";
4206 sub compare_locales
4208 (my $n1 = $a) =~ tr/A-Z_/a-z-/;
4209 (my $n2 = $b) =~ tr/A-Z_/a-z-/;
4210 return $n1 cmp $n2;
4213 # query an xml key
4214 sub xml_query($$)
4216 my ($xml, $query) = @_;
4217 my $ret = $xml->find( $query );
4218 return undef unless $ret;
4219 printf STDERR "multiple entries for %s\n", $query if (@{$ret} > 1);
4220 return @{$ret}[0]->textContent;
4223 # query an xml key for a locale, with fallback to the parents
4224 sub loc_query($$)
4226 my ($loc, $query) = @_;
4228 $loc = $lcnames{"en-US"} unless $loc->{name}; # fallback to "en-US" for root locale
4230 for (my $cur = $loc->{name}; defined $cur; $cur = locale_parent( $cur ))
4232 next unless defined $lcnames{$cur};
4233 my $xml = $lcnames{$cur}->{xml};
4234 my $ret = $xml->find( $query );
4235 next unless $ret;
4236 printf STDERR "%s: multiple entries for %s\n", $cur, $query if (@{$ret} > 1);
4237 next if @{$ret}[0]->textContent eq "\x{2191}\x{2191}\x{2191}"; # "↑↑↑"
4238 return @{$ret}[0]->textContent;
4240 return undef;
4243 # retrieve a locale field entry by going up the parents tree
4244 sub locale_entry($$$)
4246 my ($loc, $field, $def) = @_;
4248 return $loc->{$field} if defined $loc->{$field};
4250 unless ($loc->{name}) # fallback to "en-US" for root locale
4252 $loc = $lcnames{"en-US"};
4253 return $loc->{$field} if defined $loc->{$field};
4255 while (defined $loc->{alias}) # resolve aliases
4257 $loc = $lcnames{$loc->{alias}};
4258 return $loc->{$field} if defined $loc->{$field};
4260 my $cur = $loc->{name};
4261 while ($cur)
4263 if (defined $lcnames{$cur} && defined $lcnames{$cur}->{sparent})
4265 $cur = $lcnames{$cur}->{sparent};
4267 elsif ($cur =~ /(.*)-[0-9A-Za-z]+/)
4269 $cur = $1;
4271 else
4273 return $def;
4275 return $lcnames{$cur}->{$field} if defined $lcnames{$cur} && defined $lcnames{$cur}->{$field};
4277 return $def;
4280 my $string_data;
4282 sub add_str_data($)
4284 my $txt = shift;
4285 my $ret = index( $string_data, $txt );
4286 if ($ret == -1)
4288 $ret = length($string_data);
4289 $string_data .= $txt
4291 return $ret / 2;
4294 sub add_string($)
4296 my $str = shift;
4297 return 0 unless defined($str) && $str ne "";
4298 my $utf = encode( "UTF16LE", $str );
4299 return add_str_data( (pack "S<", length($utf) / 2) . $utf . (pack "S", 0) );
4302 sub add_fontsig(@)
4304 return add_str_data( pack "S<L<*", scalar(@_) * 2, @_ );
4307 sub add_strarray(@)
4309 return 0 unless @_;
4310 return add_str_data( pack "S<L<*", scalar @_, map { add_string($_) } @_);
4313 sub format_to_grouping($)
4315 my $format = shift;
4316 if ($format =~ /#,(#+),(#+0)/) { return chr(length($2)) . chr(length($1)); }
4317 if ($format =~ /#,(#+0)/) { return chr(length($1)); }
4318 # printf STDERR "unknown format %s\n", $format;
4319 return chr(3);
4322 sub parse_currency_format($$)
4324 my $name = shift;
4325 my ($posfmt, $negfmt) = split /;/, shift;
4326 my @pospatterns = ( "\xa4[^\xa0]*#", # $1.1
4327 "00[^\xa0]*\xa4", # 1.1$
4328 "\xa4.*\xa0.*#", # $ 1.1
4329 "00.*\xa0.*\xa4" ); # 1.1 $
4330 my @negpatterns = ( "\\(\xa4[^\xa0]*#", # ($1.1)
4331 "-\xa4[^\xa0]*#", # -$1.1
4332 "\xa4[^\xa0]*-#", # $-1.1
4333 "\xa4[^\xa0]*#.*00-", # $1.1-
4334 "00[^\xa0]*\xa4\\)", # (1.1$)
4335 "-#.*00[^\xa0]*\xa4", # -1.1$
4336 "00-[^\xa0]*\xa4", # 1.1-$
4337 "00[^\xa0]*\xa4-", # 1.1$-
4338 "-#.*00.*\xa0.*\xa4", # -1.1 $
4339 "-\xa4.*\xa0.*#", # -$ 1.1
4340 "00.*\xa0.*\xa4-", # 1.1 $-
4341 "\xa4.*\xa0.*#.*00-", # $ 1.1-
4342 "\xa4.*\xa0.*-#", # $ -1.1
4343 "00-.*\xa0.*\xa4", # 1.1- $
4344 "\\(\xa4.*\xa0.*#", # ($ 1.1)
4345 "00.*\xa0.*\xa4\\)"); # (1.1 $)
4346 my ($pos, $neg);
4348 for ($pos = 0; $pos < @pospatterns; $pos++)
4350 last if ($posfmt =~ /$pospatterns[$pos]/);
4352 #printf STDERR "$name: unknown format '%s'\n", $posfmt if ($pos == @pospatterns);
4353 $pos = 0 if ($pos == @pospatterns);
4355 if (defined $negfmt)
4357 for ($neg = 0; $neg < @negpatterns; $neg++)
4359 last if ($negfmt =~ /$negpatterns[$neg]/);
4361 #printf STDERR "$name: unknown format '%s'\n", $negfmt if ($neg == @negpatterns);
4362 $neg = 0 if ($neg == @negpatterns);
4364 elsif ($pos == 0) { $neg = 1; }
4365 elsif ($pos == 1) { $neg = 5; }
4366 elsif ($pos == 2) { $neg = 9; }
4367 elsif ($pos == 3) { $neg = 8; }
4369 return ($pos, $neg);
4372 sub parse_percent_format($)
4374 my $fmt = shift;
4375 my @patterns = ( "0.+%", # 1 %
4376 "0%", # 1%
4377 "%#", # %1
4378 "%.+#" ); # % 1
4379 my $pos;
4380 for ($pos = 0; $pos < @patterns; $pos++)
4382 last if ($fmt =~ /$patterns[$pos]/);
4384 printf STDERR "unknown format '%s'\n", $fmt if ($pos == @patterns);
4385 return ($pos, ($pos == 3) ? 7 : $pos);
4388 sub convert_date_format($)
4390 my $fmt = shift;
4391 $fmt =~ s/G+/gg/;
4392 $fmt =~ s/LLLL/MMMM/;
4393 $fmt =~ s/LLL/MMM/;
4394 $fmt =~ s/E+/dddd/;
4395 $fmt =~ s/ccc+/dddd/;
4396 $fmt =~ s/([^gy])y([^y])/$1yyyy$2/;
4397 $fmt =~ s/^y([^y])/yyyy$1/;
4398 $fmt =~ s/([^gy])y$/$1yyyy/;
4399 return $fmt;
4402 sub convert_time_format($)
4404 my $fmt = shift;
4405 $fmt =~ s/a+/tt/;
4406 $fmt =~ s/B+/tt/;
4407 $fmt =~ s/\x{202f}/ /;
4408 return $fmt;
4411 sub load_iso639()
4413 my %iso639;
4414 my $DATA = open_data_file( "iso639", "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3_$ISO639VERSION.tab" );
4415 while (<$DATA>)
4417 if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
4419 close $DATA;
4420 return %iso639;
4424 ################################################################
4425 # build the locale table for locale.nls
4426 sub build_locale_data()
4428 my $base = "cldr-release-$CLDRVERSION";
4429 my $suppl = load_xml_data_file( "cldr", "$base/common/supplemental/supplementalData.xml" );
4430 my $subtags = load_xml_data_file( "cldr", "$base/common/supplemental/likelySubtags.xml" );
4431 my $numbers = load_xml_data_file( "cldr", "$base/common/supplemental/numberingSystems.xml" );
4432 # obsolete phone data from CLDR version 33
4433 my $phone = load_xml_data_file( "cldr33", "common/supplemental/telephoneCodeData.xml" );
4434 my %iso639 = load_iso639();
4435 $string_data = pack "S2", 0, 0; # offset 0 == empty string
4437 %lcnames = map { $_->{name} => $_ } @locales;
4439 my %lcids;
4440 foreach my $loc (@locales) { $lcids{$loc->{lcid}} = $loc if defined $loc->{lcid}; }
4442 my %days = ( "sun" => 0, "mon" => 1, "tue" => 2, "wed" => 3, "thu" => 4, "fri" => 5, "sat" => 6 );
4444 # assign locale parents
4446 foreach my $loc (@locales)
4448 next if $loc->{name} eq "";
4449 next if defined $loc->{parent};
4450 (my $unix_name = $loc->{name}) =~ s/-/_/g;
4451 my $parent = xml_query( $suppl, "/supplementalData/parentLocales[not(\@component)]/parentLocale[contains(concat(' ',\@locales,' '),' $unix_name ')]/\@parent" );
4452 if ($parent)
4454 $parent =~ s/_/-/g;
4455 $parent = "" if $parent eq "root";
4457 elsif ($loc->{name} =~ /(.*)-[0-9A-Za-z]+/) { $parent = $1; }
4458 $loc->{parent} = $parent || "";
4461 # load per-locale XML files
4463 foreach my $loc (@locales)
4465 next if defined $loc->{alias};
4466 (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
4467 $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
4468 my $xml = load_xml_data_file( "cldr", $file );
4469 $loc->{xml} = $xml;
4470 $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
4471 $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
4472 $loc->{script} = xml_query( $xml, "/ldml/identity/script/\@type" );
4473 if (!defined($loc->{territory}) && $loc->{name} =~ /-([A-Z]{2}|[0-9]{3})$/) { $loc->{territory} = $1; }
4474 if (!defined($loc->{script}) && $loc->{name} =~ /-([A-Z][a-z]{3})(-[A-Z]{2})?$/) { $loc->{script} = $1; }
4477 # assign a default territory and sort locale
4479 foreach my $loc (@locales)
4481 next if defined $loc->{alias};
4482 next if defined $loc->{territory};
4483 my $id = $loc->{sortlocale};
4484 if (defined $id && ($id =~ /[-_]([A-Z0-9]+)$/))
4486 $loc->{territory} = $1;
4487 next;
4489 my @children = grep /^$loc->{name}-[A-Z0-9]+$/ && !defined $lcnames{$_}->{alias}, keys %lcnames;
4490 if (@children == 1)
4492 $id = $children[0];
4494 else
4496 my $name = $loc->{file} || $loc->{name};
4497 $name =~ s/-(Arab|Beng|Cyrl|Deva|Guru|Hans|Hant|Latn|Tfng|Vaii)$//;
4498 $name =~ s/-/_/g;
4499 $id = xml_query( $subtags, "/supplementalData/likelySubtags/likelySubtag[\@from='$name']/\@to" );
4500 $id =~ s/_/-/g if $id;
4502 if ($id =~ /[-_]([A-Z0-9]+)$/)
4504 $loc->{territory} = $1;
4505 next if defined $loc->{sortlocale};
4506 next unless $id =~ /^$loc->{name}/;
4507 while (defined $lcnames{$id} && defined $lcnames{$id}->{alias}) { $id = $lcnames{$id}->{alias}; }
4508 $loc->{sortlocale} = $id if defined $lcnames{$id};
4509 next;
4511 print STDERR "no territory found for $loc->{name}\n";
4514 # fill geoid table
4516 my %geotable;
4517 foreach my $geo (@geoids)
4519 my $name = $geo->{name};
4520 next unless defined $name;
4521 $geo->{alias} = $geotable{$name} if defined $geotable{$name};
4522 $geotable{$name} ||= $geo;
4524 foreach my $loc (@locales)
4526 next if defined $loc->{alias};
4527 my $territory = $loc->{territory};
4528 $geotable{$territory} ||= { name => $territory };
4530 foreach my $name (keys %geotable)
4532 my $geo = $geotable{$name};
4533 $geo->{dialcode} = xml_query( $phone, "(/supplementalData/telephoneCodeData/codesByTerritory[\@territory='$name']/telephoneCountryCode)[1]/\@code" );
4534 if ($name =~ /\d+/)
4536 $geo->{uncode} = $name;
4537 next;
4539 $geo->{iso2} = $name;
4540 $geo->{iso3} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@alpha3");
4541 $geo->{uncode} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@numeric");
4542 $geo->{sintlsymbol} ||= xml_query( $suppl, "(/supplementalData/currencyData/region[\@iso3166='$name']/currency[not(\@to)])[1]/\@iso4217") || "XXX";
4543 $geo->{sintlsymbol} =~ s/XXX/XDR/;
4545 foreach my $geo (@geoids)
4547 $geo->{parentid} = $geotable{$geo->{parent}}->{id} if defined $geo->{parent};
4548 next if defined $geo->{iso2};
4549 next if defined $geo->{alias};
4550 next unless defined $geo->{uncode};
4551 my @contains;
4552 my $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and not(\@status)]/\@contains");
4553 push @contains, split /\s+/, $list if defined $list;
4554 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and \@status='deprecated']/\@contains");
4555 push @contains, split /\s+/, $list if defined $list;
4556 while (@contains)
4558 my $territory = pop @contains;
4559 if (defined $geotable{$territory})
4561 $geotable{$territory}->{parentid} ||= $geo->{id};
4563 elsif ($territory =~ /\d+/)
4565 # expand region recursively
4566 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$territory' and not(\@status)]/\@contains" );
4567 push @contains, split /\s+/, $list if defined $list;
4572 # assign calendars to their locale
4574 foreach my $cal (@calendars)
4576 next unless defined $cal->{locale};
4577 my $loc = $lcnames{$cal->{locale}};
4578 $loc->{calendar} = [ ] unless defined $loc->{calendar};
4579 push @{$loc->{calendar}}, $cal;
4582 # assign default lcid to aliases
4584 foreach my $loc (@locales)
4586 next unless defined $loc->{alias};
4587 next if defined $loc->{lcid};
4588 my $alias = $loc->{alias};
4589 my $lcid = $lcnames{$alias}->{lcid} || 0x1000;
4590 $loc->{lcid} = $lcid | 0x80000000;
4593 # assign sort aliases to parent locale
4595 foreach my $loc (@locales)
4597 next unless $loc->{name} =~ /_/;
4598 next unless defined $loc->{alias};
4599 my $alias = $loc->{alias};
4600 my $parent = $lcnames{$alias};
4601 my $basename = $parent->{name};
4602 while (1)
4604 @{$parent->{sortnames}}[($loc->{lcid} >> 16) - 1] = $loc->{name};
4605 $alias = locale_parent( $alias );
4606 last unless $alias && defined $lcnames{$alias};
4607 $parent = $lcnames{$alias};
4608 last if defined $parent->{sortbase} && $parent->{sortbase} ne $basename;
4609 $parent->{sortbase} = $basename;
4613 # assign an array index to all locales
4615 my $idx = 0;
4616 foreach my $loc (@locales)
4618 next if defined $loc->{alias};
4619 $loc->{idx} = $idx++;
4621 foreach my $loc (@locales)
4623 my $alias = $loc->{alias};
4624 next unless defined $alias;
4625 while (defined $lcnames{$alias}->{alias}) { $alias = $lcnames{$alias}->{alias}; }
4626 $loc->{idx} = $lcnames{$alias}->{idx};
4629 # output lcids table
4631 my $lcid_data = "";
4632 foreach my $id (sort { $a <=> $b } keys %lcids)
4634 my $loc = $lcids{$id};
4635 $lcid_data .= pack "L<S<2", $id, $loc->{idx}, add_string($loc->{name});
4638 # output lcnames table
4640 my $lcname_data = "";
4641 foreach my $name (sort compare_locales keys %lcnames)
4643 my $loc = $lcnames{$name};
4644 $lcname_data .= pack "S<2L<", add_string($name), $loc->{idx}, $loc->{lcid} || 0x1000;
4647 # output locales array
4649 my $locale_data = "";
4650 my $default_lcid = 0x8001;
4651 foreach my $loc (@locales)
4653 next if defined $loc->{alias};
4654 my $sname = $loc->{name};
4655 my $language = $loc->{language};
4656 my $territory = $loc->{territory};
4657 my $script = $loc->{script};
4658 my $neutral = ($sname && $sname !~ /-$territory/);
4659 my $sparent = $loc->{sparent} || (($sname =~ /(.*)-[0-9A-Za-z]+/) ? $1 : $loc->{parent});
4660 my $unique_lcid = $loc->{lcid};
4661 unless (defined $unique_lcid) { $unique_lcid = $default_lcid++; }
4662 my $geo = $geotable{$territory};
4663 my $territory_match = "contains(concat(' ',normalize-space(\@territories),' '),' $territory ')";
4665 # languages and scripts
4667 my $ssortlocale = $loc->{sortlocale} || ($neutral ? "$sname-$territory" : $sname);
4668 my $idefaultlanguage = defined $lcnames{$ssortlocale} ? $lcnames{$ssortlocale}->{lcid} : undef;
4669 $idefaultlanguage = $lcnames{"en-US"}->{lcid} unless $ssortlocale;
4670 (my $siso639langname = $sname) =~ s/-.*$//;
4671 my $siso639langname2 = $iso639{$siso639langname} || $siso639langname;
4672 my $sopentypelang = sprintf "%-4s", locale_entry( $loc, "sopentypelang", uc $siso639langname2 );
4673 my $sabbrevlangname = defined $loc->{lcid} ? locale_entry( $loc, "sabbrevlangname", uc $siso639langname2 ) : "ZZZ";
4674 my $siso3166ctryname2 = $geo->{iso3} || $geo->{uncode};
4675 my $senglanguage = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" ) || "";
4676 my $sengcountry = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" ) || "";
4677 my $snativelangname = loc_query( $loc, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" );
4678 my $snativectryname = loc_query( $loc, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" );
4679 $sengcountry =~ s/South Korea/Korea/;
4680 $sengcountry =~ s/T\xfcrkiye/Turkey/;
4681 $snativelangname ||= $senglanguage;
4682 $snativectryname ||= $sengcountry;
4683 if ($script)
4685 my $engscript = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4686 my $nativescript = loc_query( $loc, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4687 $senglanguage .= " ($engscript)" if $engscript;
4688 $snativelangname .= " ($nativescript)" if $nativescript;
4690 my $sengdisplayname = $neutral ? $senglanguage : "$senglanguage ($sengcountry)";
4691 my $snativedisplayname = $neutral ? $snativelangname : "$snativelangname ($snativectryname)";
4692 $sengdisplayname =~ s/\) \(/, /;
4693 $snativedisplayname =~ s/\) \(/, /;
4694 my $sscripts = locale_entry( $loc, "sscripts", $script ) || xml_query( $suppl, "/supplementalData/languageData/language[\@type='$language' and not(\@alt)]/\@scripts" );
4695 $sscripts = (join ";", (sort split / /, ($sscripts || "Latn"))) . ";";
4696 my $ireadinglayout = locale_entry( $loc, "ireadinglayout", 0 );
4697 my $charlayout = loc_query( $loc, "/ldml/layout/orientation/characterOrder" );
4698 if ($charlayout eq "right-to-left")
4700 $ireadinglayout = 1;
4702 elsif ($charlayout eq "top-to-bottom")
4704 my $linelayout = loc_query( $loc, "/ldml/layout/orientation/lineOrder" );
4705 $ireadinglayout = $linelayout eq "right-to-left" ? 2 : 3;
4707 my $igeoid = $geo->{id} || 0;
4709 # numbers
4711 my $sdecimal = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/decimal" );
4712 my $slist = locale_entry( $loc, "slist", ";" );
4713 my $smondecimalsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyDecimal" ) || $sdecimal;
4714 my $sthousand = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/group" );
4715 $sthousand =~ s/\x{202f}/\x{00a0}/;
4716 my $smonthousandsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyGroup" ) || $sthousand;
4717 my $spositivesign = "";
4718 my $snegativesign = "-";
4719 my $spercent = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/percentSign" );
4720 my $snan = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/nan" );
4721 my $sposinfinity = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/infinity" );
4722 my $sneginfinity = $sposinfinity ? "-$sposinfinity" : "";
4723 my $sgrouping = format_to_grouping( loc_query( $loc, "/ldml/numbers/decimalFormats[\@numberSystem='latn']/decimalFormatLength[not(\@type)]/decimalFormat/pattern" ));
4724 my $percentformat = loc_query( $loc, "/ldml/numbers/percentFormats[\@numberSystem='latn']/percentFormatLength[not(\@type)]/percentFormat/pattern" );
4725 my $currencyformat = loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='accounting']/pattern[not(\@alt)]" ) ||
4726 loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='standard']/pattern[not(\@alt)]" );
4727 my $smongrouping = format_to_grouping( $currencyformat );
4728 my ($icurrency, $inegcurr) = parse_currency_format( $sname, $currencyformat );
4729 my ($ipospercent, $inegpercent) = parse_percent_format( $percentformat );
4730 my $native_numbering = loc_query( $loc, "/ldml/numbers/otherNumberingSystems/native" );
4731 my @snativedigits = split //, (locale_entry( $loc, "nativedigits", "" ) || xml_query( $numbers, "/supplementalData/numberingSystems/numberingSystem[\@id='$native_numbering']/\@digits" ));
4732 my $digitsubstitution = !(ord($snativedigits[0]) >= 0x600 && ord($snativedigits[0]) <= 0x6ff);
4733 my $measure = defined xml_query( $suppl, "/supplementalData/measurementData/measurementSystem[\@type='US' and $territory_match]" );
4734 my $papersize = defined xml_query( $suppl, "/supplementalData/measurementData/paperSize[\@type='US-Letter' and $territory_match]" );
4736 # currencies
4738 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
4739 my $scurrency = $geo->{scurrency} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[\@alt='narrow']" );
4740 $scurrency ||= loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[not(\@alt)]" );
4741 $geo->{scurrency} = $scurrency if $scurrency;
4742 $scurrency ||= $sintlsymbol;
4743 my $sengcurrname = $loc->{sengcurrname} || loc_query( $lcnames{en}, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" );
4744 my $snativecurrname = $loc->{sengcurrname} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" ) || $sengcurrname;
4745 my $icurrdigits = xml_query( $suppl, "/supplementalData/currencyData/fractions/info[\@iso4217='$sintlsymbol']/\@digits" );
4746 $icurrdigits = 2 unless defined $icurrdigits;
4748 # calendars
4750 my $firstday = xml_query( $suppl, "/supplementalData/weekData/firstDay[not(\@alt) and $territory_match]/\@day" );
4751 my $ifirstdayofweek = $firstday ? $days{$firstday} : 1;
4752 my $firstweekofyear = (xml_query( $suppl, "/supplementalData/weekData/minDays[$territory_match]/\@count" ) || 0) == 4 ? 2 : 0;
4753 my $serastring = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/eras/eraAbbr/era[\@type='1' and not(\@alt)]" );
4754 my (@sdayname, @sabbrevdayname, @sshortestdayname);
4755 foreach my $d (sort { $days{$a} <=> $days{$b} } keys %days)
4757 my $n = $days{$d};
4758 my %name;
4759 foreach my $type (qw(wide abbreviated short))
4761 $name{$type} = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/days/dayContext[\@type='format']/dayWidth[\@type='$type']/day[\@type='$d' and not(\@alt)]" );
4763 push @sdayname, $name{wide};
4764 push @sabbrevdayname, $name{abbreviated} || $name{wide};
4765 push @sshortestdayname, $name{short} || $name{abbreviated} || $name{wide};
4767 my (@smonthname, @sabbrevmonthname, @sgenitivemonth, @sabbrevgenitivemonth);
4768 foreach my $n (1..13)
4770 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='wide']/month[\@type='$n']" );
4771 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4772 my $genitive = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n']" );
4773 my $abbrevgen = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4774 push @smonthname, $name || $genitive || "";
4775 push @sabbrevmonthname, $abbrev || $abbrevgen || $name || $genitive || "";
4776 push @sgenitivemonth, $genitive || "";
4777 push @sabbrevgenitivemonth, $abbrevgen || $genitive || "";
4779 @sgenitivemonth = () if join("|",@smonthname) eq join("|",@sgenitivemonth);
4780 @sabbrevgenitivemonth = () if join("|",@sabbrevmonthname) eq join("|",@sabbrevgenitivemonth);
4781 my %caltypes = ( "gregorian" => 1, "japanese" => 3, "chinese" => 4, "dangi" => 5, "islamic" => 6, "buddhist" => 7, "hebrew" => 8,
4782 "persian" => 22, "islamic-civil" => 23, "islamic-umalqura" => 23 );
4783 my $calpref = xml_query( $suppl, "/supplementalData/calendarPreferenceData/calendarPreference[$territory_match]/\@ordering" ) || "gregorian";
4784 my $icalendartype;
4785 my @scalnames;
4786 foreach my $c (split /\s+/, $calpref)
4788 next unless defined $caltypes{$c};
4789 $icalendartype .= chr($caltypes{$c});
4790 $scalnames[$caltypes{$c} - 1] = loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$c']" );
4793 # date/time formats
4795 my $s1159 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='am' and not(\@alt)]" );
4796 my $s2359 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='pm' and not (\@alt)]" );
4797 my $sshortestam = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='am' and not(\@alt)]" );
4798 my $sshortestpm = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='pm' and not (\@alt)]" );
4799 my @stimeformat = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='medium']/timeFormat/pattern[not(\@alt)]" ));
4800 push @stimeformat, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4801 pop @stimeformat if $stimeformat[0] eq $stimeformat[1];
4802 @stimeformat = map convert_time_format($_), @stimeformat;
4803 my @sshorttime = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='short']/timeFormat/pattern[not(\@alt)]" ));
4804 push @sshorttime, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hm' and not(\@alt)]" );
4805 pop @sshorttime if $sshorttime[0] eq $sshorttime[1];
4806 @sshorttime = map convert_time_format($_), @sshorttime;
4807 my @sshortdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4808 push @sshortdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
4809 @sshortdate = map convert_date_format($_), @sshortdate;
4810 my @slongdate = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" ));
4811 push @slongdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
4812 @slongdate = map convert_date_format($_), @slongdate;
4813 my @smonthday = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMd' and not(\@alt)]" ));
4814 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Md' and not(\@alt)]" );
4815 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMd' and not(\@alt)]" );
4816 @smonthday = map convert_date_format($_), @smonthday;
4817 my @syearmonth = map convert_date_format($_), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMM' and not(\@alt)]" );
4818 my @sduration = map convert_time_format( lc $_ ), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4819 my $srelativelongdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMEd' and not(\@alt)]" ) ||
4820 loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMEd' and not(\@alt)]" );
4821 $srelativelongdate = convert_date_format( $srelativelongdate );
4823 if (defined $loc->{calendar})
4825 foreach my $cal (@{$loc->{calendar}})
4827 $cal->{sshortdate} = \@sshortdate;
4828 $cal->{syearmonth} = \@syearmonth;
4829 $cal->{slongdate} = \@slongdate;
4830 $cal->{serastring} = [ $serastring ];
4831 $cal->{sdayname} = \@sdayname;
4832 $cal->{sabbrevdayname} = \@sabbrevdayname;
4833 $cal->{smonthname} = \@smonthname;
4834 $cal->{sabbrevmonthname} = \@sabbrevmonthname;
4835 $cal->{scalname} = $scalnames[$cal->{id}];
4836 $cal->{smonthday} = \@smonthday;
4837 $cal->{sshortestdayname} = \@sshortestdayname;
4838 $cal->{sabbreverastring} = [ $serastring ];
4839 $cal->{sshortestdayname} = \@sshortestdayname;
4840 $cal->{srelativelongdate} = $srelativelongdate;
4844 # codepages
4846 my %ansicpmap = ( 437 => 1252, 720 => 1256, 737 => 1253, 775 => 1257, 850 => 1252,
4847 852 => 1250, 855 => 1251, 866 => 1251, 857 => 1254, 862 => 1255 );
4848 my %maccpmap = ( 437 => 10000, 720 => 10004, 737 => 10006, 775 => 10029, 850 => 10000,
4849 852 => 10029, 855 => 10007, 857 => 10081, 862 => 10005, 866 => 10007,
4850 874 => 10021, 932 => 10001, 936 => 10008, 949 => 10003, 950 => 10002,
4851 1258 => 10000 );
4852 my %ebcdiccpmap = ( 437 => 37, 720 => 20420, 737 => 20273, 866 => 20880, 932 => 20290 );
4853 my %codepagemasks = ( 874 => [ 0x01000000, 0x00000000, 0x00000000, 0, 0x00010000, 0x00000000, 0x00010000, 0x00000000 ],
4854 932 => [ 0x00000000, 0x28c70000, 0x00000010, 0, 0x00020000, 0x00000000, 0x00020000, 0x00000000 ],
4855 936 => [ 0x00000000, 0x28010000, 0x00000002, 0, 0x00040000, 0x00000000, 0x00040000, 0x00000000 ],
4856 949 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00080000, 0x00000000, 0x00080000, 0x00000000 ],
4857 950 => [ 0x00000000, 0x28c10000, 0x00000012, 0, 0x00100000, 0x00000000, 0x00100000, 0x00000000 ],
4858 1258 => [ 0x2000000f, 0x00000000, 0x00000000, 0, 0x00000100, 0x00008000, 0x00000100, 0x00008000 ],
4859 866 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x00020000, 0x00000004, 0x02020000 ],
4860 862 => [ 0x00000800, 0x40000000, 0x00000000, 0, 0x00000020, 0x00200000, 0x00000020, 0x00200000 ],
4861 857 => [ 0x0000001f, 0x00000000, 0x00000000, 0, 0x00000010, 0x01000000, 0x00000010, 0x01000000 ],
4862 855 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x02000000, 0x00000004, 0x02000000 ],
4863 852 => [ 0x00000027, 0x00000000, 0x00000000, 0, 0x00000002, 0x04000000, 0x00000002, 0x04000000 ],
4864 775 => [ 0x00000007, 0x00000000, 0x00000000, 0, 0x00000080, 0x08000000, 0x00000080, 0x08000000 ],
4865 737 => [ 0x00000080, 0x00000000, 0x00000000, 0, 0x00000008, 0x10000000, 0x00000008, 0x10010000 ],
4866 720 => [ 0x00002000, 0x00000000, 0x00000000, 0, 0x00000040, 0x20000000, 0x00000040, 0x20080000 ],
4867 850 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x40000000, 0x0000019f, 0xdfd70000 ],
4868 437 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x80000000, 0x0000019f, 0xdfd70000 ],
4869 65001 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00000000, 0x00000000, 0x0000019f, 0xdfd70000 ] );
4870 my $oemcp = locale_entry( $loc, "oemcp", 65001 );
4871 my $maccp = locale_entry( $loc, "maccp", undef ) || $maccpmap{$oemcp} || 65001;
4872 my $ebcdiccp = locale_entry( $loc, "ebcdiccp", undef ) || $ebcdiccpmap{$oemcp} || 500;
4873 $ebcdiccp = 500 if (defined $loc->{oemcp} && $loc->{oemcp} == 65001) || (defined $loc->{maccp} && $loc->{maccp} == 65001);
4874 my $ansicp = $ansicpmap{$oemcp} || $oemcp;
4875 my @fontsig = (0) x 8;
4876 my $sig = locale_entry( $loc, "fontsig", [] );
4877 foreach my $i (0..7) { $fontsig[$i] |= $codepagemasks{$oemcp}->[$i]; }
4878 foreach my $i (0..$#{$sig}) { $fontsig[$i] |= $sig->[$i]; }
4879 $fontsig[3] |= 1 << 31;
4880 $fontsig[3] |= 1 << 27 if $ireadinglayout == 1;
4881 $fontsig[3] |= 1 << 28 if $ireadinglayout == 3;
4883 # special cases for invariant locale
4885 unless ($loc->{name})
4887 $siso639langname = "iv";
4888 $siso639langname2 = "ivl";
4889 $senglanguage = $snativelangname = "Invariant Language";
4890 $sengcountry = $snativectryname = "Invariant Country";
4891 $sengdisplayname = "Invariant Language (Invariant Country)";
4892 $snativedisplayname = "Invariant Language (Invariant Region)";
4893 $sengcurrname = $snativecurrname = "International Monetary Fund";
4894 $scurrency = "\x{00a4}";
4895 $ifirstdayofweek = 0;
4896 $igeoid = $geotable{"US"}->{id};
4897 @stimeformat = ("HH:mm:ss");
4898 @sshortdate = ("MM/dd/yyyy", "yyyy-MM-dd");
4899 @slongdate = ("dddd, dd MMMM yyyy");
4900 @syearmonth = ("yyyy MMMM");
4901 @smonthday = ("MMMM dd", "MMMM d", "M/d", "MMM d");
4902 @sshorttime = ("HH:mm", "hh:mm tt", "H:mm", "h:mm tt");
4903 $srelativelongdate = "dddd, MMMM dd";
4904 $sposinfinity = "Infinity";
4905 $sneginfinity = "-Infinity";
4906 $spositivesign = "+";
4907 $ipospercent = $inegpercent = 0;
4910 # output data
4912 $locale_data .= pack "L<2",
4913 add_string( $sname ), # name
4914 add_string( $sopentypelang ); # LOCALE_SOPENTYPELANGUAGETAG
4916 $locale_data .= pack "S<14",
4917 $loc->{lcid} || 0x1000, # LOCALE_ILANGUAGE
4918 $unique_lcid, # unique_lcid
4919 locale_entry( $loc, "idigits", 2 ), # LOCALE_IDIGITS
4920 locale_entry( $loc, "inegnumber", 1 ), # LOCALE_INEGNUMBER
4921 $icurrdigits, # LOCALE_ICURRDIGITS
4922 $icurrency, # LOCALE_ICURRENCY
4923 $inegcurr, # LOCALE_INEGCURR
4924 locale_entry( $loc, "ilzero", 1 ), # LOCALE_ILZERO
4925 !$neutral, # LOCALE_INEUTRAL
4926 $ifirstdayofweek, # LOCALE_IFIRSTDAYOFWEEK
4927 $firstweekofyear, # LOCALE_IFIRSTWEEKOFYEAR
4928 $geo->{dialcode} || 1 , # LOCALE_ICOUNTRY,
4929 $measure, # LOCALE_IMEASURE
4930 $digitsubstitution; # LOCALE_IDIGITSUBSTITUTION
4932 $locale_data .= pack "L<18",
4933 add_string( $sgrouping ), # LOCALE_SGROUPING
4934 add_string( $smongrouping ), # LOCALE_SMONGROUPING
4935 add_string( $slist ), # LOCALE_SLIST
4936 add_string( $sdecimal ), # LOCALE_SDECIMAL
4937 add_string( $sthousand ), # LOCALE_STHOUSAND
4938 add_string( $scurrency ), # LOCALE_SCURRENCY
4939 add_string( $smondecimalsep ), # LOCALE_SMONDECIMALSEP
4940 add_string( $smonthousandsep ), # LOCALE_SMONTHOUSANDSEP
4941 add_string( $spositivesign ), # LOCALE_SPOSITIVESIGN
4942 add_string( $snegativesign ), # LOCALE_SNEGATIVESIGN
4943 add_string( $s1159 ), # LOCALE_S1159
4944 add_string( $s2359 ), # LOCALE_S2359
4945 add_strarray( @snativedigits ), # LOCALE_SNATIVEDIGITS
4946 add_strarray( @stimeformat ), # LOCALE_STIMEFORMAT
4947 add_strarray( @sshortdate ), # LOCALE_SSHORTDATE
4948 add_strarray( @slongdate ), # LOCALE_SLONGDATE
4949 add_strarray( @syearmonth ), # LOCALE_SYEARMONTH
4950 add_strarray( @sduration ); # LOCALE_SDURATION
4952 $locale_data .= pack "S<8",
4953 $idefaultlanguage || 0x1000, # LOCALE_IDEFAULTLANGUAGE
4954 $ansicp, # LOCALE_IDEFAULTANSICODEPAGE
4955 $oemcp, # LOCALE_IDEFAULTCODEPAGE
4956 $maccp, # LOCALE_IDEFAULTMACCODEPAGE
4957 $ebcdiccp, # LOCALE_IDEFAULTEBCDICCODEPAGE
4958 $igeoid < 65536 ? $igeoid : 39070, # old_geoid
4959 $papersize ? 1 : 9, # LOCALE_IPAPERSIZE
4960 0; # FIXME # islamic_cal
4962 $locale_data .= pack "L<24",
4963 add_string( $icalendartype ), # LOCALE_ICALENDARTYPE
4964 add_string( $sabbrevlangname ), # LOCALE_SABBREVLANGNAME
4965 add_string( $siso639langname ), # LOCALE_SISO639LANGNAME
4966 add_string( $senglanguage ), # LOCALE_SENGLANGUAGE
4967 add_string( $snativelangname ), # LOCALE_SNATIVELANGNAME
4968 add_string( $sengcountry ), # LOCALE_SENGCOUNTRY
4969 add_string( $snativectryname ), # LOCALE_SNATIVECTRYNAME
4970 add_string( $siso3166ctryname2 ), # LOCALE_SABBREVCTRYNAME
4971 add_string( $territory ), # LOCALE_SISO3166CTRYNAME
4972 add_string( $sintlsymbol ), # LOCALE_SINTLSYMBOL
4973 add_string( $sengcurrname ), # LOCALE_SENGCURRNAME
4974 add_string( $snativecurrname ), # LOCALE_SNATIVECURRNAME
4975 add_fontsig( @fontsig ), # LOCALE_FONTSIGNATURE
4976 add_string( $siso639langname2 ), # LOCALE_SISO639LANGNAME2
4977 add_string( $siso3166ctryname2 ), # LOCALE_SISO3166CTRYNAME2
4978 add_string( $sparent ), # LOCALE_SPARENT
4979 add_strarray( @sdayname ), # LOCALE_SDAYNAME
4980 add_strarray( @sabbrevdayname ), # LOCALE_SABBREVDAYNAME
4981 add_strarray( @smonthname ), # LOCALE_SMONTHNAME
4982 add_strarray( @sabbrevmonthname ), # LOCALE_SABBREVMONTHNAME
4983 add_strarray( @sgenitivemonth ), # LOCALE_SGENITIVEMONTH
4984 add_strarray( @sabbrevgenitivemonth ), # LOCALE_SABBREVGENITIVEMONTH
4985 add_strarray( @scalnames ), # LOCALE_SCALNAMES
4986 add_strarray( @{$loc->{sortnames}} ); # LOCALE_SSORTNAMES
4988 $locale_data .= pack "S<6",
4989 $inegpercent, # LOCALE_INEGATIVEPERCENT
4990 $ipospercent, # LOCALE_IPOSITIVEPERCENT
4991 0, # unknown
4992 $ireadinglayout, # LOCALE_IREADINGLAYOUT
4993 0x2a, # unknown
4994 0x2a; # unknown
4996 $locale_data .= pack "L<24",
4997 0, # unknown
4998 add_string( $sengdisplayname ), # LOCALE_SENGLISHDISPLAYNAME
4999 add_string( $snativedisplayname ), # LOCALE_SNATIVEDISPLAYNAME
5000 add_string( $spercent ), # LOCALE_SPERCENT
5001 add_string( $snan ), # LOCALE_SNAN
5002 add_string( $sposinfinity ), # LOCALE_SPOSINFINITY
5003 add_string( $sneginfinity ), # LOCALE_SNEGINFINITY
5004 0, # unknown
5005 add_string( $serastring ), # CAL_SERASTRING
5006 add_string( $serastring ), # CAL_SABBREVERASTRING
5007 0, # unknown
5008 add_string( $ssortlocale ), # LOCALE_SCONSOLEFALLBACKNAME
5009 add_strarray( @sshorttime ), # LOCALE_SSHORTTIME
5010 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5011 0, # unknown
5012 add_string( $ssortlocale ), # LOCALE_SSORTLOCALE
5013 add_string( "0409:00000409" ), # FIXME # LOCALE_SKEYBOARDSTOINSTALL
5014 add_string( $sscripts ), # LOCALE_SSCRIPTS
5015 add_string( $srelativelongdate ), # LOCALE_SRELATIVELONGDATE
5016 $igeoid, # LOCALE_IGEOID
5017 add_string( $sshortestam || "a" ), # LOCALE_SSHORTESTAM
5018 add_string( $sshortestpm || "p" ), # LOCALE_SSHORTESTPM
5019 add_strarray( @smonthday ), # LOCALE_SMONTHDAY
5020 add_string( "k0-windows-us" ) # FIXME # keyboard_layout
5023 # output language groups
5025 my %groups;
5026 add_registry_key( $nlskey, "Locale", "00000409" );
5027 foreach my $loc (@locales)
5029 next unless defined $loc->{lcid};
5030 next if ($loc->{lcid} & 0x80000000);
5031 next if !defined($loc->{alias}) && $loc->{name} !~ /-$loc->{territory}/; # skip neutral locales
5032 my $group = locale_entry( $loc, "group", 1 );
5033 my $name = sprintf( "%08x", $loc->{lcid} );
5034 my $val = sprintf( "%x", $group );
5035 add_registry_string_value( $nlskey, "Locale", $name, $val ) unless ($loc->{lcid} & 0x000f0000);
5036 add_registry_string_value( $nlskey, "Locale\\Alternate Sorts", $name, $val ) if $loc->{name} =~ /_/;
5037 $groups{$val} = 1;
5039 foreach my $group (keys %groups) { add_registry_string_value( $nlskey, "Language Groups", $group, "1" ); }
5041 # output calendar data
5043 my $calendar_data = "";
5044 foreach my $cal (@calendars)
5046 my $scalname = $cal->{name};
5047 my $iyearoffsetrange = 0;
5048 my $itwodigityearmax = $cal->{itwodigityearmax};
5049 my @sshortdate;
5050 my @syearmonth;
5051 my @slongdate;
5052 my @serastring;
5053 my @sdayname;
5054 my @sabbrevdayname;
5055 my @smonthname;
5056 my @sabbrevmonthname;
5057 my @smonthday;
5058 my @sabbreverastring;
5059 my @sshortestdayname;
5061 my $type = $cal->{type};
5062 if (defined $cal->{locale} && defined $type)
5064 my $loc = $lcnames{$cal->{locale}};
5065 my $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
5066 push @sshortdate, $fmt if $fmt;
5067 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMd' and not(\@alt)]" );
5068 push @sshortdate, $fmt if $fmt;
5069 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
5070 push @sshortdate, $fmt if $fmt;
5071 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMMMd' and not(\@alt)]" );
5072 push @sshortdate, $fmt if $fmt;
5073 @sshortdate = map convert_date_format($_), @sshortdate;
5074 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" );
5075 push @slongdate, $fmt if $fmt;
5076 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
5077 push @slongdate, $fmt if $fmt;
5078 @slongdate = map convert_date_format($_), @slongdate;
5080 foreach my $n (1..13)
5082 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n' and not(\@yeartype)]" );
5083 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n' and not(\@yeartype)]" );
5084 push @smonthname, $name || "";
5085 push @sabbrevmonthname, $abbrev || $name || "";
5088 $scalname ||= loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$type']" );
5089 if (defined $cal->{eras})
5091 my @eras;
5092 my $idx = 1;
5093 foreach my $era (@{$cal->{eras}})
5095 my $start = xml_query( $suppl, "/supplementalData/calendarData/calendar[\@type='$type']/eras/era[\@type='$era']/\@start" );
5096 next unless $start =~ /^(-?\d+)-(\d+)-(\d+)/;
5097 my ($year, $mon, $day, $zero, $first) = ($1, $2, $3, $1 - 1, 1);
5098 if ($zero < 0)
5100 $first -= $zero;
5101 $year = 1;
5102 $itwodigityearmax = 2049 - $zero;
5104 unshift @eras, pack( "S<8", 6, $idx++, $year, $mon, $day, $zero, $first, 0 );
5105 push @serastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraAbbr/era[\@type='$era']" );
5106 push @sabbreverastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraNarrow/era[\@type='$era']" );
5108 $iyearoffsetrange = add_str_data( pack "S<L<*", scalar @eras, map { add_str_data($_); } @eras );
5112 @sshortdate = @{$cal->{sshortdate}} if defined $cal->{sshortdate} && !@sshortdate;
5113 @syearmonth = @{$cal->{syearmonth}} if defined $cal->{syearmonth};
5114 @slongdate = @{$cal->{slongdate}} if defined $cal->{slongdate} && !@slongdate;
5115 @serastring = @{$cal->{serastring}} if defined $cal->{serastring} && !@serastring;
5116 @sdayname = @{$cal->{sdayname}} if defined $cal->{sdayname};
5117 @sabbrevdayname = @{$cal->{sabbrevdayname}} if defined $cal->{sabbrevdayname};
5118 @smonthname = @{$cal->{smonthname}} if defined $cal->{smonthname} && !join("",@smonthname);
5119 @sabbrevmonthname = @{$cal->{sabbrevmonthname}} if defined $cal->{sabbrevmonthname} && !join("",@sabbrevmonthname);
5120 @smonthday = @{$cal->{smonthday}} if defined $cal->{smonthday};
5121 @sabbreverastring = @{$cal->{sabbreverastring}} if defined $cal->{sabbreverastring} && !@sabbreverastring;
5122 @sshortestdayname = @{$cal->{sshortestdayname}} if defined $cal->{sshortestdayname};
5123 my $srelativelongdate = $cal->{srelativelongdate};
5125 @serastring = ("A.D.") unless @serastring;
5126 @sabbreverastring = ("AD") unless @sabbreverastring;
5128 if ($cal->{id} != 1) # calendar 1 is a placeholder, information is fetched from locale instead
5130 @sshortdate = ("") unless @sshortdate;
5131 @syearmonth = ("") unless @syearmonth;
5132 @slongdate = ("") unless @slongdate;
5133 @sdayname = ("") x 7 unless @sdayname;
5134 @sabbrevdayname = ("") x 7 unless @sabbrevdayname;
5135 @sshortestdayname = ("") x 7 unless @sshortestdayname;
5136 @smonthname = ("") x 13 unless @smonthname;
5137 @sabbrevmonthname = ("") x 13 unless @sabbrevmonthname;
5138 @smonthday = ("") unless @smonthday;
5141 $calendar_data .= pack "S<2L<17",
5142 $cal->{id}, # CAL_ICALINTVALUE
5143 $itwodigityearmax || 99, # CAL_ITWODIGITYEARMAX
5144 add_strarray( @sshortdate ), # CAL_SSHORTDATE
5145 add_strarray( @syearmonth ), # CAL_SYEARMONTH
5146 add_strarray( @slongdate ), # CAL_SLONGDATE
5147 add_strarray( @serastring ), # CAL_SERASTRING
5148 $iyearoffsetrange, # CAL_IYEAROFFSETRANGE
5149 add_strarray( @sdayname ), # CAL_SDAYNAME
5150 add_strarray( @sabbrevdayname ), # CAL_SABBREVDAYNAME
5151 add_strarray( @smonthname ), # CAL_SMONTHNAME
5152 add_strarray( @sabbrevmonthname ), # CAL_SABBREVMONTHNAME
5153 add_string( $scalname ), # CAL_SCALNAME
5154 add_strarray( @smonthday ), # CAL_SMONTHDAY
5155 add_strarray( @sabbreverastring ), # CAL_SABBREVERASTRING
5156 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5157 add_string( $srelativelongdate ); # CAL_SRELATIVELONGDATE
5160 # output locale header
5162 my $nb_lcids = scalar keys %lcids;
5163 my $nb_locales = scalar grep { !defined $_->{alias} } @locales;
5164 my $nb_lcnames = scalar keys %lcnames;
5165 my $locale_size = length($locale_data) / $nb_locales;
5166 my $nb_calendars = scalar @calendars;
5167 my $calendar_size = length($calendar_data) / $nb_calendars;
5168 my $lcids_offset = 19 * 4; # size of header
5169 my $lcnames_offset = $lcids_offset + length $lcid_data;
5170 my $locales_offset = $lcnames_offset + length $lcname_data;
5171 my $calendar_offset = $locales_offset + length $locale_data;
5172 my $strings_offset = $calendar_offset + length $calendar_data;
5174 my $locale_header = pack "L<7S<4L<S<2L<3S<2L<4",
5175 8, # offset
5177 7, # version
5178 0x5344534e, # magic
5179 0, 0, 0,
5181 $nb_lcids,
5182 $nb_locales,
5183 $locale_size,
5184 $locales_offset,
5185 $nb_lcnames,
5187 $lcids_offset,
5188 $lcnames_offset,
5190 $nb_calendars,
5191 $calendar_size,
5192 $calendar_offset,
5193 $strings_offset,
5194 0, 0;
5196 return align_string( 4, $locale_header . $lcid_data . $lcname_data . $locale_data . $calendar_data . $string_data );
5200 ################################################################
5201 # build the charmaps table for locale.nls
5202 sub build_charmaps_data()
5204 my $data = "";
5206 # MAP_FOLDDIGITS
5207 my @digits = (ord('0') .. ord('9'));
5208 $digitmap_table[0x3007] = $digits[0]; # Ideographic Zero
5209 @digitmap_table[0x0c78..0x0c7b] = @digits[0..3]; # Telugu Fraction Digits
5210 @digitmap_table[0x0c7c..0x0c7e] = @digits[1..3]; # Telugu Fraction Digits
5211 @digitmap_table[0x3021..0x3029] = @digits[1..9]; # Hangzhou Numerals
5212 @digitmap_table[0xa8e0..0xa8e9] = @digits; # Combining Devanagari Digits
5213 @digitmap_table[0x10107..0x1010f] = @digits[1..9]; # Aegean Numbers
5214 $digitmap_table[0x10320] = $digits[1]; # Old Italic Numerals
5215 $digitmap_table[0x10321] = $digits[5]; # Old Italic Numerals
5216 $data .= dump_binary_case_table( @digitmap_table );
5218 # CJK compatibility map
5219 $data .= dump_binary_case_table( @cjk_compat_table );
5221 # LCMAP_HIRAGANA/KATAKANA
5222 my (@hiragana_table, @katakana_table);
5223 foreach my $ch (0x3041..0x3096, 0x309d..0x309e)
5225 $hiragana_table[$ch + 0x60] = $ch;
5226 $katakana_table[$ch] = $ch + 0x60;
5228 $data .= dump_binary_case_table( @hiragana_table ) . dump_binary_case_table( @katakana_table );
5230 # LCMAP_HALFWIDTH/FULLWIDTH
5231 $halfwidth_table[0x2018] = 0x0027;
5232 $halfwidth_table[0x2019] = 0x0027;
5233 $halfwidth_table[0x201c] = 0x0022;
5234 $halfwidth_table[0x201d] = 0x0022;
5235 $halfwidth_table[0x309b] = 0xff9e;
5236 $halfwidth_table[0x309c] = 0xff9f;
5237 $fullwidth_table[0x309b] = 0x3099;
5238 $fullwidth_table[0x309c] = 0x309a;
5239 $data .= dump_binary_case_table( @halfwidth_table ) . dump_binary_case_table( @fullwidth_table );
5241 # LCMAP_TRADITIONAL/SIMPLIFIED_CHINESE
5242 $data .= dump_binary_case_table( @chinese_traditional_table ) . dump_binary_case_table( @chinese_simplified_table );
5244 # FIXME: some more unknown tables here
5246 return $data;
5250 ################################################################
5251 # build the geoids table for locale.nls
5252 sub build_geoids_data()
5254 my $data = "";
5255 my %index;
5256 my $idx = 0;
5257 my @geo_header = (0x00650067, 0x0000006f, 0, 4 * 7, scalar @geoids, 0, 0);
5259 foreach my $geo (@geoids)
5261 my $id = $geo->{id};
5262 $geo = $geo->{alias} if defined $geo->{alias};
5263 my $lat = "0.000";
5264 my $long = "0.000";
5265 my $iso2 = $geo->{iso2} || "XX";
5266 my $iso3 = $geo->{iso3} || "XX";
5267 my $isregion = $geo->{region} || (defined $geo->{uncode} && !defined $geo->{iso2});
5268 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
5269 my $scurrency = $geo->{scurrency} || "\x{00a4}";
5271 $data .= pack( "L<", $id );
5272 $data .= pad_string( 24, encode( "UTF16LE", $lat ));
5273 $data .= pad_string( 24, encode( "UTF16LE", $long ));
5274 $data .= pack( "L<2", $isregion ? 14 : 16, $geo->{parentid} || 39070 );
5275 $data .= pad_string( 8, encode( "UTF16LE", $iso2 ));
5276 $data .= pad_string( 8, encode( "UTF16LE", $iso3 ));
5277 $data .= pack( "S<2", $geo->{uncode} || 0, $geo->{dialcode} || 0 );
5278 $data .= pad_string( 8, encode( "UTF16LE", $sintlsymbol ));
5279 $data .= pad_string( 16, encode( "UTF16LE", $scurrency ));
5280 $index{$geo->{name}} = $idx if $geo->{name};
5281 $idx++;
5283 $index{"XX"} = $index{"001"};
5285 $geo_header[5] = $geo_header[3] + length $data;
5286 $geo_header[6] = scalar keys %index;
5288 foreach my $name (sort keys %index)
5290 $data .= pad_string( 8, encode( "UTF16LE", $name ));
5291 $data .= pack "L<", $index{$name};
5294 $geo_header[2] = $geo_header[3] + length $data;
5295 return pack( "L<7", @geo_header ) . $data;
5299 ################################################################
5300 # build a binary locale table
5301 sub dump_locales($$)
5303 my ($filename, $chartypes) = @_;
5305 printf "Building $filename\n";
5307 my $locale_data = build_locale_data();
5308 my $charmaps_data = build_charmaps_data();
5309 my $geoids_data = build_geoids_data();
5310 my $scripts_data = ""; # FIXME
5312 my @header = ( 0 ) x 8;
5313 $header[0] = 4 * scalar @header; # chartypes offset
5314 $header[4] = $header[0] + length $chartypes; # locales offset
5315 $header[5] = $header[4] + length $locale_data; # charmaps offset
5316 $header[6] = $header[5] + length $charmaps_data; # geoids offset
5317 $header[7] = $header[6] + length $geoids_data; # scripts offset
5319 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5320 print OUTPUT pack "L<*", @header;
5321 print OUTPUT $chartypes, $locale_data, $charmaps_data, $geoids_data, $scripts_data;
5322 close OUTPUT;
5323 save_file($filename);
5327 ################################################################
5328 # return the day of week of the first of the month
5329 sub month_first_dow($$)
5331 my ($year, $month) = @_;
5332 my @time = gmtime( timegm_modern( 0, 0, 0, 1, $month - 1, $year ));
5333 return $time[6];
5337 ################################################################
5338 # compare system time values
5339 sub compare_systime($$)
5341 my ($a, $b) = @_;
5342 return $a->[0] <=> $b->[0] ||
5343 $a->[1] <=> $b->[1] ||
5344 $a->[2] <=> $b->[2] ||
5345 $a->[3] <=> $b->[3] ||
5346 $a->[4] <=> $b->[4] ||
5347 $a->[5] <=> $b->[5] ||
5348 $a->[6] <=> $b->[6];
5352 ################################################################
5353 # compare the zone transition date with the rule date
5354 sub compare_transition_date($$$$)
5356 my ($stdoff, $isdst, $zone, $rule) = @_;
5358 if (scalar @{$zone} <= 1)
5360 return (!defined($zone->[0]) || $zone->[0] > $rule->[0]) ? 1 : -1;
5363 my @date = parse_transition_date( $stdoff, $isdst, $zone->[0], $zone->[1], $zone->[2], $zone->[3] || 0 );
5364 return compare_systime( \@date, $rule );
5368 ################################################################
5369 # get the Windows zone names from the CLDR data
5370 sub load_windows_zones()
5372 my $current_name;
5373 my %names;
5374 my $base = "cldr-release-$CLDRVERSION";
5375 my $INPUT = open_data_file( "cldr", "$base/common/supplemental/windowsZones.xml" );
5376 while (<$INPUT>)
5378 if (/<!-- +(\(UTC.*) -->.*/)
5380 $current_name = $1;
5382 if (/<mapZone other="(.*)" territory="001" type="(.*)"\/>/)
5384 $names{$1} = [ $current_name, $2 ];
5387 close $INPUT;
5388 return %names;
5392 ################################################################
5393 # parse a transition date specification from the tzdata files
5394 sub parse_transition_date($$@)
5396 use integer;
5397 my ($stdoff, $isdst, $year, $in, $on, $at) = @_;
5399 $on = "1" unless defined $on;
5400 $at = "0" unless defined $at;
5402 my %months = ( Jan => 1, Feb => 2, Mar => 3, Apr => 4, May => 5, Jun => 6,
5403 Jul => 7, Aug => 8, Sep => 9, Oct => 10, Nov => 11, Dec => 12 );
5404 my %days = ( Sun => 0, Mon => 1, Tue => 2, Wed => 3, Thu => 4, Fri => 5, Sat => 6 );
5406 my $mon = $in ? $months{$in} : 1;
5407 my ($week, $dow, $flag, $time, $sec);
5408 my $first = month_first_dow( $year, $mon );
5410 if ($on =~ /^last(.*)$/)
5412 $week = 5;
5413 $dow = $days{$1};
5415 elsif ($on =~ /^(.*)>=(\d+)$/)
5417 $dow = $days{$1};
5418 my $diff = ($first + 6 - $dow) % 7;
5419 $week = $2 >= 25 ? 5 : ($2 + 6 + $diff) / 7;
5421 elsif ($on =~ /^(.*)<=(\d+)$/)
5423 $dow = $days{$1};
5424 my $diff = ($first + $2 + 6 - $dow) % 7;
5425 $week = ($2 + 6 - $diff) / 7;
5426 if (!$week)
5428 $week = 5;
5429 if (!--$mon) { $mon = 12; $year--; }
5432 elsif ($on =~ /^\d+$/)
5434 $dow = ($first + $on - 1) % 7;
5435 $week = $on >= 25 ? 5 : ($on + 6) / 7;
5437 else
5439 die "unsupported date specification $year $in $on $at";
5442 if ($at =~ /^(\d+):(\d+):(\d+)([uws]?)$/)
5444 $time = $1 * 60 + $2;
5445 $sec = $3;
5446 $flag = $4;
5448 elsif ($at =~ /^(\d+):(\d+)([uws]?)$/)
5450 $time = $1 * 60 + $2;
5451 $flag = $3;
5453 elsif ($at =~ /^(\d+)([uws]?)$/)
5455 $time = $1 * 60;
5456 $flag = $2;
5458 else
5460 die "unsupported time specification $year $in $on $at";
5463 $flag ||= "w";
5464 $time -= $stdoff if $flag eq "u";
5465 $time += 60 if !$isdst && $flag ne "w";
5467 if ($time < 0) # previous day
5469 $week-- if $week < 5 && $dow == month_first_dow( $year, $mon );
5470 $week-- if $week == 5 && $dow == month_first_dow( $year + ($mon == 12), $mon % 12 + 1 );
5471 if (!$week)
5473 $week = 5;
5474 if (!--$mon) { $mon = 12; $year--; }
5476 $dow = ($dow + 6) % 7;
5477 $time += 24 * 60;
5480 return ($year, $mon, $week, $dow, $time / 60, $time % 60, $sec || 0);
5484 ################################################################
5485 # parse a system time value as a SYSTEMTIME structure
5486 sub pack_systime(@)
5488 my ($year, $mon, $week, $dow, $hour, $min, $sec) = @_;
5489 return pack "S<8", 0, $mon, $dow, $week, $hour < 24 ? ($hour, $min, $sec, 0) : (23, 59, 59, 999);
5493 ################################################################
5494 # parse a timezone offset from the tzdata files
5495 sub parse_tz_offset($)
5497 my ($hour, $min) = split /:/, shift;
5498 $min ||= 0;
5499 return $hour < 0 ? -$hour * 60 + $min : -$hour * 60 - $min; # invert sign
5503 ################################################################
5504 # build the timezone data
5505 sub dump_timezones($@)
5507 my $filename = shift;
5508 my $FIRST_YEAR = 2000;
5509 my $LAST_YEAR = 2030;
5511 my %names = load_windows_zones();
5512 my %zones;
5513 my %rules;
5514 my %links;
5515 my %res_indices;
5517 printf "Building $filename\n";
5519 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5520 print OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
5521 print OUTPUT "#include \"winresrc.h\"\n\n";
5522 print OUTPUT "#pragma makedep po\n\n";
5523 print OUTPUT "LANGUAGE LANG_ENGLISH, SUBLANG_DEFAULT\n\n";
5524 print OUTPUT "STRINGTABLE\n{\n";
5526 # load tzdata files
5528 foreach my $filename (@_)
5530 my $FILE = open_data_file( "tzdata", $filename );
5531 my $zonename;
5532 while (<$FILE>)
5534 chomp;
5535 s/\#.*$//;
5536 next if /^\s*$/;
5537 my @fields = split /\s+/;
5538 if ($fields[0] eq "Zone" || ($zonename && $fields[0] eq ""))
5540 shift @fields;
5541 $zonename = shift @fields unless $zonename;
5542 my ($stdoff, $rules, $dummy, @date) = @fields;
5543 $zones{$zonename} ||= [ ];
5544 push @{$zones{$zonename}}, [ parse_tz_offset( $stdoff ), $rules, @date ];
5545 $zonename = undef unless @date; # last entry doesn't have an until date
5546 next;
5548 if ($fields[0] eq "Rule")
5550 shift @fields;
5551 my ($rulename, $from, $to, $dummy, $in, $on, $at, $save) = @fields;
5552 $to = $from if $to eq "only";
5553 $to = $LAST_YEAR if $to eq "max";
5554 push @{$rules{$rulename}}, [ parse_tz_offset( $save ), $from, $to, $in, $on, $at ];
5555 next;
5557 if ($fields[0] eq "Link")
5559 $links{$fields[2]} = $fields[1];
5560 next;
5562 die "unrecognized line $_";
5564 close $FILE;
5567 foreach my $name (sort { uc($a) cmp uc($b) } keys %names)
5569 my ($display, $zone) = @{$names{$name}};
5570 $zone = $links{$zone} if defined $links{$zone};
5572 # build list of transitions
5574 my @transitions;
5575 my @from_date = ( 1 );
5576 my $last_stdoff = 0;
5577 for (my $i = 0; $i < scalar @{$zones{$zone}}; $i++)
5579 my ($stdoff, $rule, @until_date) = @{$zones{$zone}->[$i]};
5580 my $isdst = ($last_stdoff != $stdoff);
5581 $from_date[0] ||= $LAST_YEAR;
5582 my @systime = parse_transition_date( $stdoff, $isdst, @from_date );
5583 push @transitions, [ $stdoff, -1, \@systime ];
5585 if (defined $rules{$rule})
5587 foreach my $r (@{$rules{$rule}})
5589 my ($offset, $from, $to, $in, $on, $at) = @{$r};
5590 foreach my $year ($from..$to)
5592 next if $year < $from_date[0];
5593 next if $until_date[0] && $year > $until_date[0];
5594 my @systime = parse_transition_date( $stdoff, !!$offset, $year, $in, $on, $at );
5595 next if compare_transition_date( $stdoff, $isdst, \@until_date, \@systime ) <= 0;
5596 my $ret = compare_transition_date( $stdoff, $isdst, \@from_date, \@systime );
5597 next if $ret > 0;
5598 pop @transitions if !$ret; # remove transition if there's a dst change at the same time
5599 push @transitions, [ $stdoff, $offset, \@systime ];
5603 @from_date = @until_date;
5604 $last_stdoff = $stdoff;
5606 @transitions = sort { compare_systime( $a->[2], $b->[2] ) } @transitions;
5608 # build per-year dynamic info
5610 my @info;
5611 my $last_dstoff = 0;
5612 my $last_dst = 0;
5613 my $year = $FIRST_YEAR;
5614 while ($year <= $LAST_YEAR)
5616 if (@transitions && $transitions[0]->[2]->[0] < $year)
5618 $last_stdoff = $transitions[0]->[0];
5619 shift @transitions;
5620 next;
5622 my ($std, $dst, @trans);
5623 my $cur_stdoff = $last_stdoff;
5624 my $cur_dstoff = ($name =~ /^UTC/) ? 0 : -60;
5625 while (@transitions && $transitions[0]->[2]->[0] == $year)
5627 my $t = shift @transitions;
5628 my ($stdoff, $dstoff, $systime) = @{$t};
5629 $systime = pack_systime( @{$systime} );
5630 if (!$dstoff) # std
5632 $cur_stdoff = $stdoff unless $std;
5633 $std = $systime;
5635 elsif ($dstoff != -1) # dst
5637 $cur_dstoff = $dstoff unless $dst;
5638 $dst ||= $systime;
5640 elsif ($stdoff != $last_stdoff) # rule transition
5642 # Handle a special case: Samoa moved to the other side of
5643 # the date line between 2011-12-03 and 2012-01-01,
5644 # entirely skipping the day 2011-12-31. We ignore this
5645 # change because it happens on a year boundary and more
5646 # importantly it would generate on offset of -25 hours,
5647 # which some programs (e.g., Mono) do not like. See
5648 # https://bugs.winehq.org/show_bug.cgi?id=51758
5650 if ($last_stdoff - $stdoff < 24 * 60)
5652 @trans = ($last_stdoff, $stdoff, $systime);
5653 $cur_stdoff = $stdoff;
5656 elsif ($dst) # rule transition with no stdoff change
5658 $std = $systime;
5660 $last_dstoff = ($dstoff == -1) ? 0 : $dstoff;
5662 $last_stdoff = $cur_stdoff;
5664 if ($cur_dstoff > 0) # swap std and dst to ensure that offset is negative
5666 ($std, $dst) = ($dst, $std);
5667 $cur_stdoff += $cur_dstoff;
5668 $cur_dstoff = -$cur_dstoff;
5671 if (@trans)
5673 # heuristic to prefer switching dst
5674 if ($last_dst == $year - 1 || (!$last_dst && $trans[0] > $trans[1]))
5676 $dst ||= $trans[2];
5677 $cur_stdoff = $trans[0];
5678 $cur_dstoff = $trans[1] - $trans[0];
5680 else
5682 $std ||= $trans[2];
5683 $cur_stdoff = $trans[1];
5684 $cur_dstoff = $trans[0] - $trans[1];
5688 if ($std || $dst)
5690 $std ||= pack_systime( parse_transition_date( 0, 0, $year, "Jan", 1 ));
5691 $dst ||= pack_systime( parse_transition_date( 0, 0, $year, "Jan", 1 ));
5692 $last_dst = $year;
5694 else
5696 $std = pack "S<8", 0;
5697 $dst = pack "S<8", 0;
5698 $cur_stdoff += $last_dstoff;
5700 $info[$year++] = pack( "l<3", $cur_stdoff, 0, $cur_dstoff ) . $std . $dst;
5703 # output registry keys
5705 my $std_name = $name eq "UTC" ? "Coordinated Universal Time" : $name;
5706 my $dlt_name = $std_name =~ s/Standard Time/Daylight Time/r;
5707 my $res_idx = hex( substr( Digest::SHA::sha1_hex($name), -3, 3 )) << 4;
5708 $res_idx += 16 while exists $res_indices{$res_idx};
5709 $res_indices{$res_idx} = 1;
5711 add_registry_string_value( $zonekey, $name, "Display", $display );
5712 add_registry_string_value( $zonekey, $name, "Std", $std_name );
5713 add_registry_string_value( $zonekey, $name, "Dlt", $dlt_name );
5714 add_registry_string_value( $zonekey, $name, "MUI_Std", sprintf( "\@tzres.dll,-%u", $res_idx ));
5715 add_registry_string_value( $zonekey, $name, "MUI_Dlt", sprintf( "\@tzres.dll,-%u", $res_idx + 1 ));
5716 add_registry_string_value( $zonekey, $name, "MUI_Display", sprintf( "\@tzres.dll,-%u", $res_idx + 2 ));
5717 add_registry_binary_value( $zonekey, $name, "TZI", $info[$LAST_YEAR] );
5719 printf OUTPUT "%7d \"#msgctxt#maximum 31 characters#%s\"\n", $res_idx, $std_name;
5720 printf OUTPUT "%7d \"#msgctxt#maximum 31 characters#%s\"\n", $res_idx + 1, $dlt_name;
5721 printf OUTPUT "%7d \"%s\"\n", $res_idx + 2, $display;
5723 my $first_year = $FIRST_YEAR;
5724 my $last_year = $LAST_YEAR;
5725 $last_year-- while $last_year > $FIRST_YEAR && $info[$last_year] eq $info[$last_year - 1];
5726 $first_year++ while $first_year < $last_year && $info[$first_year] eq $info[$last_year];
5728 next if $last_year <= $first_year;
5730 foreach my $i ($first_year..$last_year)
5732 add_registry_binary_value( $zonekey, "$name\\Dynamic DST", $i, $info[$i] );
5734 add_registry_dword_value( $zonekey, "$name\\Dynamic DST", "FirstEntry", $first_year );
5735 add_registry_dword_value( $zonekey, "$name\\Dynamic DST", "LastEntry", $last_year );
5738 print OUTPUT "}\n";
5739 close OUTPUT;
5740 save_file($filename);
5744 ################################################################
5745 # build the script to create registry keys
5746 sub dump_registry_script($%)
5748 my ($filename, %keys) = @_;
5749 my $indent = 1;
5750 my @prev;
5752 printf "Building %s\n", $filename;
5753 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5754 print OUTPUT "HKLM\n{\n";
5755 foreach my $k (sort { ($a =~ tr/a-z\\/A-Z\001/r) cmp ($b =~ tr/a-z\\/A-Z\001/r) } keys %keys)
5757 my @subkeys = split /\\/, $k;
5758 while (@prev && @subkeys && $prev[0] eq $subkeys[0]) { shift @prev; shift @subkeys; }
5759 while (@prev) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; shift @prev; }
5760 my ($def, @vals) = @{$keys{$k}};
5761 for (my $i = 0; $i < @subkeys; $i++)
5763 my $name = $subkeys[$i];
5764 my $prefix = "";
5765 if ($name =~ /^-/)
5767 $name =~ s/^-//;
5768 $prefix = "NoRemove ";
5770 if ($name =~ /\s/)
5772 $name = "'$name'";
5774 printf OUTPUT "%*s%s%s%s\n%*s{\n", 4 * $indent, "", $prefix, $name,
5775 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
5776 $indent++;
5778 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
5779 @prev = split /\\/, $k;
5781 while (@prev) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; shift @prev; }
5782 printf OUTPUT "}\n";
5783 close OUTPUT;
5784 save_file($filename);
5788 ################################################################
5789 # save a file if modified
5790 sub save_file($)
5792 my $file = shift;
5793 if (-f $file && !system "cmp $file $file.new >/dev/null")
5795 unlink "$file.new";
5797 else
5799 rename "$file.new", "$file";
5804 ################################################################
5805 # main routine
5807 chdir ".." if -f "./make_unicode";
5808 load_data();
5809 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
5810 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
5811 dump_bidi_dir_table( "dlls/wineps.drv/direction.c" );
5812 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
5813 dump_mirroring( "dlls/dwrite/mirror.c" );
5814 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
5815 dump_bracket( "dlls/dwrite/bracket.c" );
5816 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
5817 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
5818 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
5819 dump_linebreak( "dlls/dwrite/linebreak.c" );
5820 dump_scripts( "dlls/dwrite/scripts" );
5821 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
5822 dump_vertical( "dlls/win32u/vertical.c", 1 );
5823 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
5824 dump_intl_nls("nls/l_intl.nls");
5825 dump_norm_table( "nls/normnfc.nls" );
5826 dump_norm_table( "nls/normnfd.nls" );
5827 dump_norm_table( "nls/normnfkc.nls" );
5828 dump_norm_table( "nls/normnfkd.nls" );
5829 dump_norm_table( "nls/normidna.nls" );
5830 my $chartypes = dump_sortkey_table( "nls/sortdefault.nls" );
5831 dump_locales( "nls/locale.nls", $chartypes );
5832 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
5833 dump_eucjp_codepage();
5834 dump_timezones( "dlls/tzres/tzres.rc", @timezone_files );
5835 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
5837 exit 0;
5839 # Local Variables:
5840 # compile-command: "./make_unicode"
5841 # End: