services: Move to GUI subsystem (as native).
[wine.git] / tools / make_unicode
blob5910716682df9a9d1868e465d379f246b7bbd7a5
1 #!/usr/bin/perl -w
3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
22 use strict;
23 use XML::LibXML;
24 use Digest::SHA;
25 use Encode;
26 use Time::Local qw(timegm_modern);
28 my $UNIVERSION = "15.1.0";
29 my $CLDRVERSION = "43";
30 my $ISO639VERSION = "20230123";
31 my $TZVERSION = "2023c";
33 my %data_files =
35 ucd => { url => "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip", name => "UCD-$UNIVERSION.zip",
36 sha => "cb1c663d053926500cd501229736045752713a066bd75802098598b7a7056177" },
37 unihan => { url => "https://www.unicode.org/Public/$UNIVERSION/ucd/Unihan.zip", name => "Unihan-$UNIVERSION.zip",
38 sha => "a0226610e324bcf784ac380e11f4cbf533ee1e6b3d028b0991bf8c0dc3f85853" },
39 idna => { url => "https://www.unicode.org/Public/idna/$UNIVERSION/IdnaMappingTable.txt", name => "IdnaMappingTable-$UNIVERSION.txt",
40 sha => "402cbd285f1f952fcd0834b63541d54f69d3d8f1b8f8599bf71a1a14935f82c4" },
41 cldr => { url => "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip",
42 sha => "132cdd24e479abb6e86db1429931cec3dada485fd41da39ece3c08e531c477df" },
43 cldr33 => { url => "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip",
44 sha => "fa3490082c086d21257153609642f54fcf788fcfda4966fe67f3f6daca0d58b9" },
45 sorting => { url => "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498/Windows 10 Sorting Weight Table.txt",
46 sha => "81fcfa1e5ed3e3a94d329959ff7d97d522ddf9d653d2c4d6ddcccc5cd4df663f" },
47 codepages => { url => "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498/Windows Supported Code Page Data Files.zip",
48 sha => "5074e6dd253056ba61fc6c870c9a955467855129c6ad3a51761c386b301b125a" },
49 iso639 => { url => "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip",
50 sha => "884faa6cc5ac5181ed7969eed75355c1bc665447614cf4c06c62e87b38fe6a97" },
51 ksx1001 => { url => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT",
52 sha => "d8d2a35206ac0ea2865f5d801c9d6717f735bf46f263a658a64a960abe59e371" },
53 jis0208 => { url => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT",
54 sha => "1c571870457f19c97720631fa83ee491549a96ba1436da1296786a67d8632e87" },
55 jis0212 => { url => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT",
56 sha => "477820bb3055bbcc90880d788cd95607d221dc94457bae249231adecf13c12e6" },
57 tzdata => { url => "https://data.iana.org/time-zones/releases/tzdata$TZVERSION.tar.gz",
58 sha => "3f510b5d1b4ae9bb38e485aa302a776b317fb3637bdb6404c4adf7b6cadd965c" },
62 # Default char for undefined mappings
63 my $DEF_CHAR = ord '?';
65 # Last valid Unicode character
66 my $MAX_CHAR = 0x10ffff;
68 my $nlskey = "-SYSTEM\\-CurrentControlSet\\-Control\\-Nls";
69 my $zonekey = "-Software\\-Microsoft\\-Windows NT\\-CurrentVersion\\Time Zones";
71 my @allfiles =
73 "CodpageFiles/037.txt",
74 "CodpageFiles/437.txt",
75 "CodpageFiles/500.txt",
76 "CodpageFiles/708.txt",
77 "CodpageFiles/720.txt",
78 "CodpageFiles/737.txt",
79 "CodpageFiles/775.txt",
80 "CodpageFiles/850.txt",
81 "CodpageFiles/852.txt",
82 "CodpageFiles/855.txt",
83 "CodpageFiles/857.txt",
84 "CodpageFiles/860.txt",
85 "CodpageFiles/861.txt",
86 "CodpageFiles/862.txt",
87 "CodpageFiles/863.txt",
88 "CodpageFiles/864.txt",
89 "CodpageFiles/865.txt",
90 "CodpageFiles/866.txt",
91 "CodpageFiles/869.txt",
92 "CodpageFiles/874.txt",
93 "CodpageFiles/875.txt",
94 "CodpageFiles/932.txt",
95 "CodpageFiles/936.txt",
96 "CodpageFiles/949.txt",
97 "CodpageFiles/950.txt",
98 "CodpageFiles/1026.txt",
99 "CodpageFiles/1250.txt",
100 "CodpageFiles/1251.txt",
101 "CodpageFiles/1252.txt",
102 "CodpageFiles/1253.txt",
103 "CodpageFiles/1254.txt",
104 "CodpageFiles/1255.txt",
105 "CodpageFiles/1256.txt",
106 "CodpageFiles/1257.txt",
107 "CodpageFiles/1258.txt",
108 "CodpageFiles/1361.txt",
109 "CodpageFiles/10000.txt",
110 "CodpageFiles/10001.txt",
111 "CodpageFiles/10002.txt",
112 "CodpageFiles/10003.txt",
113 "CodpageFiles/10004.txt",
114 "CodpageFiles/10005.txt",
115 "CodpageFiles/10006.txt",
116 "CodpageFiles/10007.txt",
117 "CodpageFiles/10008.txt",
118 "CodpageFiles/10010.txt",
119 "CodpageFiles/10017.txt",
120 "CodpageFiles/10021.txt",
121 "CodpageFiles/10029.txt",
122 "CodpageFiles/10079.txt",
123 "CodpageFiles/10081.txt",
124 "CodpageFiles/10082.txt",
125 "CodpageFiles/20127.txt",
126 "CodpageFiles/20866.txt",
127 "CodpageFiles/21866.txt",
128 "CodpageFiles/28591.txt",
129 "CodpageFiles/28592.txt",
130 "CodpageFiles/28593.txt",
131 "CodpageFiles/28594.txt",
132 "CodpageFiles/28595.txt",
133 "CodpageFiles/28596.txt",
134 "CodpageFiles/28597.txt",
135 "CodpageFiles/28598.txt",
136 "CodpageFiles/28599.txt",
137 "CodpageFiles/28603.txt",
138 "CodpageFiles/28605.txt",
141 my @timezone_files = qw(africa antarctica asia australasia europe northamerica southamerica etcetera backward);
143 my %ctype =
145 # CT_CTYPE1
146 "upper" => 0x0001,
147 "lower" => 0x0002,
148 "digit" => 0x0004,
149 "space" => 0x0008,
150 "punct" => 0x0010,
151 "cntrl" => 0x0020,
152 "blank" => 0x0040,
153 "xdigit" => 0x0080,
154 "alpha" => 0x0100 | 0x80000000,
155 "defin" => 0x0200,
156 # CT_CTYPE3 in high 16 bits
157 "nonspacing" => 0x00010000,
158 "diacritic" => 0x00020000,
159 "vowelmark" => 0x00040000,
160 "symbol" => 0x00080000,
161 "katakana" => 0x00100000,
162 "hiragana" => 0x00200000,
163 "halfwidth" => 0x00400000,
164 "fullwidth" => 0x00800000,
165 "ideograph" => 0x01000000,
166 "kashida" => 0x02000000,
167 "lexical" => 0x04000000,
168 "highsurrogate" => 0x08000000,
169 "lowsurrogate" => 0x10000000,
172 my %bracket_types =
174 "o" => 0x0000,
175 "c" => 0x0001,
178 my %indic_types =
180 "Other" => 0x0000,
181 "Bindu" => 0x0001,
182 "Visarga" => 0x0002,
183 "Avagraha" => 0x0003,
184 "Nukta" => 0x0004,
185 "Virama" => 0x0005,
186 "Vowel_Independent" => 0x0006,
187 "Vowel_Dependent" => 0x0007,
188 "Vowel" => 0x0008,
189 "Consonant_Placeholder" => 0x0009,
190 "Consonant" => 0x000a,
191 "Consonant_Dead" => 0x000b,
192 "Consonant_Succeeding_Repha" => 0x000c,
193 "Consonant_Subjoined" => 0x000d,
194 "Consonant_Medial" => 0x000e,
195 "Consonant_Final" => 0x000f,
196 "Consonant_Head_Letter" => 0x0010,
197 "Modifying_Letter" => 0x0011,
198 "Tone_Letter" => 0x0012,
199 "Tone_Mark" => 0x0013,
200 "Register_Shifter" => 0x0014,
201 "Consonant_Preceding_Repha" => 0x0015,
202 "Pure_Killer" => 0x0016,
203 "Invisible_Stacker" => 0x0017,
204 "Gemination_Mark" => 0x0018,
205 "Cantillation_Mark" => 0x0019,
206 "Non_Joiner" => 0x001a,
207 "Joiner" => 0x001b,
208 "Number_Joiner" => 0x001c,
209 "Number" => 0x001d,
210 "Brahmi_Joining_Number" => 0x001e,
211 "Consonant_With_Stacker" => 0x001f,
212 "Consonant_Prefixed" => 0x0020,
213 "Syllable_Modifier" => 0x0021,
214 "Consonant_Killer" => 0x0022,
215 "Consonant_Initial_Postfixed" => 0x0023,
218 my %matra_types =
220 "Right" => 0x01,
221 "Left" => 0x02,
222 "Visual_Order_Left" => 0x03,
223 "Left_And_Right" => 0x04,
224 "Top" => 0x05,
225 "Bottom" => 0x06,
226 "Top_And_Bottom" => 0x07,
227 "Top_And_Right" => 0x08,
228 "Top_And_Left" => 0x09,
229 "Top_And_Left_And_Right" => 0x0a,
230 "Bottom_And_Right" => 0x0b,
231 "Top_And_Bottom_And_Right" => 0x0c,
232 "Overstruck" => 0x0d,
233 "Invisible" => 0x0e,
234 "Bottom_And_Left" => 0x0f,
235 "Top_And_Bottom_And_Left" => 0x10,
238 my %break_types =
240 "BK" => 0x0001,
241 "CR" => 0x0002,
242 "LF" => 0x0003,
243 "CM" => 0x0004,
244 "SG" => 0x0005,
245 "GL" => 0x0006,
246 "CB" => 0x0007,
247 "SP" => 0x0008,
248 "ZW" => 0x0009,
249 "NL" => 0x000a,
250 "WJ" => 0x000b,
251 "JL" => 0x000c,
252 "JV" => 0x000d,
253 "JT" => 0x000e,
254 "H2" => 0x000f,
255 "H3" => 0x0010,
256 "XX" => 0x0011,
257 "OP" => 0x0012,
258 "CL" => 0x0013,
259 "CP" => 0x0014,
260 "QU" => 0x0015,
261 "NS" => 0x0016,
262 "EX" => 0x0017,
263 "SY" => 0x0018,
264 "IS" => 0x0019,
265 "PR" => 0x001a,
266 "PO" => 0x001b,
267 "NU" => 0x001c,
268 "AL" => 0x001d,
269 "ID" => 0x001e,
270 "IN" => 0x001f,
271 "HY" => 0x0020,
272 "BB" => 0x0021,
273 "BA" => 0x0022,
274 "SA" => 0x0023,
275 "AI" => 0x0024,
276 "B2" => 0x0025,
277 "HL" => 0x0026,
278 "CJ" => 0x0027,
279 "RI" => 0x0028,
280 "EB" => 0x0029,
281 "EM" => 0x002a,
282 "ZWJ" => 0x002b,
283 "AK" => 0x002c,
284 "AP" => 0x002d,
285 "AS" => 0x002e,
286 "VF" => 0x002f,
287 "VI" => 0x0030,
290 my %vertical_types =
292 "R" => 0x0000,
293 "U" => 0x0001,
294 "Tr" => 0x0002,
295 "Tu" => 0x0003,
298 my %categories =
300 "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
301 "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
302 "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"}, # Letter, Titlecase
303 "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
304 "Mc" => $ctype{"defin"}, # Mark, Spacing Combining
305 "Me" => $ctype{"defin"}, # Mark, Enclosing
306 "Nd" => $ctype{"defin"}|$ctype{"digit"}, # Number, Decimal Digit
307 "Nl" => $ctype{"defin"}|$ctype{"alpha"}, # Number, Letter
308 "No" => $ctype{"defin"}, # Number, Other
309 "Zs" => $ctype{"defin"}|$ctype{"space"}, # Separator, Space
310 "Zl" => $ctype{"defin"}|$ctype{"space"}, # Separator, Line
311 "Zp" => $ctype{"defin"}|$ctype{"space"}, # Separator, Paragraph
312 "Cc" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Control
313 "Cf" => $ctype{"defin"}|$ctype{"cntrl"}, # Other, Format
314 "Cs" => $ctype{"defin"}, # Other, Surrogate
315 "Co" => $ctype{"defin"}, # Other, Private Use
316 "Cn" => $ctype{"defin"}, # Other, Not Assigned
317 "Lm" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Modifier
318 "Lo" => $ctype{"defin"}|$ctype{"alpha"}, # Letter, Other
319 "Pc" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Connector
320 "Pd" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Dash
321 "Ps" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Open
322 "Pe" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Close
323 "Pi" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Initial quote
324 "Pf" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Final quote
325 "Po" => $ctype{"defin"}|$ctype{"punct"}, # Punctuation, Other
326 "Sm" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Math
327 "Sc" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Currency
328 "Sk" => $ctype{"defin"}|$ctype{"symbol"}, # Symbol, Modifier
329 "So" => $ctype{"defin"}|$ctype{"symbol"} # Symbol, Other
332 # a few characters need additional categories that cannot be determined automatically
333 my %special_categories =
335 "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
336 0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
337 "space" => [ 0x09..0x0d, 0x85 ],
338 "blank" => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
339 "cntrl" => [ 0x070f, 0x200c, 0x200d,
340 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
341 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
342 0xfff9, 0xfffa, 0xfffb ],
343 "punct" => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
344 0xd7, 0xf7 ],
345 "digit" => [ 0xb2, 0xb3, 0xb9 ],
346 "lower" => [ 0xaa, 0xba, 0x2071, 0x207f ],
347 "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
348 0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
349 "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
350 "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
351 0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
352 0x02b9..0x02ba, 0x02c6..0x02cf ],
353 "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
354 "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
355 0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
356 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
357 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
358 0x3131..0x3164 ],
359 "ideograph" => [ 0x3006..0x3007 ],
360 "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
361 0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
362 0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
363 0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
364 0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
365 0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
366 0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
367 0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
368 0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
369 0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
370 "kashida" => [ 0x0640 ],
373 my %directions =
375 "L" => 1, # Left-to-Right
376 "R" => 2, # Right-to-Left
377 "AL" => 12, # Right-to-Left Arabic
378 "EN" => 3, # European Number
379 "ES" => 4, # European Number Separator
380 "ET" => 5, # European Number Terminator
381 "AN" => 6, # Arabic Number
382 "CS" => 7, # Common Number Separator
383 "NSM" => 13, # Non-Spacing Mark
384 "BN" => 14, # Boundary Neutral
385 "B" => 8, # Paragraph Separator
386 "S" => 9, # Segment Separator
387 "WS" => 10, # Whitespace
388 "ON" => 11, # Other Neutrals
389 "LRE" => 15, # Left-to-Right Embedding
390 "LRO" => 15, # Left-to-Right Override
391 "RLE" => 15, # Right-to-Left Embedding
392 "RLO" => 15, # Right-to-Left Override
393 "PDF" => 15, # Pop Directional Format
394 "LRI" => 15, # Left-to-Right Isolate
395 "RLI" => 15, # Right-to-Left Isolate
396 "FSI" => 15, # First Strong Isolate
397 "PDI" => 15 # Pop Directional Isolate
400 my %c2_types =
402 "L" => 1, # C2_LEFTTORIGHT
403 "R" => 2, # C2_RIGHTTOLEFT
404 "AL" => 2, # C2_RIGHTTOLEFT
405 "EN" => 3, # C2_EUROPENUMBER
406 "ES" => 4, # C2_EUROPESEPARATOR
407 "ET" => 5, # C2_EUROPETERMINATOR
408 "AN" => 6, # C2_ARABICNUMBER
409 "CS" => 7, # C2_COMMONSEPARATOR
410 "NSM" => 11, # C2_OTHERNEUTRAL
411 "BN" => 0, # C2_NOTAPPLICABLE
412 "B" => 8, # C2_BLOCKSEPARATOR
413 "S" => 9, # C2_SEGMENTSEPARATOR
414 "WS" => 10, # C2_WHITESPACE
415 "ON" => 11, # C2_OTHERNEUTRAL
416 "LRE" => 11, # C2_OTHERNEUTRAL
417 "LRO" => 11, # C2_OTHERNEUTRAL
418 "RLE" => 11, # C2_OTHERNEUTRAL
419 "RLO" => 11, # C2_OTHERNEUTRAL
420 "PDF" => 11, # C2_OTHERNEUTRAL
421 "LRI" => 11, # C2_OTHERNEUTRAL
422 "RLI" => 11, # C2_OTHERNEUTRAL
423 "FSI" => 11, # C2_OTHERNEUTRAL
424 "PDI" => 11 # C2_OTHERNEUTRAL
427 my %bidi_types =
429 "ON" => 0, # Other Neutrals
430 "L" => 1, # Left-to-Right
431 "R" => 2, # Right-to-Left
432 "AN" => 3, # Arabic Number
433 "EN" => 4, # European Number
434 "AL" => 5, # Right-to-Left Arabic
435 "NSM" => 6, # Non-Spacing Mark
436 "CS" => 7, # Common Number Separator
437 "ES" => 8, # European Number Separator
438 "ET" => 9, # European Number Terminator
439 "BN" => 10, # Boundary Neutral
440 "S" => 11, # Segment Separator
441 "WS" => 12, # Whitespace
442 "B" => 13, # Paragraph Separator
443 "RLO" => 14, # Right-to-Left Override
444 "RLE" => 15, # Right-to-Left Embedding
445 "LRO" => 16, # Left-to-Right Override
446 "LRE" => 17, # Left-to-Right Embedding
447 "PDF" => 18, # Pop Directional Format
448 "LRI" => 19, # Left-to-Right Isolate
449 "RLI" => 20, # Right-to-Left Isolate
450 "FSI" => 21, # First Strong Isolate
451 "PDI" => 22 # Pop Directional Isolate
454 my %joining_types =
456 "U" => 0, # Non_Joining
457 "L" => 1, # Left_Joining
458 "R" => 2, # Right_Joining
459 "D" => 3, # Dual_Joining
460 "C" => 3, # Join_Causing
461 "ALAPH" => 4, # Syriac ALAPH
462 "DALATH RISH" => 5, # Syriac DALATH RISH group
463 "T" => 6, # Transparent
466 my @locales =
468 { name => "", lcid => 0x0000007f, file => "root", territory => "IV", sabbrevlangname => "IVL", sopentypelang =>"dflt" },
469 { name => "aa", sopentypelang => "AFR" },
470 { name => "aa-DJ" },
471 { name => "aa-ER" },
472 { name => "aa-ET" },
473 { name => "af", lcid => 0x00000036, oemcp => 850, sabbrevlangname => "AFK", sopentypelang => "AFK" },
474 { name => "af-NA" },
475 { name => "af-ZA", lcid => 0x00000436 },
476 { name => "agq" },
477 { name => "agq-CM" },
478 { name => "ak", sopentypelang => "TWI" },
479 { name => "ak-GH" },
480 { name => "am", lcid => 0x0000005e, sabbrevlangname => "AMH" },
481 { name => "am-ET", lcid => 0x0000045e },
482 { name => "ar", lcid => 0x00000001, territory => "SA", oemcp => 720, group => 13 },
483 { name => "ar-001" },
484 { name => "ar-AE", lcid => 0x00003801, sabbrevlangname => "ARU" },
485 { name => "ar-BH", lcid => 0x00003c01, sabbrevlangname => "ARH" },
486 { name => "ar-DJ" },
487 { name => "ar-DZ", lcid => 0x00001401, sabbrevlangname => "ARG", nativedigits => "0123456789" },
488 { name => "ar-EG", lcid => 0x00000c01, sabbrevlangname => "ARE" },
489 { name => "ar-EH" },
490 { name => "ar-ER" },
491 { name => "ar-IL" },
492 { name => "ar-IQ", lcid => 0x00000801, sabbrevlangname => "ARI" },
493 { name => "ar-JO", lcid => 0x00002c01, sabbrevlangname => "ARJ" },
494 { name => "ar-KM" },
495 { name => "ar-KW", lcid => 0x00003401, sabbrevlangname => "ARK" },
496 { name => "ar-LB", lcid => 0x00003001, sabbrevlangname => "ARB" },
497 { name => "ar-LY", lcid => 0x00001001, sabbrevlangname => "ARL", nativedigits => "0123456789" },
498 { name => "ar-MA", lcid => 0x00001801, sabbrevlangname => "ARM", nativedigits => "0123456789" },
499 { name => "ar-MR" },
500 { name => "ar-OM", lcid => 0x00002001, sabbrevlangname => "ARO" },
501 { name => "ar-PS" },
502 { name => "ar-QA", lcid => 0x00004001, sabbrevlangname => "ARQ" },
503 { name => "ar-SA", lcid => 0x00000401, sabbrevlangname => "ARA" },
504 { name => "ar-SD" },
505 { name => "ar-SO" },
506 { name => "ar-SS" },
507 { name => "ar-SY", lcid => 0x00002801, sabbrevlangname => "ARS" },
508 { name => "ar-TD" },
509 { name => "ar-TN", lcid => 0x00001c01, sabbrevlangname => "ART", nativedigits => "0123456789" },
510 { name => "ar-YE", lcid => 0x00002401, sabbrevlangname => "ARY" },
511 { name => "arn", lcid => 0x0000007a, oemcp => 850, ebcdiccp => 20284, slist => ",", sabbrevlangname => "MPD", sopentypelang => "MAP" },
512 { name => "arn-CL", lcid => 0x0000047a },
513 { name => "arn-Latn", alias => "arn" },
514 { name => "arn-Latn-CL", alias => "arn-CL" },
515 { name => "as", lcid => 0x0000004d, slist => ",", group => 15 },
516 { name => "as-IN", lcid => 0x0000044d },
517 { name => "asa" },
518 { name => "asa-TZ" },
519 { name => "ast" },
520 { name => "ast-ES" },
521 { name => "az", lcid => 0x0000002c, oemcp => 857, ebcdiccp => 20905, group => 2 },
522 { name => "az-Cyrl", lcid => 0x0000742c, oemcp => 866, ebcdiccp => 20880, group => 5, sabbrevlangname => "AZC" },
523 { name => "az-Cyrl-AZ", lcid => 0x0000082c },
524 { name => "az-Latn", lcid => 0x0000782c },
525 { name => "az-Latn-AZ", lcid => 0x0000042c },
526 { name => "ba", lcid => 0x0000006d, oemcp => 866, group => 5, sabbrevlangname => "BAS", sopentypelang => "BSH" },
527 { name => "ba-Cyrl", alias => "ba" },
528 { name => "ba-Cyrl-RU", alias => "ba-RU" },
529 { name => "ba-RU", lcid => 0x0000046d },
530 { name => "bas" },
531 { name => "bas-CM" },
532 { name => "be", lcid => 0x00000023, oemcp => 866, ebcdiccp => 500, group => 5 },
533 { name => "be-BY", lcid => 0x00000423 },
534 { name => "bem" },
535 { name => "bem-ZM" },
536 { name => "bez" },
537 { name => "bez-TZ" },
538 { name => "bg", lcid => 0x00000002, oemcp => 866, ebcdiccp => 21025, group => 5, sabbrevlangname => "BGR", sopentypelang => "BGR" },
539 { name => "bg-BG", lcid => 0x00000402 },
540 { name => "bin", lcid => 0x00000066, oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "EDO" },
541 { name => "bin-NG", lcid => 0x00000466, file => "bin", dir => "exemplars" },
542 { name => "bm", sopentypelang => "BMB" },
543 { name => "bm-Latn", file => "bm" },
544 { name => "bm-Latn-ML", file => "bm_ML" },
545 { name => "bm-ML", alias => "bm-Latn-ML" },
546 { name => "bn", lcid => 0x00000045, slist => ",", group => 15, sabbrevlangname => "BNB" },
547 { name => "bn-BD", lcid => 0x00000845 },
548 { name => "bn-IN", lcid => 0x00000445, sabbrevlangname => "BNG" },
549 { name => "bo", lcid => 0x00000051, slist => ",", group => 15, sabbrevlangname => "BOB", sopentypelang => "TIB" },
550 { name => "bo-CN", lcid => 0x00000451 },
551 { name => "bo-IN", slist => "," },
552 { name => "bo-Tibt", alias => "bo" },
553 { name => "bo-Tibt-CN", alias => "bo-CN" },
554 { name => "bo-Tibt-IN", alias => "bo-IN" },
555 { name => "br", lcid => 0x0000007e, oemcp => 850, ebcdiccp => 20297 },
556 { name => "br-FR", lcid => 0x0000047e },
557 { name => "br-Latn", alias => "br" },
558 { name => "br-Latn-FR", alias => "br-FR" },
559 { name => "brx" },
560 { name => "brx-IN" },
561 { name => "bs", lcid => 0x0000781a, oemcp => 852, maccp => 10082, ebcdiccp => 870, group => 2, sabbrevlangname => "BSB" },
562 { name => "bs-Cyrl", lcid => 0x0000641a, oemcp => 855, group => 5, sabbrevlangname => "BSC" },
563 { name => "bs-Cyrl-BA", lcid => 0x0000201a },
564 { name => "bs-Latn", lcid => 0x0000681a },
565 { name => "bs-Latn-BA", lcid => 0x0000141a },
566 { name => "byn", sopentypelang => "BIL" },
567 { name => "byn-ER" },
568 { name => "ca", lcid => 0x00000003, oemcp => 850 },
569 { name => "ca-AD", maccp => 65001 },
570 { name => "ca-ES", lcid => 0x00000403 },
571 { name => "ca-ES-valencia", lcid => 0x00000803, file => "ca_ES_VALENCIA", sabbrevlangname => "VAL" },
572 { name => "ca-FR", maccp => 65001 },
573 { name => "ca-IT", maccp => 65001 },
574 { name => "ccp" },
575 { name => "ccp-BD", alias => "ccp-Cakm-BD" },
576 { name => "ccp-Cakm", file => "ccp" },
577 { name => "ccp-Cakm-BD", file => "ccp_BD" },
578 { name => "ccp-Cakm-IN", file => "ccp_IN" },
579 { name => "ccp-IN", alias => "ccp-Cakm-IN" },
580 { name => "ce" },
581 { name => "ce-RU" },
582 { name => "ceb" },
583 { name => "ceb-Latn", file => "ceb" },
584 { name => "ceb-Latn-PH", file => "ceb_PH" },
585 { name => "ceb-PH", alias => "ceb-Latn-PH" },
586 { name => "cgg" },
587 { name => "cgg-UG" },
588 { name => "chr", lcid => 0x0000005c, slist => ",", sabbrevlangname => "CRE" },
589 { name => "chr-Cher", lcid => 0x00007c5c, file => "chr" },
590 { name => "chr-Cher-US", lcid => 0x0000045c, file => "chr_US" },
591 { name => "chr-US", alias => "chr-Cher-US" },
592 { name => "ckb", alias => "ku" },
593 { name => "ckb-IQ", alias => "ku-Arab-IQ" },
594 { name => "ckb-IR", alias => "ku-Arab-IR" },
595 { name => "co", lcid => 0x00000083, oemcp => 850, ebcdiccp => 20297 },
596 { name => "co-FR", lcid => 0x00000483 },
597 { name => "co-Latn", alias => "co" },
598 { name => "co-Latn-FR", alias => "co-FR" },
599 { name => "cs", lcid => 0x00000005, oemcp => 852, group => 2, sabbrevlangname => "CSY", sopentypelang => "CSY" },
600 { name => "cs-CZ", lcid => 0x00000405 },
601 { name => "cu", sopentypelang => "CSL" },
602 { name => "cu-RU" },
603 { name => "cy", lcid => 0x00000052, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "CYM", sopentypelang => "WEL" },
604 { name => "cy-GB", lcid => 0x00000452 },
605 { name => "da", lcid => 0x00000006, oemcp => 850, ebcdiccp => 20277 },
606 { name => "da-DK", lcid => 0x00000406 },
607 { name => "da-GL", maccp => 65001 },
608 { name => "dav" },
609 { name => "dav-KE" },
610 { name => "de", lcid => 0x00000007, oemcp => 850, ebcdiccp => 20273 },
611 { name => "de-AT", lcid => 0x00000c07, sabbrevlangname => "DEA" },
612 { name => "de-BE" },
613 { name => "de-CH", lcid => 0x00000807, sabbrevlangname => "DES" },
614 { name => "de-DE", lcid => 0x00000407 },
615 { name => "de-DE_phoneb", lcid => 0x00010407, alias => "de-DE" },
616 { name => "de-DE-u-co-phonebk", alias => "de-DE_phoneb" },
617 { name => "de-IT", oemcp => 65001 },
618 { name => "de-LI", lcid => 0x00001407, sabbrevlangname => "DEC" },
619 { name => "de-LU", lcid => 0x00001007, sabbrevlangname => "DEL" },
620 { name => "dje", sopentypelang => "DJR" },
621 { name => "dje-NE" },
622 { name => "doi", sopentypelang => "DGR" },
623 { name => "doi-IN", alias => "doi-Deva-IN" },
624 { name => "doi-Deva", file => "doi" },
625 { name => "doi-Deva-IN", file => "doi_IN" },
626 { name => "dsb", lcid => 0x00007c2e, sparent => "hsb", oemcp => 850, ebcdiccp => 870, sabbrevlangname => "DSB", sopentypelang => "LSB" },
627 { name => "dsb-DE", lcid => 0x0000082e },
628 { name => "dua" },
629 { name => "dua-CM" },
630 { name => "dv", lcid => 0x00000065, slist => "\x{060c}", group => 13, nativedigits => "0123456789" },
631 { name => "dv-MV", lcid => 0x00000465 },
632 { name => "dyo" },
633 { name => "dyo-SN" },
634 { name => "dz", sopentypelang => "DZN" },
635 { name => "dz-BT", lcid => 0x00000c51, sabbrevlangname => "ZZZ" },
636 { name => "ebu" },
637 { name => "ebu-KE" },
638 { name => "ee" },
639 { name => "ee-GH" },
640 { name => "ee-TG" },
641 { name => "el", lcid => 0x00000008, oemcp => 737, group => 4 },
642 { name => "el-CY" },
643 { name => "el-GR", lcid => 0x00000408 },
644 { name => "en", lcid => 0x00000009, oemcp => 437, slist => ",", sabbrevlangname => "ENU" },
645 { name => "en-001", oemcp => 850 },
646 { name => "en-029", lcid => 0x00002409, file => "en", oemcp => 850, sabbrevlangname => "ENB" },
647 { name => "en-150", oemcp => 65001 },
648 { name => "en-AE", lcid => 0x00004c09, oemcp => 65001, sabbrevlangname => "ZZZ" },
649 { name => "en-AG", oemcp => 850 },
650 { name => "en-AI", oemcp => 850 },
651 { name => "en-AS", oemcp => 850 },
652 { name => "en-AT", oemcp => 65001 },
653 { name => "en-AU", lcid => 0x00000c09, oemcp => 850, sabbrevlangname => "ENA" },
654 { name => "en-BB", oemcp => 850 },
655 { name => "en-BE", oemcp => 850 },
656 { name => "en-BI", oemcp => 65001 },
657 { name => "en-BM", oemcp => 850 },
658 { name => "en-BS", oemcp => 850 },
659 { name => "en-BW", oemcp => 850 },
660 { name => "en-BZ", lcid => 0x00002809, oemcp => 850, sabbrevlangname => "ENL" },
661 { name => "en-CA", lcid => 0x00001009, oemcp => 850, ebcdiccp => 37, sabbrevlangname => "ENC" },
662 { name => "en-CC", oemcp => 850 },
663 { name => "en-CH", oemcp => 65001 },
664 { name => "en-CK", oemcp => 850 },
665 { name => "en-CM", oemcp => 850 },
666 { name => "en-CX", oemcp => 850 },
667 { name => "en-CY", oemcp => 65001 },
668 { name => "en-DE", oemcp => 65001 },
669 { name => "en-DG", oemcp => 850 },
670 { name => "en-DK", oemcp => 65001 },
671 { name => "en-DM", oemcp => 850 },
672 { name => "en-ER", oemcp => 850 },
673 { name => "en-FI", oemcp => 65001 },
674 { name => "en-FJ", oemcp => 850 },
675 { name => "en-FK", oemcp => 850 },
676 { name => "en-FM", oemcp => 850 },
677 { name => "en-GB", lcid => 0x00000809, oemcp => 850, ebcdiccp => 20285, sabbrevlangname => "ENG" },
678 { name => "en-GD", oemcp => 850 },
679 { name => "en-GG", oemcp => 850 },
680 { name => "en-GH", oemcp => 850 },
681 { name => "en-GI", oemcp => 850 },
682 { name => "en-GM", oemcp => 850 },
683 { name => "en-GU", oemcp => 850 },
684 { name => "en-GY", oemcp => 850 },
685 { name => "en-HK", lcid => 0x00003c09, oemcp => 850, sabbrevlangname => "ENH" },
686 { name => "en-ID", lcid => 0x00003809, file => "en", oemcp => 850, sabbrevlangname => "ZZZ" },
687 { name => "en-IE", lcid => 0x00001809, oemcp => 850, sabbrevlangname => "ENI" },
688 { name => "en-IL", oemcp => 65001 },
689 { name => "en-IM", oemcp => 850 },
690 { name => "en-IN", lcid => 0x00004009, sabbrevlangname => "ENN" },
691 { name => "en-IO", oemcp => 850 },
692 { name => "en-JE", oemcp => 850 },
693 { name => "en-JM", lcid => 0x00002009, oemcp => 850, sabbrevlangname => "ENJ" },
694 { name => "en-KE", oemcp => 850 },
695 { name => "en-KI", oemcp => 850 },
696 { name => "en-KN", oemcp => 850 },
697 { name => "en-KY", oemcp => 850 },
698 { name => "en-LC", oemcp => 850 },
699 { name => "en-LR", oemcp => 850 },
700 { name => "en-LS", oemcp => 850 },
701 { name => "en-MG", oemcp => 850 },
702 { name => "en-MH", oemcp => 850 },
703 { name => "en-MO", oemcp => 850 },
704 { name => "en-MP", oemcp => 850 },
705 { name => "en-MS", oemcp => 850 },
706 { name => "en-MT", oemcp => 850 },
707 { name => "en-MU", oemcp => 850 },
708 { name => "en-MW", oemcp => 850 },
709 { name => "en-MY", lcid => 0x00004409, sabbrevlangname => "ENM" },
710 { name => "en-NA", oemcp => 850 },
711 { name => "en-NF", oemcp => 850 },
712 { name => "en-NG", oemcp => 850 },
713 { name => "en-NL", oemcp => 65001 },
714 { name => "en-NR", oemcp => 850 },
715 { name => "en-NU", oemcp => 850 },
716 { name => "en-NZ", lcid => 0x00001409, oemcp => 850, sabbrevlangname => "ENZ" },
717 { name => "en-PG", oemcp => 850 },
718 { name => "en-PH", lcid => 0x00003409, ebcdiccp => 500, sabbrevlangname => "ENP" },
719 { name => "en-PK", oemcp => 850 },
720 { name => "en-PN", oemcp => 850 },
721 { name => "en-PR", oemcp => 850 },
722 { name => "en-PW", oemcp => 850 },
723 { name => "en-RW", oemcp => 850 },
724 { name => "en-SB", oemcp => 850 },
725 { name => "en-SC", oemcp => 850 },
726 { name => "en-SD", oemcp => 850 },
727 { name => "en-SE", oemcp => 65001 },
728 { name => "en-SG", lcid => 0x00004809, sabbrevlangname => "ENE" },
729 { name => "en-SH", oemcp => 850 },
730 { name => "en-SI", oemcp => 65001 },
731 { name => "en-SL", oemcp => 850 },
732 { name => "en-SS", oemcp => 850 },
733 { name => "en-SX", oemcp => 850 },
734 { name => "en-SZ", oemcp => 850 },
735 { name => "en-TC", oemcp => 850 },
736 { name => "en-TK", oemcp => 850 },
737 { name => "en-TO", oemcp => 850 },
738 { name => "en-TT", lcid => 0x00002c09, oemcp => 850, sabbrevlangname => "ENT" },
739 { name => "en-TV", oemcp => 850 },
740 { name => "en-TZ", oemcp => 850 },
741 { name => "en-UG", oemcp => 850 },
742 { name => "en-UM", oemcp => 850 },
743 { name => "en-US", lcid => 0x00000409 },
744 { name => "en-VC", oemcp => 850 },
745 { name => "en-VG", oemcp => 850 },
746 { name => "en-VI", oemcp => 850 },
747 { name => "en-VU", oemcp => 850 },
748 { name => "en-WS", oemcp => 850 },
749 { name => "en-ZA", lcid => 0x00001c09, ebcdiccp => 500, sabbrevlangname => "ENS" },
750 { name => "en-ZM", oemcp => 850 },
751 { name => "en-ZW", lcid => 0x00003009, ebcdiccp => 500, sabbrevlangname => "ENW" },
752 { name => "eo", sopentypelang => "NTO" },
753 { name => "eo-001" },
754 { name => "es", lcid => 0x0000000a, oemcp => 850, ebcdiccp => 20284, sabbrevlangname => "ESP", sopentypelang => "ESP" },
755 { name => "es-419", lcid => 0x0000580a, sabbrevlangname => "ESJ" },
756 { name => "es-AR", lcid => 0x00002c0a, sabbrevlangname => "ESS" },
757 { name => "es-BO", lcid => 0x0000400a, sabbrevlangname => "ESB" },
758 { name => "es-BR", oemcp => 65001 },
759 { name => "es-BZ", oemcp => 65001 },
760 { name => "es-CL", lcid => 0x0000340a, sabbrevlangname => "ESL" },
761 { name => "es-CO", lcid => 0x0000240a, sabbrevlangname => "ESO" },
762 { name => "es-CR", lcid => 0x0000140a, sabbrevlangname => "ESC" },
763 { name => "es-CU", lcid => 0x00005c0a, sabbrevlangname => "ESK" },
764 { name => "es-DO", lcid => 0x00001c0a, sabbrevlangname => "ESD" },
765 { name => "es-EA" },
766 { name => "es-EC", lcid => 0x0000300a, sabbrevlangname => "ESF" },
767 { name => "es-ES", lcid => 0x00000c0a, sabbrevlangname => "ESN" },
768 { name => "es-ES_tradnl", lcid => 0x0000040a, file => "es_ES" },
769 { name => "es-ES-u-co-trad", alias => "es-ES_tradnl" },
770 { name => "es-GQ" },
771 { name => "es-GT", lcid => 0x0000100a, sabbrevlangname => "ESG" },
772 { name => "es-HN", lcid => 0x0000480a, sabbrevlangname => "ESH" },
773 { name => "es-IC" },
774 { name => "es-MX", lcid => 0x0000080a, sabbrevlangname => "ESM" },
775 { name => "es-NI", lcid => 0x00004c0a, sabbrevlangname => "ESI" },
776 { name => "es-PA", lcid => 0x0000180a, sabbrevlangname => "ESA" },
777 { name => "es-PE", lcid => 0x0000280a, sabbrevlangname => "ESR" },
778 { name => "es-PH" },
779 { name => "es-PR", lcid => 0x0000500a, sabbrevlangname => "ESU" },
780 { name => "es-PY", lcid => 0x00003c0a, sabbrevlangname => "ESZ" },
781 { name => "es-SV", lcid => 0x0000440a, sabbrevlangname => "ESE" },
782 { name => "es-US", lcid => 0x0000540a, sabbrevlangname => "EST" },
783 { name => "es-UY", lcid => 0x0000380a, sabbrevlangname => "ESY" },
784 { name => "es-VE", lcid => 0x0000200a, sabbrevlangname => "ESV" },
785 { name => "et", lcid => 0x00000025, oemcp => 775, group => 3, sabbrevlangname => "ETI", sopentypelang => "ETI" },
786 { name => "et-EE", lcid => 0x00000425 },
787 { name => "eu", lcid => 0x0000002d, oemcp => 850, maccp => 65001, sabbrevlangname => "EUQ", sopentypelang => "EUQ" },
788 { name => "eu-ES", lcid => 0x0000042d },
789 { name => "ewo" },
790 { name => "ewo-CM" },
791 { name => "fa", lcid => 0x00000029, inegnumber => 3, oemcp => 720, slist => "\x{061b}", group => 13, sabbrevlangname => "FAR", sopentypelang => "FAR" },
792 { name => "fa-AF", alias => "prs-AF" },
793 { name => "fa-IR", lcid => 0x00000429 },
794 { name => "ff", lcid => 0x00000067, oemcp => 850, ebcdiccp => 20297 },
795 { name => "ff-CM", alias => "ff-Latn-CM" },
796 { name => "ff-GN", alias => "ff-Latn-GN" },
797 { name => "ff-MR", alias => "ff-Latn-MR" },
798 { name => "ff-NG", alias => "ff-Latn-NG" },
799 { name => "ff-SN", alias => "ff-Latn-SN" },
800 { name => "ff-Adlm", oemcp => 65001 },
801 { name => "ff-Adlm-BF" },
802 { name => "ff-Adlm-CM" },
803 { name => "ff-Adlm-GH" },
804 { name => "ff-Adlm-GM" },
805 { name => "ff-Adlm-GN" },
806 { name => "ff-Adlm-GW" },
807 { name => "ff-Adlm-LR" },
808 { name => "ff-Adlm-MR" },
809 { name => "ff-Adlm-NE" },
810 { name => "ff-Adlm-NG" },
811 { name => "ff-Adlm-SL" },
812 { name => "ff-Adlm-SN" },
813 { name => "ff-Latn", lcid => 0x00007c67 },
814 { name => "ff-Latn-BF", oemcp => 65001 },
815 { name => "ff-Latn-CM" },
816 { name => "ff-Latn-GH", oemcp => 65001 },
817 { name => "ff-Latn-GM", oemcp => 65001 },
818 { name => "ff-Latn-GN" },
819 { name => "ff-Latn-GW", oemcp => 65001 },
820 { name => "ff-Latn-LR", oemcp => 65001 },
821 { name => "ff-Latn-MR" },
822 { name => "ff-Latn-NE", oemcp => 65001 },
823 { name => "ff-Latn-NG", lcid => 0x00000467, sabbrevlangname => "ZZZ" },
824 { name => "ff-Latn-SL", oemcp => 65001 },
825 { name => "ff-Latn-SN", lcid => 0x00000867 },
826 { name => "fi", lcid => 0x0000000b, oemcp => 850, ebcdiccp => 20278 },
827 { name => "fi-FI", lcid => 0x0000040b },
828 { name => "fil", lcid => 0x00000064, oemcp => 437, ebcdiccp => 500, sabbrevlangname => "FPO", sopentypelang => "PIL" },
829 { name => "fil-PH", lcid => 0x00000464 },
830 { name => "fil-Latn", alias => "fil" },
831 { name => "fil-Latn-PH", alias => "fil-PH" },
832 { name => "fo", lcid => 0x00000038, oemcp => 850, maccp => 10079, ebcdiccp => 20277, sabbrevlangname => "FOS", sopentypelang => "FOS" },
833 { name => "fo-DK", oemcp => 65001, maccp => 65001 },
834 { name => "fo-FO", lcid => 0x00000438 },
835 { name => "fr", lcid => 0x0000000c, oemcp => 850, ebcdiccp => 20297 },
836 { name => "fr-029", lcid => 0x00001c0c, file => "fr", sabbrevlangname => "ZZZ" },
837 { name => "fr-BE", lcid => 0x0000080c, sabbrevlangname => "FRB" },
838 { name => "fr-BF" },
839 { name => "fr-BI" },
840 { name => "fr-BJ" },
841 { name => "fr-BL" },
842 { name => "fr-CA", lcid => 0x00000c0c, sabbrevlangname => "FRC" },
843 { name => "fr-CD", lcid => 0x0000240c, sabbrevlangname => "FRD" },
844 { name => "fr-CF" },
845 { name => "fr-CG" },
846 { name => "fr-CH", lcid => 0x0000100c, sabbrevlangname => "FRS" },
847 { name => "fr-CI", lcid => 0x0000300c, sabbrevlangname => "FRI" },
848 { name => "fr-CM", lcid => 0x00002c0c, sabbrevlangname => "FRE" },
849 { name => "fr-DJ" },
850 { name => "fr-DZ" },
851 { name => "fr-FR", lcid => 0x0000040c },
852 { name => "fr-GA" },
853 { name => "fr-GF" },
854 { name => "fr-GN" },
855 { name => "fr-GP" },
856 { name => "fr-GQ" },
857 { name => "fr-HT", lcid => 0x00003c0c, sabbrevlangname => "FRH" },
858 { name => "fr-KM" },
859 { name => "fr-LU", lcid => 0x0000140c, sabbrevlangname => "FRL" },
860 { name => "fr-MA", lcid => 0x0000380c, sabbrevlangname => "FRO" },
861 { name => "fr-MC", lcid => 0x0000180c, sabbrevlangname => "FRM" },
862 { name => "fr-MF" },
863 { name => "fr-MG" },
864 { name => "fr-ML", lcid => 0x0000340c, sabbrevlangname => "FRF" },
865 { name => "fr-MQ" },
866 { name => "fr-MR" },
867 { name => "fr-MU" },
868 { name => "fr-NC" },
869 { name => "fr-NE" },
870 { name => "fr-PF" },
871 { name => "fr-PM" },
872 { name => "fr-RE", lcid => 0x0000200c, sabbrevlangname => "FRR" },
873 { name => "fr-RW" },
874 { name => "fr-SC" },
875 { name => "fr-SN", lcid => 0x0000280c, sabbrevlangname => "FRN" },
876 { name => "fr-SY" },
877 { name => "fr-TD" },
878 { name => "fr-TG" },
879 { name => "fr-TN" },
880 { name => "fr-VU" },
881 { name => "fr-WF" },
882 { name => "fr-YT" },
883 { name => "fur", sopentypelang => "FRL" },
884 { name => "fur-IT" },
885 { name => "fuv-NG", alias => "ff-Latn-NG" },
886 { name => "fy", lcid => 0x00000062, oemcp => 850, sabbrevlangname => "FYN", sopentypelang => "FRI" },
887 { name => "fy-NL", lcid => 0x00000462 },
888 { name => "ga", lcid => 0x0000003c, oemcp => 850, sabbrevlangname => "IRE", sopentypelang => "IRI" },
889 { name => "ga-GB", oemcp => 65001 },
890 { name => "ga-IE", lcid => 0x0000083c },
891 { name => "gd", lcid => 0x00000091, oemcp => 850, ebcdiccp => 20285, sopentypelang => "GAE" },
892 { name => "gd-GB", lcid => 0x00000491 },
893 { name => "gd-Latn", alias => "gd" },
894 { name => "gl", lcid => 0x00000056, oemcp => 850, sabbrevlangname => "GLC", sopentypelang => "GAL" },
895 { name => "gl-ES", lcid => 0x00000456 },
896 { name => "gn", lcid => 0x00000074, oemcp => 850, ebcdiccp => 20284, slist => ",", sopentypelang => "GUA" },
897 { name => "gn-PY", lcid => 0x00000474 },
898 { name => "gsw", lcid => 0x00000084, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "ZZZ", sopentypelang => "ALS" },
899 { name => "gsw-CH" },
900 { name => "gsw-FR", lcid => 0x00000484, sabbrevlangname => "GSW" },
901 { name => "gsw-LI" },
902 { name => "gu", lcid => 0x00000047, slist => ",", group => 15 },
903 { name => "gu-IN", lcid => 0x00000447 },
904 { name => "guz" },
905 { name => "guz-KE" },
906 { name => "gv", sopentypelang => "MNX" },
907 { name => "gv-GB", file => "gv" },
908 { name => "gv-IM" },
909 { name => "ha", lcid => 0x00000068, oemcp => 437 },
910 { name => "ha-GH", alias => "ha-Latn-GH" },
911 { name => "ha-Latn", lcid => 0x00007c68, file => "ha" },
912 { name => "ha-Latn-GH", file => "ha_GH", ebcdiccp => 500 },
913 { name => "ha-Latn-NE", file => "ha_NE", ebcdiccp => 500 },
914 { name => "ha-Latn-NG", lcid => 0x00000468, file => "ha_NG" },
915 { name => "ha-NE", alias => "ha-Latn-NE" },
916 { name => "ha-NG", alias => "ha-Latn-NG" },
917 { name => "haw", lcid => 0x00000075, oemcp => 437 },
918 { name => "haw-Latn", alias => "haw" },
919 { name => "haw-Latn-US", alias => "haw-US" },
920 { name => "haw-US", lcid => 0x00000475 },
921 { name => "he", lcid => 0x0000000d, oemcp => 862, slist => ",", group => 12, sopentypelang => "IWR" },
922 { name => "he-IL", lcid => 0x0000040d },
923 { name => "hi", lcid => 0x00000039, slist => ",", group => 15 },
924 { name => "hi-IN", lcid => 0x00000439 },
925 { name => "hr", lcid => 0x0000001a, inegnumber => 2, oemcp => 852, maccp => 10082, group => 2 },
926 { name => "hr-BA", lcid => 0x0000101a, ebcdiccp => 870, inegnumber => 1, sabbrevlangname => "HRB" },
927 { name => "hr-HR", lcid => 0x0000041a },
928 { name => "hsb", lcid => 0x0000002e, oemcp => 850, ebcdiccp => 870, sopentypelang => "USB" },
929 { name => "hsb-DE", lcid => 0x0000042e },
930 { name => "hu", lcid => 0x0000000e, oemcp => 852, group => 2 },
931 { name => "hu-HU", lcid => 0x0000040e },
932 { name => "hu-HU_technl", lcid => 0x0001040e, alias => "hu-HU" },
933 { name => "hy", lcid => 0x0000002b, slist => ",", group => 17 },
934 { name => "hy-AM", lcid => 0x0000042b },
935 { name => "ia" },
936 { name => "ia-001" },
937 ## name => "ibb", lcid => 0x00000069 },
938 ## name => "ibb-NG", lcid => 0x00000469 },
939 { name => "id", lcid => 0x00000021, oemcp => 850 },
940 { name => "id-ID", lcid => 0x00000421 },
941 { name => "ig", lcid => 0x00000070, oemcp => 437 },
942 { name => "ig-Latn", alias => "ig" },
943 { name => "ig-Latn-NG", alias => "ig-NG" },
944 { name => "ig-NG", lcid => 0x00000470 },
945 { name => "ii", lcid => 0x00000078, group => 9, sopentypelang => "YIM" },
946 { name => "ii-CN", lcid => 0x00000478 },
947 { name => "ii-Yiii", alias => "ii" },
948 { name => "ii-Yiii-CN", alias => "ii-CN" },
949 { name => "is", lcid => 0x0000000f, oemcp => 850, maccp => 10079, ebcdiccp => 20871 },
950 { name => "is-IS", lcid => 0x0000040f },
951 { name => "it", lcid => 0x00000010, oemcp => 850, ebcdiccp => 20280 },
952 { name => "it-CH", lcid => 0x00000810, ebcdiccp => 500, sabbrevlangname => "ITS" },
953 { name => "it-IT", lcid => 0x00000410 },
954 { name => "it-SM" },
955 { name => "it-VA", oemcp => 65001 },
956 { name => "iu", lcid => 0x0000005d, oemcp => 437, slist => ",", sortlocale => "iu-Latn-CA", sabbrevlangname => "IUK", sopentypelang => "INU" },
957 { name => "iu-Cans", lcid => 0x0000785d, file => "iu", oemcp => 65001, sabbrevlangname => "IUS" },
958 { name => "iu-Cans-CA", lcid => 0x0000045d, file => "iu_CA" },
959 { name => "iu-Latn", lcid => 0x00007c5d },
960 { name => "iu-Latn-CA", lcid => 0x0000085d },
961 { name => "ja", lcid => 0x00000011, ireadinglayout => 2, oemcp => 932, slist => ",", sscripts => "Hani Hira Jpan Kana", group => 7, sopentypelang => "JAN" },
962 { name => "ja-JP", lcid => 0x00000411 },
963 { name => "ja-JP_radstr", lcid => 0x00040411, alias => "ja-JP" },
964 { name => "ja-JP-u-co-unihan", alias => "ja-JP_radstr" },
965 { name => "jgo" },
966 { name => "jgo-CM" },
967 { name => "jmc" },
968 { name => "jmc-TZ" },
969 { name => "jv", oemcp => 850, nativedigits => "0123456789" },
970 { name => "jv-ID", alias => "jv-Latn-ID" },
971 ## name => "jv-Java" },
972 ## name => "jv-Java-ID" },
973 { name => "jv-Latn", file => "jv" },
974 { name => "jv-Latn-ID", file => "jv_ID" },
975 { name => "ka", lcid => 0x00000037, group => 16 },
976 { name => "ka-GE", lcid => 0x00000437 },
977 { name => "ka-GE_modern", lcid => 0x00010437, alias => "ka-GE" },
978 { name => "kab", sopentypelang => "KAB0" },
979 { name => "kab-DZ" },
980 { name => "kam", sopentypelang => "KMB" },
981 { name => "kam-KE" },
982 { name => "kde" },
983 { name => "kde-TZ" },
984 { name => "kea" },
985 { name => "kea-CV" },
986 { name => "kgp" },
987 { name => "kgp-BR" },
988 { name => "khq" },
989 { name => "khq-ML" },
990 { name => "ki" },
991 { name => "ki-KE" },
992 { name => "kk", lcid => 0x0000003f, group => 5, sabbrevlangname => "KKZ" },
993 { name => "kk-Cyrl", alias => "kk" },
994 { name => "kk-Cyrl-KZ", alias => "kk-KZ" },
995 { name => "kk-KZ", lcid => 0x0000043f },
996 { name => "kkj" },
997 { name => "kkj-CM" },
998 { name => "kl", lcid => 0x0000006f, oemcp => 850, ebcdiccp => 20277, sopentypelang => "GRN" },
999 { name => "kl-GL", lcid => 0x0000046f },
1000 { name => "kln", sopentypelang => "KAL" },
1001 { name => "kln-KE" },
1002 { name => "km", lcid => 0x00000053, inegnumber => 2, slist => ",", group => 15 },
1003 { name => "km-KH", lcid => 0x00000453 },
1004 { name => "kn", lcid => 0x0000004b, slist => ",", group => 15, sabbrevlangname => "KDI" },
1005 { name => "kn-IN", lcid => 0x0000044b },
1006 { name => "ko", lcid => 0x00000012, ireadinglayout => 2, slist => ",", oemcp => 949, ebcdiccp => 20833, sscripts => "Hang Hani Kore", group => 8 },
1007 { name => "ko-KP", oemcp => 65001 },
1008 { name => "ko-KR", lcid => 0x00000412 },
1009 { name => "kok", lcid => 0x00000057, slist => ",", group => 15, sabbrevlangname => "KNK" },
1010 { name => "kok-IN", lcid => 0x00000457 },
1011 { name => "kr", lcid => 0x00000071, sortlocale => "kr-Latn-NG", oemcp => 850, dir => "exemplars", sabbrevlangname => "ZZZ", sopentypelang => "KNR" },
1012 { name => "kr-Latn", file => "kr", dir => "exemplars" },
1013 { name => "kr-Latn-NG", lcid => 0x00000471, file => "kr", dir => "exemplars" },
1014 { name => "kr-NG", alias => "kr-Latn-NG" },
1015 { name => "ks", lcid => 0x00000060, group => 15, sabbrevlangname => "ZZZ", sopentypelang => "KSH" },
1016 { name => "ks-Arab", lcid => 0x00000460 },
1017 { name => "ks-Arab-IN" },
1018 { name => "ks-Deva", slist => "," },
1019 { name => "ks-Deva-IN", lcid => 0x00000860 },
1020 { name => "ks-IN", alias => "ks-Arab-IN" },
1021 { name => "ksb" },
1022 { name => "ksb-TZ" },
1023 { name => "ksf" },
1024 { name => "ksf-CM" },
1025 { name => "ksh", sopentypelang => "KSH0" },
1026 { name => "ksh-DE" },
1027 { name => "ku", lcid => 0x00000092, file => "ckb", slist => "\x{061b}", sortlocale => "ku-Arab-IQ", oemcp => 720 },
1028 { name => "ku-Arab", lcid => 0x00007c92, file => "ckb", group => 13 },
1029 { name => "ku-Arab-IQ", lcid => 0x00000492, file => "ckb_IQ" },
1030 { name => "ku-Arab-IR", file => "ckb_IR", oemcp => 65001 },
1031 { name => "kw" },
1032 { name => "kw-GB" },
1033 { name => "ky", lcid => 0x00000040, oemcp => 866, group => 5, sabbrevlangname => "KYR" },
1034 { name => "ky-Cyrl", alias => "ky" },
1035 { name => "ky-Cyrl-KG", alias => "ky-KG" },
1036 { name => "ky-KG", lcid => 0x00000440 },
1037 { name => "la", lcid => 0x00000076, oemcp => 437, slist => ",", sabbrevlangname => "ZZZ" },
1038 { name => "la-VA", lcid => 0x00000476 },
1039 { name => "la-001", alias => "la-VA" },
1040 { name => "lag" },
1041 { name => "lag-TZ" },
1042 { name => "lb", lcid => 0x0000006e, oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "LBX" },
1043 { name => "lb-LU", lcid => 0x0000046e },
1044 { name => "lg" },
1045 { name => "lg-UG" },
1046 { name => "lkt" },
1047 { name => "lkt-US" },
1048 { name => "ln" },
1049 { name => "ln-AO" },
1050 { name => "ln-CD" },
1051 { name => "ln-CF" },
1052 { name => "ln-CG" },
1053 { name => "lo", lcid => 0x00000054, group => 15 },
1054 { name => "lo-LA", lcid => 0x00000454 },
1055 { name => "lrc" },
1056 { name => "lrc-IQ" },
1057 { name => "lrc-IR" },
1058 { name => "lt", lcid => 0x00000027, oemcp => 775, group => 3, sabbrevlangname => "LTH", sopentypelang => "LTH" },
1059 { name => "lt-LT", lcid => 0x00000427 },
1060 { name => "lu" },
1061 { name => "lu-CD" },
1062 { name => "luo" },
1063 { name => "luo-KE" },
1064 { name => "luy", sopentypelang => "LUH" },
1065 { name => "luy-KE" },
1066 { name => "lv", lcid => 0x00000026, oemcp => 775, group => 3, sabbrevlangname => "LVI", sopentypelang => "LVI" },
1067 { name => "lv-LV", lcid => 0x00000426 },
1068 { name => "mai" },
1069 { name => "mai-IN" },
1070 { name => "mas" },
1071 { name => "mas-KE" },
1072 { name => "mas-TZ" },
1073 { name => "mer" },
1074 { name => "mer-KE" },
1075 { name => "mfe" },
1076 { name => "mfe-MU" },
1077 { name => "mg" },
1078 { name => "mg-MG" },
1079 { name => "mgh" },
1080 { name => "mgh-MZ" },
1081 { name => "mgo" },
1082 { name => "mgo-CM" },
1083 { name => "mi", lcid => 0x00000081, slist => "," },
1084 { name => "mi-Latn", alias => "mi" },
1085 { name => "mi-Latn-NZ", alias => "mi-NZ" },
1086 { name => "mi-NZ", lcid => 0x00000481 },
1087 { name => "mk", lcid => 0x0000002f, oemcp => 866, ebcdiccp => 500, group => 5, sabbrevlangname => "MKI" },
1088 { name => "mk-MK", lcid => 0x0000042f },
1089 { name => "ml", lcid => 0x0000004c, group => 15, sabbrevlangname => "MYM", sopentypelang => "MLR" },
1090 { name => "ml-IN", lcid => 0x0000044c },
1091 { name => "mn", lcid => 0x00000050, oemcp => 866, sopentypelang => "MNG" },
1092 { name => "mn-Cyrl", lcid => 0x00007850, file => "mn", sabbrevlangname => "MNN" },
1093 { name => "mn-Cyrl-MN", alias => "mn-MN" },
1094 { name => "mn-MN", lcid => 0x00000450, sparent => "mn-Cyrl", group => 5 },
1095 { name => "mn-Mong", lcid => 0x00007c50, oemcp => 65001, slist => ",", group => 15, sabbrevlangname => "MNG", nativedigits => "0123456789" },
1096 { name => "mn-Mong-CN", lcid => 0x00000850 },
1097 { name => "mn-Mong-MN", lcid => 0x00000c50, sabbrevlangname => "MNM" },
1098 { name => "mni", lcid => 0x00000058, slist => ",", sabbrevlangname => "ZZZ" },
1099 { name => "mni-IN", lcid => 0x00000458, file => "mni_Beng_IN" },
1100 { name => "mni-Beng" },
1101 { name => "mni-Beng-IN", alias => "mni-IN" },
1102 { name => "moh", lcid => 0x0000007c, oemcp => 850, ebcdiccp => 37, slist => ",", sabbrevlangname => "MWK" },
1103 { name => "moh-CA", lcid => 0x0000047c },
1104 { name => "moh-Latn", alias => "moh" },
1105 { name => "moh-Latn-CA", alias => "moh-CA" },
1106 { name => "mr", lcid => 0x0000004e, slist => ",", group => 15 },
1107 { name => "mr-IN", lcid => 0x0000044e },
1108 { name => "ms", lcid => 0x0000003e, oemcp => 850, sabbrevlangname => "MSL", sopentypelang => "MLY" },
1109 { name => "ms-BN", lcid => 0x0000083e, sabbrevlangname => "MSB" },
1110 { name => "ms-ID" },
1111 { name => "ms-Latn", alias => "ms" },
1112 { name => "ms-Latn-BN", alias => "ms-BN" },
1113 { name => "ms-Latn-MY", alias => "ms-MY" },
1114 { name => "ms-Latn-SG", alias => "ms-SG" },
1115 { name => "ms-MY", lcid => 0x0000043e },
1116 { name => "ms-SG" },
1117 { name => "mt", lcid => 0x0000003a, sopentypelang => "MTS" },
1118 { name => "mt-MT", lcid => 0x0000043a },
1119 { name => "mua" },
1120 { name => "mua-CM" },
1121 { name => "my", lcid => 0x00000055, sopentypelang => "BRM" },
1122 { name => "my-MM", lcid => 0x00000455 },
1123 { name => "mzn" },
1124 { name => "mzn-IR" },
1125 { name => "naq" },
1126 { name => "naq-NA" },
1127 { name => "nb", lcid => 0x00007c14, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NOR", sopentypelang => "NOR" },
1128 { name => "nb-NO", lcid => 0x00000414 },
1129 { name => "nb-SJ" },
1130 { name => "nd", sopentypelang => "NDB" },
1131 { name => "nd-ZW" },
1132 { name => "nds" },
1133 { name => "nds-DE" },
1134 { name => "nds-NL" },
1135 { name => "ne", lcid => 0x00000061, slist => "," },
1136 { name => "ne-IN", lcid => 0x00000861, sabbrevlangname => "NEI" },
1137 { name => "ne-NP", lcid => 0x00000461, group => 15 },
1138 { name => "nl", lcid => 0x00000013, oemcp => 850 },
1139 { name => "nl-AW" },
1140 { name => "nl-BE", lcid => 0x00000813, sabbrevlangname => "NLB" },
1141 { name => "nl-BQ" },
1142 { name => "nl-CW" },
1143 { name => "nl-NL", lcid => 0x00000413 },
1144 { name => "nl-SR" },
1145 { name => "nl-SX" },
1146 { name => "nmg" },
1147 { name => "nmg-CM" },
1148 { name => "nn", lcid => 0x00007814, oemcp => 850, ebcdiccp => 20277, sabbrevlangname => "NON", sopentypelang => "NYN" },
1149 { name => "nn-NO", lcid => 0x00000814 },
1150 { name => "nnh" },
1151 { name => "nnh-CM" },
1152 { name => "no", lcid => 0x00000014, oemcp => 850, ebcdiccp => 20277, sortlocale => "nb-NO" },
1153 { name => "nqo", idigits => 3, inegnumber => 3, slist => "\x{060c}", sopentypelang => "NKO" },
1154 { name => "nqo-GN" },
1155 { name => "nr", sopentypelang => "NDB" },
1156 { name => "nr-ZA" },
1157 { name => "nso", lcid => 0x0000006c, oemcp => 850, sopentypelang => "SOT" },
1158 { name => "nso-ZA", lcid => 0x0000046c },
1159 { name => "nus" },
1160 { name => "nus-SD", alias => "nus-SS" },
1161 { name => "nus-SS" },
1162 { name => "nyn", sopentypelang => "NKL" },
1163 { name => "nyn-UG" },
1164 { name => "oc", lcid => 0x00000082, oemcp => 850, ebcdiccp => 20297 },
1165 { name => "oc-FR", lcid => 0x00000482 },
1166 { name => "oc-Latn", alias => "oc" },
1167 { name => "oc-Latn-FR", alias => "oc-FR" },
1168 { name => "om", lcid => 0x00000072, sopentypelang => "ORO" },
1169 { name => "om-ET", lcid => 0x00000472 },
1170 { name => "om-KE" },
1171 { name => "or", lcid => 0x00000048, slist => ",", group => 15 },
1172 { name => "or-IN", lcid => 0x00000448 },
1173 { name => "os" },
1174 { name => "os-GE" },
1175 { name => "os-RU" },
1176 { name => "pa", lcid => 0x00000046, slist => "," },
1177 { name => "pa-Arab", lcid => 0x00007c46, slist => ";", inegnumber => 2, oemcp => 720, group => 13, sabbrevlangname => "PAP" },
1178 { name => "pa-Arab-PK", lcid => 0x00000846 },
1179 { name => "pa-Guru" },
1180 { name => "pa-Guru-IN", alias => "pa-IN" },
1181 { name => "pa-IN", lcid => 0x00000446, sparent => "pa-Guru", file => "pa_Guru_IN", group => 15 },
1182 { name => "pap", lcid => 0x00000079, oemcp => 850, sopentypelang => "PAP0" },
1183 ## name => "pap-029", lcid => 0x00000479 },
1184 { name => "pcm" },
1185 { name => "pcm-NG", alias => "pcm-Latn-NG" },
1186 { name => "pcm-Latn", file => "pcm" },
1187 { name => "pcm-Latn-NG", file => "pcm_NG" },
1188 { name => "pl", lcid => 0x00000015, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "PLK", sopentypelang => "PLK" },
1189 { name => "pl-PL", lcid => 0x00000415 },
1190 { name => "prg" },
1191 { name => "prg-001" },
1192 { name => "prs", lcid => 0x0000008c, file => "fa", inegnumber => 3, oemcp => 720, group => 13, sopentypelang => "DRI" },
1193 { name => "prs-AF", lcid => 0x0000048c, file => "fa_AF" },
1194 { name => "prs-Arab", alias => "prs" },
1195 { name => "prs-Arab-AF", alias => "prs-AF" },
1196 { name => "ps", lcid => 0x00000063, group => 13, sabbrevlangname => "PAS", sopentypelang => "PAS" },
1197 { name => "ps-AF", lcid => 0x00000463 },
1198 { name => "ps-PK" },
1199 { name => "pt", lcid => 0x00000016, oemcp => 850, sabbrevlangname => "PTB", sopentypelang => "PTG" },
1200 { name => "pt-AO" },
1201 { name => "pt-BR", lcid => 0x00000416 },
1202 { name => "pt-CH", oemcp => 65001 },
1203 { name => "pt-CV" },
1204 { name => "pt-GQ", oemcp => 65001 },
1205 { name => "pt-GW" },
1206 { name => "pt-LU", oemcp => 65001 },
1207 { name => "pt-MO" },
1208 { name => "pt-MZ" },
1209 { name => "pt-PT", lcid => 0x00000816, sabbrevlangname => "PTG" },
1210 { name => "pt-ST" },
1211 { name => "pt-TL" },
1212 ## name => qps-Latn-x-sh", lcid => 0x80000901 },
1213 ## name => qps-ploc", lcid => 0x80000501 },
1214 ## name => qps-ploca", lcid => 0x800005fe },
1215 ## name => qps-plocm", lcid => 0x800009ff },
1216 { name => "qu", alias => "quz" },
1217 { name => "qu-BO", alias => "quz-BO" },
1218 { name => "qu-EC", alias => "quz-EC" },
1219 { name => "qu-PE", alias => "quz-PE" },
1220 { name => "quc", lcid => 0x00000086, oemcp => 850, ebcdiccp => 20284, slist => "," },
1221 { name => "quc-Latn", lcid => 0x00007c86, file => "quc" },
1222 { name => "quc-Latn-GT", lcid => 0x00000486, file => "quc_GT" },
1223 { name => "qut", alias => "quc" },
1224 { name => "qut-GT", alias => "quc-Latn-GT" },
1225 { name => "quz", lcid => 0x0000006b, file => "qu", territory => "BO", oemcp => 850, ebcdiccp => 20284, slist => "," },
1226 { name => "quz-BO", lcid => 0x0000046b, file => "qu_BO" },
1227 { name => "quz-EC", lcid => 0x0000086b, file => "qu_EC" },
1228 { name => "quz-Latn", alias => "quz" },
1229 { name => "quz-Latn-BO", alias => "quz-BO" },
1230 { name => "quz-Latn-EC", alias => "quz-EC" },
1231 { name => "quz-Latn-PE", alias => "quz-PE" },
1232 { name => "quz-PE", lcid => 0x00000c6b, file => "qu_PE" },
1233 { name => "rm", lcid => 0x00000017, oemcp => 850, ebcdiccp => 20273, sabbrevlangname => "RMC", sopentypelang => "RMS" },
1234 { name => "rm-CH", lcid => 0x00000417 },
1235 { name => "rn" },
1236 { name => "rn-BI" },
1237 { name => "ro", lcid => 0x00000018, oemcp => 852, ebcdiccp => 20880, sabbrevlangname => "ROM", sopentypelang => "ROM" },
1238 { name => "ro-MD", lcid => 0x00000818, maccp => 65001, sabbrevlangname => "ROD" },
1239 { name => "ro-RO", lcid => 0x00000418, group => 2 },
1240 { name => "rof" },
1241 { name => "rof-TZ" },
1242 { name => "ru", lcid => 0x00000019, oemcp => 866 },
1243 { name => "ru-BY", maccp => 65001 },
1244 { name => "ru-KG", maccp => 65001 },
1245 { name => "ru-KZ", maccp => 65001 },
1246 { name => "ru-MD", lcid => 0x00000819, maccp => 65001, sabbrevlangname => "RUM" },
1247 { name => "ru-RU", lcid => 0x00000419, group => 5 },
1248 { name => "ru-UA", maccp => 65001 },
1249 { name => "rw", lcid => 0x00000087, oemcp => 437, sopentypelang => "RUA" },
1250 { name => "rw-RW", lcid => 0x00000487 },
1251 { name => "rwk" },
1252 { name => "rwk-TZ" },
1253 { name => "sa", lcid => 0x0000004f, slist => ",", group => 15 },
1254 { name => "sa-Deva", alias => "sa" },
1255 { name => "sa-Deva-IN", alias => "sa-IN" },
1256 { name => "sa-IN", lcid => 0x0000044f },
1257 { name => "sah", lcid => 0x00000085, oemcp => 866, group => 5, sopentypelang => "YAK" },
1258 { name => "sah-Cyrl", alias => "sah" },
1259 { name => "sah-Cyrl-RU", alias => "sah-RU" },
1260 { name => "sah-RU", lcid => 0x00000485 },
1261 { name => "saq" },
1262 { name => "saq-KE" },
1263 { name => "sat" },
1264 { name => "sat-Olck" },
1265 { name => "sat-Olck-IN" },
1266 { name => "sbp" },
1267 { name => "sbp-TZ" },
1268 { name => "sc" },
1269 { name => "sc-IT" },
1270 { name => "sd", lcid => 0x00000059, inegnumber => 3, oemcp => 720, sabbrevlangname => "SIP" },
1271 { name => "sd-Arab", lcid => 0x00007c59, group => 13 },
1272 { name => "sd-Arab-PK", lcid => 0x00000859 },
1273 { name => "sd-Deva", inegnumber => 1, slist => ",", oemcp => 65001, group => 15 },
1274 { name => "sd-Deva-IN", lcid => 0x00000459, sabbrevlangname => "ZZZ" },
1275 { name => "sd-PK", alias => "sd-Arab-PK" },
1276 { name => "se", lcid => 0x0000003b, oemcp => 850, ebcdiccp => 20277, sopentypelang => "NSM" },
1277 { name => "se-FI", lcid => 0x00000c3b, ebcdiccp => 20278, sabbrevlangname => "SMG" },
1278 { name => "se-NO", lcid => 0x0000043b },
1279 { name => "se-SE", lcid => 0x0000083b, ebcdiccp => 20278, sabbrevlangname => "SMF" },
1280 { name => "se-Latn", alias => "se" },
1281 { name => "se-Latn-FI", alias => "se-FI" },
1282 { name => "se-Latn-NO", alias => "se-NO" },
1283 { name => "se-Latn-SE", alias => "se-SE" },
1284 { name => "seh" },
1285 { name => "seh-MZ" },
1286 { name => "ses" },
1287 { name => "ses-ML" },
1288 { name => "sg", sopentypelang => "SGO" },
1289 { name => "sg-CF" },
1290 { name => "shi" },
1291 { name => "shi-Latn" },
1292 { name => "shi-Latn-MA" },
1293 { name => "shi-Tfng" },
1294 { name => "shi-Tfng-MA" },
1295 { name => "si", lcid => 0x0000005b, group => 15, sopentypelang => "SNH" },
1296 { name => "si-LK", lcid => 0x0000045b },
1297 { name => "sk", lcid => 0x0000001b, oemcp => 852, ebcdiccp => 20880, group => 2, sabbrevlangname => "SKY", sopentypelang => "SKY" },
1298 { name => "sk-SK", lcid => 0x0000041b },
1299 { name => "sl", lcid => 0x00000024, oemcp => 852, ebcdiccp => 20880, group => 2 },
1300 { name => "sl-SI", lcid => 0x00000424 },
1301 { name => "sma", lcid => 0x0000783b, sparent => "se", ebcdiccp => 20278, sabbrevlangname => "SMB", sopentypelang => "SSM" },
1302 { name => "sma-Latn", alias => "sma" },
1303 { name => "sma-Latn-NO", alias => "sma-NO" },
1304 { name => "sma-Latn-SE", alias => "sma-SE" },
1305 { name => "sma-NO", lcid => 0x0000183b, ebcdiccp => 20277, sabbrevlangname => "SMA" },
1306 { name => "sma-SE", lcid => 0x00001c3b },
1307 { name => "smj", lcid => 0x00007c3b, sparent => "se", ebcdiccp => 20278, sabbrevlangname => "SMK", sopentypelang => "LSM" },
1308 { name => "smj-Latn", alias => "smj" },
1309 { name => "smj-Latn-NO", alias => "smj-NO" },
1310 { name => "smj-Latn-SE", alias => "smj-SE" },
1311 { name => "smj-NO", lcid => 0x0000103b, ebcdiccp => 20277, sabbrevlangname => "SMJ" },
1312 { name => "smj-SE", lcid => 0x0000143b },
1313 { name => "smn", lcid => 0x0000703b, sparent => "se", ebcdiccp => 20278, sopentypelang => "ISM" },
1314 { name => "smn-FI", lcid => 0x0000243b },
1315 { name => "smn-Latn", alias => "smn" },
1316 { name => "smn-Latn-FI", alias => "smn-FI" },
1317 { name => "sms", lcid => 0x0000743b, sparent => "se", ebcdiccp => 20278, sopentypelang => "SKS" },
1318 { name => "sms-FI", lcid => 0x0000203b },
1319 { name => "sms-Latn", alias => "sms" },
1320 { name => "sms-Latn-FI", alias => "sms-FI" },
1321 { name => "sn", sopentypelang => "SNA0" },
1322 { name => "sn-Latn", file => "sn" },
1323 { name => "sn-Latn-ZW", file => "sn_ZW" },
1324 { name => "sn-ZW", alias => "sn-Latn-ZW" },
1325 { name => "so", lcid => 0x00000077, sopentypelang => "SML" },
1326 { name => "so-DJ" },
1327 { name => "so-ET" },
1328 { name => "so-KE" },
1329 { name => "so-SO", lcid => 0x00000477 },
1330 { name => "sq", lcid => 0x0000001c, oemcp => 852, ebcdiccp => 20880, group => 2 },
1331 { name => "sq-AL", lcid => 0x0000041c },
1332 { name => "sq-MK" },
1333 { name => "sq-XK" },
1334 { name => "sr", lcid => 0x00007c1a, sortlocale => "sr-Latn-RS", oemcp => 852, group => 2, sabbrevlangname => "SRB", sopentypelang => "SRB" },
1335 { name => "sr-Cyrl", lcid => 0x00006c1a, oemcp => 855, ebcdiccp => 21025, group => 5, sabbrevlangname => "SRO" },
1336 { name => "sr-Cyrl-BA", lcid => 0x00001c1a, sabbrevlangname => "SRN" },
1337 { name => "sr-Cyrl-ME", lcid => 0x0000301a, sabbrevlangname => "SRQ" },
1338 { name => "sr-Cyrl-RS", lcid => 0x0000281a },
1339 { name => "sr-Cyrl-XK" },
1340 { name => "sr-Latn", lcid => 0x0000701a, sabbrevlangname => "SRM" },
1341 { name => "sr-Latn-BA", lcid => 0x0000181a, maccp => 10082, ebcdiccp => 870, sabbrevlangname => "SRS" },
1342 { name => "sr-Latn-ME", lcid => 0x00002c1a, sabbrevlangname => "SRP" },
1343 { name => "sr-Latn-RS", lcid => 0x0000241a, sabbrevlangname => "SRM" },
1344 { name => "sr-Latn-XK" },
1345 ## name => "sr-Cyrl-CS", lcid => 0x00000c1a },
1346 ## name => "sr-Latn-CS", lcid => 0x0000081a },
1347 { name => "ss", sopentypelang => "SWZ" },
1348 { name => "ss-SZ" },
1349 { name => "ss-ZA" },
1350 { name => "ssy" },
1351 { name => "ssy-ER" },
1352 { name => "st", lcid => 0x00000030 },
1353 { name => "st-LS" },
1354 { name => "st-ZA", lcid => 0x00000430 },
1355 { name => "su" },
1356 { name => "su-Latn" },
1357 { name => "su-Latn-ID" },
1358 { name => "sv", lcid => 0x0000001d, oemcp => 850, ebcdiccp => 20278, sabbrevlangname => "SVE", sopentypelang => "SVE" },
1359 { name => "sv-AX" },
1360 { name => "sv-FI", lcid => 0x0000081d, sabbrevlangname => "SVF" },
1361 { name => "sv-SE", lcid => 0x0000041d, sabbrevlangname => "SVE" },
1362 { name => "sw", lcid => 0x00000041, territory => "KE", oemcp => 437, ebcdiccp => 500, sabbrevlangname => "SWK", sopentypelang => "SWK" },
1363 { name => "sw-CD" },
1364 { name => "sw-KE", lcid => 0x00000441 },
1365 { name => "sw-TZ" },
1366 { name => "sw-UG" },
1367 { name => "swc-CD", alias => "sw-CD" },
1368 { name => "syr", lcid => 0x0000005a, slist => ",", group => 13 },
1369 { name => "syr-SY", lcid => 0x0000045a },
1370 { name => "syr-Syrc", alias => "syr" },
1371 { name => "syr-Syrc-SY", alias => "syr-SY" },
1372 { name => "ta", lcid => 0x00000049, slist => ",", group => 15, sabbrevlangname => "TAI" },
1373 { name => "ta-IN", lcid => 0x00000449 },
1374 { name => "ta-LK", lcid => 0x00000849, sabbrevlangname => "TAM" },
1375 { name => "ta-MY" },
1376 { name => "ta-SG" },
1377 { name => "te", lcid => 0x0000004a, group => 15 },
1378 { name => "te-IN", lcid => 0x0000044a },
1379 { name => "teo" },
1380 { name => "teo-KE" },
1381 { name => "teo-UG" },
1382 { name => "tg", lcid => 0x00000028, oemcp => 866, group => 5, sabbrevlangname => "TAJ", sopentypelang => "TAJ" },
1383 { name => "tg-Cyrl", lcid => 0x00007c28, file => "tg" },
1384 { name => "tg-Cyrl-TJ", lcid => 0x00000428, file => "tg_TJ" },
1385 { name => "tg-TJ", alias => "tg-Cyrl-TJ" },
1386 { name => "th", lcid => 0x0000001e, oemcp => 874, ebcdiccp => 20838, slist => ",", group => 11 },
1387 { name => "th-TH", lcid => 0x0000041e },
1388 { name => "ti", lcid => 0x00000073, territory => "ER", sopentypelang => "TGY" },
1389 { name => "ti-ER", lcid => 0x00000873 },
1390 { name => "ti-ET", lcid => 0x00000473, sabbrevlangname => "TIE" },
1391 { name => "tig", sopentypelang => "TGR" },
1392 { name => "tig-ER" },
1393 { name => "tig-Ethi-ER", alias => "tig-ER" },
1394 { name => "tk", lcid => 0x00000042, oemcp => 852, ebcdiccp => 20880, group => 2, sopentypelang => "TKM" },
1395 { name => "tk-Latn", alias => "tk" },
1396 { name => "tk-Latn-TM", alias => "tk-TM" },
1397 { name => "tk-TM", lcid => 0x00000442 },
1398 { name => "tn", lcid => 0x00000032, oemcp => 850, sopentypelang => "TNA" },
1399 { name => "tn-BW", lcid => 0x00000832, sabbrevlangname => "TSB" },
1400 { name => "tn-ZA", lcid => 0x00000432 },
1401 { name => "to", sopentypelang => "TGN" },
1402 { name => "to-TO" },
1403 { name => "tr", lcid => 0x0000001f, oemcp => 857, ebcdiccp => 20905, group => 6, sabbrevlangname => "TRK", sopentypelang => "TRK" },
1404 { name => "tr-CY" },
1405 { name => "tr-TR", lcid => 0x0000041f },
1406 { name => "ts", lcid => 0x00000031, sopentypelang => "TSG" },
1407 { name => "ts-ZA", lcid => 0x00000431 },
1408 { name => "tt", lcid => 0x00000044, oemcp => 866, group => 5, sabbrevlangname => "TTT" },
1409 { name => "tt-Cyrl", alias => "tt" },
1410 { name => "tt-Cyrl-RU", alias => "tt-RU" },
1411 { name => "tt-RU", lcid => 0x00000444 },
1412 { name => "twq" },
1413 { name => "twq-NE" },
1414 { name => "tzm", lcid => 0x0000005f, sortlocale => "tzm-Latn-DZ", oemcp => 850, ebcdiccp => 20297, sabbrevlangname => "TZA" },
1415 { name => "tzm-Latn", lcid => 0x00007c5f, territory => "DZ", file => "tzm" },
1416 { name => "tzm-Latn-MA", file => "tzm_MA", oemcp => 65001 },
1417 { name => "tzm-Latn-DZ", lcid => 0x0000085f, file => "tzm" },
1418 { name => "tzm-MA", alias => "tzm-Latn-MA" },
1419 { name => "tzm-DZ", alias => "tzm-Latn-DZ" },
1420 ## name => "tzm-Arab", group => 13 },
1421 ## name => "tzm-Arab-MA", lcid => 0x0000045f },
1422 ## name => "tzm-Tfng", lcid => 0x0000785f },
1423 ## name => "tzm-Tfng-MA", lcid => 0x0000105f },
1424 { name => "ug", lcid => 0x00000080, oemcp => 720, slist => ",", group => 13, sopentypelang => "UYG", nativedigits => "0123456789" },
1425 { name => "ug-Arab", alias => "ug" },
1426 { name => "ug-Arab-CN", alias => "ug-CN" },
1427 { name => "ug-CN", lcid => 0x00000480 },
1428 { name => "uk", lcid => 0x00000022, oemcp => 866, maccp => 10017, ebcdiccp => 500, group => 5 },
1429 { name => "uk-UA", lcid => 0x00000422 },
1430 { name => "ur", lcid => 0x00000020, oemcp => 720 },
1431 { name => "ur-IN", lcid => 0x00000820, maccp => 65001, sabbrevlangname => "URI" },
1432 { name => "ur-PK", lcid => 0x00000420, group => 13 },
1433 { name => "uz", lcid => 0x00000043, oemcp => 857, maccp => 10029, group => 2 },
1434 { name => "uz-Arab", oemcp => 65001, maccp => 65001 },
1435 { name => "uz-Arab-AF" },
1436 { name => "uz-Cyrl", lcid => 0x00007843, oemcp => 866, maccp => 10007, group => 5, sabbrevlangname => "UZC" },
1437 { name => "uz-Cyrl-UZ", lcid => 0x00000843 },
1438 { name => "uz-Latn", lcid => 0x00007c43 },
1439 { name => "uz-Latn-UZ", lcid => 0x00000443 },
1440 { name => "vai" },
1441 { name => "vai-Latn" },
1442 { name => "vai-Latn-LR" },
1443 { name => "vai-Vaii" },
1444 { name => "vai-Vaii-LR" },
1445 { name => "ve", lcid => 0x00000033, sabbrevlangname => "ZZZ" },
1446 { name => "ve-ZA", lcid => 0x00000433 },
1447 { name => "vi", lcid => 0x0000002a, oemcp => 1258, slist => ",", group => 14, sabbrevlangname => "VIT", sopentypelang => "VIT" },
1448 { name => "vi-VN", lcid => 0x0000042a },
1449 { name => "vo" },
1450 { name => "vo-001" },
1451 { name => "vun" },
1452 { name => "vun-TZ" },
1453 { name => "wa", oemcp => 850 },
1454 { name => "wa-BE" },
1455 { name => "wae" },
1456 { name => "wae-CH" },
1457 { name => "wal" },
1458 { name => "wal-ET" },
1459 { name => "wo", lcid => 0x00000088, oemcp => 850, ebcdiccp => 20297, sopentypelang => "WLF" },
1460 { name => "wo-Latn", alias => "wo" },
1461 { name => "wo-Latn-SN", alias => "wo-SN" },
1462 { name => "wo-SN", lcid => 0x00000488 },
1463 { name => "x-IV_mathan", lcid => 0x0001007f, alias => "" },
1464 { name => "xh", lcid => 0x00000034, oemcp => 850, sopentypelang => "XHS" },
1465 { name => "xh-ZA", lcid => 0x00000434 },
1466 { name => "xog" },
1467 { name => "xog-UG" },
1468 { name => "yav" },
1469 { name => "yav-CM" },
1470 { name => "yi", lcid => 0x0000003d, sabbrevlangname => "ZZZ", sopentypelang => "JII" },
1471 { name => "yi-001", lcid => 0x0000043d },
1472 { name => "yo", lcid => 0x0000006a, oemcp => 437, sopentypelang => "YBA" },
1473 { name => "yo-BJ", ebcdiccp => 500 },
1474 { name => "yo-Latn", alias => "yo" },
1475 { name => "yo-Latn-NG", alias => "yo-NG" },
1476 { name => "yo-NG", lcid => 0x0000046a },
1477 { name => "yrl" },
1478 { name => "yrl-BR" },
1479 { name => "yrl-CO" },
1480 { name => "yrl-VE" },
1481 { name => "yue" },
1482 { name => "yue-Hans" },
1483 { name => "yue-Hans-CN" },
1484 { name => "yue-Hant" },
1485 { name => "yue-Hant-HK" },
1486 { name => "zgh" },
1487 { name => "zgh-MA", alias => "zgh-Tfng-MA" },
1488 { name => "zgh-Tfng", file => "zgh" },
1489 { name => "zgh-Tfng-MA", file => "zgh_MA" },
1490 { name => "zh", lcid => 0x00007804, ireadinglayout => 2, oemcp => 936, slist => ",", sscripts => "Hani Hans", sabbrevlangname => "CHS", sopentypelang => "ZHS", nativedigits => "0123456789" },
1491 { name => "zh-CN", lcid => 0x00000804, file => "zh_Hans_CN", sparent => "zh-Hans" },
1492 { name => "zh-CN_phoneb", lcid => 0x00050804, alias => "zh-CN" },
1493 { name => "zh-CN_stroke", lcid => 0x00020804, alias => "zh-CN" },
1494 { name => "zh-Hans", lcid => 0x00000004, group => 10 },
1495 { name => "zh-Hans-CN", alias => "zh-CN" },
1496 { name => "zh-Hans-CN-u-co-phonebk", alias => "zh-CN_phoneb" },
1497 { name => "zh-Hans-CN-u-co-stroke", alias => "zh-CN_stroke" },
1498 { name => "zh-Hans-HK", slist => ";", nativedigits => "" },
1499 { name => "zh-Hans-MO", slist => ";", nativedigits => "" },
1500 { name => "zh-Hans-SG", alias => "zh-SG" },
1501 { name => "zh-Hans-SG-u-co-phonebk", alias => "zh-SG_phoneb" },
1502 { name => "zh-Hans-SG-u-co-stroke", alias => "zh-SG_stroke" },
1503 { name => "zh-Hant", lcid => 0x00007c04, sortlocale => "zh-HK", ireadinglayout => 2, oemcp => 950, slist => ",", sscripts => "Hani Hant", group => 9, sabbrevlangname => "CHT", sopentypelang => "ZHH" },
1504 { name => "zh-Hant-HK", alias => "zh-HK" },
1505 { name => "zh-Hant-HK-u-co-unihan", alias => "zh-HK_radstr" },
1506 { name => "zh-Hant-MO", alias => "zh-MO" },
1507 { name => "zh-Hant-MO-u-co-stroke", alias => "zh-MO_stroke" },
1508 { name => "zh-Hant-MO-u-co-unihan", alias => "zh-MO_radstr" },
1509 { name => "zh-Hant-TW", alias => "zh-TW" },
1510 { name => "zh-Hant-TW-u-co-phonetic", alias => "zh-TW_pronun" },
1511 { name => "zh-Hant-TW-u-co-unihan", alias => "zh-TW_radstr" },
1512 { name => "zh-HK", lcid => 0x00000c04, file => "zh_Hant_HK", sparent => "zh-Hant", sabbrevlangname => "ZHH" },
1513 { name => "zh-HK_radstr", lcid => 0x00040c04, alias => "zh-HK" },
1514 { name => "zh-MO", lcid => 0x00001404, file => "zh_Hant_MO", sparent => "zh-Hant", sabbrevlangname => "ZHM", sopentypelang => "ZHT" },
1515 { name => "zh-MO_radstr", lcid => 0x00041404, alias => "zh-MO" },
1516 { name => "zh-MO_stroke", lcid => 0x00021404, alias => "zh-MO" },
1517 { name => "zh-SG", lcid => 0x00001004, file => "zh_Hans_SG", sparent => "zh-Hans", sabbrevlangname => "ZHI" },
1518 { name => "zh-SG_phoneb", lcid => 0x00051004, alias => "zh-SG" },
1519 { name => "zh-SG_stroke", lcid => 0x00021004, alias => "zh-SG" },
1520 { name => "zh-TW", lcid => 0x00000404, file => "zh_Hant_TW", sparent => "zh-Hant", sopentypelang => "ZHT" },
1521 { name => "zh-TW_pronun", lcid => 0x00030404, alias => "zh-TW" },
1522 { name => "zh-TW_radstr", lcid => 0x00040404, alias => "zh-TW" },
1523 { name => "zu", lcid => 0x00000035, oemcp => 850 },
1524 { name => "zu-ZA", lcid => 0x00000435 },
1527 my @calendars =
1529 { id => 1, name => "Gregorian", itwodigityearmax => 2049 },
1530 { id => 2, type => "gregorian", locale => "en-US", itwodigityearmax => 2049 },
1531 { id => 3, type => "japanese", locale => "ja-JP", eras => [ 232..236 ] },
1532 { id => 4, type => "roc", locale => "zh-TW", eras => [ 1 ] },
1533 { id => 5, type => "dangi", locale => "ko-KR", eras => [ 0 ] },
1534 { id => 6, type => "islamic", locale => "ar-SA", itwodigityearmax => 1451 },
1535 { id => 7, type => "buddhist", locale => "th-TH", eras => [ 0 ] },
1536 { id => 8, type => "hebrew", locale => "he-IL", itwodigityearmax => 5810 },
1537 { id => 9, type => "gregorian", locale => "fr-FR", itwodigityearmax => 2049 },
1538 { id => 10, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1539 { id => 11, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1540 { id => 12, type => "gregorian", locale => "ar-SA", itwodigityearmax => 2049 },
1541 { id => 13, name => "Julian", locale => "en-US", itwodigityearmax => 2049 },
1542 { id => 14, name => "Japanese Lunisolar" },
1543 { id => 15, name => "Chinese Lunisolar" },
1544 { id => 16, name => "Saka" },
1545 { id => 17, name => "Lunar ETO Chinese" },
1546 { id => 18, name => "Lunar ETO Korean" },
1547 { id => 19, name => "Lunar ETO Rokuyou" },
1548 { id => 20, name => "Korean Lunisolar" },
1549 { id => 21, name => "Taiwan Lunisolar" },
1550 { id => 22, type => "persian", locale => "prs-AF", itwodigityearmax => 1429 },
1551 { id => 23, type => "islamic-umalqura", locale => "ar-SA", itwodigityearmax => 1451 },
1554 my @geoids =
1556 { id => 2, name => "AG" }, # Antigua and Barbuda
1557 { id => 3, name => "AF" }, # Afghanistan
1558 { id => 4, name => "DZ" }, # Algeria
1559 { id => 5, name => "AZ" }, # Azerbaijan
1560 { id => 6, name => "AL" }, # Albania
1561 { id => 7, name => "AM" }, # Armenia
1562 { id => 8, name => "AD" }, # Andorra
1563 { id => 9, name => "AO" }, # Angola
1564 { id => 10, name => "AS" }, # American Samoa
1565 { id => 11, name => "AR" }, # Argentina
1566 { id => 12, name => "AU" }, # Australia
1567 { id => 14, name => "AT" }, # Austria
1568 { id => 17, name => "BH" }, # Bahrain
1569 { id => 18, name => "BB" }, # Barbados
1570 { id => 19, name => "BW" }, # Botswana
1571 { id => 20, name => "BM" }, # Bermuda
1572 { id => 21, name => "BE" }, # Belgium
1573 { id => 22, name => "BS" }, # Bahamas, The
1574 { id => 23, name => "BD" }, # Bangladesh
1575 { id => 24, name => "BZ" }, # Belize
1576 { id => 25, name => "BA" }, # Bosnia and Herzegovina
1577 { id => 26, name => "BO" }, # Bolivia
1578 { id => 27, name => "MM" }, # Myanmar
1579 { id => 28, name => "BJ" }, # Benin
1580 { id => 29, name => "BY" }, # Belarus
1581 { id => 30, name => "SB" }, # Solomon Islands
1582 { id => 32, name => "BR" }, # Brazil
1583 { id => 34, name => "BT" }, # Bhutan
1584 { id => 35, name => "BG" }, # Bulgaria
1585 { id => 37, name => "BN" }, # Brunei
1586 { id => 38, name => "BI" }, # Burundi
1587 { id => 39, name => "CA" }, # Canada
1588 { id => 40, name => "KH" }, # Cambodia
1589 { id => 41, name => "TD" }, # Chad
1590 { id => 42, name => "LK" }, # Sri Lanka
1591 { id => 43, name => "CG" }, # Congo
1592 { id => 44, name => "CD" }, # Congo (DRC)
1593 { id => 45, name => "CN" }, # China
1594 { id => 46, name => "CL" }, # Chile
1595 { id => 49, name => "CM" }, # Cameroon
1596 { id => 50, name => "KM" }, # Comoros
1597 { id => 51, name => "CO" }, # Colombia
1598 { id => 54, name => "CR" }, # Costa Rica
1599 { id => 55, name => "CF" }, # Central African Republic
1600 { id => 56, name => "CU" }, # Cuba
1601 { id => 57, name => "CV" }, # Cape Verde
1602 { id => 59, name => "CY" }, # Cyprus
1603 { id => 61, name => "DK" }, # Denmark
1604 { id => 62, name => "DJ" }, # Djibouti
1605 { id => 63, name => "DM" }, # Dominica
1606 { id => 65, name => "DO" }, # Dominican Republic
1607 { id => 66, name => "EC" }, # Ecuador
1608 { id => 67, name => "EG" }, # Egypt
1609 { id => 68, name => "IE" }, # Ireland
1610 { id => 69, name => "GQ" }, # Equatorial Guinea
1611 { id => 70, name => "EE" }, # Estonia
1612 { id => 71, name => "ER" }, # Eritrea
1613 { id => 72, name => "SV" }, # El Salvador
1614 { id => 73, name => "ET" }, # Ethiopia
1615 { id => 75, name => "CZ" }, # Czech Republic
1616 { id => 77, name => "FI" }, # Finland
1617 { id => 78, name => "FJ" }, # Fiji Islands
1618 { id => 80, name => "FM" }, # Micronesia
1619 { id => 81, name => "FO" }, # Faroe Islands
1620 { id => 84, name => "FR" }, # France
1621 { id => 86, name => "GM" }, # Gambia, The
1622 { id => 87, name => "GA" }, # Gabon
1623 { id => 88, name => "GE" }, # Georgia
1624 { id => 89, name => "GH" }, # Ghana
1625 { id => 90, name => "GI" }, # Gibraltar
1626 { id => 91, name => "GD" }, # Grenada
1627 { id => 93, name => "GL" }, # Greenland
1628 { id => 94, name => "DE" }, # Germany
1629 { id => 98, name => "GR" }, # Greece
1630 { id => 99, name => "GT" }, # Guatemala
1631 { id => 100, name => "GN" }, # Guinea
1632 { id => 101, name => "GY" }, # Guyana
1633 { id => 103, name => "HT" }, # Haiti
1634 { id => 104, name => "HK" }, # Hong Kong S.A.R.
1635 { id => 106, name => "HN" }, # Honduras
1636 { id => 108, name => "HR" }, # Croatia
1637 { id => 109, name => "HU" }, # Hungary
1638 { id => 110, name => "IS" }, # Iceland
1639 { id => 111, name => "ID" }, # Indonesia
1640 { id => 113, name => "IN" }, # India
1641 { id => 114, name => "IO" }, # British Indian Ocean Territory
1642 { id => 116, name => "IR" }, # Iran
1643 { id => 117, name => "IL" }, # Israel
1644 { id => 118, name => "IT" }, # Italy
1645 { id => 119, name => "CI" }, # Côte d'Ivoire
1646 { id => 121, name => "IQ" }, # Iraq
1647 { id => 122, name => "JP" }, # Japan
1648 { id => 124, name => "JM" }, # Jamaica
1649 { id => 125, name => "SJ" }, # Jan Mayen
1650 { id => 126, name => "JO" }, # Jordan
1651 { id => 127, parent => "UM" }, # Johnston Atoll
1652 { id => 129, name => "KE" }, # Kenya
1653 { id => 130, name => "KG" }, # Kyrgyzstan
1654 { id => 131, name => "KP" }, # North Korea
1655 { id => 133, name => "KI" }, # Kiribati
1656 { id => 134, name => "KR" }, # Korea
1657 { id => 136, name => "KW" }, # Kuwait
1658 { id => 137, name => "KZ" }, # Kazakhstan
1659 { id => 138, name => "LA" }, # Laos
1660 { id => 139, name => "LB" }, # Lebanon
1661 { id => 140, name => "LV" }, # Latvia
1662 { id => 141, name => "LT" }, # Lithuania
1663 { id => 142, name => "LR" }, # Liberia
1664 { id => 143, name => "SK" }, # Slovakia
1665 { id => 145, name => "LI" }, # Liechtenstein
1666 { id => 146, name => "LS" }, # Lesotho
1667 { id => 147, name => "LU" }, # Luxembourg
1668 { id => 148, name => "LY" }, # Libya
1669 { id => 149, name => "MG" }, # Madagascar
1670 { id => 151, name => "MO" }, # Macao S.A.R.
1671 { id => 152, name => "MD" }, # Moldova
1672 { id => 154, name => "MN" }, # Mongolia
1673 { id => 156, name => "MW" }, # Malawi
1674 { id => 157, name => "ML" }, # Mali
1675 { id => 158, name => "MC" }, # Monaco
1676 { id => 159, name => "MA" }, # Morocco
1677 { id => 160, name => "MU" }, # Mauritius
1678 { id => 162, name => "MR" }, # Mauritania
1679 { id => 163, name => "MT" }, # Malta
1680 { id => 164, name => "OM" }, # Oman
1681 { id => 165, name => "MV" }, # Maldives
1682 { id => 166, name => "MX" }, # Mexico
1683 { id => 167, name => "MY" }, # Malaysia
1684 { id => 168, name => "MZ" }, # Mozambique
1685 { id => 173, name => "NE" }, # Niger
1686 { id => 174, name => "VU" }, # Vanuatu
1687 { id => 175, name => "NG" }, # Nigeria
1688 { id => 176, name => "NL" }, # Netherlands
1689 { id => 177, name => "NO" }, # Norway
1690 { id => 178, name => "NP" }, # Nepal
1691 { id => 180, name => "NR" }, # Nauru
1692 { id => 181, name => "SR" }, # Suriname
1693 { id => 182, name => "NI" }, # Nicaragua
1694 { id => 183, name => "NZ" }, # New Zealand
1695 { id => 184, name => "PS" }, # Palestinian Authority
1696 { id => 185, name => "PY" }, # Paraguay
1697 { id => 187, name => "PE" }, # Peru
1698 { id => 190, name => "PK" }, # Pakistan
1699 { id => 191, name => "PL" }, # Poland
1700 { id => 192, name => "PA" }, # Panama
1701 { id => 193, name => "PT" }, # Portugal
1702 { id => 194, name => "PG" }, # Papua New Guinea
1703 { id => 195, name => "PW" }, # Palau
1704 { id => 196, name => "GW" }, # Guinea-Bissau
1705 { id => 197, name => "QA" }, # Qatar
1706 { id => 198, name => "RE" }, # Reunion
1707 { id => 199, name => "MH" }, # Marshall Islands
1708 { id => 200, name => "RO" }, # Romania
1709 { id => 201, name => "PH" }, # Philippines
1710 { id => 202, name => "PR" }, # Puerto Rico
1711 { id => 203, name => "RU" }, # Russia
1712 { id => 204, name => "RW" }, # Rwanda
1713 { id => 205, name => "SA" }, # Saudi Arabia
1714 { id => 206, name => "PM" }, # St. Pierre and Miquelon
1715 { id => 207, name => "KN" }, # St. Kitts and Nevis
1716 { id => 208, name => "SC" }, # Seychelles
1717 { id => 209, name => "ZA" }, # South Africa
1718 { id => 210, name => "SN" }, # Senegal
1719 { id => 212, name => "SI" }, # Slovenia
1720 { id => 213, name => "SL" }, # Sierra Leone
1721 { id => 214, name => "SM" }, # San Marino
1722 { id => 215, name => "SG" }, # Singapore
1723 { id => 216, name => "SO" }, # Somalia
1724 { id => 217, name => "ES" }, # Spain
1725 { id => 218, name => "LC" }, # St. Lucia
1726 { id => 219, name => "SD" }, # Sudan
1727 { id => 220, name => "SJ" }, # Svalbard
1728 { id => 221, name => "SE" }, # Sweden
1729 { id => 222, name => "SY" }, # Syria
1730 { id => 223, name => "CH" }, # Switzerland
1731 { id => 224, name => "AE" }, # United Arab Emirates
1732 { id => 225, name => "TT" }, # Trinidad and Tobago
1733 { id => 227, name => "TH" }, # Thailand
1734 { id => 228, name => "TJ" }, # Tajikistan
1735 { id => 231, name => "TO" }, # Tonga
1736 { id => 232, name => "TG" }, # Togo
1737 { id => 233, name => "ST" }, # São Tomé and Príncipe
1738 { id => 234, name => "TN" }, # Tunisia
1739 { id => 235, name => "TR" }, # Turkey
1740 { id => 236, name => "TV" }, # Tuvalu
1741 { id => 237, name => "TW" }, # Taiwan
1742 { id => 238, name => "TM" }, # Turkmenistan
1743 { id => 239, name => "TZ" }, # Tanzania
1744 { id => 240, name => "UG" }, # Uganda
1745 { id => 241, name => "UA" }, # Ukraine
1746 { id => 242, name => "GB" }, # United Kingdom
1747 { id => 244, name => "US" }, # United States
1748 { id => 245, name => "BF" }, # Burkina Faso
1749 { id => 246, name => "UY" }, # Uruguay
1750 { id => 247, name => "UZ" }, # Uzbekistan
1751 { id => 248, name => "VC" }, # St. Vincent and the Grenadines
1752 { id => 249, name => "VE" }, # Bolivarian Republic of Venezuela
1753 { id => 251, name => "VN" }, # Vietnam
1754 { id => 252, name => "VI" }, # Virgin Islands
1755 { id => 253, name => "VA" }, # Vatican City
1756 { id => 254, name => "NA" }, # Namibia
1757 { id => 257, name => "EH" }, # Western Sahara (disputed)
1758 { id => 258, parent => "UM" }, # Wake Island
1759 { id => 259, name => "WS" }, # Samoa
1760 { id => 260, name => "SZ" }, # Swaziland
1761 { id => 261, name => "YE" }, # Yemen
1762 { id => 263, name => "ZM" }, # Zambia
1763 { id => 264, name => "ZW" }, # Zimbabwe
1764 { id => 269, name => "CS" }, # Serbia and Montenegro (Former)
1765 { id => 270, name => "ME" }, # Montenegro
1766 { id => 271, name => "RS" }, # Serbia
1767 { id => 273, name => "CW" }, # Curaçao
1768 { id => 276, name => "SS" }, # South Sudan
1769 { id => 300, name => "AI" }, # Anguilla
1770 { id => 301, name => "AQ" }, # Antarctica
1771 { id => 302, name => "AW" }, # Aruba
1772 { id => 303, parent => "SH" }, # Ascension Island
1773 { id => 304, parent => "053" }, # Ashmore and Cartier Islands
1774 { id => 305, parent => "UM" }, # Baker Island
1775 { id => 306, name => "BV" }, # Bouvet Island
1776 { id => 307, name => "KY" }, # Cayman Islands
1777 { id => 308, name => "830", parent => "155" }, # Channel Islands
1778 { id => 309, name => "CX" }, # Christmas Island
1779 { id => 310, parent => "009" }, # Clipperton Island
1780 { id => 311, name => "CC" }, # Cocos (Keeling) Islands
1781 { id => 312, name => "CK" }, # Cook Islands
1782 { id => 313, parent => "053" }, # Coral Sea Islands
1783 { id => 314, parent => "IO" }, # Diego Garcia
1784 { id => 315, name => "FK" }, # Falkland Islands (Islas Malvinas)
1785 { id => 317, name => "GF" }, # French Guiana
1786 { id => 318, name => "PF" }, # French Polynesia
1787 { id => 319, name => "TF" }, # French Southern and Antarctic Lands
1788 { id => 321, name => "GP" }, # Guadeloupe
1789 { id => 322, name => "GU" }, # Guam
1790 { id => 323 }, # Guantanamo Bay
1791 { id => 324, name => "GG" }, # Guernsey
1792 { id => 325, name => "HM" }, # Heard Island and McDonald Islands
1793 { id => 326, parent => "UM" }, # Howland Island
1794 { id => 327, parent => "UM" }, # Jarvis Island
1795 { id => 328, name => "JE" }, # Jersey
1796 { id => 329, parent => "UM" }, # Kingman Reef
1797 { id => 330, name => "MQ" }, # Martinique
1798 { id => 331, name => "YT" }, # Mayotte
1799 { id => 332, name => "MS" }, # Montserrat
1800 { id => 333, name => "AN", region => 1 }, # Netherlands Antilles (Former)
1801 { id => 334, name => "NC" }, # New Caledonia
1802 { id => 335, name => "NU" }, # Niue
1803 { id => 336, name => "NF" }, # Norfolk Island
1804 { id => 337, name => "MP" }, # Northern Mariana Islands
1805 { id => 338, parent => "UM" }, # Palmyra Atoll
1806 { id => 339, name => "PN" }, # Pitcairn Islands
1807 { id => 340, parent => "MP" }, # Rota Island
1808 { id => 341, parent => "MP" }, # Saipan
1809 { id => 342, name => "GS" }, # South Georgia and the South Sandwich Islands
1810 { id => 343, name => "SH" }, # St. Helena
1811 { id => 346, parent => "MP" }, # Tinian Island
1812 { id => 347, name => "TK" }, # Tokelau
1813 { id => 348, parent => "SH" }, # Tristan da Cunha
1814 { id => 349, name => "TC" }, # Turks and Caicos Islands
1815 { id => 351, name => "VG" }, # Virgin Islands, British
1816 { id => 352, name => "WF" }, # Wallis and Futuna
1817 { id => 742, name => "002" }, # Africa
1818 { id => 2129, name => "142" }, # Asia
1819 { id => 10541, name => "150" }, # Europe
1820 { id => 15126, name => "IM" }, # Man, Isle of
1821 { id => 19618, name => "MK" }, # Macedonia, Former Yugoslav Republic of
1822 { id => 20900, name => "054" }, # Melanesia
1823 { id => 21206, name => "057" }, # Micronesia
1824 { id => 21242, parent => "UM" }, # Midway Islands
1825 { id => 23581, name => "021" }, # Northern America
1826 { id => 26286, name => "061" }, # Polynesia
1827 { id => 27082, name => "013" }, # Central America
1828 { id => 27114, name => "009" }, # Oceania
1829 { id => 30967, name => "SX" }, # Sint Maarten (Dutch part)
1830 { id => 31396, name => "005" }, # South America
1831 { id => 31706, name => "MF" }, # Saint Martin (French part)
1832 { id => 39070, name => "001" }, # World
1833 { id => 42483, name => "011" }, # Western Africa
1834 { id => 42484, name => "017" }, # Middle Africa
1835 { id => 42487, name => "015" }, # Northern Africa
1836 { id => 47590, name => "143" }, # Central Asia
1837 { id => 47599, name => "035" }, # South-Eastern Asia
1838 { id => 47600, name => "030" }, # Eastern Asia
1839 { id => 47603, name => "014" }, # Eastern Africa
1840 { id => 47609, name => "151" }, # Eastern Europe
1841 { id => 47610, name => "039" }, # Southern Europe
1842 { id => 47611, name => "145" }, # Middle East
1843 { id => 47614, name => "034" }, # Southern Asia
1844 { id => 7299303, name => "TL" }, # Democratic Republic of Timor-Leste
1845 { id => 9914689, name => "XK" }, # Kosovo
1846 { id => 10026358, name => "019" }, # Americas
1847 { id => 10028789, name => "AX" }, # Ã…land Islands
1848 { id => 10039880, name => "029", sintlsymbol => "XCD" }, # Caribbean
1849 { id => 10039882, name => "154" }, # Northern Europe
1850 { id => 10039883, name => "018" }, # Southern Africa
1851 { id => 10210824, name => "155" }, # Western Europe
1852 { id => 10210825, name => "053" }, # Australia and New Zealand
1853 { id => 161832015, name => "BL" }, # Saint Barthélemy
1854 { id => 161832256, name => "UM" }, # U.S. Minor Outlying Islands
1855 { id => 161832257, name => "419", parent => "019" }, # Latin America and the Caribbean
1856 { id => 161832258, name => "BQ" }, # Bonaire, Sint Eustatius and Saba
1859 my @cp2uni = ();
1860 my @glyph2uni = ();
1861 my @lead_bytes = ();
1862 my @uni2cp = ();
1863 my @tolower_table = ();
1864 my @toupper_table = ();
1865 my @digitmap_table = ();
1866 my @halfwidth_table = ();
1867 my @fullwidth_table = ();
1868 my @cjk_compat_table = ();
1869 my @chinese_traditional_table = ();
1870 my @chinese_simplified_table = ();
1871 my @category_table = ();
1872 my @initial_joining_table = ();
1873 my @direction_table = ();
1874 my @decomp_table = ();
1875 my @combining_class_table = ();
1876 my @decomp_compat_table = ();
1877 my @comp_exclusions = ();
1878 my @idna_decomp_table = ();
1879 my @idna_disallowed = ();
1880 my %registry_keys;
1881 my $default_char;
1882 my $default_wchar;
1884 my %joining_forms =
1886 "isolated" => [],
1887 "final" => [],
1888 "initial" => [],
1889 "medial" => []
1892 my $current_data_file;
1894 sub to_utf16(@)
1896 my @ret;
1897 foreach my $ch (@_)
1899 if ($ch < 0x10000)
1901 push @ret, $ch;
1903 else
1905 my $val = $ch - 0x10000;
1906 push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
1909 return @ret;
1912 ################################################################
1913 # fetch a unicode.org file and open it
1914 sub open_data_file($@)
1916 my ($id, $name) = @_;
1917 my $data = $data_files{$id};
1918 my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
1919 local *FILE;
1921 my $url = $data->{url};
1922 my $filename = "$cache/" . ($data->{name} || ($url =~ s/.*\/([^\/]+)$/$1/r));
1923 unless (-f $filename)
1925 print "Fetching $url...\n";
1926 system "mkdir", "-p", $cache;
1927 !system "wget", "-q", "-O", $filename, $url or die "cannot fetch $url";
1930 my $sha = Digest::SHA->new( "sha256" )->addfile( $filename )->hexdigest;
1931 die "invalid checksum $sha for $filename" unless $sha eq $data->{sha};
1933 if ($filename =~ /\.zip$/)
1935 open FILE, "-|", "unzip", "-p", $filename, $name or die "cannot extract $name from $filename";
1937 elsif ($filename =~ /\.tar\.gz$/)
1939 open FILE, "-|", "tar", "-x", "-f", $filename, "-O", $name or die "cannot extract $name from $filename";
1941 else
1943 open FILE, "<$filename" or die "cannot open $filename";
1945 $current_data_file = $name ? "$url:$name" : $url;
1946 return *FILE;
1949 ################################################################
1950 # load a unicode.org file as XML data
1951 sub load_xml_data_file($@)
1953 my ($id, $name) = @_;
1954 my $FILE = open_data_file( $id, $name );
1955 my $xml = XML::LibXML->load_xml( IO => $FILE );
1956 close FILE;
1957 return $xml;
1960 ################################################################
1961 # recursively get the decomposition for a character
1962 sub get_decomposition($$);
1963 sub get_decomposition($$)
1965 my ($char, $table) = @_;
1966 my @ret;
1968 return $char unless defined $table->[$char];
1969 foreach my $ch (@{$table->[$char]})
1971 push @ret, get_decomposition( $ch, $table );
1973 return @ret;
1976 ################################################################
1977 # get the composition that results in a given character
1978 sub get_composition($$)
1980 my ($ch, $compat) = @_;
1981 return () unless defined $decomp_table[$ch]; # no decomposition
1982 my @ret = @{$decomp_table[$ch]};
1983 return () if @ret < 2; # singleton decomposition
1984 return () if $comp_exclusions[$ch]; # composition exclusion
1985 return () if $combining_class_table[$ch]; # non-starter
1986 return () if $combining_class_table[$ret[0]]; # first char is non-starter
1987 return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
1988 defined $decomp_compat_table[$ret[0]]; # first char has compat decomposition
1989 return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
1990 defined $idna_decomp_table[$ret[0]]; # first char has IDNA decomposition
1991 return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
1992 defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]]; # first char's decomposition has IDNA decomposition
1993 return () if $compat == 2 && defined $idna_decomp_table[$ret[1]]; # second char has IDNA decomposition
1994 return @ret;
1997 ################################################################
1998 # recursively build decompositions
1999 sub build_decompositions(@)
2001 my @src = @_;
2002 my @dst;
2004 for (my $i = 0; $i < @src; $i++)
2006 next unless defined $src[$i];
2007 my @decomp = to_utf16( get_decomposition( $i, \@src ));
2008 $dst[$i] = \@decomp;
2010 return @dst;
2013 ################################################################
2014 # compose Hangul sequences
2015 sub compose_hangul(@)
2017 my $SBASE = 0xac00;
2018 my $LBASE = 0x1100;
2019 my $VBASE = 0x1161;
2020 my $TBASE = 0x11a7;
2021 my $LCOUNT = 19;
2022 my $VCOUNT = 21;
2023 my $TCOUNT = 28;
2024 my $NCOUNT = $VCOUNT * $TCOUNT;
2025 my $SCOUNT = $LCOUNT * $NCOUNT;
2027 my @seq = @_;
2028 my @ret;
2029 my $i;
2031 for ($i = 0; $i < @seq; $i++)
2033 my $ch = $seq[$i];
2034 if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
2035 $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
2037 $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
2038 $i++;
2040 if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
2041 $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
2043 $ch += $seq[$i+1] - $TBASE;
2044 $i++;
2046 push @ret, $ch;
2048 return @ret;
2051 ################################################################
2052 # remove linguistic-only mappings from the case table
2053 sub remove_linguistic_mappings($$)
2055 my ($upper, $lower) = @_;
2057 # remove case mappings that don't round-trip
2059 for (my $i = 0; $i < @{$upper}; $i++)
2061 next unless defined ${$upper}[$i];
2062 my $ch = ${$upper}[$i];
2063 ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
2065 for (my $i = 0; $i < @{$lower}; $i++)
2067 next unless defined ${$lower}[$i];
2068 my $ch = ${$lower}[$i];
2069 ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
2073 ################################################################
2074 # read in the Unicode database files
2075 sub load_data()
2077 my $start;
2079 # now build mappings from the decomposition field of the Unicode database
2081 my $UNICODE_DATA = open_data_file( "ucd", "UnicodeData.txt" );
2082 while (<$UNICODE_DATA>)
2084 # Decode the fields ...
2085 my ($code, $name, $cat, $comb, $bidi,
2086 $decomp, $dec, $dig, $num, $mirror,
2087 $oldname, $comment, $upper, $lower, $title) = split /;/;
2088 my $src = hex $code;
2090 die "unknown category $cat" unless defined $categories{$cat};
2091 die "unknown directionality $bidi" unless defined $directions{$bidi};
2093 $category_table[$src] = $categories{$cat};
2094 $direction_table[$src] = $bidi;
2095 if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
2097 $initial_joining_table[$src] = $joining_types{"T"};
2099 else
2101 $initial_joining_table[$src] = $joining_types{"U"};
2104 if ($lower ne "")
2106 $tolower_table[$src] = hex $lower;
2108 if ($upper ne "")
2110 $toupper_table[$src] = hex $upper;
2112 if ($dec ne "")
2114 $category_table[$src] |= $ctype{"digit"};
2116 if ($dig ne "")
2118 $digitmap_table[$src] = ord $dig;
2120 $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
2122 $category_table[$src] |= $ctype{"nonspacing"} if $bidi eq "NSM";
2123 $category_table[$src] |= $ctype{"diacritic"} if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
2124 $category_table[$src] |= $ctype{"vowelmark"} if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
2125 $category_table[$src] |= $ctype{"halfwidth"} if $name =~ /^HALFWIDTH\s/;
2126 $category_table[$src] |= $ctype{"fullwidth"} if $name =~ /^FULLWIDTH\s/;
2127 $category_table[$src] |= $ctype{"hiragana"} if $name =~ /(HIRAGANA)|(\WKANA\W)/;
2128 $category_table[$src] |= $ctype{"katakana"} if $name =~ /(KATAKANA)|(\WKANA\W)/;
2129 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^<CJK Ideograph/;
2130 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
2131 $category_table[$src] |= $ctype{"ideograph"} if $name =~ /^HANGZHOU/;
2132 $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
2133 $category_table[$src] |= $ctype{"lowsurrogate"} if $name =~ /Low Surrogate/;
2135 # copy the category and direction for everything between First/Last pairs
2136 if ($name =~ /, First>/) { $start = $src; }
2137 if ($name =~ /, Last>/)
2139 while ($start < $src)
2141 $category_table[$start] = $category_table[$src];
2142 $direction_table[$start] = $direction_table[$src];
2143 $combining_class_table[$start] = $combining_class_table[$src];
2144 $start++;
2148 next if $decomp eq ""; # no decomposition, skip it
2150 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
2152 my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
2153 $decomp_compat_table[$src] = \@seq;
2156 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
2158 # decomposition of the form "<foo> 1234" -> use char if type is known
2159 my $dst = hex $2;
2160 if ($1 eq "narrow")
2162 $halfwidth_table[$dst] = $src;
2163 $fullwidth_table[$src] = $dst;
2165 elsif ($1 eq "wide")
2167 next if $dst == 0x5c; # don't remap backslash
2168 $fullwidth_table[$dst] = $src;
2169 $halfwidth_table[$src] = $dst;
2171 elsif ($1 eq "font" || $1 eq "square" || $1 eq "circle")
2173 $fullwidth_table[$src] = $dst if $src >= 0x10000;
2175 elsif ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
2177 ${joining_forms{$1}}[$dst] = $src;
2180 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
2182 # decomposition "<compat> 0020 1234" -> combining accent
2184 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
2186 # store decomposition
2187 if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
2189 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
2191 elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
2193 my $dst = hex $1;
2194 # Single char decomposition
2195 $decomp_table[$src] = $decomp_compat_table[$src] = [ $dst ];
2196 if ($name =~ /^CJK COMPATIBILITY IDEOGRAPH/)
2198 $cjk_compat_table[$src] = $dst;
2199 $fullwidth_table[$src] = $dst if $src >= 0x10000;
2204 close $UNICODE_DATA;
2206 # patch the category of some special characters
2208 for (my $i = 0; $i < @decomp_table; $i++)
2210 next unless defined $decomp_table[$i];
2211 $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
2213 foreach my $cat (keys %special_categories)
2215 my $flag = $ctype{$cat};
2216 foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
2218 for (my $i = 0; $i < @decomp_compat_table; $i++)
2220 next unless defined $decomp_compat_table[$i];
2221 next unless @{$decomp_compat_table[$i]} == 2;
2222 $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
2225 # load the composition exclusions
2227 my $EXCL = open_data_file( "ucd", "CompositionExclusions.txt" );
2228 while (<$EXCL>)
2230 s/\#.*//; # remove comments
2231 if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
2233 foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
2235 elsif (/^([0-9a-fA-F]+)\s*$/)
2237 $comp_exclusions[hex $1] = 1;
2240 close $EXCL;
2242 # load the IDNA mappings
2244 @idna_decomp_table = @decomp_compat_table;
2245 my $IDNA = open_data_file( "idna", "IdnaMappingTable.txt" );
2246 while (<$IDNA>)
2248 s/\#.*//; # remove comments
2249 next if /^\s*$/;
2250 my ($char, $type, $mapping) = split /;/;
2251 my ($ch1, $ch2);
2252 if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
2254 $ch1 = hex $1;
2255 $ch2 = hex $2;
2257 elsif ($char =~ /([0-9a-fA-F]+)/)
2259 $ch1 = $ch2 = hex $1;
2262 if ($type =~ /mapped/ || $type =~ /deviation/)
2264 $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
2265 my @seq = map { hex $_; } split /\s+/, $mapping;
2266 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
2268 elsif ($type =~ /valid/)
2271 elsif ($type =~ /ignored/)
2273 foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
2275 elsif ($type =~ /disallowed/)
2277 foreach my $i ($ch1 .. $ch2)
2279 $idna_decomp_table[$i] = undef;
2280 $idna_disallowed[$i] = 1;
2284 close $IDNA;
2286 # load the Unihan mappings
2288 my $UNIHAN = open_data_file( "unihan", "Unihan_Variants.txt" );
2289 while (<$UNIHAN>)
2291 s/\#.*//; # remove comments
2292 next if /^\s*$/;
2293 if (/^U\+([0-9a-fA-F]{4})\s+kTraditionalVariant\s+U\+([0-9a-fA-F]{4})$/)
2295 next if hex $1 < 0x4dc0; # skip extension A
2296 $chinese_traditional_table[hex $1] = hex $2;
2298 elsif (/^U\+([0-9a-fA-F]{4})\s+kSimplifiedVariant\s+U\+([0-9a-fA-F]{4})$/)
2300 next if hex $1 < 0x4dc0; # skip extension A
2301 $chinese_simplified_table[hex $1] = hex $2;
2304 close $UNIHAN;
2305 foreach my $i (0xf900..0xfaff)
2307 next unless defined $cjk_compat_table[$i];
2308 next if defined $chinese_simplified_table[$cjk_compat_table[$i]];
2309 $chinese_simplified_table[$i] = $cjk_compat_table[$i];
2314 ################################################################
2315 # add a new registry key
2316 sub add_registry_key($$$)
2318 my ($base, $key, $defval) = @_;
2319 $registry_keys{"$base\\$key"} = [ $defval ] unless defined $registry_keys{"$base\\$key"};
2322 ################################################################
2323 # add a new registry value with explicit type
2324 sub add_registry_value($$$$)
2326 my ($base, $key, $name, $value) = @_;
2327 add_registry_key( $base, $key, undef );
2328 push @{$registry_keys{"$base\\$key"}}, "'$name' = $value";
2331 ################################################################
2332 # add a new registry string value
2333 sub add_registry_string_value($$$$)
2335 my ($base, $key, $name, $value) = @_;
2336 $value =~ s/\'/\'\'/g;
2337 add_registry_value( $base, $key, $name, "s '$value'" );
2340 ################################################################
2341 # add a new registry dword value
2342 sub add_registry_dword_value($$$$)
2344 my ($base, $key, $name, $value) = @_;
2345 add_registry_value( $base, $key, $name, "d $value" );
2348 ################################################################
2349 # add a new registry binary value
2350 sub add_registry_binary_value($$$$)
2352 my ($base, $key, $name, $value) = @_;
2353 add_registry_value( $base, $key, $name, "b " . join "", map { sprintf "%02x", $_; } unpack( "C*", $value ));
2356 ################################################################
2357 # define a new lead byte
2358 sub add_lead_byte($)
2360 my $ch = shift;
2361 return if defined $cp2uni[$ch];
2362 push @lead_bytes, $ch;
2363 $cp2uni[$ch] = 0;
2366 ################################################################
2367 # define a new char mapping
2368 sub add_mapping($$)
2370 my ($cp, $uni) = @_;
2371 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2372 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2373 if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
2376 ################################################################
2377 # get a mapping including glyph chars for MB_USEGLYPHCHARS
2378 sub get_glyphs_mapping(@)
2380 my @table = @_;
2382 for (my $i = 0; $i < @glyph2uni; $i++)
2384 $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
2386 return @table;
2389 ################################################################
2390 # build EUC-JP table from the JIS 0208/0212 files
2391 sub dump_eucjp_codepage()
2393 @cp2uni = ();
2394 @glyph2uni = ();
2395 @lead_bytes = ();
2396 @uni2cp = ();
2397 $default_char = $DEF_CHAR;
2398 $default_wchar = 0x30fb;
2400 # ASCII chars
2401 foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
2403 # lead bytes
2404 foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
2406 # JIS X 0201 right plane
2407 foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
2409 # undefined chars
2410 foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
2411 $cp2uni[0xa0] = 0xf8f0;
2412 $cp2uni[0xff] = 0xf8f3;
2414 # Fix backslash conversion
2415 add_mapping( 0xa1c0, 0xff3c );
2417 # Add private mappings for rows undefined in JIS 0208/0212
2418 my $private = 0xe000;
2419 foreach my $hi (0xf5 .. 0xfe)
2421 foreach my $lo (0xa1 .. 0xfe)
2423 add_mapping( ($hi << 8) + $lo, $private++ );
2426 foreach my $hi (0xf5 .. 0xfe)
2428 foreach my $lo (0x21 .. 0x7e)
2430 add_mapping( ($hi << 8) + $lo, $private++ );
2434 my $INPUT = open_data_file( "jis0208" );
2435 while (<$INPUT>)
2437 next if /^\#/; # skip comments
2438 next if /^$/; # skip empty lines
2439 next if /\x1a/; # skip ^Z
2440 if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2442 add_mapping( 0x8080 + hex $1, hex $2 );
2443 next;
2445 die "Unrecognized line $_\n";
2447 close $INPUT;
2449 $INPUT = open_data_file( "jis0212" );
2450 while (<$INPUT>)
2452 next if /^\#/; # skip comments
2453 next if /^$/; # skip empty lines
2454 next if /\x1a/; # skip ^Z
2455 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2457 add_mapping( 0x8000 + hex $1, hex $2 );
2458 next;
2460 die "Unrecognized line $_\n";
2462 close $INPUT;
2464 output_codepage_file( 20932 );
2467 ################################################################
2468 # build Korean Wansung table from the KSX1001 file
2469 sub dump_krwansung_codepage(@)
2471 my @cp949 = @_;
2472 @cp2uni = ();
2473 @glyph2uni = ();
2474 @lead_bytes = ();
2475 @uni2cp = ();
2476 $default_char = 0x3f;
2477 $default_wchar = 0x003f;
2479 # ASCII and undefined chars
2480 foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
2481 add_mapping( 0xa0, 0xf8e6 );
2482 add_mapping( 0xad, 0xf8e7 );
2483 add_mapping( 0xae, 0xf8e8 );
2484 add_mapping( 0xaf, 0xf8e9 );
2485 add_mapping( 0xfe, 0xf8ea );
2486 add_mapping( 0xff, 0xf8eb );
2488 my $INPUT = open_data_file( "ksx1001" );
2489 while (<$INPUT>)
2491 next if /^\#/; # skip comments
2492 next if /^$/; # skip empty lines
2493 next if /\x1a/; # skip ^Z
2494 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
2496 add_mapping( 0x8080 + hex $1, hex $2 );
2497 next;
2499 die "Unrecognized line $_\n";
2501 close $INPUT;
2503 # get some extra mappings from cp 949
2504 my @defined_lb;
2505 map { $defined_lb[$_] = 1; } @lead_bytes;
2506 foreach my $i (0x0000 .. 0xffff)
2508 next if ($i >= 0x1100 && $i <= 0x11ff); # range not used in 20949
2509 next unless defined $cp949[$i];
2510 if ($cp949[$i] >= 0xff)
2512 # only add chars for lead bytes that exist in 20949
2513 my $hi = $cp949[$i] >> 8;
2514 my $lo = $cp949[$i] & 0xff;
2515 next unless $defined_lb[$hi];
2516 next unless $lo >= 0xa1 && $lo <= 0xfe;
2518 add_mapping( $cp949[$i], $i );
2521 output_codepage_file( 20949 );
2525 ################################################################
2526 # dump an array of integers
2527 sub dump_array($$@)
2529 my ($bit_width, $default, @array) = @_;
2530 my $format = sprintf "0x%%0%ux", $bit_width / 4;
2531 my $i;
2532 my $ret = " ";
2533 for ($i = 0; $i < $#array; $i++)
2535 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2536 $ret .= (($i % 8) != 7) ? ", " : ",\n ";
2538 $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
2539 return $ret;
2543 ################################################################
2544 # dump an SBCS mapping table in binary format
2545 sub dump_binary_sbcs_table($)
2547 my $codepage = shift;
2549 my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2550 my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
2552 print OUTPUT pack "S<*", @header;
2553 print OUTPUT pack "C12", (0) x 12;
2554 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2556 if (@glyph2uni)
2558 print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
2560 else
2562 print OUTPUT pack "S<*", 0;
2565 print OUTPUT pack "S<*", 0, 0;
2567 print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2571 ################################################################
2572 # dump a DBCS mapping table in binary format
2573 sub dump_binary_dbcs_table($)
2575 my $codepage = shift;
2576 my @lb_ranges = get_lb_ranges();
2577 my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
2579 my @offsets = (0) x 256;
2580 my $pos = 0;
2581 foreach my $i (@lead_bytes)
2583 $offsets[$i] = ($pos += 256);
2584 $cp2uni[$i] = 0;
2587 my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
2589 print OUTPUT pack "S<*", @header;
2590 print OUTPUT pack "C12", @lb_ranges, 0 x 12;
2591 print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
2592 print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
2594 foreach my $i (@lead_bytes)
2596 my $base = $i << 8;
2597 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
2600 print OUTPUT pack "S<", 4;
2601 print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
2605 ################################################################
2606 # get the list of defined lead byte ranges
2607 sub get_lb_ranges()
2609 my @list = ();
2610 my @ranges = ();
2612 foreach my $i (@lead_bytes) { $list[$i] = 1; }
2613 my $on = 0;
2614 for (my $i = 0; $i < 256; $i++)
2616 if ($on)
2618 if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
2620 else
2622 if ($list[$i]) { push @ranges, $i; $on = 1; }
2625 if ($on) { push @ranges, 0xff; }
2626 return @ranges;
2629 ################################################################
2630 # dump the Indic Syllabic Category table
2631 sub dump_indic($)
2633 my $filename = shift;
2634 my @indic_table;
2636 my $INPUT = open_data_file( "ucd", "IndicSyllabicCategory.txt" );
2637 while (<$INPUT>)
2639 next if /^\#/; # skip comments
2640 next if /^\s*$/; # skip empty lines
2641 next if /\x1a/; # skip ^Z
2642 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2644 my $type = $2;
2645 die "unknown indic $type" unless defined $indic_types{$type};
2646 if (hex $1 < 65536)
2648 $indic_table[hex $1] = $indic_types{$type};
2650 next;
2652 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2654 my $type = $3;
2655 die "unknown indic $type" unless defined $indic_types{$type};
2656 if (hex $1 < 65536 and hex $2 < 65536)
2658 foreach my $i (hex $1 .. hex $2)
2660 $indic_table[$i] = $indic_types{$type};
2663 next;
2665 die "malformed line $_";
2667 close $INPUT;
2669 my $prev_data_file = $current_data_file;
2670 $INPUT = open_data_file( "ucd", "IndicPositionalCategory.txt" );
2671 while (<$INPUT>)
2673 next if /^\#/; # skip comments
2674 next if /^\s*$/; # skip empty lines
2675 next if /\x1a/; # skip ^Z
2676 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
2678 my $type = $2;
2679 die "unknown matra $type" unless defined $matra_types{$type};
2680 $indic_table[hex $1] |= $matra_types{$type} << 8;
2681 next;
2683 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
2685 my $type = $3;
2686 die "unknown matra $type" unless defined $matra_types{$type};
2687 foreach my $i (hex $1 .. hex $2)
2689 $indic_table[$i] |= $matra_types{$type} << 8;
2691 next;
2693 die "malformed line $_";
2695 close $INPUT;
2697 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2698 print "Building $filename\n";
2699 print OUTPUT "/* Unicode Indic Syllabic Category */\n";
2700 print OUTPUT "/* generated from $prev_data_file */\n";
2701 print OUTPUT "/* and from $current_data_file */\n";
2702 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2703 print OUTPUT "#include \"windef.h\"\n\n";
2705 dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
2707 close OUTPUT;
2708 save_file($filename);
2711 ################################################################
2712 # dump the Line Break Properties table
2713 sub dump_linebreak($)
2715 my $filename = shift;
2716 my @break_table;
2718 my $INPUT = open_data_file( "ucd", "LineBreak.txt" );
2719 while (<$INPUT>)
2721 next if /^\#/; # skip comments
2722 next if /^\s*$/; # skip empty lines
2723 next if /\x1a/; # skip ^Z
2724 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2726 my $type = $2;
2727 die "unknown breaktype $type" unless defined $break_types{$type};
2728 $break_table[hex $1] = $break_types{$type};
2729 next;
2731 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
2733 my $type = $3;
2734 die "unknown breaktype $type" unless defined $break_types{$type};
2735 foreach my $i (hex $1 .. hex $2)
2737 $break_table[$i] = $break_types{$type};
2739 next;
2741 elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2743 my $type = $2;
2744 die "unknown breaktype $type" unless defined $break_types{$type};
2745 $break_table[hex $1] = $break_types{$type};
2746 next;
2748 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
2750 my $type = $3;
2751 die "unknown breaktype $type" unless defined $break_types{$type};
2752 foreach my $i (hex $1 .. hex $2)
2754 $break_table[$i] = $break_types{$type};
2756 next;
2758 die "malformed line $_";
2760 close $INPUT;
2762 open OUTPUT,">$filename.new" or die "Cannot create $filename";
2763 print "Building $filename\n";
2764 print OUTPUT "/* Unicode Line Break Properties */\n";
2765 print OUTPUT "/* generated from $current_data_file */\n";
2766 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2767 print OUTPUT "#include \"windef.h\"\n\n";
2769 dump_three_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
2771 close OUTPUT;
2772 save_file($filename);
2775 my %scripts =
2777 "Unknown" => 0,
2778 "Common" => 1,
2779 "Inherited" => 2,
2780 "Arabic" => 3,
2781 "Armenian" => 4,
2782 "Avestan" => 5,
2783 "Balinese" => 6,
2784 "Bamum" => 7,
2785 "Batak" => 8,
2786 "Bengali" => 9,
2787 "Bopomofo" => 10,
2788 "Brahmi" => 11,
2789 "Braille" => 12,
2790 "Buginese" => 13,
2791 "Buhid" => 14,
2792 "Canadian_Aboriginal" => 15,
2793 "Carian" => 16,
2794 "Cham" => 17,
2795 "Cherokee" => 18,
2796 "Coptic" => 19,
2797 "Cuneiform" => 20,
2798 "Cypriot" => 21,
2799 "Cyrillic" => 22,
2800 "Deseret" => 23,
2801 "Devanagari" => 24,
2802 "Egyptian_Hieroglyphs" => 25,
2803 "Ethiopic" => 26,
2804 "Georgian" => 27,
2805 "Glagolitic" => 28,
2806 "Gothic" => 29,
2807 "Greek" => 30,
2808 "Gujarati" => 31,
2809 "Gurmukhi" => 32,
2810 "Han" => 33,
2811 "Hangul" => 34,
2812 "Hanunoo" => 35,
2813 "Hebrew" => 36,
2814 "Hiragana" => 37,
2815 "Imperial_Aramaic" => 38,
2816 "Inscriptional_Pahlavi" => 39,
2817 "Inscriptional_Parthian" => 40,
2818 "Javanese" => 41,
2819 "Kaithi" => 42,
2820 "Kannada" => 43,
2821 "Katakana" => 44,
2822 "Kayah_Li" => 45,
2823 "Kharoshthi" => 46,
2824 "Khmer" => 47,
2825 "Lao" => 48,
2826 "Latin" => 49,
2827 "Lepcha" => 50,
2828 "Limbu" => 51,
2829 "Linear_B" => 52,
2830 "Lisu" => 53,
2831 "Lycian" => 54,
2832 "Lydian" => 55,
2833 "Malayalam" => 56,
2834 "Mandaic" => 57,
2835 "Meetei_Mayek" => 58,
2836 "Mongolian" => 59,
2837 "Myanmar" => 60,
2838 "New_Tai_Lue" => 61,
2839 "Nko" => 62,
2840 "Ogham" => 63,
2841 "Ol_Chiki" => 64,
2842 "Old_Italic" => 65,
2843 "Old_Persian" => 66,
2844 "Old_South_Arabian" => 67,
2845 "Old_Turkic" => 68,
2846 "Oriya" => 69,
2847 "Osmanya" => 70,
2848 "Phags_Pa" => 71,
2849 "Phoenician" => 72,
2850 "Rejang" => 73,
2851 "Runic" => 74,
2852 "Samaritan" => 75,
2853 "Saurashtra" => 76,
2854 "Shavian" => 77,
2855 "Sinhala" => 78,
2856 "Sundanese" => 79,
2857 "Syloti_Nagri" => 80,
2858 "Syriac" => 81,
2859 "Tagalog" => 82,
2860 "Tagbanwa" => 83,
2861 "Tai_Le" => 84,
2862 "Tai_Tham" => 85,
2863 "Tai_Viet" => 86,
2864 "Tamil" => 87,
2865 "Telugu" => 88,
2866 "Thaana" => 89,
2867 "Thai" => 90,
2868 "Tibetan" => 91,
2869 "Tifinagh" => 92,
2870 "Ugaritic" => 93,
2871 "Vai" => 94,
2872 "Yi" => 95,
2873 # Win8/Win8.1
2874 "Chakma" => 96,
2875 "Meroitic_Cursive" => 97,
2876 "Meroitic_Hieroglyphs" => 98,
2877 "Miao" => 99,
2878 "Sharada" => 100,
2879 "Sora_Sompeng" => 101,
2880 "Takri" => 102,
2881 # Win10
2882 "Bassa_Vah" => 103,
2883 "Caucasian_Albanian" => 104,
2884 "Duployan" => 105,
2885 "Elbasan" => 106,
2886 "Grantha" => 107,
2887 "Khojki" => 108,
2888 "Khudawadi" => 109,
2889 "Linear_A" => 110,
2890 "Mahajani" => 111,
2891 "Manichaean" => 112,
2892 "Mende_Kikakui" => 113,
2893 "Modi" => 114,
2894 "Mro" => 115,
2895 "Nabataean" => 116,
2896 "Old_North_Arabian" => 117,
2897 "Old_Permic" => 118,
2898 "Pahawh_Hmong" => 119,
2899 "Palmyrene" => 120,
2900 "Pau_Cin_Hau" => 121,
2901 "Psalter_Pahlavi" => 122,
2902 "Siddham" => 123,
2903 "Tirhuta" => 124,
2904 "Warang_Citi" => 125,
2905 # Win10 RS1
2906 "Adlam" => 126,
2907 "Ahom" => 127,
2908 "Anatolian_Hieroglyphs" => 128,
2909 "Bhaiksuki" => 129,
2910 "Hatran" => 130,
2911 "Marchen" => 131,
2912 "Multani" => 132,
2913 "Newa" => 133,
2914 "Old_Hungarian" => 134,
2915 "Osage" => 135,
2916 "SignWriting" => 136,
2917 "Tangut" => 137,
2918 # Win10 RS4
2919 "Masaram_Gondi" => 138,
2920 "Nushu" => 139,
2921 "Soyombo" => 140,
2922 "Zanabazar_Square" => 141,
2923 # Win10 1903
2924 "Dogra" => 142,
2925 "Gunjala_Gondi" => 143,
2926 "Hanifi_Rohingya" => 144,
2927 "Makasar" => 145,
2928 "Medefaidrin" => 146,
2929 "Old_Sogdian" => 147,
2930 "Sogdian" => 148,
2931 # Win10 2004
2932 "Elymaic" => 149,
2933 "Nyiakeng_Puachue_Hmong" => 150,
2934 "Nandinagari" => 151,
2935 "Wancho" => 152,
2936 # Win11
2937 "Chorasmian" => 153,
2938 "Dives_Akuru" => 154,
2939 "Khitan_Small_Script" => 155,
2940 "Yezidi" => 156,
2943 ################################################################
2944 # dump Script IDs table
2945 sub dump_scripts($)
2947 my $filename = shift;
2948 my $header = $filename;
2949 my @scripts_table;
2950 my $script_index;
2951 my $i;
2953 my $INPUT = open_data_file( "ucd", "Scripts.txt" );
2954 # Fill the table
2955 # Unknown script id is always 0, so undefined scripts are automatically treated as such
2956 while (<$INPUT>)
2958 my $type = "";
2960 next if /^\#/; # skip comments
2961 next if /^\s*$/; # skip empty lines
2962 next if /\x1a/; # skip ^Z
2963 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2965 $type = $2;
2966 if (defined $scripts{$type})
2968 $scripts_table[hex $1] = $scripts{$type};
2970 next;
2972 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
2974 $type = $3;
2975 if (defined $scripts{$type})
2977 foreach my $i (hex $1 .. hex $2)
2979 $scripts_table[$i] = $scripts{$type};
2982 next;
2986 close $INPUT;
2988 $header = "$filename.h";
2989 open OUTPUT,">$header.new" or die "Cannot create $header";
2990 print "Building $header\n";
2991 print OUTPUT "/* Unicode Script IDs */\n";
2992 print OUTPUT "/* generated from $current_data_file */\n";
2993 print OUTPUT "/* DO NOT EDIT!! */\n\n";
2995 print OUTPUT "enum unicode_script_id {\n";
2996 foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
2998 print OUTPUT " Script_$script = $scripts{$script},\n";
3000 print OUTPUT " Script_LastId = ", (scalar keys %scripts) - 1, "\n";
3001 print OUTPUT "};\n";
3003 close OUTPUT;
3004 save_file($header);
3006 $filename = "$filename.c";
3007 open OUTPUT,">$filename.new" or die "Cannot create $header";
3008 print "Building $filename\n";
3009 print OUTPUT "/* Unicode Script IDs */\n";
3010 print OUTPUT "/* generated from $current_data_file */\n";
3011 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3012 print OUTPUT "#include \"windef.h\"\n\n";
3014 dump_three_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
3015 close OUTPUT;
3016 save_file($filename);
3019 ################################################################
3020 # dump the BiDi mirroring table
3021 sub dump_mirroring($)
3023 my $filename = shift;
3024 my @mirror_table = ();
3026 my $INPUT = open_data_file( "ucd", "BidiMirroring.txt" );
3027 while (<$INPUT>)
3029 next if /^\#/; # skip comments
3030 next if /^$/; # skip empty lines
3031 next if /\x1a/; # skip ^Z
3032 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
3034 $mirror_table[hex $1] = hex $2;
3035 next;
3037 die "malformed line $_";
3039 close $INPUT;
3041 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3042 print "Building $filename\n";
3043 print OUTPUT "/* Unicode BiDi mirroring */\n";
3044 print OUTPUT "/* generated from $current_data_file */\n";
3045 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3046 print OUTPUT "#include \"windef.h\"\n\n";
3047 dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
3048 close OUTPUT;
3049 save_file($filename);
3052 ################################################################
3053 # dump the Bidi Brackets
3054 sub dump_bracket($)
3056 my $filename = shift;
3057 my @bracket_table;
3059 my $INPUT = open_data_file( "ucd", "BidiBrackets.txt" );
3060 while (<$INPUT>)
3062 next if /^\#/; # skip comments
3063 next if /^\s*$/; # skip empty lines
3064 next if /\x1a/; # skip ^Z
3065 if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
3067 my $type = $3;
3068 die "unknown bracket $type" unless defined $bracket_types{$type};
3069 die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
3070 $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
3071 $bracket_table[hex $1] += $bracket_types{$type} << 8;
3072 next;
3074 die "malformed line $_";
3076 close $INPUT;
3078 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3079 print "Building $filename\n";
3080 print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
3081 print OUTPUT "/* generated from $current_data_file */\n";
3082 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3083 print OUTPUT "#include \"windef.h\"\n\n";
3085 dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
3087 close OUTPUT;
3088 save_file($filename);
3091 ################################################################
3092 # dump the Arabic shaping table
3093 sub dump_shaping($)
3095 my $filename = shift;
3096 my @joining_table = @initial_joining_table;
3098 my $INPUT = open_data_file( "ucd", "ArabicShaping.txt" );
3099 while (<$INPUT>)
3101 next if /^\#/; # skip comments
3102 next if /^\s*$/; # skip empty lines
3103 next if /\x1a/; # skip ^Z
3104 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3106 my $type = $2;
3107 $joining_table[hex $1] = $joining_types{$type};
3108 next;
3110 die "malformed line $_";
3112 close $INPUT;
3114 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3115 print "Building $filename\n";
3116 print OUTPUT "/* Unicode Arabic shaping */\n";
3117 print OUTPUT "/* generated from $current_data_file */\n";
3118 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3119 print OUTPUT "#include \"windef.h\"\n\n";
3121 dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
3123 print OUTPUT "\nconst unsigned short wine_shaping_forms[256][4] =\n{\n";
3124 for (my $i = 0x600; $i <= 0x6ff; $i++)
3126 printf OUTPUT " { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
3127 ${joining_forms{"isolated"}}[$i] || $i,
3128 ${joining_forms{"final"}}[$i] || $i,
3129 ${joining_forms{"initial"}}[$i] || $i,
3130 ${joining_forms{"medial"}}[$i] || $i;
3132 print OUTPUT "};\n";
3134 close OUTPUT;
3135 save_file($filename);
3138 ################################################################
3139 # dump the Arabic shaping table
3140 sub dump_arabic_shaping($)
3142 my $filename = shift;
3143 my @joining_table = @initial_joining_table;
3145 my $INPUT = open_data_file( "ucd", "ArabicShaping.txt" );
3146 while (<$INPUT>)
3148 next if /^\#/; # skip comments
3149 next if /^\s*$/; # skip empty lines
3150 next if /\x1a/; # skip ^Z
3151 if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
3153 my $type = $2;
3154 my $group = $3;
3156 if ($group eq "ALAPH" || $group eq "DALATH RISH")
3158 $joining_table[hex $1] = $joining_types{$group};
3160 else
3162 $joining_table[hex $1] = $joining_types{$type};
3165 next;
3167 die "malformed line $_";
3169 close $INPUT;
3171 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3172 print "Building $filename\n";
3173 print OUTPUT "/* Unicode Arabic shaping */\n";
3174 print OUTPUT "/* generated from $current_data_file */\n";
3175 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3176 print OUTPUT "#include \"windef.h\"\n\n";
3178 dump_three_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
3180 close OUTPUT;
3181 save_file($filename);
3184 ################################################################
3185 # dump the Vertical Orientation table
3186 sub dump_vertical($$)
3188 my ($filename, $unix) = @_;
3189 my @vertical_table;
3191 my $INPUT = open_data_file( "ucd", "VerticalOrientation.txt" );
3192 while (<$INPUT>)
3194 next if /^\#/; # skip comments
3195 next if /^\s*$/; # skip empty lines
3196 next if /\x1a/; # skip ^Z
3197 if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
3199 my $type = $2;
3200 die "unknown vertical $type" unless defined $vertical_types{$type};
3201 if (hex $1 < 65536)
3203 $vertical_table[hex $1] = $vertical_types{$type};
3205 next;
3207 elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
3209 my $type = $3;
3210 die "unknown vertical $type" unless defined $vertical_types{$type};
3211 foreach my $i (hex $1 .. hex $2)
3213 $vertical_table[$i] = $vertical_types{$type};
3215 next;
3217 die "malformed line $_";
3219 close $INPUT;
3221 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3222 print "Building $filename\n";
3223 print OUTPUT "/* Unicode Vertical Orientation */\n";
3224 print OUTPUT "/* generated from $current_data_file */\n";
3225 print OUTPUT "/* DO NOT EDIT!! */\n\n";
3226 if ($unix)
3228 print OUTPUT "#if 0\n";
3229 print OUTPUT "#pragma makedep unix\n";
3230 print OUTPUT "#endif\n\n";
3232 print OUTPUT "#include \"windef.h\"\n\n";
3234 dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
3236 close OUTPUT;
3237 save_file($filename);
3240 ################################################################
3241 # compress a mapping table by removing identical rows
3242 sub compress_array($$@)
3244 my $rows = shift;
3245 my $def = shift;
3246 my @table = @_;
3247 my $len = @table / $rows;
3248 my @array;
3249 my $data = "";
3251 # try to merge table rows
3252 for (my $row = 0; $row < $rows; $row++)
3254 my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
3255 my $pos = index $data, $rowtxt;
3256 if ($pos == -1)
3258 # check if the tail of the data can match the start of the new row
3259 my $first = substr( $rowtxt, 0, 1 );
3260 for (my $i = length($data) - 1; $i > 0; $i--)
3262 $pos = index( substr( $data, -$i ), $first );
3263 last if $pos == -1;
3264 $i -= $pos;
3265 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
3266 substr( $data, -$i ) = "";
3267 last;
3269 $pos = length $data;
3270 $data .= $rowtxt;
3272 $array[$row] = $rows + $pos;
3274 return @array, unpack "U*", $data;
3277 ################################################################
3278 # dump a char -> value mapping table using two-level tables
3279 sub dump_two_level_mapping($$$@)
3281 my $name = shift;
3282 my $def = shift;
3283 my $size = shift;
3284 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3285 my (@array, @row_array, @data, @row_data);
3286 (@row_array[0..4095], @data) = compress_array( 4096, $def, @_[0..65535] );
3287 (@array[0..255], @row_data) = compress_array( 256, 0, @row_array );
3289 for (my $i = 0; $i < @row_data; $i++) { $row_data[$i] += @row_data + 256 - 4096; }
3291 printf OUTPUT "const %s %s[%d] =\n{\n", $type, $name, @array + @row_data + @data;
3292 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array );
3293 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @row_data );
3294 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @data );
3297 ################################################################
3298 # dump a char -> value mapping table using three-level tables
3299 sub dump_three_level_mapping($$@)
3301 my $name = shift;
3302 my $def = shift;
3303 my $size = shift;
3304 my $type = $size == 16 ? "unsigned short" : "unsigned int";
3305 my $level3 = ($MAX_CHAR + 1) / 16;
3306 my $level2 = $level3 / 16;
3307 my $level1 = $level2 / 16;
3308 my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
3309 my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
3310 my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
3312 for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
3313 for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
3315 printf OUTPUT "const %s %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
3316 printf OUTPUT " /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
3317 printf OUTPUT " /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
3318 printf OUTPUT " /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
3319 printf OUTPUT " /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
3322 ################################################################
3323 # dump a binary case mapping table in l_intl.nls format
3324 sub dump_binary_case_table(@)
3326 my (@table) = @_;
3327 my @difftable;
3328 my @res;
3330 for (my $i = 0; $i < @table; $i++)
3332 next unless defined $table[$i];
3333 $difftable[$i] = ($table[$i] - $i) & 0xffffffff;
3336 my (@low_array1, @low_array2, @low_data, @low_row_data);
3337 (@low_array2[0..4095], @low_data) = compress_array( 4096, 0, @difftable[0..65535] );
3338 (@low_array1[0..255], @low_row_data) = compress_array( 256, 0, @low_array2 );
3340 if (scalar @table > 0x10000)
3342 my (@high_array1, @high_array2, @high_data, @high_row_data);
3343 (@high_array2[0..32767], @high_data) = compress_array( 32768, 0, @difftable[65536..$MAX_CHAR] );
3344 (@high_array1[0..1023], @high_row_data) = compress_array( 1024, 0, @high_array2 );
3346 push @res, map { $_ + 1024; } @low_array1;
3347 push @res, map { $_ + @res + @low_row_data + @low_data; } @high_array1;
3348 push @res, map { $_ + @res + @low_row_data - 4096; } @low_row_data;
3349 push @res, @low_data;
3350 push @res, map { 2 * ($_ - 32768) + @res + @high_row_data; } @high_row_data;
3351 return pack( "S<*", 1 + scalar @res + 2 * scalar @high_data, @res ) . pack( "L<*", @high_data );
3353 else
3355 push @res, @low_array1;
3356 push @res, map { $_ + @res + @low_row_data - 4096; } @low_row_data;
3357 push @res, @low_data;
3358 return pack "S<*", 1 + scalar @res, @res;
3362 ################################################################
3363 # dump case mappings for l_intl.nls
3364 sub dump_intl_nls($)
3366 my @upper_table = @toupper_table;
3367 my @lower_table = @tolower_table;
3368 remove_linguistic_mappings( \@upper_table, \@lower_table );
3370 my $upper = dump_binary_case_table( @upper_table[0..65535] );
3371 my $lower = dump_binary_case_table( @lower_table[0..65535] );
3373 my $filename = shift;
3374 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3375 printf "Building $filename\n";
3377 binmode OUTPUT;
3378 print OUTPUT pack "S<", 1; # version
3379 print OUTPUT $upper;
3380 print OUTPUT $lower;
3381 close OUTPUT;
3382 save_file($filename);
3386 ################################################################
3387 # dump the bidi direction table
3388 sub dump_bidi_dir_table($)
3390 my $filename = shift;
3391 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3392 printf "Building $filename\n";
3393 printf OUTPUT "/* Unicode BiDi direction table */\n";
3394 printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
3395 printf OUTPUT "#include \"windef.h\"\n\n";
3397 my @table;
3399 for (my $i = 0; $i < @direction_table; $i++)
3401 $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
3404 dump_three_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
3406 close OUTPUT;
3407 save_file($filename);
3411 sub rol($$)
3413 my ($byte, $count) = @_;
3414 return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
3417 ################################################################
3418 # compress the character properties table
3419 sub compress_char_props_table($@)
3421 my $rows = shift;
3422 my @table = @_;
3423 my $len = @table / $rows;
3424 my $pos = 0;
3425 my @array = (0) x $rows;
3426 my %sequences;
3428 # add some predefined sequences
3429 foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
3431 # try to merge table rows
3432 for (my $row = 0; $row < $rows; $row++)
3434 my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
3435 my $rowtxt = pack "L*", @table_row;
3436 if (defined($sequences{$rowtxt}))
3438 # reuse an existing row
3439 $array[$row] = $sequences{$rowtxt};
3441 else
3443 # create a new row
3444 $sequences{$rowtxt} = $array[$row] = ++$pos;
3445 push @array, @table_row;
3448 return @array;
3451 ################################################################
3452 # dump a normalization table in binary format
3453 sub dump_norm_table($)
3455 my $filename = shift;
3457 my %forms = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
3458 my %decomp = ( "nfc" => \@decomp_table,
3459 "nfd" => \@decomp_table,
3460 "nfkc" => \@decomp_compat_table,
3461 "nfkd" => \@decomp_compat_table ,
3462 "idna" => \@idna_decomp_table );
3464 open OUTPUT,">$filename.new" or die "Cannot create $filename";
3465 print "Building $filename\n";
3467 my $type = $filename;
3468 $type =~ s!.*/norm(\w+)\.nls!$1!;
3470 my $compose = $forms{$type} & 1;
3471 my $compat = !!($forms{$type} & 4) + ($type eq "idna");
3473 my @version = split /\./, $UNIVERSION;
3475 # combining classes
3477 my @classes;
3478 my @class_values;
3480 foreach my $c (grep defined, @combining_class_table)
3482 $classes[$c] = 1 if $c < 0x100;
3484 for (my $i = 0; $i < @classes; $i++)
3486 next unless defined $classes[$i];
3487 $classes[$i] = @class_values;
3488 push @class_values, $i;
3490 push @class_values, 0 if (@class_values % 2);
3491 die "too many classes" if @class_values >= 0x40;
3493 # character properties
3495 my @char_props;
3496 my @decomposed;
3497 my @comp_hash_table;
3498 my $comp_hash_size = $compose ? 254 : 0;
3500 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3502 next unless defined $combining_class_table[$i];
3503 if (defined $decomp{$type}->[$i])
3505 my @dec = get_decomposition( $i, $decomp{$type} );
3506 if ($compose && (my @comp = get_composition( $i, $compat )))
3508 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
3509 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
3511 my $val = 0;
3512 foreach my $d (@dec)
3514 $val = $combining_class_table[$d];
3515 last if $val;
3517 $char_props[$i] = $classes[$val];
3519 else
3521 $char_props[$i] = 0xbf;
3523 @dec = compose_hangul( @dec ) if $compose;
3524 @dec = to_utf16( @dec );
3525 push @dec, 0 if @dec >= 7;
3526 $decomposed[$i] = \@dec;
3528 else
3530 if ($combining_class_table[$i] == 0x100)
3532 $char_props[$i] = 0x7f;
3534 elsif ($combining_class_table[$i])
3536 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
3538 elsif ($type eq "idna" && defined $idna_disallowed[$i])
3540 $char_props[$i] = 0xff;
3542 else
3544 $char_props[$i] = 0;
3549 if ($compose)
3551 for (my $i = 0; $i <= $MAX_CHAR; $i++)
3553 my @comp = get_composition( $i, $compat );
3554 next unless @comp;
3555 if ($combining_class_table[$comp[1]])
3557 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
3558 $char_props[$comp[1]] |= 0x40;
3560 else
3562 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
3563 $char_props[$comp[1]] |= 0xc0;
3568 # surrogates
3569 foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
3570 foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
3572 # Hangul
3573 if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
3574 elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
3575 foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
3577 # invalid chars
3578 if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
3579 foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
3580 foreach my $i (0x00..0x10)
3582 $char_props[($i << 16) | 0xfffe] = 0xff;
3583 $char_props[($i << 16) | 0xffff] = 0xff;
3586 # decomposition hash table
3588 my @decomp_hash_table;
3589 my @decomp_hash_index;
3590 my @decomp_hash_data;
3591 my $decomp_hash_size = 944;
3593 # build string of character data, reusing substrings when possible
3594 my $decomp_char_data = "";
3595 foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
3597 my $str = pack "U*", @{$i};
3598 $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
3600 for (my $i = 0; $i < @decomposed; $i++)
3602 next unless defined $decomposed[$i];
3603 my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
3604 die "sequence not found" if $pos == -1;
3605 my $len = @{$decomposed[$i]};
3606 $len = 7 if $len > 7;
3607 my $hash = $i % $decomp_hash_size;
3608 push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
3610 for (my $i = 0; $i < $decomp_hash_size; $i++)
3612 $decomp_hash_index[$i] = @decomp_hash_data / 2;
3613 next unless defined $decomp_hash_table[$i];
3614 if (@{$decomp_hash_table[$i]} == 1)
3616 my $entry = $decomp_hash_table[$i]->[0];
3617 if ($char_props[$entry->[0]] == 0xbf)
3619 $decomp_hash_index[$i] = $entry->[1];
3620 next;
3623 foreach my $entry (@{$decomp_hash_table[$i]})
3625 push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
3628 push @decomp_hash_data, 0, 0;
3630 # composition hash table
3632 my @comp_hash_index;
3633 my @comp_hash_data;
3634 if (@comp_hash_table)
3636 for (my $i = 0; $i < $comp_hash_size; $i++)
3638 $comp_hash_index[$i] = @comp_hash_data;
3639 push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
3641 $comp_hash_index[$comp_hash_size] = @comp_hash_data;
3642 push @comp_hash_data, 0, 0, 0;
3645 my $level1 = ($MAX_CHAR + 1) / 128;
3646 my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
3648 my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
3649 0, $decomp_hash_size, $comp_hash_size, 0 );
3650 my @tables = (0) x 8;
3652 $tables[0] = 16 + @header + @tables;
3653 $tables[1] = $tables[0] + @class_values / 2;
3654 $tables[2] = $tables[1] + $level1 / 2;
3655 $tables[3] = $tables[2] + (@rows - $level1) / 2;
3656 $tables[4] = $tables[3] + @decomp_hash_index;
3657 $tables[5] = $tables[4] + @decomp_hash_data;
3658 $tables[6] = $tables[5] + length $decomp_char_data;
3659 $tables[7] = $tables[6] + @comp_hash_index;
3661 print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
3662 print OUTPUT pack "S<*", @header;
3663 print OUTPUT pack "S<*", @tables;
3664 print OUTPUT pack "C*", @class_values;
3666 print OUTPUT pack "C*", @rows[0..$level1-1];
3667 print OUTPUT pack "C*", @rows[$level1..$#rows];
3668 print OUTPUT pack "S<*", @decomp_hash_index;
3669 print OUTPUT pack "S<*", @decomp_hash_data;
3670 print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
3671 print OUTPUT pack "S<*", @comp_hash_index;
3672 print OUTPUT pack "S<*", @comp_hash_data;
3674 close OUTPUT;
3675 save_file($filename);
3677 add_registry_string_value( $nlskey, "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
3681 ################################################################
3682 # output a codepage definition file from the global tables
3683 sub output_codepage_file($)
3685 my $codepage = shift;
3687 my $output = sprintf "nls/c_%03d.nls", $codepage;
3688 open OUTPUT,">$output.new" or die "Cannot create $output";
3690 printf "Building %s\n", $output;
3691 if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
3692 else { dump_binary_dbcs_table( $codepage ); }
3694 close OUTPUT;
3695 save_file($output);
3697 add_registry_string_value( $nlskey, "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
3700 ################################################################
3701 # output a codepage table from a Microsoft-style mapping file
3702 sub dump_msdata_codepage($)
3704 my $filename = shift;
3706 my $state = "";
3707 my ($codepage, $width, $count);
3708 my ($lb_cur, $lb_end);
3710 @cp2uni = ();
3711 @glyph2uni = ();
3712 @lead_bytes = ();
3713 @uni2cp = ();
3714 $default_char = $DEF_CHAR;
3715 $default_wchar = $DEF_CHAR;
3717 my $INPUT = open_data_file( "codepages", $filename );
3719 while (<$INPUT>)
3721 next if /^;/; # skip comments
3722 next if /^\s*$/; # skip empty lines
3723 next if /\x1a/; # skip ^Z
3724 last if /^ENDCODEPAGE/;
3726 if (/^CODEPAGE\s+(\d+)/)
3728 $codepage = $1;
3729 next;
3731 if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
3733 $width = $1;
3734 $default_char = hex $2;
3735 $default_wchar = hex $3;
3736 next;
3738 if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
3740 $state = $1;
3741 $count = $2;
3742 next;
3744 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
3746 if ($state eq "MBTABLE")
3748 my $cp = hex $1;
3749 my $uni = hex $2;
3750 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3751 next;
3753 if ($state eq "GLYPHTABLE")
3755 my $cp = hex $1;
3756 my $uni = hex $2;
3757 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
3758 next;
3760 if ($state eq "WCTABLE")
3762 my $uni = hex $1;
3763 my $cp = hex $2;
3764 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
3765 next;
3767 if ($state eq "DBCSRANGE")
3769 my $start = hex $1;
3770 my $end = hex $2;
3771 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
3772 $lb_cur = $start;
3773 $lb_end = $end;
3774 next;
3776 if ($state eq "DBCSTABLE")
3778 my $mb = hex $1;
3779 my $uni = hex $2;
3780 my $cp = ($lb_cur << 8) | $mb;
3781 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
3782 if (!--$count)
3784 if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
3786 next;
3789 die "$filename: Unrecognized line $_\n";
3791 close $INPUT;
3793 output_codepage_file( $codepage );
3795 if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
3798 ################################################################
3799 # align a string length
3800 sub align_string($$)
3802 my ($align, $str) = @_;
3803 $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
3804 return $str;
3807 ################################################################
3808 # pad a string with zeros
3809 sub pad_string($$)
3811 my ($pad, $str) = @_;
3812 $str .= pack "C*", (0) x ($pad - length($str)) if length($str) < $pad;
3813 return $str;
3816 ################################################################
3817 # pack a GUID string
3818 sub pack_guid($)
3820 $_ = shift;
3821 /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
3822 return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
3825 ################################################################
3826 # comparison function for compression sort
3827 sub cmp_compression
3829 return scalar @{$a} <=> scalar @{$b} ||
3830 $a->[4] <=> $b->[4] ||
3831 $a->[5] <=> $b->[5] ||
3832 $a->[6] <=> $b->[6] ||
3833 $a->[7] <=> $b->[7] ||
3834 $a->[8] <=> $b->[8] ||
3835 $a->[9] <=> $b->[9] ||
3836 $a->[10] <=> $b->[10] ||
3837 $a->[11] <=> $b->[11] ||
3838 $a->[12] <=> $b->[12];
3841 ################################################################
3842 # build a binary sort keys table
3843 sub dump_sortkey_table($)
3845 my $filename = shift;
3846 my @keys;
3847 my ($part, $section, $subsection, $guid, $version, $ling_flag);
3848 my @multiple_weights;
3849 my @expansions;
3850 my @compressions;
3851 my %exceptions;
3852 my %guids;
3853 my %compr_flags;
3854 my %locales;
3855 my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
3856 my $jamostr = "";
3858 my $re_hex = '0x[0-9A-Fa-f]+';
3859 my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
3860 $guids{$default_guid} = { };
3862 my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
3864 my $KEYS = open_data_file( "sorting" );
3866 printf "Building $filename\n";
3868 while (<$KEYS>)
3870 s/\s*;.*$//;
3871 next if /^\s*$/; # skip empty lines
3872 if (/^\s*(SORTKEY|SORTTABLES)/)
3874 $part = $1;
3875 next;
3877 if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
3879 $part = $section = "";
3880 next;
3882 if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
3884 $section = $1;
3885 $guid = undef;
3886 next;
3888 next unless $part;
3889 if ("$part.$section" eq "SORTKEY.DEFAULT")
3891 if (/^\s*($re_hex)\s+$re_key/)
3893 $keys[hex $1] = [ split(/\s+/,$2) ];
3894 next;
3897 elsif ("$part.$section" eq "SORTTABLES.RELEASE")
3899 if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
3901 $version = hex $1;
3902 next;
3904 if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
3906 # ignore for now
3907 next;
3910 elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
3911 "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
3912 "$part.$section" eq "SORTTABLES.INVERSECASING")
3914 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
3916 $guid = lc $1;
3917 $guids{$guid} = { } unless defined $guids{$guid};
3918 $guids{$guid}->{flags} |= $flags{$section};
3919 next;
3921 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3923 $locales{$1} = $guid;
3924 next;
3927 elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
3929 if (/^\s*(\d+)\s+(\d+)/)
3931 push @multiple_weights, $1, $2;
3932 next;
3935 elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
3937 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3939 my $pos = scalar @expansions / 2;
3940 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
3941 push @expansions, hex $2, hex $3;
3942 next;
3945 elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
3947 if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
3949 $keys[hex $1] = $keys[hex $2];
3950 next;
3953 elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
3955 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
3957 if ($subsection || !$guid) # start a new one
3959 $guid = lc $1;
3960 $subsection = "";
3961 $guids{$guid} = { } unless defined $guids{$guid};
3962 $guids{$guid}->{flags} |= $flags{$2} if $2;
3963 $guids{$guid}->{compr} = @compressions;
3964 $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
3965 $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
3966 push @compressions, [ ];
3968 else # merge with current one
3970 $guids{lc $1} = { } unless defined $guids{lc $1};
3971 $guids{lc $1}->{flags} |= $flags{$2} if $2;
3972 $guids{lc $1}->{compr} = $guids{$guid}->{compr};
3973 $compr_flags{lc $1} = $compr_flags{$guid};
3975 next;
3977 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
3979 $locales{$1} = $guid;
3980 next;
3982 if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
3984 $subsection = $1;
3985 next;
3987 if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
3989 my @comp = map { hex $_; } split(/\s+/,$1);
3990 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
3991 # add compression flags
3992 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
3993 next;
3996 elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
3998 if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
4000 $guid = lc $1;
4001 $guids{$guid} = { } unless defined $guids{lc $1};
4002 $ling_flag = ($2 ? "+" : "-");
4003 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
4004 next;
4006 if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
4008 $locales{$1} = $guid;
4009 next;
4011 if (/^\s*($re_hex)\s+$re_key/)
4013 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
4014 next;
4017 elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
4019 if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
4021 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
4022 next;
4025 die "$current_data_file: $part.$section: unrecognized line $_\n";
4027 close $KEYS;
4029 # Sortkey table
4031 my $table;
4032 for (my $i = 0; $i < 0x10000; $i++)
4034 my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
4035 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4038 foreach my $id (sort keys %exceptions)
4040 my $pos = length($table) / 4;
4041 my @exc = @{$exceptions{$id}};
4042 my @filled;
4043 my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
4044 my $guid = substr( $id, 0, -1 );
4045 $guids{$guid}->{$key} = $pos;
4046 $pos += 0x100;
4047 my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
4048 for (my $j = 0; $j < 0x10000; $j++)
4050 next unless defined $exc[$j] || defined $flags[$j];
4051 $filled[$j >> 8] = 1;
4052 $j |= 0xff;
4054 for (my $j = 0; $j < 0x100; $j++)
4056 $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
4057 $pos += 0x100 if $filled[$j];
4059 for (my $j = 0; $j < 0x10000; $j++)
4061 next unless $filled[$j >> 8];
4062 my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
4063 $k[3] |= $flags[$j] || 0;
4064 $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
4068 # Case mapping tables
4070 # standard table
4071 my @casemaps;
4072 my @upper = @toupper_table;
4073 my @lower = @tolower_table;
4074 remove_linguistic_mappings( \@upper, \@lower );
4075 $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4077 # linguistic table
4078 $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
4080 # Turkish table
4081 @upper = @toupper_table;
4082 @lower = @tolower_table;
4083 $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
4084 $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
4085 $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
4086 my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
4088 # Char type table
4090 my @table;
4091 my $types = "";
4092 my %typestr;
4093 for (my $i = 0; $i < 0x10000; $i++)
4095 my $str = pack "S<3",
4096 ($category_table[$i] || 0) & 0xffff,
4097 defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
4098 ($category_table[$i] || 0) >> 16;
4100 if (!defined($typestr{$str}))
4102 $typestr{$str} = length($types) / 6;
4103 $types .= $str;
4105 $table[$i] = $typestr{$str};
4108 my (@rows, @array, @data, @row_data);
4109 (@rows[0..4095], @data) = compress_array( 4096, 0, @table[0..65535] );
4110 (@array[0..255], @row_data) = compress_array( 256, 0, @rows );
4111 for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; } # we need byte offsets
4112 for (my $i = 0; $i < @row_data; $i++) { $row_data[$i] += 2 * @row_data + 512 - 4096; }
4114 my $arraystr = pack("S<*", @array, @row_data) . pack("C*", @data);
4115 my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
4116 $chartypes = align_string( 8, $chartypes . $types . $arraystr );
4118 # Sort tables
4120 # guids
4121 my $sorttables = pack "L<2", $version, scalar %guids;
4122 foreach my $id (sort keys %guids)
4124 my %guid = %{$guids{$id}};
4125 my $flags = $guid{flags} || 0;
4126 my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
4127 $sorttables .= pack_guid($id) . pack "L<5",
4128 $flags,
4129 defined($guid{compr}) ? $guid{compr} : 0xffffffff,
4130 $guid{except} || 0,
4131 $guid{ling_except} || 0,
4132 $map / 2;
4135 # expansions
4136 $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
4138 # compressions
4139 $sorttables .= pack "L<", scalar @compressions;
4140 my $rowstr = "";
4141 foreach my $c (@compressions)
4143 my $pos = length($rowstr) / 2;
4144 my $min = 0xffff;
4145 my $max = 0;
4146 my @lengths = (0) x 8;
4147 foreach my $r (sort cmp_compression @{$c})
4149 my @row = @{$r};
4150 $lengths[scalar @row - 6]++;
4151 foreach my $val (@row[4..$#row])
4153 $min = $val if $min > $val;
4154 $max = $val if $max < $val;
4156 $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
4157 $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
4159 $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
4161 $sorttables .= $rowstr;
4163 # multiple weights
4164 $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
4166 # jamo sort
4167 $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
4169 # Locales
4171 add_registry_key( $nlskey, "Sorting\\Ids", "{$default_guid}" );
4172 foreach my $loc (sort keys %locales)
4174 # skip specific locales that match more general ones
4175 my @parts = split /[-_]/, $loc;
4176 next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
4177 next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
4178 add_registry_string_value( $nlskey, "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
4181 # File header
4183 my @header;
4184 $header[0] = 16;
4185 $header[1] = $header[0] + length $table;
4186 $header[2] = $header[1] + length $casemaps;
4187 $header[3] = $header[2] + length $chartypes;
4189 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
4190 print OUTPUT pack "L<*", @header;
4191 print OUTPUT $table, $casemaps, $chartypes, $sorttables;
4192 close OUTPUT;
4193 save_file($filename);
4194 return $chartypes;
4198 my %lcnames;
4200 sub locale_parent($)
4202 my $loc = shift;
4204 return undef unless $loc;
4205 return $lcnames{$loc}->{sparent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{sparent};
4206 return $lcnames{$loc}->{parent} if defined $lcnames{$loc} && defined $lcnames{$loc}->{parent};
4207 if ($loc =~ /(.*)-[0-9A-Za-z]+/) { return $1; }
4208 return "";
4211 sub compare_locales
4213 (my $n1 = $a) =~ tr/A-Z_/a-z-/;
4214 (my $n2 = $b) =~ tr/A-Z_/a-z-/;
4215 return $n1 cmp $n2;
4218 # query an xml key
4219 sub xml_query($$)
4221 my ($xml, $query) = @_;
4222 my $ret = $xml->find( $query );
4223 return undef unless $ret;
4224 printf STDERR "multiple entries for %s\n", $query if (@{$ret} > 1);
4225 return @{$ret}[0]->textContent;
4228 # query an xml key for a locale, with fallback to the parents
4229 sub loc_query($$)
4231 my ($loc, $query) = @_;
4233 $loc = $lcnames{"en-US"} unless $loc->{name}; # fallback to "en-US" for root locale
4235 for (my $cur = $loc->{name}; defined $cur; $cur = locale_parent( $cur ))
4237 next unless defined $lcnames{$cur};
4238 my $xml = $lcnames{$cur}->{xml};
4239 my $ret = $xml->find( $query );
4240 next unless $ret;
4241 printf STDERR "%s: multiple entries for %s\n", $cur, $query if (@{$ret} > 1);
4242 next if @{$ret}[0]->textContent eq "\x{2191}\x{2191}\x{2191}"; # "↑↑↑"
4243 return @{$ret}[0]->textContent;
4245 return undef;
4248 # retrieve a locale field entry by going up the parents tree
4249 sub locale_entry($$$)
4251 my ($loc, $field, $def) = @_;
4253 return $loc->{$field} if defined $loc->{$field};
4255 unless ($loc->{name}) # fallback to "en-US" for root locale
4257 $loc = $lcnames{"en-US"};
4258 return $loc->{$field} if defined $loc->{$field};
4260 while (defined $loc->{alias}) # resolve aliases
4262 $loc = $lcnames{$loc->{alias}};
4263 return $loc->{$field} if defined $loc->{$field};
4265 my $cur = $loc->{name};
4266 while ($cur)
4268 if (defined $lcnames{$cur} && defined $lcnames{$cur}->{sparent})
4270 $cur = $lcnames{$cur}->{sparent};
4272 elsif ($cur =~ /(.*)-[0-9A-Za-z]+/)
4274 $cur = $1;
4276 else
4278 return $def;
4280 return $lcnames{$cur}->{$field} if defined $lcnames{$cur} && defined $lcnames{$cur}->{$field};
4282 return $def;
4285 my $string_data;
4287 sub add_str_data($)
4289 my $txt = shift;
4290 my $ret = index( $string_data, $txt );
4291 if ($ret == -1)
4293 $ret = length($string_data);
4294 $string_data .= $txt
4296 return $ret / 2;
4299 sub add_string($)
4301 my $str = shift;
4302 return 0 unless defined($str) && $str ne "";
4303 my $utf = encode( "UTF16LE", $str );
4304 return add_str_data( (pack "S<", length($utf) / 2) . $utf . (pack "S", 0) );
4307 sub add_fontsig(@)
4309 return add_str_data( pack "S<L<*", scalar(@_) * 2, @_ );
4312 sub add_strarray(@)
4314 return 0 unless @_;
4315 return add_str_data( pack "S<L<*", scalar @_, map { add_string($_) } @_);
4318 sub format_to_grouping($)
4320 my $format = shift;
4321 if ($format =~ /#,(#+),(#+0)/) { return chr(length($2)) . chr(length($1)); }
4322 if ($format =~ /#,(#+0)/) { return chr(length($1)); }
4323 # printf STDERR "unknown format %s\n", $format;
4324 return chr(3);
4327 sub parse_currency_format($$)
4329 my $name = shift;
4330 my ($posfmt, $negfmt) = split /;/, shift;
4331 my @pospatterns = ( "\xa4[^\xa0]*#", # $1.1
4332 "00[^\xa0]*\xa4", # 1.1$
4333 "\xa4.*\xa0.*#", # $ 1.1
4334 "00.*\xa0.*\xa4" ); # 1.1 $
4335 my @negpatterns = ( "\\(\xa4[^\xa0]*#", # ($1.1)
4336 "-\xa4[^\xa0]*#", # -$1.1
4337 "\xa4[^\xa0]*-#", # $-1.1
4338 "\xa4[^\xa0]*#.*00-", # $1.1-
4339 "00[^\xa0]*\xa4\\)", # (1.1$)
4340 "-#.*00[^\xa0]*\xa4", # -1.1$
4341 "00-[^\xa0]*\xa4", # 1.1-$
4342 "00[^\xa0]*\xa4-", # 1.1$-
4343 "-#.*00.*\xa0.*\xa4", # -1.1 $
4344 "-\xa4.*\xa0.*#", # -$ 1.1
4345 "00.*\xa0.*\xa4-", # 1.1 $-
4346 "\xa4.*\xa0.*#.*00-", # $ 1.1-
4347 "\xa4.*\xa0.*-#", # $ -1.1
4348 "00-.*\xa0.*\xa4", # 1.1- $
4349 "\\(\xa4.*\xa0.*#", # ($ 1.1)
4350 "00.*\xa0.*\xa4\\)"); # (1.1 $)
4351 my ($pos, $neg);
4353 for ($pos = 0; $pos < @pospatterns; $pos++)
4355 last if ($posfmt =~ /$pospatterns[$pos]/);
4357 #printf STDERR "$name: unknown format '%s'\n", $posfmt if ($pos == @pospatterns);
4358 $pos = 0 if ($pos == @pospatterns);
4360 if (defined $negfmt)
4362 for ($neg = 0; $neg < @negpatterns; $neg++)
4364 last if ($negfmt =~ /$negpatterns[$neg]/);
4366 #printf STDERR "$name: unknown format '%s'\n", $negfmt if ($neg == @negpatterns);
4367 $neg = 0 if ($neg == @negpatterns);
4369 elsif ($pos == 0) { $neg = 1; }
4370 elsif ($pos == 1) { $neg = 5; }
4371 elsif ($pos == 2) { $neg = 9; }
4372 elsif ($pos == 3) { $neg = 8; }
4374 return ($pos, $neg);
4377 sub parse_percent_format($)
4379 my $fmt = shift;
4380 my @patterns = ( "0.+%", # 1 %
4381 "0%", # 1%
4382 "%#", # %1
4383 "%.+#" ); # % 1
4384 my $pos;
4385 for ($pos = 0; $pos < @patterns; $pos++)
4387 last if ($fmt =~ /$patterns[$pos]/);
4389 printf STDERR "unknown format '%s'\n", $fmt if ($pos == @patterns);
4390 return ($pos, ($pos == 3) ? 7 : $pos);
4393 sub convert_date_format($)
4395 my $fmt = shift;
4396 $fmt =~ s/G+/gg/;
4397 $fmt =~ s/LLLL/MMMM/;
4398 $fmt =~ s/LLL/MMM/;
4399 $fmt =~ s/E+/dddd/;
4400 $fmt =~ s/ccc+/dddd/;
4401 $fmt =~ s/([^gy])y([^y])/$1yyyy$2/;
4402 $fmt =~ s/^y([^y])/yyyy$1/;
4403 $fmt =~ s/([^gy])y$/$1yyyy/;
4404 return $fmt;
4407 sub convert_time_format($)
4409 my $fmt = shift;
4410 $fmt =~ s/a+/tt/;
4411 $fmt =~ s/B+/tt/;
4412 $fmt =~ s/\x{202f}/ /;
4413 return $fmt;
4416 sub load_iso639()
4418 my %iso639;
4419 my $DATA = open_data_file( "iso639", "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3_$ISO639VERSION.tab" );
4420 while (<$DATA>)
4422 if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
4424 close $DATA;
4425 return %iso639;
4429 ################################################################
4430 # build the locale table for locale.nls
4431 sub build_locale_data()
4433 my $base = "cldr-release-$CLDRVERSION";
4434 my $suppl = load_xml_data_file( "cldr", "$base/common/supplemental/supplementalData.xml" );
4435 my $subtags = load_xml_data_file( "cldr", "$base/common/supplemental/likelySubtags.xml" );
4436 my $numbers = load_xml_data_file( "cldr", "$base/common/supplemental/numberingSystems.xml" );
4437 # obsolete phone data from CLDR version 33
4438 my $phone = load_xml_data_file( "cldr33", "common/supplemental/telephoneCodeData.xml" );
4439 my %iso639 = load_iso639();
4440 $string_data = pack "S2", 0, 0; # offset 0 == empty string
4442 %lcnames = map { $_->{name} => $_ } @locales;
4444 my %lcids;
4445 foreach my $loc (@locales) { $lcids{$loc->{lcid}} = $loc if defined $loc->{lcid}; }
4447 my %days = ( "sun" => 0, "mon" => 1, "tue" => 2, "wed" => 3, "thu" => 4, "fri" => 5, "sat" => 6 );
4449 # assign locale parents
4451 foreach my $loc (@locales)
4453 next if $loc->{name} eq "";
4454 next if defined $loc->{parent};
4455 (my $unix_name = $loc->{name}) =~ s/-/_/g;
4456 my $parent = xml_query( $suppl, "/supplementalData/parentLocales[not(\@component)]/parentLocale[contains(concat(' ',\@locales,' '),' $unix_name ')]/\@parent" );
4457 if ($parent)
4459 $parent =~ s/_/-/g;
4460 $parent = "" if $parent eq "root";
4462 elsif ($loc->{name} =~ /(.*)-[0-9A-Za-z]+/) { $parent = $1; }
4463 $loc->{parent} = $parent || "";
4466 # load per-locale XML files
4468 foreach my $loc (@locales)
4470 next if defined $loc->{alias};
4471 (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
4472 $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
4473 my $xml = load_xml_data_file( "cldr", $file );
4474 $loc->{xml} = $xml;
4475 $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
4476 $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
4477 $loc->{script} = xml_query( $xml, "/ldml/identity/script/\@type" );
4478 if (!defined($loc->{territory}) && $loc->{name} =~ /-([A-Z]{2}|[0-9]{3})$/) { $loc->{territory} = $1; }
4479 if (!defined($loc->{script}) && $loc->{name} =~ /-([A-Z][a-z]{3})(-[A-Z]{2})?$/) { $loc->{script} = $1; }
4482 # assign a default territory and sort locale
4484 foreach my $loc (@locales)
4486 next if defined $loc->{alias};
4487 next if defined $loc->{territory};
4488 my $id = $loc->{sortlocale};
4489 if (defined $id && ($id =~ /[-_]([A-Z0-9]+)$/))
4491 $loc->{territory} = $1;
4492 next;
4494 my @children = grep /^$loc->{name}-[A-Z0-9]+$/ && !defined $lcnames{$_}->{alias}, keys %lcnames;
4495 if (@children == 1)
4497 $id = $children[0];
4499 else
4501 my $name = $loc->{file} || $loc->{name};
4502 $name =~ s/-(Arab|Beng|Cyrl|Deva|Guru|Hans|Hant|Latn|Tfng|Vaii)$//;
4503 $name =~ s/-/_/g;
4504 $id = xml_query( $subtags, "/supplementalData/likelySubtags/likelySubtag[\@from='$name']/\@to" );
4505 $id =~ s/_/-/g if $id;
4507 if ($id =~ /[-_]([A-Z0-9]+)$/)
4509 $loc->{territory} = $1;
4510 next if defined $loc->{sortlocale};
4511 next unless $id =~ /^$loc->{name}/;
4512 while (defined $lcnames{$id} && defined $lcnames{$id}->{alias}) { $id = $lcnames{$id}->{alias}; }
4513 $loc->{sortlocale} = $id if defined $lcnames{$id};
4514 next;
4516 print STDERR "no territory found for $loc->{name}\n";
4519 # fill geoid table
4521 my %geotable;
4522 foreach my $geo (@geoids)
4524 my $name = $geo->{name};
4525 next unless defined $name;
4526 $geo->{alias} = $geotable{$name} if defined $geotable{$name};
4527 $geotable{$name} ||= $geo;
4529 foreach my $loc (@locales)
4531 next if defined $loc->{alias};
4532 my $territory = $loc->{territory};
4533 $geotable{$territory} ||= { name => $territory };
4535 foreach my $name (keys %geotable)
4537 my $geo = $geotable{$name};
4538 $geo->{dialcode} = xml_query( $phone, "(/supplementalData/telephoneCodeData/codesByTerritory[\@territory='$name']/telephoneCountryCode)[1]/\@code" );
4539 if ($name =~ /\d+/)
4541 $geo->{uncode} = $name;
4542 next;
4544 $geo->{iso2} = $name;
4545 $geo->{iso3} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@alpha3");
4546 $geo->{uncode} = xml_query( $suppl, "/supplementalData/codeMappings/territoryCodes[\@type='$name']/\@numeric");
4547 $geo->{sintlsymbol} ||= xml_query( $suppl, "(/supplementalData/currencyData/region[\@iso3166='$name']/currency[not(\@to)])[1]/\@iso4217") || "XXX";
4548 $geo->{sintlsymbol} =~ s/XXX/XDR/;
4550 foreach my $geo (@geoids)
4552 $geo->{parentid} = $geotable{$geo->{parent}}->{id} if defined $geo->{parent};
4553 next if defined $geo->{iso2};
4554 next if defined $geo->{alias};
4555 next unless defined $geo->{uncode};
4556 my @contains;
4557 my $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and not(\@status)]/\@contains");
4558 push @contains, split /\s+/, $list if defined $list;
4559 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$geo->{uncode}' and \@status='deprecated']/\@contains");
4560 push @contains, split /\s+/, $list if defined $list;
4561 while (@contains)
4563 my $territory = pop @contains;
4564 if (defined $geotable{$territory})
4566 $geotable{$territory}->{parentid} ||= $geo->{id};
4568 elsif ($territory =~ /\d+/)
4570 # expand region recursively
4571 $list = xml_query( $suppl, "/supplementalData/territoryContainment/group[\@type='$territory' and not(\@status)]/\@contains" );
4572 push @contains, split /\s+/, $list if defined $list;
4577 # assign calendars to their locale
4579 foreach my $cal (@calendars)
4581 next unless defined $cal->{locale};
4582 my $loc = $lcnames{$cal->{locale}};
4583 $loc->{calendar} = [ ] unless defined $loc->{calendar};
4584 push @{$loc->{calendar}}, $cal;
4587 # assign default lcid to aliases
4589 foreach my $loc (@locales)
4591 next unless defined $loc->{alias};
4592 next if defined $loc->{lcid};
4593 my $alias = $loc->{alias};
4594 my $lcid = $lcnames{$alias}->{lcid} || 0x1000;
4595 $loc->{lcid} = $lcid | 0x80000000;
4598 # assign sort aliases to parent locale
4600 foreach my $loc (@locales)
4602 next unless $loc->{name} =~ /_/;
4603 next unless defined $loc->{alias};
4604 my $alias = $loc->{alias};
4605 my $parent = $lcnames{$alias};
4606 my $basename = $parent->{name};
4607 while (1)
4609 @{$parent->{sortnames}}[($loc->{lcid} >> 16) - 1] = $loc->{name};
4610 $alias = locale_parent( $alias );
4611 last unless $alias && defined $lcnames{$alias};
4612 $parent = $lcnames{$alias};
4613 last if defined $parent->{sortbase} && $parent->{sortbase} ne $basename;
4614 $parent->{sortbase} = $basename;
4618 # assign an array index to all locales
4620 my $idx = 0;
4621 foreach my $loc (@locales)
4623 next if defined $loc->{alias};
4624 $loc->{idx} = $idx++;
4626 foreach my $loc (@locales)
4628 my $alias = $loc->{alias};
4629 next unless defined $alias;
4630 while (defined $lcnames{$alias}->{alias}) { $alias = $lcnames{$alias}->{alias}; }
4631 $loc->{idx} = $lcnames{$alias}->{idx};
4634 # output lcids table
4636 my $lcid_data = "";
4637 foreach my $id (sort { $a <=> $b } keys %lcids)
4639 my $loc = $lcids{$id};
4640 $lcid_data .= pack "L<S<2", $id, $loc->{idx}, add_string($loc->{name});
4643 # output lcnames table
4645 my $lcname_data = "";
4646 foreach my $name (sort compare_locales keys %lcnames)
4648 my $loc = $lcnames{$name};
4649 $lcname_data .= pack "S<2L<", add_string($name), $loc->{idx}, $loc->{lcid} || 0x1000;
4652 # output locales array
4654 my $locale_data = "";
4655 my $default_lcid = 0x8001;
4656 foreach my $loc (@locales)
4658 next if defined $loc->{alias};
4659 my $sname = $loc->{name};
4660 my $language = $loc->{language};
4661 my $territory = $loc->{territory};
4662 my $script = $loc->{script};
4663 my $neutral = ($sname && $sname !~ /-$territory/);
4664 my $sparent = $loc->{sparent} || (($sname =~ /(.*)-[0-9A-Za-z]+/) ? $1 : $loc->{parent});
4665 my $unique_lcid = $loc->{lcid};
4666 unless (defined $unique_lcid) { $unique_lcid = $default_lcid++; }
4667 my $geo = $geotable{$territory};
4668 my $territory_match = "contains(concat(' ',normalize-space(\@territories),' '),' $territory ')";
4670 # languages and scripts
4672 my $ssortlocale = $loc->{sortlocale} || ($neutral ? "$sname-$territory" : $sname);
4673 my $idefaultlanguage = defined $lcnames{$ssortlocale} ? $lcnames{$ssortlocale}->{lcid} : undef;
4674 $idefaultlanguage = $lcnames{"en-US"}->{lcid} unless $ssortlocale;
4675 (my $siso639langname = $sname) =~ s/-.*$//;
4676 my $siso639langname2 = $iso639{$siso639langname} || $siso639langname;
4677 my $sopentypelang = sprintf "%-4s", locale_entry( $loc, "sopentypelang", uc $siso639langname2 );
4678 my $sabbrevlangname = defined $loc->{lcid} ? locale_entry( $loc, "sabbrevlangname", uc $siso639langname2 ) : "ZZZ";
4679 my $siso3166ctryname2 = $geo->{iso3} || $geo->{uncode};
4680 my $senglanguage = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" ) || "";
4681 my $sengcountry = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" ) || "";
4682 my $snativelangname = loc_query( $loc, "/ldml/localeDisplayNames/languages/language[\@type='$language' and not(\@alt)]" );
4683 my $snativectryname = loc_query( $loc, "/ldml/localeDisplayNames/territories/territory[\@type='$territory' and not(\@alt)]" );
4684 $sengcountry =~ s/South Korea/Korea/;
4685 $sengcountry =~ s/T\xfcrkiye/Turkey/;
4686 $snativelangname ||= $senglanguage;
4687 $snativectryname ||= $sengcountry;
4688 if ($script)
4690 my $engscript = loc_query( $lcnames{en}, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4691 my $nativescript = loc_query( $loc, "/ldml/localeDisplayNames/scripts/script[\@type='$script' and not(\@alt)]" );
4692 $senglanguage .= " ($engscript)" if $engscript;
4693 $snativelangname .= " ($nativescript)" if $nativescript;
4695 my $sengdisplayname = $neutral ? $senglanguage : "$senglanguage ($sengcountry)";
4696 my $snativedisplayname = $neutral ? $snativelangname : "$snativelangname ($snativectryname)";
4697 $sengdisplayname =~ s/\) \(/, /;
4698 $snativedisplayname =~ s/\) \(/, /;
4699 my $sscripts = locale_entry( $loc, "sscripts", $script ) || xml_query( $suppl, "/supplementalData/languageData/language[\@type='$language' and not(\@alt)]/\@scripts" );
4700 $sscripts = (join ";", (sort split / /, ($sscripts || "Latn"))) . ";";
4701 my $ireadinglayout = locale_entry( $loc, "ireadinglayout", 0 );
4702 my $charlayout = loc_query( $loc, "/ldml/layout/orientation/characterOrder" );
4703 if ($charlayout eq "right-to-left")
4705 $ireadinglayout = 1;
4707 elsif ($charlayout eq "top-to-bottom")
4709 my $linelayout = loc_query( $loc, "/ldml/layout/orientation/lineOrder" );
4710 $ireadinglayout = $linelayout eq "right-to-left" ? 2 : 3;
4712 my $igeoid = $geo->{id} || 0;
4714 # numbers
4716 my $sdecimal = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/decimal" );
4717 my $slist = locale_entry( $loc, "slist", ";" );
4718 my $smondecimalsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyDecimal" ) || $sdecimal;
4719 my $sthousand = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/group" );
4720 $sthousand =~ s/\x{202f}/\x{00a0}/;
4721 my $smonthousandsep = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/currencyGroup" ) || $sthousand;
4722 my $spositivesign = "";
4723 my $snegativesign = "-";
4724 my $spercent = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/percentSign" );
4725 my $snan = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/nan" );
4726 my $sposinfinity = loc_query( $loc, "/ldml/numbers/symbols[\@numberSystem='latn']/infinity" );
4727 my $sneginfinity = $sposinfinity ? "-$sposinfinity" : "";
4728 my $sgrouping = format_to_grouping( loc_query( $loc, "/ldml/numbers/decimalFormats[\@numberSystem='latn']/decimalFormatLength[not(\@type)]/decimalFormat/pattern" ));
4729 my $percentformat = loc_query( $loc, "/ldml/numbers/percentFormats[\@numberSystem='latn']/percentFormatLength[not(\@type)]/percentFormat/pattern" );
4730 my $currencyformat = loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='accounting']/pattern[not(\@alt)]" ) ||
4731 loc_query( $loc, "/ldml/numbers/currencyFormats[\@numberSystem='latn']/currencyFormatLength[not(\@type)]/currencyFormat[\@type='standard']/pattern[not(\@alt)]" );
4732 my $smongrouping = format_to_grouping( $currencyformat );
4733 my ($icurrency, $inegcurr) = parse_currency_format( $sname, $currencyformat );
4734 my ($ipospercent, $inegpercent) = parse_percent_format( $percentformat );
4735 my $native_numbering = loc_query( $loc, "/ldml/numbers/otherNumberingSystems/native" );
4736 my @snativedigits = split //, (locale_entry( $loc, "nativedigits", "" ) || xml_query( $numbers, "/supplementalData/numberingSystems/numberingSystem[\@id='$native_numbering']/\@digits" ));
4737 my $digitsubstitution = !(ord($snativedigits[0]) >= 0x600 && ord($snativedigits[0]) <= 0x6ff);
4738 my $measure = defined xml_query( $suppl, "/supplementalData/measurementData/measurementSystem[\@type='US' and $territory_match]" );
4739 my $papersize = defined xml_query( $suppl, "/supplementalData/measurementData/paperSize[\@type='US-Letter' and $territory_match]" );
4741 # currencies
4743 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
4744 my $scurrency = $geo->{scurrency} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[\@alt='narrow']" );
4745 $scurrency ||= loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/symbol[not(\@alt)]" );
4746 $geo->{scurrency} = $scurrency if $scurrency;
4747 $scurrency ||= $sintlsymbol;
4748 my $sengcurrname = $loc->{sengcurrname} || loc_query( $lcnames{en}, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" );
4749 my $snativecurrname = $loc->{sengcurrname} || loc_query( $loc, "/ldml/numbers/currencies/currency[\@type='$sintlsymbol']/displayName[not(\@count)]" ) || $sengcurrname;
4750 my $icurrdigits = xml_query( $suppl, "/supplementalData/currencyData/fractions/info[\@iso4217='$sintlsymbol']/\@digits" );
4751 $icurrdigits = 2 unless defined $icurrdigits;
4753 # calendars
4755 my $firstday = xml_query( $suppl, "/supplementalData/weekData/firstDay[not(\@alt) and $territory_match]/\@day" );
4756 my $ifirstdayofweek = $firstday ? $days{$firstday} : 1;
4757 my $firstweekofyear = (xml_query( $suppl, "/supplementalData/weekData/minDays[$territory_match]/\@count" ) || 0) == 4 ? 2 : 0;
4758 my $serastring = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/eras/eraAbbr/era[\@type='1' and not(\@alt)]" );
4759 my (@sdayname, @sabbrevdayname, @sshortestdayname);
4760 foreach my $d (sort { $days{$a} <=> $days{$b} } keys %days)
4762 my $n = $days{$d};
4763 my %name;
4764 foreach my $type (qw(wide abbreviated short))
4766 $name{$type} = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/days/dayContext[\@type='format']/dayWidth[\@type='$type']/day[\@type='$d' and not(\@alt)]" );
4768 push @sdayname, $name{wide};
4769 push @sabbrevdayname, $name{abbreviated} || $name{wide};
4770 push @sshortestdayname, $name{short} || $name{abbreviated} || $name{wide};
4772 my (@smonthname, @sabbrevmonthname, @sgenitivemonth, @sabbrevgenitivemonth);
4773 foreach my $n (1..13)
4775 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='wide']/month[\@type='$n']" );
4776 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='stand-alone']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4777 my $genitive = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n']" );
4778 my $abbrevgen = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n']" );
4779 push @smonthname, $name || $genitive || "";
4780 push @sabbrevmonthname, $abbrev || $abbrevgen || $name || $genitive || "";
4781 push @sgenitivemonth, $genitive || "";
4782 push @sabbrevgenitivemonth, $abbrevgen || $genitive || "";
4784 @sgenitivemonth = () if join("|",@smonthname) eq join("|",@sgenitivemonth);
4785 @sabbrevgenitivemonth = () if join("|",@sabbrevmonthname) eq join("|",@sabbrevgenitivemonth);
4786 my %caltypes = ( "gregorian" => 1, "japanese" => 3, "chinese" => 4, "dangi" => 5, "islamic" => 6, "buddhist" => 7, "hebrew" => 8,
4787 "persian" => 22, "islamic-civil" => 23, "islamic-umalqura" => 23 );
4788 my $calpref = xml_query( $suppl, "/supplementalData/calendarPreferenceData/calendarPreference[$territory_match]/\@ordering" ) || "gregorian";
4789 my $icalendartype;
4790 my @scalnames;
4791 foreach my $c (split /\s+/, $calpref)
4793 next unless defined $caltypes{$c};
4794 $icalendartype .= chr($caltypes{$c});
4795 $scalnames[$caltypes{$c} - 1] = loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$c']" );
4798 # date/time formats
4800 my $s1159 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='am' and not(\@alt)]" );
4801 my $s2359 = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='abbreviated']/dayPeriod[\@type='pm' and not (\@alt)]" );
4802 my $sshortestam = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='am' and not(\@alt)]" );
4803 my $sshortestpm = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dayPeriods/dayPeriodContext[\@type='format']/dayPeriodWidth[\@type='narrow']/dayPeriod[\@type='pm' and not (\@alt)]" );
4804 my @stimeformat = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='medium']/timeFormat/pattern[not(\@alt)]" ));
4805 push @stimeformat, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4806 pop @stimeformat if $stimeformat[0] eq $stimeformat[1];
4807 @stimeformat = map convert_time_format($_), @stimeformat;
4808 my @sshorttime = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/timeFormats/timeFormatLength[\@type='short']/timeFormat/pattern[not(\@alt)]" ));
4809 push @sshorttime, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hm' and not(\@alt)]" );
4810 pop @sshorttime if $sshorttime[0] eq $sshorttime[1];
4811 @sshorttime = map convert_time_format($_), @sshorttime;
4812 my @sshortdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
4813 push @sshortdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
4814 @sshortdate = map convert_date_format($_), @sshortdate;
4815 my @slongdate = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" ));
4816 push @slongdate, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
4817 @slongdate = map convert_date_format($_), @slongdate;
4818 my @smonthday = (loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMd' and not(\@alt)]" ));
4819 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Md' and not(\@alt)]" );
4820 push @smonthday, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMd' and not(\@alt)]" );
4821 @smonthday = map convert_date_format($_), @smonthday;
4822 my @syearmonth = map convert_date_format($_), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMM' and not(\@alt)]" );
4823 my @sduration = map convert_time_format( lc $_ ), loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='Hms' and not(\@alt)]" );
4824 my $srelativelongdate = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMMEd' and not(\@alt)]" ) ||
4825 loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='gregorian']/dateTimeFormats/availableFormats/dateFormatItem[\@id='MMMEd' and not(\@alt)]" );
4826 $srelativelongdate = convert_date_format( $srelativelongdate );
4828 if (defined $loc->{calendar})
4830 foreach my $cal (@{$loc->{calendar}})
4832 $cal->{sshortdate} = \@sshortdate;
4833 $cal->{syearmonth} = \@syearmonth;
4834 $cal->{slongdate} = \@slongdate;
4835 $cal->{serastring} = [ $serastring ];
4836 $cal->{sdayname} = \@sdayname;
4837 $cal->{sabbrevdayname} = \@sabbrevdayname;
4838 $cal->{smonthname} = \@smonthname;
4839 $cal->{sabbrevmonthname} = \@sabbrevmonthname;
4840 $cal->{scalname} = $scalnames[$cal->{id}];
4841 $cal->{smonthday} = \@smonthday;
4842 $cal->{sshortestdayname} = \@sshortestdayname;
4843 $cal->{sabbreverastring} = [ $serastring ];
4844 $cal->{sshortestdayname} = \@sshortestdayname;
4845 $cal->{srelativelongdate} = $srelativelongdate;
4849 # codepages
4851 my %ansicpmap = ( 437 => 1252, 720 => 1256, 737 => 1253, 775 => 1257, 850 => 1252,
4852 852 => 1250, 855 => 1251, 866 => 1251, 857 => 1254, 862 => 1255 );
4853 my %maccpmap = ( 437 => 10000, 720 => 10004, 737 => 10006, 775 => 10029, 850 => 10000,
4854 852 => 10029, 855 => 10007, 857 => 10081, 862 => 10005, 866 => 10007,
4855 874 => 10021, 932 => 10001, 936 => 10008, 949 => 10003, 950 => 10002,
4856 1258 => 10000 );
4857 my %ebcdiccpmap = ( 437 => 37, 720 => 20420, 737 => 20273, 866 => 20880, 932 => 20290 );
4858 my %codepagemasks = ( 874 => [ 0x01000000, 0x00000000, 0x00000000, 0, 0x00010000, 0x00000000, 0x00010000, 0x00000000 ],
4859 932 => [ 0x00000000, 0x28c70000, 0x00000010, 0, 0x00020000, 0x00000000, 0x00020000, 0x00000000 ],
4860 936 => [ 0x00000000, 0x28010000, 0x00000002, 0, 0x00040000, 0x00000000, 0x00040000, 0x00000000 ],
4861 949 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00080000, 0x00000000, 0x00080000, 0x00000000 ],
4862 950 => [ 0x00000000, 0x28c10000, 0x00000012, 0, 0x00100000, 0x00000000, 0x00100000, 0x00000000 ],
4863 1258 => [ 0x2000000f, 0x00000000, 0x00000000, 0, 0x00000100, 0x00008000, 0x00000100, 0x00008000 ],
4864 866 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x00020000, 0x00000004, 0x02020000 ],
4865 862 => [ 0x00000800, 0x40000000, 0x00000000, 0, 0x00000020, 0x00200000, 0x00000020, 0x00200000 ],
4866 857 => [ 0x0000001f, 0x00000000, 0x00000000, 0, 0x00000010, 0x01000000, 0x00000010, 0x01000000 ],
4867 855 => [ 0x00000200, 0x00000000, 0x00000000, 0, 0x00000004, 0x02000000, 0x00000004, 0x02000000 ],
4868 852 => [ 0x00000027, 0x00000000, 0x00000000, 0, 0x00000002, 0x04000000, 0x00000002, 0x04000000 ],
4869 775 => [ 0x00000007, 0x00000000, 0x00000000, 0, 0x00000080, 0x08000000, 0x00000080, 0x08000000 ],
4870 737 => [ 0x00000080, 0x00000000, 0x00000000, 0, 0x00000008, 0x10000000, 0x00000008, 0x10010000 ],
4871 720 => [ 0x00002000, 0x00000000, 0x00000000, 0, 0x00000040, 0x20000000, 0x00000040, 0x20080000 ],
4872 850 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x40000000, 0x0000019f, 0xdfd70000 ],
4873 437 => [ 0x00000003, 0x00000000, 0x00000000, 0, 0x00000001, 0x80000000, 0x0000019f, 0xdfd70000 ],
4874 65001 => [ 0x00000000, 0x00000000, 0x00000000, 0, 0x00000000, 0x00000000, 0x0000019f, 0xdfd70000 ] );
4875 my $oemcp = locale_entry( $loc, "oemcp", 65001 );
4876 my $maccp = locale_entry( $loc, "maccp", undef ) || $maccpmap{$oemcp} || 65001;
4877 my $ebcdiccp = locale_entry( $loc, "ebcdiccp", undef ) || $ebcdiccpmap{$oemcp} || 500;
4878 $ebcdiccp = 500 if (defined $loc->{oemcp} && $loc->{oemcp} == 65001) || (defined $loc->{maccp} && $loc->{maccp} == 65001);
4879 my $ansicp = $ansicpmap{$oemcp} || $oemcp;
4880 my @fontsig = (0) x 8;
4881 my $sig = locale_entry( $loc, "fontsig", [] );
4882 foreach my $i (0..7) { $fontsig[$i] |= $codepagemasks{$oemcp}->[$i]; }
4883 foreach my $i (0..$#{$sig}) { $fontsig[$i] |= $sig->[$i]; }
4884 $fontsig[3] |= 1 << 31;
4885 $fontsig[3] |= 1 << 27 if $ireadinglayout == 1;
4886 $fontsig[3] |= 1 << 28 if $ireadinglayout == 3;
4888 # special cases for invariant locale
4890 unless ($loc->{name})
4892 $siso639langname = "iv";
4893 $siso639langname2 = "ivl";
4894 $senglanguage = $snativelangname = "Invariant Language";
4895 $sengcountry = $snativectryname = "Invariant Country";
4896 $sengdisplayname = "Invariant Language (Invariant Country)";
4897 $snativedisplayname = "Invariant Language (Invariant Region)";
4898 $sengcurrname = $snativecurrname = "International Monetary Fund";
4899 $scurrency = "\x{00a4}";
4900 $ifirstdayofweek = 0;
4901 $igeoid = $geotable{"US"}->{id};
4902 @stimeformat = ("HH:mm:ss");
4903 @sshortdate = ("MM/dd/yyyy", "yyyy-MM-dd");
4904 @slongdate = ("dddd, dd MMMM yyyy");
4905 @syearmonth = ("yyyy MMMM");
4906 @smonthday = ("MMMM dd", "MMMM d", "M/d", "MMM d");
4907 @sshorttime = ("HH:mm", "hh:mm tt", "H:mm", "h:mm tt");
4908 $srelativelongdate = "dddd, MMMM dd";
4909 $sposinfinity = "Infinity";
4910 $sneginfinity = "-Infinity";
4911 $spositivesign = "+";
4912 $ipospercent = $inegpercent = 0;
4915 # output data
4917 $locale_data .= pack "L<2",
4918 add_string( $sname ), # name
4919 add_string( $sopentypelang ); # LOCALE_SOPENTYPELANGUAGETAG
4921 $locale_data .= pack "S<14",
4922 $loc->{lcid} || 0x1000, # LOCALE_ILANGUAGE
4923 $unique_lcid, # unique_lcid
4924 locale_entry( $loc, "idigits", 2 ), # LOCALE_IDIGITS
4925 locale_entry( $loc, "inegnumber", 1 ), # LOCALE_INEGNUMBER
4926 $icurrdigits, # LOCALE_ICURRDIGITS
4927 $icurrency, # LOCALE_ICURRENCY
4928 $inegcurr, # LOCALE_INEGCURR
4929 locale_entry( $loc, "ilzero", 1 ), # LOCALE_ILZERO
4930 !$neutral, # LOCALE_INEUTRAL
4931 $ifirstdayofweek, # LOCALE_IFIRSTDAYOFWEEK
4932 $firstweekofyear, # LOCALE_IFIRSTWEEKOFYEAR
4933 $geo->{dialcode} || 1 , # LOCALE_ICOUNTRY,
4934 $measure, # LOCALE_IMEASURE
4935 $digitsubstitution; # LOCALE_IDIGITSUBSTITUTION
4937 $locale_data .= pack "L<18",
4938 add_string( $sgrouping ), # LOCALE_SGROUPING
4939 add_string( $smongrouping ), # LOCALE_SMONGROUPING
4940 add_string( $slist ), # LOCALE_SLIST
4941 add_string( $sdecimal ), # LOCALE_SDECIMAL
4942 add_string( $sthousand ), # LOCALE_STHOUSAND
4943 add_string( $scurrency ), # LOCALE_SCURRENCY
4944 add_string( $smondecimalsep ), # LOCALE_SMONDECIMALSEP
4945 add_string( $smonthousandsep ), # LOCALE_SMONTHOUSANDSEP
4946 add_string( $spositivesign ), # LOCALE_SPOSITIVESIGN
4947 add_string( $snegativesign ), # LOCALE_SNEGATIVESIGN
4948 add_string( $s1159 ), # LOCALE_S1159
4949 add_string( $s2359 ), # LOCALE_S2359
4950 add_strarray( @snativedigits ), # LOCALE_SNATIVEDIGITS
4951 add_strarray( @stimeformat ), # LOCALE_STIMEFORMAT
4952 add_strarray( @sshortdate ), # LOCALE_SSHORTDATE
4953 add_strarray( @slongdate ), # LOCALE_SLONGDATE
4954 add_strarray( @syearmonth ), # LOCALE_SYEARMONTH
4955 add_strarray( @sduration ); # LOCALE_SDURATION
4957 $locale_data .= pack "S<8",
4958 $idefaultlanguage || 0x1000, # LOCALE_IDEFAULTLANGUAGE
4959 $ansicp, # LOCALE_IDEFAULTANSICODEPAGE
4960 $oemcp, # LOCALE_IDEFAULTCODEPAGE
4961 $maccp, # LOCALE_IDEFAULTMACCODEPAGE
4962 $ebcdiccp, # LOCALE_IDEFAULTEBCDICCODEPAGE
4963 $igeoid < 65536 ? $igeoid : 39070, # old_geoid
4964 $papersize ? 1 : 9, # LOCALE_IPAPERSIZE
4965 0; # FIXME # islamic_cal
4967 $locale_data .= pack "L<24",
4968 add_string( $icalendartype ), # LOCALE_ICALENDARTYPE
4969 add_string( $sabbrevlangname ), # LOCALE_SABBREVLANGNAME
4970 add_string( $siso639langname ), # LOCALE_SISO639LANGNAME
4971 add_string( $senglanguage ), # LOCALE_SENGLANGUAGE
4972 add_string( $snativelangname ), # LOCALE_SNATIVELANGNAME
4973 add_string( $sengcountry ), # LOCALE_SENGCOUNTRY
4974 add_string( $snativectryname ), # LOCALE_SNATIVECTRYNAME
4975 add_string( $siso3166ctryname2 ), # LOCALE_SABBREVCTRYNAME
4976 add_string( $territory ), # LOCALE_SISO3166CTRYNAME
4977 add_string( $sintlsymbol ), # LOCALE_SINTLSYMBOL
4978 add_string( $sengcurrname ), # LOCALE_SENGCURRNAME
4979 add_string( $snativecurrname ), # LOCALE_SNATIVECURRNAME
4980 add_fontsig( @fontsig ), # LOCALE_FONTSIGNATURE
4981 add_string( $siso639langname2 ), # LOCALE_SISO639LANGNAME2
4982 add_string( $siso3166ctryname2 ), # LOCALE_SISO3166CTRYNAME2
4983 add_string( $sparent ), # LOCALE_SPARENT
4984 add_strarray( @sdayname ), # LOCALE_SDAYNAME
4985 add_strarray( @sabbrevdayname ), # LOCALE_SABBREVDAYNAME
4986 add_strarray( @smonthname ), # LOCALE_SMONTHNAME
4987 add_strarray( @sabbrevmonthname ), # LOCALE_SABBREVMONTHNAME
4988 add_strarray( @sgenitivemonth ), # LOCALE_SGENITIVEMONTH
4989 add_strarray( @sabbrevgenitivemonth ), # LOCALE_SABBREVGENITIVEMONTH
4990 add_strarray( @scalnames ), # LOCALE_SCALNAMES
4991 add_strarray( @{$loc->{sortnames}} ); # LOCALE_SSORTNAMES
4993 $locale_data .= pack "S<6",
4994 $inegpercent, # LOCALE_INEGATIVEPERCENT
4995 $ipospercent, # LOCALE_IPOSITIVEPERCENT
4996 0, # unknown
4997 $ireadinglayout, # LOCALE_IREADINGLAYOUT
4998 0x2a, # unknown
4999 0x2a; # unknown
5001 $locale_data .= pack "L<24",
5002 0, # unknown
5003 add_string( $sengdisplayname ), # LOCALE_SENGLISHDISPLAYNAME
5004 add_string( $snativedisplayname ), # LOCALE_SNATIVEDISPLAYNAME
5005 add_string( $spercent ), # LOCALE_SPERCENT
5006 add_string( $snan ), # LOCALE_SNAN
5007 add_string( $sposinfinity ), # LOCALE_SPOSINFINITY
5008 add_string( $sneginfinity ), # LOCALE_SNEGINFINITY
5009 0, # unknown
5010 add_string( $serastring ), # CAL_SERASTRING
5011 add_string( $serastring ), # CAL_SABBREVERASTRING
5012 0, # unknown
5013 add_string( $ssortlocale ), # LOCALE_SCONSOLEFALLBACKNAME
5014 add_strarray( @sshorttime ), # LOCALE_SSHORTTIME
5015 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5016 0, # unknown
5017 add_string( $ssortlocale ), # LOCALE_SSORTLOCALE
5018 add_string( "0409:00000409" ), # FIXME # LOCALE_SKEYBOARDSTOINSTALL
5019 add_string( $sscripts ), # LOCALE_SSCRIPTS
5020 add_string( $srelativelongdate ), # LOCALE_SRELATIVELONGDATE
5021 $igeoid, # LOCALE_IGEOID
5022 add_string( $sshortestam || "a" ), # LOCALE_SSHORTESTAM
5023 add_string( $sshortestpm || "p" ), # LOCALE_SSHORTESTPM
5024 add_strarray( @smonthday ), # LOCALE_SMONTHDAY
5025 add_string( "k0-windows-us" ) # FIXME # keyboard_layout
5028 # output language groups
5030 my %groups;
5031 add_registry_key( $nlskey, "Locale", "00000409" );
5032 foreach my $loc (@locales)
5034 next unless defined $loc->{lcid};
5035 next if ($loc->{lcid} & 0x80000000);
5036 next if !defined($loc->{alias}) && $loc->{name} !~ /-$loc->{territory}/; # skip neutral locales
5037 my $group = locale_entry( $loc, "group", 1 );
5038 my $name = sprintf( "%08x", $loc->{lcid} );
5039 my $val = sprintf( "%x", $group );
5040 add_registry_string_value( $nlskey, "Locale", $name, $val ) unless ($loc->{lcid} & 0x000f0000);
5041 add_registry_string_value( $nlskey, "Locale\\Alternate Sorts", $name, $val ) if $loc->{name} =~ /_/;
5042 $groups{$val} = 1;
5044 foreach my $group (keys %groups) { add_registry_string_value( $nlskey, "Language Groups", $group, "1" ); }
5046 # output calendar data
5048 my $calendar_data = "";
5049 foreach my $cal (@calendars)
5051 my $scalname = $cal->{name};
5052 my $iyearoffsetrange = 0;
5053 my $itwodigityearmax = $cal->{itwodigityearmax};
5054 my @sshortdate;
5055 my @syearmonth;
5056 my @slongdate;
5057 my @serastring;
5058 my @sdayname;
5059 my @sabbrevdayname;
5060 my @smonthname;
5061 my @sabbrevmonthname;
5062 my @smonthday;
5063 my @sabbreverastring;
5064 my @sshortestdayname;
5066 my $type = $cal->{type};
5067 if (defined $cal->{locale} && defined $type)
5069 my $loc = $lcnames{$cal->{locale}};
5070 my $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMd' and not(\@alt)]" );
5071 push @sshortdate, $fmt if $fmt;
5072 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMd' and not(\@alt)]" );
5073 push @sshortdate, $fmt if $fmt;
5074 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yMMMd' and not(\@alt)]" );
5075 push @sshortdate, $fmt if $fmt;
5076 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateTimeFormats/availableFormats/dateFormatItem[\@id='yyyyMMMd' and not(\@alt)]" );
5077 push @sshortdate, $fmt if $fmt;
5078 @sshortdate = map convert_date_format($_), @sshortdate;
5079 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='full']/dateFormat/pattern[not(\@alt)]" );
5080 push @slongdate, $fmt if $fmt;
5081 $fmt = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/dateFormats/dateFormatLength[\@type='long']/dateFormat/pattern[not(\@alt)]" );
5082 push @slongdate, $fmt if $fmt;
5083 @slongdate = map convert_date_format($_), @slongdate;
5085 foreach my $n (1..13)
5087 my $name = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='wide']/month[\@type='$n' and not(\@yeartype)]" );
5088 my $abbrev = loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/months/monthContext[\@type='format']/monthWidth[\@type='abbreviated']/month[\@type='$n' and not(\@yeartype)]" );
5089 push @smonthname, $name || "";
5090 push @sabbrevmonthname, $abbrev || $name || "";
5093 $scalname ||= loc_query( $loc, "/ldml/localeDisplayNames/types/type[\@key='calendar' and \@type='$type']" );
5094 if (defined $cal->{eras})
5096 my @eras;
5097 my $idx = 1;
5098 foreach my $era (@{$cal->{eras}})
5100 my $start = xml_query( $suppl, "/supplementalData/calendarData/calendar[\@type='$type']/eras/era[\@type='$era']/\@start" );
5101 next unless $start =~ /^(-?\d+)-(\d+)-(\d+)/;
5102 my ($year, $mon, $day, $zero, $first) = ($1, $2, $3, $1 - 1, 1);
5103 if ($zero < 0)
5105 $first -= $zero;
5106 $year = 1;
5107 $itwodigityearmax = 2049 - $zero;
5109 unshift @eras, pack( "S<8", 6, $idx++, $year, $mon, $day, $zero, $first, 0 );
5110 push @serastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraAbbr/era[\@type='$era']" );
5111 push @sabbreverastring, loc_query( $loc, "/ldml/dates/calendars/calendar[\@type='$type']/eras/eraNarrow/era[\@type='$era']" );
5113 $iyearoffsetrange = add_str_data( pack "S<L<*", scalar @eras, map { add_str_data($_); } @eras );
5117 @sshortdate = @{$cal->{sshortdate}} if defined $cal->{sshortdate} && !@sshortdate;
5118 @syearmonth = @{$cal->{syearmonth}} if defined $cal->{syearmonth};
5119 @slongdate = @{$cal->{slongdate}} if defined $cal->{slongdate} && !@slongdate;
5120 @serastring = @{$cal->{serastring}} if defined $cal->{serastring} && !@serastring;
5121 @sdayname = @{$cal->{sdayname}} if defined $cal->{sdayname};
5122 @sabbrevdayname = @{$cal->{sabbrevdayname}} if defined $cal->{sabbrevdayname};
5123 @smonthname = @{$cal->{smonthname}} if defined $cal->{smonthname} && !join("",@smonthname);
5124 @sabbrevmonthname = @{$cal->{sabbrevmonthname}} if defined $cal->{sabbrevmonthname} && !join("",@sabbrevmonthname);
5125 @smonthday = @{$cal->{smonthday}} if defined $cal->{smonthday};
5126 @sabbreverastring = @{$cal->{sabbreverastring}} if defined $cal->{sabbreverastring} && !@sabbreverastring;
5127 @sshortestdayname = @{$cal->{sshortestdayname}} if defined $cal->{sshortestdayname};
5128 my $srelativelongdate = $cal->{srelativelongdate};
5130 @serastring = ("A.D.") unless @serastring;
5131 @sabbreverastring = ("AD") unless @sabbreverastring;
5133 if ($cal->{id} != 1) # calendar 1 is a placeholder, information is fetched from locale instead
5135 @sshortdate = ("") unless @sshortdate;
5136 @syearmonth = ("") unless @syearmonth;
5137 @slongdate = ("") unless @slongdate;
5138 @sdayname = ("") x 7 unless @sdayname;
5139 @sabbrevdayname = ("") x 7 unless @sabbrevdayname;
5140 @sshortestdayname = ("") x 7 unless @sshortestdayname;
5141 @smonthname = ("") x 13 unless @smonthname;
5142 @sabbrevmonthname = ("") x 13 unless @sabbrevmonthname;
5143 @smonthday = ("") unless @smonthday;
5146 $calendar_data .= pack "S<2L<17",
5147 $cal->{id}, # CAL_ICALINTVALUE
5148 $itwodigityearmax || 99, # CAL_ITWODIGITYEARMAX
5149 add_strarray( @sshortdate ), # CAL_SSHORTDATE
5150 add_strarray( @syearmonth ), # CAL_SYEARMONTH
5151 add_strarray( @slongdate ), # CAL_SLONGDATE
5152 add_strarray( @serastring ), # CAL_SERASTRING
5153 $iyearoffsetrange, # CAL_IYEAROFFSETRANGE
5154 add_strarray( @sdayname ), # CAL_SDAYNAME
5155 add_strarray( @sabbrevdayname ), # CAL_SABBREVDAYNAME
5156 add_strarray( @smonthname ), # CAL_SMONTHNAME
5157 add_strarray( @sabbrevmonthname ), # CAL_SABBREVMONTHNAME
5158 add_string( $scalname ), # CAL_SCALNAME
5159 add_strarray( @smonthday ), # CAL_SMONTHDAY
5160 add_strarray( @sabbreverastring ), # CAL_SABBREVERASTRING
5161 add_strarray( @sshortestdayname ), # CAL_SSHORTESTDAYNAME
5162 add_string( $srelativelongdate ); # CAL_SRELATIVELONGDATE
5165 # output locale header
5167 my $nb_lcids = scalar keys %lcids;
5168 my $nb_locales = scalar grep { !defined $_->{alias} } @locales;
5169 my $nb_lcnames = scalar keys %lcnames;
5170 my $locale_size = length($locale_data) / $nb_locales;
5171 my $nb_calendars = scalar @calendars;
5172 my $calendar_size = length($calendar_data) / $nb_calendars;
5173 my $lcids_offset = 19 * 4; # size of header
5174 my $lcnames_offset = $lcids_offset + length $lcid_data;
5175 my $locales_offset = $lcnames_offset + length $lcname_data;
5176 my $calendar_offset = $locales_offset + length $locale_data;
5177 my $strings_offset = $calendar_offset + length $calendar_data;
5179 my $locale_header = pack "L<7S<4L<S<2L<3S<2L<4",
5180 8, # offset
5182 7, # version
5183 0x5344534e, # magic
5184 0, 0, 0,
5186 $nb_lcids,
5187 $nb_locales,
5188 $locale_size,
5189 $locales_offset,
5190 $nb_lcnames,
5192 $lcids_offset,
5193 $lcnames_offset,
5195 $nb_calendars,
5196 $calendar_size,
5197 $calendar_offset,
5198 $strings_offset,
5199 0, 0;
5201 return align_string( 4, $locale_header . $lcid_data . $lcname_data . $locale_data . $calendar_data . $string_data );
5205 ################################################################
5206 # build the charmaps table for locale.nls
5207 sub build_charmaps_data()
5209 my $data = "";
5211 # MAP_FOLDDIGITS
5212 my @digits = (ord('0') .. ord('9'));
5213 $digitmap_table[0x3007] = $digits[0]; # Ideographic Zero
5214 @digitmap_table[0x0c78..0x0c7b] = @digits[0..3]; # Telugu Fraction Digits
5215 @digitmap_table[0x0c7c..0x0c7e] = @digits[1..3]; # Telugu Fraction Digits
5216 @digitmap_table[0x3021..0x3029] = @digits[1..9]; # Hangzhou Numerals
5217 @digitmap_table[0xa8e0..0xa8e9] = @digits; # Combining Devanagari Digits
5218 @digitmap_table[0x10107..0x1010f] = @digits[1..9]; # Aegean Numbers
5219 $digitmap_table[0x10320] = $digits[1]; # Old Italic Numerals
5220 $digitmap_table[0x10321] = $digits[5]; # Old Italic Numerals
5221 $data .= dump_binary_case_table( @digitmap_table );
5223 # CJK compatibility map
5224 $data .= dump_binary_case_table( @cjk_compat_table );
5226 # LCMAP_HIRAGANA/KATAKANA
5227 my (@hiragana_table, @katakana_table);
5228 foreach my $ch (0x3041..0x3096, 0x309d..0x309e)
5230 $hiragana_table[$ch + 0x60] = $ch;
5231 $katakana_table[$ch] = $ch + 0x60;
5233 $data .= dump_binary_case_table( @hiragana_table ) . dump_binary_case_table( @katakana_table );
5235 # LCMAP_HALFWIDTH/FULLWIDTH
5236 $halfwidth_table[0x2018] = 0x0027;
5237 $halfwidth_table[0x2019] = 0x0027;
5238 $halfwidth_table[0x201c] = 0x0022;
5239 $halfwidth_table[0x201d] = 0x0022;
5240 $halfwidth_table[0x309b] = 0xff9e;
5241 $halfwidth_table[0x309c] = 0xff9f;
5242 $fullwidth_table[0x309b] = 0x3099;
5243 $fullwidth_table[0x309c] = 0x309a;
5244 $data .= dump_binary_case_table( @halfwidth_table ) . dump_binary_case_table( @fullwidth_table );
5246 # LCMAP_TRADITIONAL/SIMPLIFIED_CHINESE
5247 $data .= dump_binary_case_table( @chinese_traditional_table ) . dump_binary_case_table( @chinese_simplified_table );
5249 # FIXME: some more unknown tables here
5251 return $data;
5255 ################################################################
5256 # build the geoids table for locale.nls
5257 sub build_geoids_data()
5259 my $data = "";
5260 my %index;
5261 my $idx = 0;
5262 my @geo_header = (0x00650067, 0x0000006f, 0, 4 * 7, scalar @geoids, 0, 0);
5264 foreach my $geo (@geoids)
5266 my $id = $geo->{id};
5267 $geo = $geo->{alias} if defined $geo->{alias};
5268 my $lat = "0.000";
5269 my $long = "0.000";
5270 my $iso2 = $geo->{iso2} || "XX";
5271 my $iso3 = $geo->{iso3} || "XX";
5272 my $isregion = $geo->{region} || (defined $geo->{uncode} && !defined $geo->{iso2});
5273 my $sintlsymbol = $geo->{sintlsymbol} || "XDR";
5274 my $scurrency = $geo->{scurrency} || "\x{00a4}";
5276 $data .= pack( "L<", $id );
5277 $data .= pad_string( 24, encode( "UTF16LE", $lat ));
5278 $data .= pad_string( 24, encode( "UTF16LE", $long ));
5279 $data .= pack( "L<2", $isregion ? 14 : 16, $geo->{parentid} || 39070 );
5280 $data .= pad_string( 8, encode( "UTF16LE", $iso2 ));
5281 $data .= pad_string( 8, encode( "UTF16LE", $iso3 ));
5282 $data .= pack( "S<2", $geo->{uncode} || 0, $geo->{dialcode} || 0 );
5283 $data .= pad_string( 8, encode( "UTF16LE", $sintlsymbol ));
5284 $data .= pad_string( 16, encode( "UTF16LE", $scurrency ));
5285 $index{$geo->{name}} = $idx if $geo->{name};
5286 $idx++;
5288 $index{"XX"} = $index{"001"};
5290 $geo_header[5] = $geo_header[3] + length $data;
5291 $geo_header[6] = scalar keys %index;
5293 foreach my $name (sort keys %index)
5295 $data .= pad_string( 8, encode( "UTF16LE", $name ));
5296 $data .= pack "L<", $index{$name};
5299 $geo_header[2] = $geo_header[3] + length $data;
5300 return pack( "L<7", @geo_header ) . $data;
5304 ################################################################
5305 # build a binary locale table
5306 sub dump_locales($$)
5308 my ($filename, $chartypes) = @_;
5310 printf "Building $filename\n";
5312 my $locale_data = build_locale_data();
5313 my $charmaps_data = build_charmaps_data();
5314 my $geoids_data = build_geoids_data();
5315 my $scripts_data = ""; # FIXME
5317 my @header = ( 0 ) x 8;
5318 $header[0] = 4 * scalar @header; # chartypes offset
5319 $header[4] = $header[0] + length $chartypes; # locales offset
5320 $header[5] = $header[4] + length $locale_data; # charmaps offset
5321 $header[6] = $header[5] + length $charmaps_data; # geoids offset
5322 $header[7] = $header[6] + length $geoids_data; # scripts offset
5324 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5325 print OUTPUT pack "L<*", @header;
5326 print OUTPUT $chartypes, $locale_data, $charmaps_data, $geoids_data, $scripts_data;
5327 close OUTPUT;
5328 save_file($filename);
5332 ################################################################
5333 # return the day of week of the first of the month
5334 sub month_first_dow($$)
5336 my ($year, $month) = @_;
5337 my @time = gmtime( timegm_modern( 0, 0, 0, 1, $month - 1, $year ));
5338 return $time[6];
5342 ################################################################
5343 # compare system time values
5344 sub compare_systime($$)
5346 my ($a, $b) = @_;
5347 return $a->[0] <=> $b->[0] ||
5348 $a->[1] <=> $b->[1] ||
5349 $a->[2] <=> $b->[2] ||
5350 $a->[3] <=> $b->[3] ||
5351 $a->[4] <=> $b->[4] ||
5352 $a->[5] <=> $b->[5] ||
5353 $a->[6] <=> $b->[6];
5357 ################################################################
5358 # compare the zone transition date with the rule date
5359 sub compare_transition_date($$$$)
5361 my ($stdoff, $isdst, $zone, $rule) = @_;
5363 if (scalar @{$zone} <= 1)
5365 return (!defined($zone->[0]) || $zone->[0] > $rule->[0]) ? 1 : -1;
5368 my @date = parse_transition_date( $stdoff, $isdst, $zone->[0], $zone->[1], $zone->[2], $zone->[3] || 0 );
5369 return compare_systime( \@date, $rule );
5373 ################################################################
5374 # get the Windows zone names from the CLDR data
5375 sub load_windows_zones()
5377 my $current_name;
5378 my %names;
5379 my $base = "cldr-release-$CLDRVERSION";
5380 my $INPUT = open_data_file( "cldr", "$base/common/supplemental/windowsZones.xml" );
5381 while (<$INPUT>)
5383 if (/<!-- +(\(UTC.*) -->.*/)
5385 $current_name = $1;
5387 if (/<mapZone other="(.*)" territory="001" type="(.*)"\/>/)
5389 $names{$1} = [ $current_name, $2 ];
5392 close $INPUT;
5393 return %names;
5397 ################################################################
5398 # parse a transition date specification from the tzdata files
5399 sub parse_transition_date($$@)
5401 use integer;
5402 my ($stdoff, $isdst, $year, $in, $on, $at) = @_;
5404 $on = "1" unless defined $on;
5405 $at = "0" unless defined $at;
5407 my %months = ( Jan => 1, Feb => 2, Mar => 3, Apr => 4, May => 5, Jun => 6,
5408 Jul => 7, Aug => 8, Sep => 9, Oct => 10, Nov => 11, Dec => 12 );
5409 my %days = ( Sun => 0, Mon => 1, Tue => 2, Wed => 3, Thu => 4, Fri => 5, Sat => 6 );
5411 my $mon = $in ? $months{$in} : 1;
5412 my ($week, $dow, $flag, $time, $sec);
5413 my $first = month_first_dow( $year, $mon );
5415 if ($on =~ /^last(.*)$/)
5417 $week = 5;
5418 $dow = $days{$1};
5420 elsif ($on =~ /^(.*)>=(\d+)$/)
5422 $dow = $days{$1};
5423 my $diff = ($first + 6 - $dow) % 7;
5424 $week = $2 >= 25 ? 5 : ($2 + 6 + $diff) / 7;
5426 elsif ($on =~ /^(.*)<=(\d+)$/)
5428 $dow = $days{$1};
5429 my $diff = ($first + $2 + 6 - $dow) % 7;
5430 $week = ($2 + 6 - $diff) / 7;
5431 if (!$week)
5433 $week = 5;
5434 if (!--$mon) { $mon = 12; $year--; }
5437 elsif ($on =~ /^\d+$/)
5439 $dow = ($first + $on - 1) % 7;
5440 $week = $on >= 25 ? 5 : ($on + 6) / 7;
5442 else
5444 die "unsupported date specification $year $in $on $at";
5447 if ($at =~ /^(\d+):(\d+):(\d+)([uws]?)$/)
5449 $time = $1 * 60 + $2;
5450 $sec = $3;
5451 $flag = $4;
5453 elsif ($at =~ /^(\d+):(\d+)([uws]?)$/)
5455 $time = $1 * 60 + $2;
5456 $flag = $3;
5458 elsif ($at =~ /^(\d+)([uws]?)$/)
5460 $time = $1 * 60;
5461 $flag = $2;
5463 else
5465 die "unsupported time specification $year $in $on $at";
5468 $flag ||= "w";
5469 $time -= $stdoff if $flag eq "u";
5470 $time += 60 if !$isdst && $flag ne "w";
5472 if ($time < 0) # previous day
5474 $week-- if $week < 5 && $dow == month_first_dow( $year, $mon );
5475 $week-- if $week == 5 && $dow == month_first_dow( $year + ($mon == 12), $mon % 12 + 1 );
5476 if (!$week)
5478 $week = 5;
5479 if (!--$mon) { $mon = 12; $year--; }
5481 $dow = ($dow + 6) % 7;
5482 $time += 24 * 60;
5485 return ($year, $mon, $week, $dow, $time / 60, $time % 60, $sec || 0);
5489 ################################################################
5490 # parse a system time value as a SYSTEMTIME structure
5491 sub pack_systime(@)
5493 my ($year, $mon, $week, $dow, $hour, $min, $sec) = @_;
5494 return pack "S<8", 0, $mon, $dow, $week, $hour < 24 ? ($hour, $min, $sec, 0) : (23, 59, 59, 999);
5498 ################################################################
5499 # parse a timezone offset from the tzdata files
5500 sub parse_tz_offset($)
5502 my ($hour, $min) = split /:/, shift;
5503 $min ||= 0;
5504 return $hour < 0 ? -$hour * 60 + $min : -$hour * 60 - $min; # invert sign
5508 ################################################################
5509 # build the timezone data
5510 sub dump_timezones($@)
5512 my $filename = shift;
5513 my $FIRST_YEAR = 2000;
5514 my $LAST_YEAR = 2030;
5516 my %names = load_windows_zones();
5517 my %zones;
5518 my %rules;
5519 my %links;
5520 my %res_indices;
5522 printf "Building $filename\n";
5524 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5525 print OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
5526 print OUTPUT "#include \"winresrc.h\"\n\n";
5527 print OUTPUT "#pragma makedep po\n\n";
5528 print OUTPUT "LANGUAGE LANG_ENGLISH, SUBLANG_DEFAULT\n\n";
5529 print OUTPUT "STRINGTABLE\n{\n";
5531 # load tzdata files
5533 foreach my $filename (@_)
5535 my $FILE = open_data_file( "tzdata", $filename );
5536 my $zonename;
5537 while (<$FILE>)
5539 chomp;
5540 s/\#.*$//;
5541 next if /^\s*$/;
5542 my @fields = split /\s+/;
5543 if ($fields[0] eq "Zone" || ($zonename && $fields[0] eq ""))
5545 shift @fields;
5546 $zonename = shift @fields unless $zonename;
5547 my ($stdoff, $rules, $dummy, @date) = @fields;
5548 $zones{$zonename} ||= [ ];
5549 push @{$zones{$zonename}}, [ parse_tz_offset( $stdoff ), $rules, @date ];
5550 $zonename = undef unless @date; # last entry doesn't have an until date
5551 next;
5553 if ($fields[0] eq "Rule")
5555 shift @fields;
5556 my ($rulename, $from, $to, $dummy, $in, $on, $at, $save) = @fields;
5557 $to = $from if $to eq "only";
5558 $to = $LAST_YEAR if $to eq "max";
5559 push @{$rules{$rulename}}, [ parse_tz_offset( $save ), $from, $to, $in, $on, $at ];
5560 next;
5562 if ($fields[0] eq "Link")
5564 $links{$fields[2]} = $fields[1];
5565 next;
5567 die "unrecognized line $_";
5569 close $FILE;
5572 foreach my $name (sort { uc($a) cmp uc($b) } keys %names)
5574 my ($display, $zone) = @{$names{$name}};
5575 $zone = $links{$zone} if defined $links{$zone};
5577 # build list of transitions
5579 my @transitions;
5580 my @from_date = ( 1 );
5581 my $last_stdoff = 0;
5582 for (my $i = 0; $i < scalar @{$zones{$zone}}; $i++)
5584 my ($stdoff, $rule, @until_date) = @{$zones{$zone}->[$i]};
5585 my $isdst = ($last_stdoff != $stdoff);
5586 $from_date[0] ||= $LAST_YEAR;
5587 my @systime = parse_transition_date( $stdoff, $isdst, @from_date );
5588 push @transitions, [ $stdoff, -1, \@systime ];
5590 if (defined $rules{$rule})
5592 foreach my $r (@{$rules{$rule}})
5594 my ($offset, $from, $to, $in, $on, $at) = @{$r};
5595 foreach my $year ($from..$to)
5597 next if $year < $from_date[0];
5598 next if $until_date[0] && $year > $until_date[0];
5599 my @systime = parse_transition_date( $stdoff, !!$offset, $year, $in, $on, $at );
5600 next if compare_transition_date( $stdoff, $isdst, \@until_date, \@systime ) <= 0;
5601 my $ret = compare_transition_date( $stdoff, $isdst, \@from_date, \@systime );
5602 next if $ret > 0;
5603 pop @transitions if !$ret; # remove transition if there's a dst change at the same time
5604 push @transitions, [ $stdoff, $offset, \@systime ];
5608 @from_date = @until_date;
5609 $last_stdoff = $stdoff;
5611 @transitions = sort { compare_systime( $a->[2], $b->[2] ) } @transitions;
5613 # build per-year dynamic info
5615 my @info;
5616 my $last_dstoff = 0;
5617 my $last_dst = 0;
5618 my $year = $FIRST_YEAR;
5619 while ($year <= $LAST_YEAR)
5621 if (@transitions && $transitions[0]->[2]->[0] < $year)
5623 $last_stdoff = $transitions[0]->[0];
5624 shift @transitions;
5625 next;
5627 my ($std, $dst, @trans);
5628 my $cur_stdoff = $last_stdoff;
5629 my $cur_dstoff = ($name =~ /^UTC/) ? 0 : -60;
5630 while (@transitions && $transitions[0]->[2]->[0] == $year)
5632 my $t = shift @transitions;
5633 my ($stdoff, $dstoff, $systime) = @{$t};
5634 $systime = pack_systime( @{$systime} );
5635 if (!$dstoff) # std
5637 $cur_stdoff = $stdoff unless $std;
5638 $std = $systime;
5640 elsif ($dstoff != -1) # dst
5642 $cur_dstoff = $dstoff unless $dst;
5643 $dst ||= $systime;
5645 elsif ($stdoff != $last_stdoff) # rule transition
5647 # Handle a special case: Samoa moved to the other side of
5648 # the date line between 2011-12-03 and 2012-01-01,
5649 # entirely skipping the day 2011-12-31. We ignore this
5650 # change because it happens on a year boundary and more
5651 # importantly it would generate on offset of -25 hours,
5652 # which some programs (e.g., Mono) do not like. See
5653 # https://bugs.winehq.org/show_bug.cgi?id=51758
5655 if ($last_stdoff - $stdoff < 24 * 60)
5657 @trans = ($last_stdoff, $stdoff, $systime);
5658 $cur_stdoff = $stdoff;
5661 elsif ($dst) # rule transition with no stdoff change
5663 $std = $systime;
5665 $last_dstoff = ($dstoff == -1) ? 0 : $dstoff;
5667 $last_stdoff = $cur_stdoff;
5669 if ($cur_dstoff > 0) # swap std and dst to ensure that offset is negative
5671 ($std, $dst) = ($dst, $std);
5672 $cur_stdoff += $cur_dstoff;
5673 $cur_dstoff = -$cur_dstoff;
5676 if (@trans)
5678 # heuristic to prefer switching dst
5679 if ($last_dst == $year - 1 || (!$last_dst && $trans[0] > $trans[1]))
5681 $dst ||= $trans[2];
5682 $cur_stdoff = $trans[0];
5683 $cur_dstoff = $trans[1] - $trans[0];
5685 else
5687 $std ||= $trans[2];
5688 $cur_stdoff = $trans[1];
5689 $cur_dstoff = $trans[0] - $trans[1];
5693 if ($std || $dst)
5695 $std ||= pack_systime( parse_transition_date( 0, 0, $year, "Jan", 1 ));
5696 $dst ||= pack_systime( parse_transition_date( 0, 0, $year, "Jan", 1 ));
5697 $last_dst = $year;
5699 else
5701 $std = pack "S<8", 0;
5702 $dst = pack "S<8", 0;
5703 $cur_stdoff += $last_dstoff;
5705 $info[$year++] = pack( "l<3", $cur_stdoff, 0, $cur_dstoff ) . $std . $dst;
5708 # output registry keys
5710 my $std_name = $name eq "UTC" ? "Coordinated Universal Time" : $name;
5711 my $dlt_name = $std_name =~ s/Standard Time/Daylight Time/r;
5712 my $res_idx = hex( substr( Digest::SHA::sha1_hex($name), -3, 3 )) << 4;
5713 $res_idx += 16 while exists $res_indices{$res_idx};
5714 $res_indices{$res_idx} = 1;
5716 add_registry_string_value( $zonekey, $name, "Display", $display );
5717 add_registry_string_value( $zonekey, $name, "Std", $std_name );
5718 add_registry_string_value( $zonekey, $name, "Dlt", $dlt_name );
5719 add_registry_string_value( $zonekey, $name, "MUI_Std", sprintf( "\@tzres.dll,-%u", $res_idx ));
5720 add_registry_string_value( $zonekey, $name, "MUI_Dlt", sprintf( "\@tzres.dll,-%u", $res_idx + 1 ));
5721 add_registry_string_value( $zonekey, $name, "MUI_Display", sprintf( "\@tzres.dll,-%u", $res_idx + 2 ));
5722 add_registry_binary_value( $zonekey, $name, "TZI", $info[$LAST_YEAR] );
5724 printf OUTPUT "%7d \"#msgctxt#maximum 31 characters#%s\"\n", $res_idx, $std_name;
5725 printf OUTPUT "%7d \"#msgctxt#maximum 31 characters#%s\"\n", $res_idx + 1, $dlt_name;
5726 printf OUTPUT "%7d \"%s\"\n", $res_idx + 2, $display;
5728 my $first_year = $FIRST_YEAR;
5729 my $last_year = $LAST_YEAR;
5730 $last_year-- while $last_year > $FIRST_YEAR && $info[$last_year] eq $info[$last_year - 1];
5731 $first_year++ while $first_year < $last_year && $info[$first_year] eq $info[$last_year];
5733 next if $last_year <= $first_year;
5735 foreach my $i ($first_year..$last_year)
5737 add_registry_binary_value( $zonekey, "$name\\Dynamic DST", $i, $info[$i] );
5739 add_registry_dword_value( $zonekey, "$name\\Dynamic DST", "FirstEntry", $first_year );
5740 add_registry_dword_value( $zonekey, "$name\\Dynamic DST", "LastEntry", $last_year );
5743 print OUTPUT "}\n";
5744 close OUTPUT;
5745 save_file($filename);
5749 ################################################################
5750 # build the script to create registry keys
5751 sub dump_registry_script($%)
5753 my ($filename, %keys) = @_;
5754 my $indent = 1;
5755 my @prev;
5757 printf "Building %s\n", $filename;
5758 open OUTPUT, ">$filename.new" or die "Cannot create $filename";
5759 print OUTPUT "HKLM\n{\n";
5760 foreach my $k (sort { ($a =~ tr/a-z\\/A-Z\001/r) cmp ($b =~ tr/a-z\\/A-Z\001/r) } keys %keys)
5762 my @subkeys = split /\\/, $k;
5763 while (@prev && @subkeys && $prev[0] eq $subkeys[0]) { shift @prev; shift @subkeys; }
5764 while (@prev) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; shift @prev; }
5765 my ($def, @vals) = @{$keys{$k}};
5766 for (my $i = 0; $i < @subkeys; $i++)
5768 my $name = $subkeys[$i];
5769 my $prefix = "";
5770 if ($name =~ /^-/)
5772 $name =~ s/^-//;
5773 $prefix = "NoRemove ";
5775 if ($name =~ /\s/)
5777 $name = "'$name'";
5779 printf OUTPUT "%*s%s%s%s\n%*s{\n", 4 * $indent, "", $prefix, $name,
5780 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
5781 $indent++;
5783 foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
5784 @prev = split /\\/, $k;
5786 while (@prev) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; shift @prev; }
5787 printf OUTPUT "}\n";
5788 close OUTPUT;
5789 save_file($filename);
5793 ################################################################
5794 # save a file if modified
5795 sub save_file($)
5797 my $file = shift;
5798 if (-f $file && !system "cmp $file $file.new >/dev/null")
5800 unlink "$file.new";
5802 else
5804 rename "$file.new", "$file";
5809 ################################################################
5810 # main routine
5812 chdir ".." if -f "./make_unicode";
5813 load_data();
5814 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
5815 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
5816 dump_bidi_dir_table( "dlls/wineps.drv/direction.c" );
5817 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
5818 dump_mirroring( "dlls/dwrite/mirror.c" );
5819 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
5820 dump_bracket( "dlls/dwrite/bracket.c" );
5821 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
5822 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
5823 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
5824 dump_linebreak( "dlls/dwrite/linebreak.c" );
5825 dump_scripts( "dlls/dwrite/scripts" );
5826 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
5827 dump_vertical( "dlls/win32u/vertical.c", 1 );
5828 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
5829 dump_intl_nls("nls/l_intl.nls");
5830 dump_norm_table( "nls/normnfc.nls" );
5831 dump_norm_table( "nls/normnfd.nls" );
5832 dump_norm_table( "nls/normnfkc.nls" );
5833 dump_norm_table( "nls/normnfkd.nls" );
5834 dump_norm_table( "nls/normidna.nls" );
5835 my $chartypes = dump_sortkey_table( "nls/sortdefault.nls" );
5836 dump_locales( "nls/locale.nls", $chartypes );
5837 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
5838 dump_eucjp_codepage();
5839 dump_timezones( "dlls/tzres/tzres.rc", @timezone_files );
5840 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
5842 exit 0;
5844 # Local Variables:
5845 # compile-command: "./make_unicode"
5846 # End: