tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $REPORTS = "http://www.unicode.org/reports";
  30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  32
  33 # Sort keys file
  34 my $SORTKEYS = "tr10/allkeys.txt";
  35
  36 # Default char for undefined mappings
  37 my $DEF_CHAR = ord '?';
  38
  39 # Last valid Unicode character
  40 my $MAX_CHAR = 0x10ffff;
  41
  42 my @allfiles =
  43 (
  44     "CodpageFiles/037.txt",
  45     "CodpageFiles/437.txt",
  46     "CodpageFiles/500.txt",
  47     "CodpageFiles/708.txt",
  48     "CodpageFiles/737.txt",
  49     "CodpageFiles/775.txt",
  50     "CodpageFiles/850.txt",
  51     "CodpageFiles/852.txt",
  52     "CodpageFiles/855.txt",
  53     "CodpageFiles/857.txt",
  54     "CodpageFiles/860.txt",
  55     "CodpageFiles/861.txt",
  56     "CodpageFiles/862.txt",
  57     "CodpageFiles/863.txt",
  58     "CodpageFiles/864.txt",
  59     "CodpageFiles/865.txt",
  60     "CodpageFiles/866.txt",
  61     "CodpageFiles/869.txt",
  62     "CodpageFiles/874.txt",
  63     "CodpageFiles/875.txt",
  64     "CodpageFiles/932.txt",
  65     "CodpageFiles/936.txt",
  66     "CodpageFiles/949.txt",
  67     "CodpageFiles/950.txt",
  68     "CodpageFiles/1026.txt",
  69     "CodpageFiles/1250.txt",
  70     "CodpageFiles/1251.txt",
  71     "CodpageFiles/1252.txt",
  72     "CodpageFiles/1253.txt",
  73     "CodpageFiles/1254.txt",
  74     "CodpageFiles/1255.txt",
  75     "CodpageFiles/1256.txt",
  76     "CodpageFiles/1257.txt",
  77     "CodpageFiles/1258.txt",
  78     "CodpageFiles/1361.txt",
  79     "CodpageFiles/10000.txt",
  80     "CodpageFiles/10001.txt",
  81     "CodpageFiles/10002.txt",
  82     "CodpageFiles/10003.txt",
  83     "CodpageFiles/10004.txt",
  84     "CodpageFiles/10005.txt",
  85     "CodpageFiles/10006.txt",
  86     "CodpageFiles/10007.txt",
  87     "CodpageFiles/10008.txt",
  88     "CodpageFiles/10010.txt",
  89     "CodpageFiles/10017.txt",
  90     "CodpageFiles/10021.txt",
  91     "CodpageFiles/10029.txt",
  92     "CodpageFiles/10079.txt",
  93     "CodpageFiles/10081.txt",
  94     "CodpageFiles/10082.txt",
  95     "CodpageFiles/20127.txt",
  96     "CodpageFiles/20866.txt",
  97     "CodpageFiles/21866.txt",
  98     "CodpageFiles/28591.txt",
  99     "CodpageFiles/28592.txt",
 100     "CodpageFiles/28593.txt",
 101     "CodpageFiles/28594.txt",
 102     "CodpageFiles/28595.txt",
 103     "CodpageFiles/28596.txt",
 104     "CodpageFiles/28597.txt",
 105     "CodpageFiles/28598.txt",
 106     "CodpageFiles/28599.txt",
 107     "CodpageFiles/28603.txt",
 108     "CodpageFiles/28605.txt",
 109 );
 110
 111
 112 my %ctype =
 113 (
 114      # CT_CTYPE1
 115     "upper"  => 0x0001,
 116     "lower"  => 0x0002,
 117     "digit"  => 0x0004,
 118     "space"  => 0x0008,
 119     "punct"  => 0x0010,
 120     "cntrl"  => 0x0020,
 121     "blank"  => 0x0040,
 122     "xdigit" => 0x0080,
 123     "alpha"  => 0x0100 | 0x80000000,
 124     "defin"  => 0x0200,
 125      # CT_CTYPE3 in high 16 bits
 126     "nonspacing"    => 0x00010000,
 127     "diacritic"     => 0x00020000,
 128     "vowelmark"     => 0x00040000,
 129     "symbol"        => 0x00080000,
 130     "katakana"      => 0x00100000,
 131     "hiragana"      => 0x00200000,
 132     "halfwidth"     => 0x00400000,
 133     "fullwidth"     => 0x00800000,
 134     "ideograph"     => 0x01000000,
 135     "kashida"       => 0x02000000,
 136     "lexical"       => 0x04000000,
 137     "highsurrogate" => 0x08000000,
 138     "lowsurrogate"  => 0x10000000,
 139 );
 140
 141 my %bracket_types =
 142 (
 143     "o" => 0x0000,
 144     "c" => 0x0001,
 145 );
 146
 147 my %indic_types =
 148 (
 149     "Other"    => 0x0000,
 150     "Bindu"    => 0x0001,
 151     "Visarga"  => 0x0002,
 152     "Avagraha" => 0x0003,
 153     "Nukta"    => 0x0004,
 154     "Virama"   => 0x0005,
 155     "Vowel_Independent"  => 0x0006,
 156     "Vowel_Dependent"  => 0x0007,
 157     "Vowel"  => 0x0008,
 158     "Consonant_Placeholder"  => 0x0009,
 159     "Consonant"  => 0x000a,
 160     "Consonant_Dead"  => 0x000b,
 161     "Consonant_Succeeding_Repha" => 0x000c,
 162     "Consonant_Subjoined"  => 0x000d,
 163     "Consonant_Medial"  => 0x000e,
 164     "Consonant_Final"  => 0x000f,
 165     "Consonant_Head_Letter"  => 0x0010,
 166     "Modifying_Letter"  => 0x0011,
 167     "Tone_Letter"  => 0x0012,
 168     "Tone_Mark"  => 0x0013,
 169     "Register_Shifter"  => 0x0014,
 170     "Consonant_Preceding_Repha" => 0x0015,
 171     "Pure_Killer" => 0x0016,
 172     "Invisible_Stacker" => 0x0017,
 173     "Gemination_Mark" => 0x0018,
 174     "Cantillation_Mark" => 0x0019,
 175     "Non_Joiner" => 0x001a,
 176     "Joiner" => 0x001b,
 177     "Number_Joiner" => 0x001c,
 178     "Number" => 0x001d,
 179     "Brahmi_Joining_Number" => 0x001e,
 180     "Consonant_With_Stacker" => 0x001f,
 181     "Consonant_Prefixed" => 0x0020,
 182     "Syllable_Modifier" => 0x0021,
 183     "Consonant_Killer" => 0x0022,
 184     "Consonant_Initial_Postfixed" => 0x0023,
 185 );
 186
 187 my %matra_types =
 188 (
 189     "Right"    => 0x01,
 190     "Left"  => 0x02,
 191     "Visual_Order_Left" => 0x03,
 192     "Left_And_Right"    => 0x04,
 193     "Top"   => 0x05,
 194     "Bottom"  => 0x06,
 195     "Top_And_Bottom"  => 0x07,
 196     "Top_And_Right"  => 0x08,
 197     "Top_And_Left"  => 0x09,
 198     "Top_And_Left_And_Right"  => 0x0a,
 199     "Bottom_And_Right"  => 0x0b,
 200     "Top_And_Bottom_And_Right"  => 0x0c,
 201     "Overstruck"  => 0x0d,
 202     "Invisible"  => 0x0e,
 203     "Bottom_And_Left"  => 0x0f,
 204     "Top_And_Bottom_And_Left"  => 0x10,
 205 );
 206
 207 my %break_types =
 208 (
 209     "BK"  => 0x0001,
 210     "CR"  => 0x0002,
 211     "LF"  => 0x0003,
 212     "CM"  => 0x0004,
 213     "SG"  => 0x0005,
 214     "GL"  => 0x0006,
 215     "CB"  => 0x0007,
 216     "SP"  => 0x0008,
 217     "ZW"  => 0x0009,
 218     "NL"  => 0x000a,
 219     "WJ"  => 0x000b,
 220     "JL"  => 0x000c,
 221     "JV"  => 0x000d,
 222     "JT"  => 0x000e,
 223     "H2"  => 0x000f,
 224     "H3"  => 0x0010,
 225     "XX"  => 0x0011,
 226     "OP"  => 0x0012,
 227     "CL"  => 0x0013,
 228     "CP"  => 0x0014,
 229     "QU"  => 0x0015,
 230     "NS"  => 0x0016,
 231     "EX"  => 0x0017,
 232     "SY"  => 0x0018,
 233     "IS"  => 0x0019,
 234     "PR"  => 0x001a,
 235     "PO"  => 0x001b,
 236     "NU"  => 0x001c,
 237     "AL"  => 0x001d,
 238     "ID"  => 0x001e,
 239     "IN"  => 0x001f,
 240     "HY"  => 0x0020,
 241     "BB"  => 0x0021,
 242     "BA"  => 0x0022,
 243     "SA"  => 0x0023,
 244     "AI"  => 0x0024,
 245     "B2"  => 0x0025,
 246     "HL"  => 0x0026,
 247     "CJ"  => 0x0027,
 248     "RI"  => 0x0028,
 249     "EB"  => 0x0029,
 250     "EM"  => 0x002a,
 251     "ZWJ" => 0x002b,
 252 );
 253
 254 my %vertical_types =
 255 (
 256     "R"  => 0x0000,
 257     "U"  => 0x0001,
 258     "Tr" => 0x0002,
 259     "Tu" => 0x0003,
 260 );
 261
 262 my %categories =
 263 (
 264     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 265     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 266     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 267     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 268     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 269     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 270     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 271     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 272     "No" => $ctype{"defin"},                    # Number, Other
 273     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 274     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 275     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 276     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 277     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 278     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 279     "Co" => $ctype{"defin"},                    # Other, Private Use
 280     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 281     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 282     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 283     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 284     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 285     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 286     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 287     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 288     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 289     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 290     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 291     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 292     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 293     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 294 );
 295
 296 # a few characters need additional categories that cannot be determined automatically
 297 my %special_categories =
 298 (
 299     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 300                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 301     "space"  => [ 0x09..0x0d, 0x85 ],
 302     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 303     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 304                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 305                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 306                   0xfff9, 0xfffa, 0xfffb ],
 307     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 308                   0xd7, 0xf7 ],
 309     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 310     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 311     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 312                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 313     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 314     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 315                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 316                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 317     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 318     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 319                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 320                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 321                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 322                      0x3131..0x3164 ],
 323     "ideograph" => [ 0x3006..0x3007 ],
 324     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 325                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 326                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 327                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 328                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 329                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 330                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 331                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 332                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 333                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 334     "kashida" => [ 0x0640 ],
 335 );
 336
 337 my %directions =
 338 (
 339     "L"   => 1,    # Left-to-Right
 340     "R"   => 2,    # Right-to-Left
 341     "AL"  => 12,   # Right-to-Left Arabic
 342     "EN"  => 3,    # European Number
 343     "ES"  => 4,    # European Number Separator
 344     "ET"  => 5,    # European Number Terminator
 345     "AN"  => 6,    # Arabic Number
 346     "CS"  => 7,    # Common Number Separator
 347     "NSM" => 13,   # Non-Spacing Mark
 348     "BN"  => 14,   # Boundary Neutral
 349     "B"   => 8,    # Paragraph Separator
 350     "S"   => 9,    # Segment Separator
 351     "WS"  => 10,   # Whitespace
 352     "ON"  => 11,   # Other Neutrals
 353     "LRE" => 15,   # Left-to-Right Embedding
 354     "LRO" => 15,   # Left-to-Right Override
 355     "RLE" => 15,   # Right-to-Left Embedding
 356     "RLO" => 15,   # Right-to-Left Override
 357     "PDF" => 15,   # Pop Directional Format
 358     "LRI" => 15,   # Left-to-Right Isolate
 359     "RLI" => 15,   # Right-to-Left Isolate
 360     "FSI" => 15,   # First Strong Isolate
 361     "PDI" => 15    # Pop Directional Isolate
 362 );
 363
 364 my %c2_types =
 365 (
 366     "L"   => 1,    # C2_LEFTTORIGHT
 367     "R"   => 2,    # C2_RIGHTTOLEFT
 368     "AL"  => 2,    # C2_RIGHTTOLEFT
 369     "EN"  => 3,    # C2_EUROPENUMBER
 370     "ES"  => 4,    # C2_EUROPESEPARATOR
 371     "ET"  => 5,    # C2_EUROPETERMINATOR
 372     "AN"  => 6,    # C2_ARABICNUMBER
 373     "CS"  => 7,    # C2_COMMONSEPARATOR
 374     "NSM" => 11,   # C2_OTHERNEUTRAL
 375     "BN"  => 0,    # C2_NOTAPPLICABLE
 376     "B"   => 8,    # C2_BLOCKSEPARATOR
 377     "S"   => 9,    # C2_SEGMENTSEPARATOR
 378     "WS"  => 10,   # C2_WHITESPACE
 379     "ON"  => 11,   # C2_OTHERNEUTRAL
 380     "LRE" => 11,   # C2_OTHERNEUTRAL
 381     "LRO" => 11,   # C2_OTHERNEUTRAL
 382     "RLE" => 11,   # C2_OTHERNEUTRAL
 383     "RLO" => 11,   # C2_OTHERNEUTRAL
 384     "PDF" => 11,   # C2_OTHERNEUTRAL
 385     "LRI" => 11,   # C2_OTHERNEUTRAL
 386     "RLI" => 11,   # C2_OTHERNEUTRAL
 387     "FSI" => 11,   # C2_OTHERNEUTRAL
 388     "PDI" => 11    # C2_OTHERNEUTRAL
 389 );
 390
 391 my %bidi_types =
 392 (
 393     "ON"  => 0,    # Other Neutrals
 394     "L"   => 1,    # Left-to-Right
 395     "R"   => 2,    # Right-to-Left
 396     "AN"  => 3,    # Arabic Number
 397     "EN"  => 4,    # European Number
 398     "AL"  => 5,    # Right-to-Left Arabic
 399     "NSM" => 6,    # Non-Spacing Mark
 400     "CS"  => 7,    # Common Number Separator
 401     "ES"  => 8,    # European Number Separator
 402     "ET"  => 9,    # European Number Terminator
 403     "BN"  => 10,   # Boundary Neutral
 404     "S"   => 11,   # Segment Separator
 405     "WS"  => 12,   # Whitespace
 406     "B"   => 13,   # Paragraph Separator
 407     "RLO" => 14,   # Right-to-Left Override
 408     "RLE" => 15,   # Right-to-Left Embedding
 409     "LRO" => 16,   # Left-to-Right Override
 410     "LRE" => 17,   # Left-to-Right Embedding
 411     "PDF" => 18,   # Pop Directional Format
 412     "LRI" => 19,   # Left-to-Right Isolate
 413     "RLI" => 20,   # Right-to-Left Isolate
 414     "FSI" => 21,   # First Strong Isolate
 415     "PDI" => 22    # Pop Directional Isolate
 416 );
 417
 418 my %joining_types =
 419 (
 420    "U" => 0,    # Non_Joining
 421    "T" => 1,    # Transparent
 422    "R" => 2,    # Right_Joining
 423    "L" => 3,    # Left_Joining
 424    "D" => 4,    # Dual_Joining
 425    "C" => 5,    # Join_Causing
 426 );
 427
 428 my @cp2uni = ();
 429 my @glyph2uni = ();
 430 my @lead_bytes = ();
 431 my @uni2cp = ();
 432 my @tolower_table = ();
 433 my @toupper_table = ();
 434 my @digitmap_table = ();
 435 my @category_table = ();
 436 my @joining_table = ();
 437 my @direction_table = ();
 438 my @decomp_table = ();
 439 my @combining_class_table = ();
 440 my @decomp_compat_table = ();
 441 my @comp_exclusions = ();
 442 my @idna_decomp_table = ();
 443 my @idna_disallowed = ();
 444 my %registry_keys;
 445 my $default_char;
 446 my $default_wchar;
 447
 448 my %joining_forms =
 449 (
 450    "isolated" => [],
 451    "final" => [],
 452    "initial" => [],
 453    "medial" => []
 454 );
 455
 456 sub to_utf16(@)
 457 {
 458     my @ret;
 459     foreach my $ch (@_)
 460     {
 461         if ($ch < 0x10000)
 462         {
 463             push @ret, $ch;
 464         }
 465         else
 466         {
 467             my $val = $ch - 0x10000;
 468             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 469         }
 470     }
 471     return @ret;
 472 }
 473
 474 ################################################################
 475 # fetch a unicode.org file and open it
 476 sub open_data_file($$)
 477 {
 478     my ($base, $name) = @_;
 479     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 480     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 481     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 482     local *FILE;
 483
 484     if ($base =~ /.*\/([^\/]+)\.zip$/)
 485     {
 486         my $zip = "$1$suffix.zip";
 487         unless (-f "$cache/$zip")
 488         {
 489             system "mkdir", "-p", $cache;
 490             print "Fetching $base...\n";
 491             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 492         }
 493         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 494     }
 495     else
 496     {
 497         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 498         unless (-f $dest)
 499         {
 500             system "mkdir", "-p", $dir;
 501             print "Fetching $base/$name...\n";
 502             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 503         }
 504         open FILE, "<$dest" or die "cannot open $dest";
 505     }
 506     return *FILE;
 507 }
 508
 509 ################################################################
 510 # recursively get the decomposition for a character
 511 sub get_decomposition($$);
 512 sub get_decomposition($$)
 513 {
 514     my ($char, $table) = @_;
 515     my @ret;
 516
 517     return $char unless defined $table->[$char];
 518     foreach my $ch (@{$table->[$char]})
 519     {
 520         push @ret, get_decomposition( $ch, $table );
 521     }
 522     return @ret;
 523 }
 524
 525 ################################################################
 526 # get the composition that results in a given character
 527 sub get_composition($$)
 528 {
 529     my ($ch, $compat) = @_;
 530     return () unless defined $decomp_table[$ch];  # no decomposition
 531     my @ret = @{$decomp_table[$ch]};
 532     return () if @ret < 2;                        # singleton decomposition
 533     return () if $comp_exclusions[$ch];           # composition exclusion
 534     return () if $combining_class_table[$ch];     # non-starter
 535     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 536     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 537         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 538     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 539         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 540     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 541         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 542     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 543     return @ret;
 544 }
 545
 546 ################################################################
 547 # recursively build decompositions
 548 sub build_decompositions(@)
 549 {
 550     my @src = @_;
 551     my @dst;
 552
 553     for (my $i = 0; $i < @src; $i++)
 554     {
 555         next unless defined $src[$i];
 556         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 557         $dst[$i] = \@decomp;
 558     }
 559     return @dst;
 560 }
 561
 562 ################################################################
 563 # compose Hangul sequences
 564 sub compose_hangul(@)
 565 {
 566     my $SBASE  = 0xac00;
 567     my $LBASE  = 0x1100;
 568     my $VBASE  = 0x1161;
 569     my $TBASE  = 0x11a7;
 570     my $LCOUNT = 19;
 571     my $VCOUNT = 21;
 572     my $TCOUNT = 28;
 573     my $NCOUNT = $VCOUNT * $TCOUNT;
 574     my $SCOUNT = $LCOUNT * $NCOUNT;
 575
 576     my @seq = @_;
 577     my @ret;
 578     my $i;
 579
 580     for ($i = 0; $i < @seq; $i++)
 581     {
 582         my $ch = $seq[$i];
 583         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 584             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 585         {
 586             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 587             $i++;
 588         }
 589         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 590             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 591         {
 592             $ch += $seq[$i+1] - $TBASE;
 593             $i++;
 594         }
 595         push @ret, $ch;
 596     }
 597     return @ret;
 598 }
 599
 600 ################################################################
 601 # remove linguistic-only mappings from the case table
 602 sub remove_linguistic_mappings($$)
 603 {
 604     my ($upper, $lower) = @_;
 605
 606     # remove case mappings that don't round-trip
 607
 608     for (my $i = 0; $i < @{$upper}; $i++)
 609     {
 610         next unless defined ${$upper}[$i];
 611         my $ch = ${$upper}[$i];
 612         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 613     }
 614     for (my $i = 0; $i < @{$lower}; $i++)
 615     {
 616         next unless defined ${$lower}[$i];
 617         my $ch = ${$lower}[$i];
 618         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 619     }
 620 }
 621
 622 ################################################################
 623 # read in the Unicode database files
 624 sub load_data()
 625 {
 626     my $start;
 627
 628     # now build mappings from the decomposition field of the Unicode database
 629
 630     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 631     while (<$UNICODE_DATA>)
 632     {
 633         # Decode the fields ...
 634         my ($code, $name, $cat, $comb, $bidi,
 635             $decomp, $dec, $dig, $num, $mirror,
 636             $oldname, $comment, $upper, $lower, $title) = split /;/;
 637         my $src = hex $code;
 638
 639         die "unknown category $cat" unless defined $categories{$cat};
 640         die "unknown directionality $bidi" unless defined $directions{$bidi};
 641
 642         $category_table[$src] = $categories{$cat};
 643         $direction_table[$src] = $bidi;
 644         $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
 645
 646         if ($lower ne "")
 647         {
 648             $tolower_table[$src] = hex $lower;
 649         }
 650         if ($upper ne "")
 651         {
 652             $toupper_table[$src] = hex $upper;
 653         }
 654         if ($dec ne "")
 655         {
 656             $category_table[$src] |= $ctype{"digit"};
 657         }
 658         if ($dig ne "")
 659         {
 660             $digitmap_table[$src] = ord $dig;
 661         }
 662         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 663
 664         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 665         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 666         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 667         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 668         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 669         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 670         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 671         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 672         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 673         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 674         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 675         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 676
 677         # copy the category and direction for everything between First/Last pairs
 678         if ($name =~ /, First>/) { $start = $src; }
 679         if ($name =~ /, Last>/)
 680         {
 681             while ($start < $src)
 682             {
 683                 $category_table[$start] = $category_table[$src];
 684                 $direction_table[$start] = $direction_table[$src];
 685                 $combining_class_table[$start] = $combining_class_table[$src];
 686                 $start++;
 687             }
 688         }
 689
 690         next if $decomp eq "";  # no decomposition, skip it
 691
 692         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 693         {
 694             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 695             $decomp_compat_table[$src] = \@seq;
 696         }
 697
 698         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 699         {
 700             # decomposition of the form "<foo> 1234" -> use char if type is known
 701             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 702             {
 703                 ${joining_forms{$1}}[hex $2] = $src;
 704             }
 705         }
 706         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 707         {
 708             # decomposition "<compat> 0020 1234" -> combining accent
 709         }
 710         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 711         {
 712             # store decomposition
 713             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 714             {
 715                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 716             }
 717             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 718             {
 719                 # Single char decomposition
 720                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 721             }
 722         }
 723     }
 724     close $UNICODE_DATA;
 725
 726     # patch the category of some special characters
 727
 728     for (my $i = 0; $i < @decomp_table; $i++)
 729     {
 730         next unless defined $decomp_table[$i];
 731         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 732     }
 733     foreach my $cat (keys %special_categories)
 734     {
 735         my $flag = $ctype{$cat};
 736         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 737     }
 738     for (my $i = 0; $i < @decomp_compat_table; $i++)
 739     {
 740         next unless defined $decomp_compat_table[$i];
 741         next unless @{$decomp_compat_table[$i]} == 2;
 742         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 743     }
 744
 745     # load the composition exclusions
 746
 747     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 748     while (<$EXCL>)
 749     {
 750         s/\#.*//;  # remove comments
 751         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 752         {
 753             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 754         }
 755         elsif (/^([0-9a-fA-F]+)\s*$/)
 756         {
 757             $comp_exclusions[hex $1] = 1;
 758         }
 759     }
 760     close $EXCL;
 761
 762     # load the IDNA mappings
 763
 764     @idna_decomp_table = @decomp_compat_table;
 765     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 766     while (<$IDNA>)
 767     {
 768         s/\#.*//;  # remove comments
 769         next if /^\s*$/;
 770         my ($char, $type, $mapping) = split /;/;
 771         my ($ch1, $ch2);
 772         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 773         {
 774             $ch1 = hex $1;
 775             $ch2 = hex $2;
 776         }
 777         elsif ($char =~ /([0-9a-fA-F]+)/)
 778         {
 779             $ch1 = $ch2 = hex $1;
 780         }
 781
 782         if ($type =~ /mapped/ || $type =~ /deviation/)
 783         {
 784             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 785             my @seq = map { hex $_; } split /\s+/, $mapping;
 786             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 787         }
 788         elsif ($type =~ /valid/)
 789         {
 790         }
 791         elsif ($type =~ /ignored/)
 792         {
 793             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 794         }
 795         elsif ($type =~ /disallowed/)
 796         {
 797             foreach my $i ($ch1 .. $ch2)
 798             {
 799                 $idna_decomp_table[$i] = undef;
 800                 $idna_disallowed[$i] = 1;
 801             }
 802         }
 803     }
 804     close $IDNA;
 805 }
 806
 807
 808 ################################################################
 809 # add a new registry key
 810 sub add_registry_key($$)
 811 {
 812     my ($key, $defval) = @_;
 813     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 814 }
 815
 816 ################################################################
 817 # add a new registry value
 818 sub add_registry_value($$$)
 819 {
 820     my ($key, $name, $value) = @_;
 821     add_registry_key( $key, undef );
 822     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 823 }
 824
 825 ################################################################
 826 # define a new lead byte
 827 sub add_lead_byte($)
 828 {
 829     my $ch = shift;
 830     return if defined $cp2uni[$ch];
 831     push @lead_bytes, $ch;
 832     $cp2uni[$ch] = 0;
 833 }
 834
 835 ################################################################
 836 # define a new char mapping
 837 sub add_mapping($$)
 838 {
 839     my ($cp, $uni) = @_;
 840     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 841     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 842     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 843 }
 844
 845 ################################################################
 846 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 847 sub get_glyphs_mapping(@)
 848 {
 849     my @table = @_;
 850
 851     for (my $i = 0; $i < @glyph2uni; $i++)
 852     {
 853         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 854     }
 855     return @table;
 856 }
 857
 858 ################################################################
 859 # build EUC-JP table from the JIS 0208/0212 files
 860 sub dump_eucjp_codepage()
 861 {
 862     @cp2uni = ();
 863     @glyph2uni = ();
 864     @lead_bytes = ();
 865     @uni2cp = ();
 866     $default_char = $DEF_CHAR;
 867     $default_wchar = 0x30fb;
 868
 869     # ASCII chars
 870     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 871
 872     # lead bytes
 873     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 874
 875     # JIS X 0201 right plane
 876     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 877
 878     # undefined chars
 879     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 880     $cp2uni[0xa0] = 0xf8f0;
 881     $cp2uni[0xff] = 0xf8f3;
 882
 883     # Fix backslash conversion
 884     add_mapping( 0xa1c0, 0xff3c );
 885
 886     # Add private mappings for rows undefined in JIS 0208/0212
 887     my $private = 0xe000;
 888     foreach my $hi (0xf5 .. 0xfe)
 889     {
 890         foreach my $lo (0xa1 .. 0xfe)
 891         {
 892             add_mapping( ($hi << 8) + $lo, $private++ );
 893         }
 894     }
 895     foreach my $hi (0xf5 .. 0xfe)
 896     {
 897         foreach my $lo (0x21 .. 0x7e)
 898         {
 899             add_mapping( ($hi << 8) + $lo, $private++ );
 900         }
 901     }
 902
 903     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 904     while (<$INPUT>)
 905     {
 906         next if /^\#/;  # skip comments
 907         next if /^$/;  # skip empty lines
 908         next if /\x1a/;  # skip ^Z
 909         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 910         {
 911             add_mapping( 0x8080 + hex $1, hex $2 );
 912             next;
 913         }
 914         die "Unrecognized line $_\n";
 915     }
 916     close $INPUT;
 917
 918     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 919     while (<$INPUT>)
 920     {
 921         next if /^\#/;  # skip comments
 922         next if /^$/;  # skip empty lines
 923         next if /\x1a/;  # skip ^Z
 924         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 925         {
 926             add_mapping( 0x8000 + hex $1, hex $2 );
 927             next;
 928         }
 929         die "Unrecognized line $_\n";
 930     }
 931     close $INPUT;
 932
 933     output_codepage_file( 20932 );
 934 }
 935
 936
 937 ################################################################
 938 # build the sort keys table
 939 sub dump_sortkeys($)
 940 {
 941     my $filename = shift;
 942     my @sortkeys = ();
 943
 944     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
 945     while (<$INPUT>)
 946     {
 947         next if /^\#/;  # skip comments
 948         next if /^$/;  # skip empty lines
 949         next if /\x1a/;  # skip ^Z
 950         next if /^\@version/;  # skip @version header
 951         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 952         {
 953             my ($uni,$variable) = (hex $1, $2);
 954             next if $uni > 65535;
 955             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
 956             next;
 957         }
 958         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 959         {
 960             # multiple character sequence, ignored for now
 961             next;
 962         }
 963         die "$SORTKEYS: Unrecognized line $_\n";
 964     }
 965     close $INPUT;
 966
 967     # compress the keys to 32 bit:
 968     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
 969
 970     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
 971                        ${$a}[2] <=> ${$b}[2] or
 972                        ${$a}[3] <=> ${$b}[3] or
 973                        ${$a}[4] <=> ${$b}[4] or
 974                        $a cmp $b; } @sortkeys;
 975
 976     my ($n2, $n3) = (1, 1);
 977     my @keys = (-1, -1, -1, -1, -1 );
 978     my @flatkeys = ();
 979
 980     for (my $i = 0; $i < @sortkeys; $i++)
 981     {
 982         next unless defined $sortkeys[$i];
 983         my @current = @{$sortkeys[$i]};
 984         if ($current[1] == $keys[1])
 985         {
 986             if ($current[2] == $keys[2])
 987             {
 988                 if ($current[3] == $keys[3])
 989                 {
 990                     # nothing
 991                 }
 992                 else
 993                 {
 994                     $keys[3] = $current[3];
 995                     $n3++;
 996                     die if ($n3 >= 16);
 997                 }
 998             }
 999             else
1000             {
1001                 $keys[2] = $current[2];
1002                 $keys[3] = $current[3];
1003                 $n2++;
1004                 $n3 = 1;
1005                 die if ($n2 >= 256);
1006             }
1007         }
1008         else
1009         {
1010             $keys[1] = $current[1];
1011             $keys[2] = $current[2];
1012             $keys[3] = $current[3];
1013             $n2 = 1;
1014             $n3 = 1;
1015         }
1016
1017         if ($current[2]) { $current[2] = $n2; }
1018         if ($current[3]) { $current[3] = $n3; }
1019         if ($current[4]) { $current[4] = 1; }
1020
1021         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1022     }
1023
1024     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1025     printf "Building $filename\n";
1026     printf OUTPUT "/* Unicode collation element table */\n";
1027     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1028     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1029     print OUTPUT "#include \"windef.h\"\n\n";
1030
1031     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1032
1033     close OUTPUT;
1034     save_file($filename);
1035 }
1036
1037
1038 ################################################################
1039 # dump an array of integers
1040 sub dump_array($$@)
1041 {
1042     my ($bit_width, $default, @array) = @_;
1043     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1044     my $i;
1045     my $ret = "    ";
1046     for ($i = 0; $i < $#array; $i++)
1047     {
1048         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1049         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1050     }
1051     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1052     return $ret;
1053 }
1054
1055
1056 ################################################################
1057 # dump an SBCS mapping table in binary format
1058 sub dump_binary_sbcs_table($)
1059 {
1060     my $codepage = shift;
1061
1062     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1063     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1064
1065     print OUTPUT pack "S<*", @header;
1066     print OUTPUT pack "C12", (0) x 12;
1067     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1068
1069     if (@glyph2uni)
1070     {
1071         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1072     }
1073     else
1074     {
1075         print OUTPUT pack "S<*", 0;
1076     }
1077
1078     print OUTPUT pack "S<*", 0, 0;
1079
1080     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1081 }
1082
1083
1084 ################################################################
1085 # dump a DBCS mapping table in binary format
1086 sub dump_binary_dbcs_table($)
1087 {
1088     my $codepage = shift;
1089     my @lb_ranges = get_lb_ranges();
1090     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1091
1092     my @offsets = (0) x 256;
1093     my $pos = 0;
1094     foreach my $i (@lead_bytes)
1095     {
1096         $offsets[$i] = ($pos += 256);
1097         $cp2uni[$i] = 0;
1098     }
1099
1100     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1101
1102     print OUTPUT pack "S<*", @header;
1103     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1104     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1105     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1106
1107     foreach my $i (@lead_bytes)
1108     {
1109         my $base = $i << 8;
1110         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1111     }
1112
1113     print OUTPUT pack "S<", 4;
1114     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1115 }
1116
1117
1118 ################################################################
1119 # get the list of defined lead byte ranges
1120 sub get_lb_ranges()
1121 {
1122     my @list = ();
1123     my @ranges = ();
1124
1125     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1126     my $on = 0;
1127     for (my $i = 0; $i < 256; $i++)
1128     {
1129         if ($on)
1130         {
1131             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1132         }
1133         else
1134         {
1135             if ($list[$i]) { push @ranges, $i; $on = 1; }
1136         }
1137     }
1138     if ($on) { push @ranges, 0xff; }
1139     return @ranges;
1140 }
1141
1142 ################################################################
1143 # dump the Indic Syllabic Category table
1144 sub dump_indic($)
1145 {
1146     my $filename = shift;
1147     my @indic_table;
1148
1149     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1150     while (<$INPUT>)
1151     {
1152         next if /^\#/;  # skip comments
1153         next if /^\s*$/;  # skip empty lines
1154         next if /\x1a/;  # skip ^Z
1155         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1156         {
1157             my $type = $2;
1158             die "unknown indic $type" unless defined $indic_types{$type};
1159             if (hex $1 < 65536)
1160             {
1161                 $indic_table[hex $1] = $indic_types{$type};
1162             }
1163             next;
1164         }
1165         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1166         {
1167             my $type = $3;
1168             die "unknown indic $type" unless defined $indic_types{$type};
1169             if (hex $1 < 65536 and hex $2 < 65536)
1170             {
1171                 foreach my $i (hex $1 .. hex $2)
1172                 {
1173                     $indic_table[$i] = $indic_types{$type};
1174                 }
1175             }
1176             next;
1177         }
1178         die "malformed line $_";
1179     }
1180     close $INPUT;
1181
1182     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1183     while (<$INPUT>)
1184     {
1185         next if /^\#/;  # skip comments
1186         next if /^\s*$/;  # skip empty lines
1187         next if /\x1a/;  # skip ^Z
1188         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1189         {
1190             my $type = $2;
1191             die "unknown matra $type" unless defined $matra_types{$type};
1192             $indic_table[hex $1] |= $matra_types{$type} << 8;
1193             next;
1194         }
1195         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1196         {
1197             my $type = $3;
1198             die "unknown matra $type" unless defined $matra_types{$type};
1199             foreach my $i (hex $1 .. hex $2)
1200             {
1201                 $indic_table[$i] |= $matra_types{$type} << 8;
1202             }
1203             next;
1204         }
1205         die "malformed line $_";
1206     }
1207     close $INPUT;
1208
1209     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1210     print "Building $filename\n";
1211     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1212     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1213     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1214     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1215     print OUTPUT "#include \"windef.h\"\n\n";
1216
1217     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1218
1219     close OUTPUT;
1220     save_file($filename);
1221 }
1222
1223 ################################################################
1224 # dump the Line Break Properties table
1225 sub dump_linebreak($)
1226 {
1227     my $filename = shift;
1228     my @break_table;
1229     my $next_group = 0;
1230
1231     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1232     while (<$INPUT>)
1233     {
1234         next if /^\#/;  # skip comments
1235         next if /^\s*$/;  # skip empty lines
1236         next if /\x1a/;  # skip ^Z
1237         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1238         {
1239             my $type = $2;
1240             die "unknown breaktype $type" unless defined $break_types{$type};
1241             $break_table[hex $1] = $break_types{$type};
1242             next;
1243         }
1244         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1245         {
1246             my $type = $3;
1247             die "unknown breaktype $type" unless defined $break_types{$type};
1248             foreach my $i (hex $1 .. hex $2)
1249             {
1250                 $break_table[$i] = $break_types{$type};
1251             }
1252             next;
1253         }
1254         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1255         {
1256             my $type = $2;
1257             die "unknown breaktype $type" unless defined $break_types{$type};
1258             $break_table[hex $1] = $break_types{$type};
1259             next;
1260         }
1261         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1262         {
1263             my $type = $3;
1264             die "unknown breaktype $type" unless defined $break_types{$type};
1265             foreach my $i (hex $1 .. hex $2)
1266             {
1267                 $break_table[$i] = $break_types{$type};
1268             }
1269             next;
1270         }
1271         die "malformed line $_";
1272     }
1273     close $INPUT;
1274
1275     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1276     print "Building $filename\n";
1277     print OUTPUT "/* Unicode Line Break Properties */\n";
1278     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1279     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1280     print OUTPUT "#include \"windef.h\"\n\n";
1281
1282     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1283
1284     close OUTPUT;
1285     save_file($filename);
1286 }
1287
1288 my %scripts =
1289 (
1290     "Unknown"                => 0,
1291     "Common"                 => 1,
1292     "Inherited"              => 2,
1293     "Arabic"                 => 3,
1294     "Armenian"               => 4,
1295     "Avestan"                => 5,
1296     "Balinese"               => 6,
1297     "Bamum"                  => 7,
1298     "Batak"                  => 8,
1299     "Bengali"                => 9,
1300     "Bopomofo"               => 10,
1301     "Brahmi"                 => 11,
1302     "Braille"                => 12,
1303     "Buginese"               => 13,
1304     "Buhid"                  => 14,
1305     "Canadian_Aboriginal"    => 15,
1306     "Carian"                 => 16,
1307     "Cham"                   => 17,
1308     "Cherokee"               => 18,
1309     "Coptic"                 => 19,
1310     "Cuneiform"              => 20,
1311     "Cypriot"                => 21,
1312     "Cyrillic"               => 22,
1313     "Deseret"                => 23,
1314     "Devanagari"             => 24,
1315     "Egyptian_Hieroglyphs"   => 25,
1316     "Ethiopic"               => 26,
1317     "Georgian"               => 27,
1318     "Glagolitic"             => 28,
1319     "Gothic"                 => 29,
1320     "Greek"                  => 30,
1321     "Gujarati"               => 31,
1322     "Gurmukhi"               => 32,
1323     "Han"                    => 33,
1324     "Hangul"                 => 34,
1325     "Hanunoo"                => 35,
1326     "Hebrew"                 => 36,
1327     "Hiragana"               => 37,
1328     "Imperial_Aramaic"       => 38,
1329     "Inscriptional_Pahlavi"  => 39,
1330     "Inscriptional_Parthian" => 40,
1331     "Javanese"               => 41,
1332     "Kaithi"                 => 42,
1333     "Kannada"                => 43,
1334     "Katakana"               => 44,
1335     "Kayah_Li"               => 45,
1336     "Kharoshthi"             => 46,
1337     "Khmer"                  => 47,
1338     "Lao"                    => 48,
1339     "Latin"                  => 49,
1340     "Lepcha"                 => 50,
1341     "Limbu"                  => 51,
1342     "Linear_B"               => 52,
1343     "Lisu"                   => 53,
1344     "Lycian"                 => 54,
1345     "Lydian"                 => 55,
1346     "Malayalam"              => 56,
1347     "Mandaic"                => 57,
1348     "Meetei_Mayek"           => 58,
1349     "Mongolian"              => 59,
1350     "Myanmar"                => 60,
1351     "New_Tai_Lue"            => 61,
1352     "Nko"                    => 62,
1353     "Ogham"                  => 63,
1354     "Ol_Chiki"               => 64,
1355     "Old_Italic"             => 65,
1356     "Old_Persian"            => 66,
1357     "Old_South_Arabian"      => 67,
1358     "Old_Turkic"             => 68,
1359     "Oriya"                  => 69,
1360     "Osmanya"                => 70,
1361     "Phags_Pa"               => 71,
1362     "Phoenician"             => 72,
1363     "Rejang"                 => 73,
1364     "Runic"                  => 74,
1365     "Samaritan"              => 75,
1366     "Saurashtra"             => 76,
1367     "Shavian"                => 77,
1368     "Sinhala"                => 78,
1369     "Sundanese"              => 79,
1370     "Syloti_Nagri"           => 80,
1371     "Syriac"                 => 81,
1372     "Tagalog"                => 82,
1373     "Tagbanwa"               => 83,
1374     "Tai_Le"                 => 84,
1375     "Tai_Tham"               => 85,
1376     "Tai_Viet"               => 86,
1377     "Tamil"                  => 87,
1378     "Telugu"                 => 88,
1379     "Thaana"                 => 89,
1380     "Thai"                   => 90,
1381     "Tibetan"                => 91,
1382     "Tifinagh"               => 92,
1383     "Ugaritic"               => 93,
1384     "Vai"                    => 94,
1385     "Yi"                     => 95,
1386     # Win8/Win8.1
1387     "Chakma"                 => 96,
1388     "Meroitic_Cursive"       => 97,
1389     "Meroitic_Hieroglyphs"   => 98,
1390     "Miao"                   => 99,
1391     "Sharada"                => 100,
1392     "Sora_Sompeng"           => 101,
1393     "Takri"                  => 102,
1394     # Win10
1395     "Bassa_Vah"              => 103,
1396     "Caucasian_Albanian"     => 104,
1397     "Duployan"               => 105,
1398     "Elbasan"                => 106,
1399     "Grantha"                => 107,
1400     "Khojki"                 => 108,
1401     "Khudawadi"              => 109,
1402     "Linear_A"               => 110,
1403     "Mahajani"               => 111,
1404     "Manichaean"             => 112,
1405     "Mende_Kikakui"          => 113,
1406     "Modi"                   => 114,
1407     "Mro"                    => 115,
1408     "Nabataean"              => 116,
1409     "Old_North_Arabian"      => 117,
1410     "Old_Permic"             => 118,
1411     "Pahawh_Hmong"           => 119,
1412     "Palmyrene"              => 120,
1413     "Pau_Cin_Hau"            => 121,
1414     "Psalter_Pahlavi"        => 122,
1415     "Siddham"                => 123,
1416     "Tirhuta"                => 124,
1417     "Warang_Citi"            => 125,
1418     # Win10 RS1
1419     "Adlam"                  => 126,
1420     "Ahom"                   => 127,
1421     "Anatolian_Hieroglyphs"  => 128,
1422     "Bhaiksuki"              => 129,
1423     "Hatran"                 => 130,
1424     "Marchen"                => 131,
1425     "Multani"                => 132,
1426     "Newa"                   => 133,
1427     "Old_Hungarian"          => 134,
1428     "Osage"                  => 135,
1429     "SignWriting"            => 136,
1430     "Tangut"                 => 137,
1431     # Win10 RS4
1432     "Masaram_Gondi"          => 138,
1433     "Nushu"                  => 139,
1434     "Soyombo"                => 140,
1435     "Zanabazar_Square"       => 141,
1436     # Win10 1903
1437     "Dogra"                  => 142,
1438     "Gunjala_Gondi"          => 143,
1439     "Hanifi_Rohingya"        => 144,
1440     "Makasar"                => 145,
1441     "Medefaidrin"            => 146,
1442     "Old_Sogdian"            => 147,
1443     "Sogdian"                => 148,
1444     # Win10 2004
1445     "Elymaic"                => 149,
1446     "Nyiakeng_Puachue_Hmong" => 150,
1447     "Nandinagari"            => 151,
1448     "Wancho"                 => 152,
1449 );
1450
1451 ################################################################
1452 # dump Script IDs table
1453 sub dump_scripts($)
1454 {
1455     my $filename = shift;
1456     my $header = $filename;
1457     my @scripts_table;
1458     my $script_index;
1459     my $i;
1460
1461     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1462     # Fill the table
1463     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1464     while (<$INPUT>)
1465     {
1466         my $type = "";
1467
1468         next if /^\#/;  # skip comments
1469         next if /^\s*$/;  # skip empty lines
1470         next if /\x1a/;  # skip ^Z
1471         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1472         {
1473             $type = $2;
1474             if (defined $scripts{$type})
1475             {
1476                 $scripts_table[hex $1] = $scripts{$type};
1477             }
1478             next;
1479         }
1480         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1481         {
1482             $type = $3;
1483             if (defined $scripts{$type})
1484             {
1485                 foreach my $i (hex $1 .. hex $2)
1486                 {
1487                     $scripts_table[$i] = $scripts{$type};
1488                 }
1489             }
1490             next;
1491         }
1492     }
1493
1494     close $INPUT;
1495
1496     $header = "$filename.h";
1497     open OUTPUT,">$header.new" or die "Cannot create $header";
1498     print "Building $header\n";
1499     print OUTPUT "/* Unicode Script IDs */\n";
1500     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1501     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1502
1503     print OUTPUT "enum unicode_script_id {\n";
1504     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1505     {
1506         print OUTPUT "    Script_$script = $scripts{$script},\n";
1507     }
1508     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1509     print OUTPUT "};\n";
1510
1511     close OUTPUT;
1512     save_file($header);
1513
1514     $filename = "$filename.c";
1515     open OUTPUT,">$filename.new" or die "Cannot create $header";
1516     print "Building $filename\n";
1517     print OUTPUT "/* Unicode Script IDs */\n";
1518     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1519     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1520     print OUTPUT "#include \"windef.h\"\n\n";
1521
1522     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1523     close OUTPUT;
1524     save_file($filename);
1525 }
1526
1527 ################################################################
1528 # dump the BiDi mirroring table
1529 sub dump_mirroring($)
1530 {
1531     my $filename = shift;
1532     my @mirror_table = ();
1533
1534     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1535     while (<$INPUT>)
1536     {
1537         next if /^\#/;  # skip comments
1538         next if /^$/;  # skip empty lines
1539         next if /\x1a/;  # skip ^Z
1540         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1541         {
1542             $mirror_table[hex $1] = hex $2;
1543             next;
1544         }
1545         die "malformed line $_";
1546     }
1547     close $INPUT;
1548
1549     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1550     print "Building $filename\n";
1551     print OUTPUT "/* Unicode BiDi mirroring */\n";
1552     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1553     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1554     print OUTPUT "#include \"windef.h\"\n\n";
1555     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1556     close OUTPUT;
1557     save_file($filename);
1558 }
1559
1560 ################################################################
1561 # dump the Bidi Brackets
1562 sub dump_bracket($)
1563 {
1564     my $filename = shift;
1565     my @bracket_table;
1566
1567     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1568     while (<$INPUT>)
1569     {
1570         next if /^\#/;  # skip comments
1571         next if /^\s*$/;  # skip empty lines
1572         next if /\x1a/;  # skip ^Z
1573         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1574         {
1575             my $type = $3;
1576             die "unknown bracket $type" unless defined $bracket_types{$type};
1577             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1578             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1579             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1580             next;
1581         }
1582         die "malformed line $_";
1583     }
1584     close $INPUT;
1585
1586     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1587     print "Building $filename\n";
1588     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1589     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1590     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1591     print OUTPUT "#include \"windef.h\"\n\n";
1592
1593     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1594
1595     close OUTPUT;
1596     save_file($filename);
1597 }
1598
1599 ################################################################
1600 # dump the Arabic shaping table
1601 sub dump_shaping($)
1602 {
1603     my $filename = shift;
1604     my %groups;
1605     my $next_group = 0;
1606
1607     $groups{"No_Joining_Group"} = $next_group++;
1608
1609     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1610     while (<$INPUT>)
1611     {
1612         next if /^\#/;  # skip comments
1613         next if /^\s*$/;  # skip empty lines
1614         next if /\x1a/;  # skip ^Z
1615         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1616         {
1617             my $type = $2;
1618             my $group = $3;
1619             $groups{$group} = $next_group++ unless defined $groups{$group};
1620             $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1621             next;
1622         }
1623         die "malformed line $_";
1624     }
1625     close $INPUT;
1626
1627     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1628     print "Building $filename\n";
1629     print OUTPUT "/* Unicode Arabic shaping */\n";
1630     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1631     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1632     print OUTPUT "#include \"windef.h\"\n\n";
1633
1634     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1635
1636     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1637     for (my $i = 0x600; $i <= 0x6ff; $i++)
1638     {
1639         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1640             ${joining_forms{"isolated"}}[$i] || $i,
1641             ${joining_forms{"final"}}[$i] || $i,
1642             ${joining_forms{"initial"}}[$i] || $i,
1643             ${joining_forms{"medial"}}[$i] || $i;
1644     }
1645     print OUTPUT "};\n";
1646
1647     close OUTPUT;
1648     save_file($filename);
1649 }
1650
1651 ################################################################
1652 # dump the Vertical Orientation table
1653 sub dump_vertical($)
1654 {
1655     my $filename = shift;
1656     my @vertical_table;
1657
1658     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1659     while (<$INPUT>)
1660     {
1661         next if /^\#/;  # skip comments
1662         next if /^\s*$/;  # skip empty lines
1663         next if /\x1a/;  # skip ^Z
1664         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1665         {
1666             my $type = $2;
1667             die "unknown vertical $type" unless defined $vertical_types{$type};
1668             if (hex $1 < 65536)
1669             {
1670                 $vertical_table[hex $1] = $vertical_types{$type};
1671             }
1672             next;
1673         }
1674         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1675         {
1676             my $type = $3;
1677             die "unknown vertical $type" unless defined $vertical_types{$type};
1678             foreach my $i (hex $1 .. hex $2)
1679             {
1680                 $vertical_table[$i] = $vertical_types{$type};
1681             }
1682             next;
1683         }
1684         die "malformed line $_";
1685     }
1686     close $INPUT;
1687
1688     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1689     print "Building $filename\n";
1690     print OUTPUT "/* Unicode Vertical Orientation */\n";
1691     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1692     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1693     print OUTPUT "#include \"windef.h\"\n\n";
1694
1695     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1696
1697     close OUTPUT;
1698     save_file($filename);
1699 }
1700
1701 ################################################################
1702 # dump the digit folding tables
1703 sub dump_digit_folding($)
1704 {
1705     my ($filename) = shift;
1706     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1707     print "Building $filename\n";
1708     print OUTPUT "/* Unicode digit folding mappings */\n";
1709     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1710     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1711     print OUTPUT "#include \"windef.h\"\n\n";
1712
1713     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1714     close OUTPUT;
1715     save_file($filename);
1716 }
1717
1718
1719 ################################################################
1720 # compress a mapping table by removing identical rows
1721 sub compress_array($$@)
1722 {
1723     my $rows = shift;
1724     my $def = shift;
1725     my @table = @_;
1726     my $len = @table / $rows;
1727     my @array;
1728     my $data = "";
1729
1730     # try to merge table rows
1731     for (my $row = 0; $row < $rows; $row++)
1732     {
1733         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1734         my $pos = index $data, $rowtxt;
1735         if ($pos == -1)
1736         {
1737             # check if the tail of the data can match the start of the new row
1738             my $first = substr( $rowtxt, 0, 1 );
1739             for (my $i = length($data) - 1; $i > 0; $i--)
1740             {
1741                 $pos = index( substr( $data, -$i ), $first );
1742                 last if $pos == -1;
1743                 $i -= $pos;
1744                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1745                 substr( $data, -$i ) = "";
1746                 last;
1747             }
1748             $pos = length $data;
1749             $data .= $rowtxt;
1750         }
1751         $array[$row] = $rows + $pos;
1752     }
1753     return @array, unpack "U*", $data;
1754 }
1755
1756 ################################################################
1757 # dump a char -> 16-bit value mapping table using two-level tables
1758 sub dump_two_level_mapping($$@)
1759 {
1760     my $name = shift;
1761     my $def = shift;
1762     my $size = shift;
1763     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1764     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1765     my @array = compress_array( 256, 0, @row_array[0..4095] );
1766
1767     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1768
1769     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1770     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1771     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1772     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1773 }
1774
1775 ################################################################
1776 # dump a char -> value mapping table using three-level tables
1777 sub dump_three_level_mapping($$@)
1778 {
1779     my $name = shift;
1780     my $def = shift;
1781     my $size = shift;
1782     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1783     my $level3 = ($MAX_CHAR + 1) / 16;
1784     my $level2 = $level3 / 16;
1785     my $level1 = $level2 / 16;
1786     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1787     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1788     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1789
1790     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1791     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1792
1793     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1794     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1795     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1796     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1797     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1798 }
1799
1800 ################################################################
1801 # dump a binary case mapping table in l_intl.nls format
1802 sub dump_binary_case_table(@)
1803 {
1804     my (@table) = @_;
1805     my $max_char = 0x10000;
1806     my $level1 = $max_char / 16;
1807     my $level2 = $level1 / 16;
1808
1809     my @difftable;
1810     for (my $i = 0; $i < @table; $i++)
1811     {
1812         next unless defined $table[$i];
1813         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1814     }
1815
1816     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1817     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1818     my $offset = @array - $level1;
1819     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1820     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1821 }
1822
1823 ################################################################
1824 # dump case mappings for l_intl.nls
1825 sub dump_intl_nls($)
1826 {
1827     my @upper_table = @toupper_table;
1828     my @lower_table = @tolower_table;
1829     remove_linguistic_mappings( \@upper_table, \@lower_table );
1830
1831     my $upper = dump_binary_case_table( @upper_table );
1832     my $lower = dump_binary_case_table( @lower_table );
1833
1834     my $filename = shift;
1835     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1836     printf "Building $filename\n";
1837
1838     binmode OUTPUT;
1839     print OUTPUT pack "S<", 1;  # version
1840     print OUTPUT $upper;
1841     print OUTPUT $lower;
1842     close OUTPUT;
1843     save_file($filename);
1844 }
1845
1846
1847 ################################################################
1848 # dump the bidi direction table
1849 sub dump_bidi_dir_table($)
1850 {
1851     my $filename = shift;
1852     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1853     printf "Building $filename\n";
1854     printf OUTPUT "/* Unicode BiDi direction table */\n";
1855     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1856     printf OUTPUT "#include \"windef.h\"\n\n";
1857
1858     my @table;
1859
1860     for (my $i = 0; $i < 65536; $i++)
1861     {
1862         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1863     }
1864
1865     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1866
1867     close OUTPUT;
1868     save_file($filename);
1869 }
1870
1871
1872 sub rol($$)
1873 {
1874     my ($byte, $count) = @_;
1875     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1876 }
1877
1878 ################################################################
1879 # compress the character properties table
1880 sub compress_char_props_table($@)
1881 {
1882     my $rows = shift;
1883     my @table = @_;
1884     my $len = @table / $rows;
1885     my $pos = 0;
1886     my @array = (0) x $rows;
1887     my %sequences;
1888
1889     # add some predefined sequences
1890     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1891
1892     # try to merge table rows
1893     for (my $row = 0; $row < $rows; $row++)
1894     {
1895         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1896         my $rowtxt = pack "L*", @table_row;
1897         if (defined($sequences{$rowtxt}))
1898         {
1899             # reuse an existing row
1900             $array[$row] = $sequences{$rowtxt};
1901         }
1902         else
1903         {
1904             # create a new row
1905             $sequences{$rowtxt} = $array[$row] = ++$pos;
1906             push @array, @table_row;
1907         }
1908     }
1909     return @array;
1910 }
1911
1912 ################################################################
1913 # dump a normalization table in binary format
1914 sub dump_norm_table($)
1915 {
1916     my $filename = shift;
1917
1918     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
1919     my %decomp = ( "nfc" => \@decomp_table,
1920                    "nfd" => \@decomp_table,
1921                    "nfkc" => \@decomp_compat_table,
1922                    "nfkd" => \@decomp_compat_table ,
1923                    "idna" => \@idna_decomp_table );
1924
1925     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1926     print "Building $filename\n";
1927
1928     my $type = $filename;
1929     $type =~ s!.*/norm(\w+)\.nls!$1!;
1930
1931     my $compose = $forms{$type} & 1;
1932     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
1933
1934     my @version = split /\./, $UNIVERSION;
1935
1936     # combining classes
1937
1938     my @classes;
1939     my @class_values;
1940
1941     foreach my $c (grep defined, @combining_class_table)
1942     {
1943         $classes[$c] = 1 if $c < 0x100;
1944     }
1945     for (my $i = 0; $i < @classes; $i++)
1946     {
1947         next unless defined $classes[$i];
1948         $classes[$i] = @class_values;
1949         push @class_values, $i;
1950     }
1951     push @class_values, 0 if (@class_values % 2);
1952     die "too many classes" if @class_values >= 0x40;
1953
1954     # character properties
1955
1956     my @char_props;
1957     my @decomposed;
1958     my @comp_hash_table;
1959     my $comp_hash_size = $compose ? 254 : 0;
1960
1961     for (my $i = 0; $i <= $MAX_CHAR; $i++)
1962     {
1963         next unless defined $combining_class_table[$i];
1964         if (defined $decomp{$type}->[$i])
1965         {
1966             my @dec = get_decomposition( $i, $decomp{$type} );
1967             if ($compose && (my @comp = get_composition( $i, $compat )))
1968             {
1969                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
1970                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
1971
1972                 my $val = 0;
1973                 foreach my $d (@dec)
1974                 {
1975                     $val = $combining_class_table[$d];
1976                     last if $val;
1977                 }
1978                 $char_props[$i] = $classes[$val];
1979             }
1980             else
1981             {
1982                 $char_props[$i] = 0xbf;
1983             }
1984             @dec = compose_hangul( @dec ) if $compose;
1985             @dec = to_utf16( @dec );
1986             push @dec, 0 if @dec >= 7;
1987             $decomposed[$i] = \@dec;
1988         }
1989         else
1990         {
1991             if ($combining_class_table[$i] == 0x100)
1992             {
1993                 $char_props[$i] = 0x7f;
1994             }
1995             elsif ($combining_class_table[$i])
1996             {
1997                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
1998             }
1999             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2000             {
2001                 $char_props[$i] = 0xff;
2002             }
2003             else
2004             {
2005                 $char_props[$i] = 0;
2006             }
2007         }
2008     }
2009
2010     if ($compose)
2011     {
2012         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2013         {
2014             my @comp = get_composition( $i, $compat );
2015             next unless @comp;
2016             if ($combining_class_table[$comp[1]])
2017             {
2018                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2019                 $char_props[$comp[1]] |= 0x40;
2020             }
2021             else
2022             {
2023                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2024                 $char_props[$comp[1]] |= 0xc0;
2025             }
2026         }
2027     }
2028
2029     # surrogates
2030     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2031     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2032
2033     # Hangul
2034     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2035     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2036     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2037
2038     # invalid chars
2039     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2040     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2041     foreach my $i (0x00..0x10)
2042     {
2043         $char_props[($i << 16) | 0xfffe] = 0xff;
2044         $char_props[($i << 16) | 0xffff] = 0xff;
2045     }
2046
2047     # decomposition hash table
2048
2049     my @decomp_hash_table;
2050     my @decomp_hash_index;
2051     my @decomp_hash_data;
2052     my $decomp_hash_size = 944;
2053
2054     # build string of character data, reusing substrings when possible
2055     my $decomp_char_data = "";
2056     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2057     {
2058         my $str = pack "U*", @{$i};
2059         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2060     }
2061     for (my $i = 0; $i < @decomposed; $i++)
2062     {
2063         next unless defined $decomposed[$i];
2064         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2065         die "sequence not found" if $pos == -1;
2066         my $len = @{$decomposed[$i]};
2067         $len = 7 if $len > 7;
2068         my $hash = $i % $decomp_hash_size;
2069         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2070     }
2071     for (my $i = 0; $i < $decomp_hash_size; $i++)
2072     {
2073         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2074         next unless defined $decomp_hash_table[$i];
2075         if (@{$decomp_hash_table[$i]} == 1)
2076         {
2077             my $entry = $decomp_hash_table[$i]->[0];
2078             if ($char_props[$entry->[0]] == 0xbf)
2079             {
2080                 $decomp_hash_index[$i] = $entry->[1];
2081                 next;
2082             }
2083         }
2084         foreach my $entry (@{$decomp_hash_table[$i]})
2085         {
2086             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2087         }
2088     }
2089     push @decomp_hash_data, 0, 0;
2090
2091     # composition hash table
2092
2093     my @comp_hash_index;
2094     my @comp_hash_data;
2095     if (@comp_hash_table)
2096     {
2097         for (my $i = 0; $i < $comp_hash_size; $i++)
2098         {
2099             $comp_hash_index[$i] = @comp_hash_data;
2100             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2101         }
2102         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2103         push @comp_hash_data, 0, 0, 0;
2104     }
2105
2106     my $level1 = ($MAX_CHAR + 1) / 128;
2107     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2108
2109     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2110                    0, $decomp_hash_size, $comp_hash_size, 0 );
2111     my @tables = (0) x 8;
2112
2113     $tables[0] = 16 + @header + @tables;
2114     $tables[1] = $tables[0] + @class_values / 2;
2115     $tables[2] = $tables[1] + $level1 / 2;
2116     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2117     $tables[4] = $tables[3] + @decomp_hash_index;
2118     $tables[5] = $tables[4] + @decomp_hash_data;
2119     $tables[6] = $tables[5] + length $decomp_char_data;
2120     $tables[7] = $tables[6] + @comp_hash_index;
2121
2122     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2123     print OUTPUT pack "S<*", @header;
2124     print OUTPUT pack "S<*", @tables;
2125     print OUTPUT pack "C*", @class_values;
2126
2127     print OUTPUT pack "C*", @rows[0..$level1-1];
2128     print OUTPUT pack "C*", @rows[$level1..$#rows];
2129     print OUTPUT pack "S<*", @decomp_hash_index;
2130     print OUTPUT pack "S<*", @decomp_hash_data;
2131     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2132     print OUTPUT pack "S<*", @comp_hash_index;
2133     print OUTPUT pack "S<*", @comp_hash_data;
2134
2135     close OUTPUT;
2136     save_file($filename);
2137
2138     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2139 }
2140
2141
2142 ################################################################
2143 # output a codepage definition file from the global tables
2144 sub output_codepage_file($)
2145 {
2146     my $codepage = shift;
2147
2148     my $output = sprintf "nls/c_%03d.nls", $codepage;
2149     open OUTPUT,">$output.new" or die "Cannot create $output";
2150
2151     printf "Building %s\n", $output;
2152     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2153     else { dump_binary_dbcs_table( $codepage ); }
2154
2155     close OUTPUT;
2156     save_file($output);
2157
2158     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2159 }
2160
2161 ################################################################
2162 # output a codepage table from a Microsoft-style mapping file
2163 sub dump_msdata_codepage($)
2164 {
2165     my $filename = shift;
2166
2167     my $state = "";
2168     my ($codepage, $width, $count);
2169     my ($lb_cur, $lb_end);
2170
2171     @cp2uni = ();
2172     @glyph2uni = ();
2173     @lead_bytes = ();
2174     @uni2cp = ();
2175     $default_char = $DEF_CHAR;
2176     $default_wchar = $DEF_CHAR;
2177
2178     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2179
2180     while (<$INPUT>)
2181     {
2182         next if /^;/;  # skip comments
2183         next if /^\s*$/;  # skip empty lines
2184         next if /\x1a/;  # skip ^Z
2185         last if /^ENDCODEPAGE/;
2186
2187         if (/^CODEPAGE\s+(\d+)/)
2188         {
2189             $codepage = $1;
2190             next;
2191         }
2192         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2193         {
2194             $width = $1;
2195             $default_char = hex $2;
2196             $default_wchar = hex $3;
2197             next;
2198         }
2199         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2200         {
2201             $state = $1;
2202             $count = $2;
2203             next;
2204         }
2205         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2206         {
2207             if ($state eq "MBTABLE")
2208             {
2209                 my $cp = hex $1;
2210                 my $uni = hex $2;
2211                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2212                 next;
2213             }
2214             if ($state eq "GLYPHTABLE")
2215             {
2216                 my $cp = hex $1;
2217                 my $uni = hex $2;
2218                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2219                 next;
2220             }
2221             if ($state eq "WCTABLE")
2222             {
2223                 my $uni = hex $1;
2224                 my $cp = hex $2;
2225                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2226                 next;
2227             }
2228             if ($state eq "DBCSRANGE")
2229             {
2230                 my $start = hex $1;
2231                 my $end = hex $2;
2232                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2233                 $lb_cur = $start;
2234                 $lb_end = $end;
2235                 next;
2236             }
2237             if ($state eq "DBCSTABLE")
2238             {
2239                 my $mb = hex $1;
2240                 my $uni = hex $2;
2241                 my $cp = ($lb_cur << 8) | $mb;
2242                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2243                 if (!--$count)
2244                 {
2245                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2246                 }
2247                 next;
2248             }
2249         }
2250         die "$filename: Unrecognized line $_\n";
2251     }
2252     close $INPUT;
2253
2254     output_codepage_file( $codepage );
2255 }
2256
2257 ################################################################
2258 # align a string length
2259 sub align_string($$)
2260 {
2261     my ($align, $str) = @_;
2262     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2263     return $str;
2264 }
2265
2266 ################################################################
2267 # pack a GUID string
2268 sub pack_guid($)
2269 {
2270     $_ = shift;
2271     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2272     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2273 }
2274
2275 ################################################################
2276 # comparison function for compression sort
2277 sub cmp_compression
2278 {
2279     return scalar @{$a} <=> scalar @{$b} ||
2280         $a->[4] <=> $b->[4] ||
2281         $a->[5] <=> $b->[5] ||
2282         $a->[6] <=> $b->[6] ||
2283         $a->[7] <=> $b->[7] ||
2284         $a->[8] <=> $b->[8] ||
2285         $a->[9] <=> $b->[9] ||
2286         $a->[10] <=> $b->[10] ||
2287         $a->[11] <=> $b->[11] ||
2288         $a->[12] <=> $b->[12];
2289 }
2290
2291 ################################################################
2292 # build a binary sort keys table
2293 sub dump_sortkey_table($$)
2294 {
2295     my ($filename, $download) = @_;
2296
2297     my @keys;
2298     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2299     my @multiple_weights;
2300     my @expansions;
2301     my @compressions;
2302     my %exceptions;
2303     my %guids;
2304     my %compr_flags;
2305     my %locales;
2306     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2307     my $jamostr = "";
2308
2309     my $re_hex = '0x[0-9A-Fa-f]+';
2310     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2311     $guids{$default_guid} = { };
2312
2313     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2314
2315     my $KEYS = open_data_file( $MSDATA, $download );
2316
2317     printf "Building $filename\n";
2318
2319     while (<$KEYS>)
2320     {
2321         s/\s*;.*$//;
2322         next if /^\s*$/;  # skip empty lines
2323         if (/^\s*(SORTKEY|SORTTABLES)/)
2324         {
2325             $part = $1;
2326             next;
2327         }
2328         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2329         {
2330             $part = $section = "";
2331             next;
2332         }
2333         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2334         {
2335             $section = $1;
2336             $guid = undef;
2337             next;
2338         }
2339         next unless $part;
2340         if ("$part.$section" eq "SORTKEY.DEFAULT")
2341         {
2342             if (/^\s*($re_hex)\s+$re_key/)
2343             {
2344                 $keys[hex $1] = [ split(/\s+/,$2) ];
2345                 next;
2346             }
2347         }
2348         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2349         {
2350             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2351             {
2352                 $version = hex $1;
2353                 next;
2354             }
2355             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2356             {
2357                 # ignore for now
2358                 next;
2359             }
2360         }
2361         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2362                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2363                "$part.$section" eq "SORTTABLES.INVERSECASING")
2364         {
2365             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2366             {
2367                 $guid = lc $1;
2368                 $guids{$guid} = { } unless defined $guids{$guid};
2369                 $guids{$guid}->{flags} |= $flags{$section};
2370                 next;
2371             }
2372             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2373             {
2374                 $locales{$1} = $guid;
2375                 next;
2376             }
2377         }
2378         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2379         {
2380             if (/^\s*(\d+)\s+(\d+)/)
2381             {
2382                 push @multiple_weights, $1, $2;
2383                 next;
2384             }
2385         }
2386         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2387         {
2388             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2389             {
2390                 my $pos = scalar @expansions / 2;
2391                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2392                 push @expansions, hex $2, hex $3;
2393                 next;
2394             }
2395         }
2396         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2397         {
2398             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2399             {
2400                 $keys[hex $1] = $keys[hex $2];
2401                 next;
2402             }
2403         }
2404         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2405         {
2406             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2407             {
2408                 if ($subsection || !$guid)  # start a new one
2409                 {
2410                     $guid = lc $1;
2411                     $subsection = "";
2412                     $guids{$guid} = { } unless defined $guids{$guid};
2413                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2414                     $guids{$guid}->{compr} = @compressions;
2415                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2416                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2417                     push @compressions, [ ];
2418                 }
2419                 else  # merge with current one
2420                 {
2421                     $guids{lc $1} = { } unless defined $guids{lc $1};
2422                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2423                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2424                     $compr_flags{lc $1} = $compr_flags{$guid};
2425                 }
2426                 next;
2427             }
2428             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2429             {
2430                 $locales{$1} = $guid;
2431                 next;
2432             }
2433             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2434             {
2435                 $subsection = $1;
2436                 next;
2437             }
2438             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2439             {
2440                 my @comp = map { hex $_; } split(/\s+/,$1);
2441                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2442                 # add compression flags
2443                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2444                 next;
2445             }
2446         }
2447         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2448         {
2449             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2450             {
2451                 $guid = lc $1;
2452                 $guids{$guid} = { } unless defined $guids{lc $1};
2453                 $ling_flag = ($2 ? "+" : "-");
2454                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2455                 next;
2456             }
2457             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2458             {
2459                 $locales{$1} = $guid;
2460                 next;
2461             }
2462             if (/^\s*($re_hex)\s+$re_key/)
2463             {
2464                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2465                 next;
2466             }
2467         }
2468         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2469         {
2470             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2471             {
2472                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2473                 next;
2474             }
2475         }
2476         die "$download: $part.$section: unrecognized line $_\n";
2477     }
2478     close $KEYS;
2479
2480     # Sortkey table
2481
2482     my $table;
2483     for (my $i = 0; $i < 0x10000; $i++)
2484     {
2485         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2486         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2487     }
2488
2489     foreach my $id (sort keys %exceptions)
2490     {
2491         my $pos = length($table) / 4;
2492         my @exc = @{$exceptions{$id}};
2493         my @filled;
2494         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2495         my $guid = substr( $id, 0, -1 );
2496         $guids{$guid}->{$key} = $pos;
2497         $pos += 0x100;
2498         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2499         for (my $j = 0; $j < 0x10000; $j++)
2500         {
2501             next unless defined $exc[$j] || defined $flags[$j];
2502             $filled[$j >> 8] = 1;
2503             $j |= 0xff;
2504         }
2505         for (my $j = 0; $j < 0x100; $j++)
2506         {
2507             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2508             $pos += 0x100 if $filled[$j];
2509         }
2510         for (my $j = 0; $j < 0x10000; $j++)
2511         {
2512             next unless $filled[$j >> 8];
2513             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2514             $k[3] |= $flags[$j] || 0;
2515             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2516         }
2517     }
2518
2519     # Case mapping tables
2520
2521     # standard table
2522     my @casemaps;
2523     my @upper = @toupper_table;
2524     my @lower = @tolower_table;
2525     remove_linguistic_mappings( \@upper, \@lower );
2526     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2527
2528     # linguistic table
2529     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2530
2531     # Turkish table
2532     @upper = @toupper_table;
2533     @lower = @tolower_table;
2534     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2535     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2536     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2537     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2538
2539     # Char type table
2540
2541     my @table;
2542     my $types = "";
2543     my %typestr;
2544     for (my $i = 0; $i < 0x10000; $i++)
2545     {
2546         my $str = pack "S<3",
2547             ($category_table[$i] || 0) & 0xffff,
2548             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2549             ($category_table[$i] || 0) >> 16;
2550
2551         if (!defined($typestr{$str}))
2552         {
2553             $typestr{$str} = length($types) / 6;
2554             $types .= $str;
2555         }
2556         $table[$i] = $typestr{$str};
2557     }
2558
2559     my @rows = compress_array( 4096, 0, @table[0..65535] );
2560     my @array = compress_array( 256, 0, @rows[0..4095] );
2561     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2562     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2563
2564     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2565     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2566     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2567
2568     # Sort tables
2569
2570     # guids
2571     my $sorttables = pack "L<2", $version, scalar %guids;
2572     foreach my $id (sort keys %guids)
2573     {
2574         my %guid = %{$guids{$id}};
2575         my $flags = $guid{flags} || 0;
2576         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2577         $sorttables .= pack_guid($id) . pack "L<5",
2578             $flags,
2579             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2580             $guid{except} || 0,
2581             $guid{ling_except} || 0,
2582             $map / 2;
2583     }
2584
2585     # expansions
2586     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2587
2588     # compressions
2589     $sorttables .= pack "L<", scalar @compressions;
2590     my $rowstr = "";
2591     foreach my $c (@compressions)
2592     {
2593         my $pos = length($rowstr) / 2;
2594         my $min = 0xffff;
2595         my $max = 0;
2596         my @lengths = (0) x 8;
2597         foreach my $r (sort cmp_compression @{$c})
2598         {
2599             my @row = @{$r};
2600             $lengths[scalar @row - 6]++;
2601             foreach my $val (@row[4..$#row])
2602             {
2603                 $min = $val if $min > $val;
2604                 $max = $val if $max < $val;
2605             }
2606             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2607             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2608         }
2609         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2610     }
2611     $sorttables .= $rowstr;
2612
2613     # multiple weights
2614     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2615
2616     # jamo sort
2617     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2618
2619     # Locales
2620
2621     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2622     foreach my $loc (sort keys %locales)
2623     {
2624         # skip specific locales that match more general ones
2625         my @parts = split /[-_]/, $loc;
2626         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2627         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2628         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2629     }
2630
2631     # File header
2632
2633     my @header;
2634     $header[0] = 16;
2635     $header[1] = $header[0] + length $table;
2636     $header[2] = $header[1] + length $casemaps;
2637     $header[3] = $header[2] + length $chartypes;
2638
2639     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2640     print OUTPUT pack "L<*", @header;
2641     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2642     close OUTPUT;
2643     save_file($filename);
2644 }
2645
2646
2647 ################################################################
2648 # build the script to create registry keys
2649 sub dump_registry_script($%)
2650 {
2651     my ($filename, %keys) = @_;
2652     my $indent = 1;
2653
2654     printf "Building %s\n", $filename;
2655     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2656     print OUTPUT "HKLM\n{\n";
2657     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2658     {
2659         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2660         $indent++;
2661     }
2662     foreach my $k (sort keys %keys)
2663     {
2664         my @subkeys = split /\\/, $k;
2665         my ($def, @vals) = @{$keys{$k}};
2666         for (my $i = 0; $i < @subkeys; $i++)
2667         {
2668             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2669                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2670             $indent++;
2671         }
2672         foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2673         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2674     }
2675     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2676     close OUTPUT;
2677     save_file($filename);
2678 }
2679
2680
2681 ################################################################
2682 # save a file if modified
2683 sub save_file($)
2684 {
2685     my $file = shift;
2686     if (-f $file && !system "cmp $file $file.new >/dev/null")
2687     {
2688         unlink "$file.new";
2689     }
2690     else
2691     {
2692         rename "$file.new", "$file";
2693     }
2694 }
2695
2696
2697 ################################################################
2698 # main routine
2699
2700 chdir ".." if -f "./make_unicode";
2701 load_data();
2702 dump_sortkeys( "dlls/kernelbase/collation.c" );
2703 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2704 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2705 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2706 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2707 dump_mirroring( "dlls/dwrite/mirror.c" );
2708 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2709 dump_bracket( "dlls/dwrite/bracket.c" );
2710 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2711 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2712 dump_linebreak( "dlls/dwrite/linebreak.c" );
2713 dump_scripts( "dlls/dwrite/scripts" );
2714 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2715 dump_vertical( "dlls/gdi32/vertical.c" );
2716 dump_vertical( "dlls/wineps.drv/vertical.c" );
2717 dump_intl_nls("nls/l_intl.nls");
2718 dump_norm_table( "nls/normnfc.nls" );
2719 dump_norm_table( "nls/normnfd.nls" );
2720 dump_norm_table( "nls/normnfkc.nls" );
2721 dump_norm_table( "nls/normnfkd.nls" );
2722 dump_norm_table( "nls/normidna.nls" );
2723 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2724 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2725 dump_eucjp_codepage();
2726 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2727
2728 exit 0;
2729
2730 # Local Variables:
2731 # compile-command: "./make_unicode"
2732 # End: