tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $REPORTS = "http://www.unicode.org/reports";
  30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  32
  33 # Sort keys file
  34 my $SORTKEYS = "tr10/allkeys.txt";
  35
  36 # Default char for undefined mappings
  37 my $DEF_CHAR = ord '?';
  38
  39 # Last valid Unicode character
  40 my $MAX_CHAR = 0x10ffff;
  41
  42 my @allfiles =
  43 (
  44     "CodpageFiles/037.txt",
  45     "CodpageFiles/437.txt",
  46     "CodpageFiles/500.txt",
  47     "CodpageFiles/708.txt",
  48     "CodpageFiles/737.txt",
  49     "CodpageFiles/775.txt",
  50     "CodpageFiles/850.txt",
  51     "CodpageFiles/852.txt",
  52     "CodpageFiles/855.txt",
  53     "CodpageFiles/857.txt",
  54     "CodpageFiles/860.txt",
  55     "CodpageFiles/861.txt",
  56     "CodpageFiles/862.txt",
  57     "CodpageFiles/863.txt",
  58     "CodpageFiles/864.txt",
  59     "CodpageFiles/865.txt",
  60     "CodpageFiles/866.txt",
  61     "CodpageFiles/869.txt",
  62     "CodpageFiles/874.txt",
  63     "CodpageFiles/875.txt",
  64     "CodpageFiles/932.txt",
  65     "CodpageFiles/936.txt",
  66     "CodpageFiles/949.txt",
  67     "CodpageFiles/950.txt",
  68     "CodpageFiles/1026.txt",
  69     "CodpageFiles/1250.txt",
  70     "CodpageFiles/1251.txt",
  71     "CodpageFiles/1252.txt",
  72     "CodpageFiles/1253.txt",
  73     "CodpageFiles/1254.txt",
  74     "CodpageFiles/1255.txt",
  75     "CodpageFiles/1256.txt",
  76     "CodpageFiles/1257.txt",
  77     "CodpageFiles/1258.txt",
  78     "CodpageFiles/1361.txt",
  79     "CodpageFiles/10000.txt",
  80     "CodpageFiles/10001.txt",
  81     "CodpageFiles/10002.txt",
  82     "CodpageFiles/10003.txt",
  83     "CodpageFiles/10004.txt",
  84     "CodpageFiles/10005.txt",
  85     "CodpageFiles/10006.txt",
  86     "CodpageFiles/10007.txt",
  87     "CodpageFiles/10008.txt",
  88     "CodpageFiles/10010.txt",
  89     "CodpageFiles/10017.txt",
  90     "CodpageFiles/10021.txt",
  91     "CodpageFiles/10029.txt",
  92     "CodpageFiles/10079.txt",
  93     "CodpageFiles/10081.txt",
  94     "CodpageFiles/10082.txt",
  95     "CodpageFiles/20127.txt",
  96     "CodpageFiles/20866.txt",
  97     "CodpageFiles/21866.txt",
  98     "CodpageFiles/28591.txt",
  99     "CodpageFiles/28592.txt",
 100     "CodpageFiles/28593.txt",
 101     "CodpageFiles/28594.txt",
 102     "CodpageFiles/28595.txt",
 103     "CodpageFiles/28596.txt",
 104     "CodpageFiles/28597.txt",
 105     "CodpageFiles/28598.txt",
 106     "CodpageFiles/28599.txt",
 107     "CodpageFiles/28603.txt",
 108     "CodpageFiles/28605.txt",
 109 );
 110
 111
 112 my %ctype =
 113 (
 114      # CT_CTYPE1
 115     "upper"  => 0x0001,
 116     "lower"  => 0x0002,
 117     "digit"  => 0x0004,
 118     "space"  => 0x0008,
 119     "punct"  => 0x0010,
 120     "cntrl"  => 0x0020,
 121     "blank"  => 0x0040,
 122     "xdigit" => 0x0080,
 123     "alpha"  => 0x0100 | 0x80000000,
 124     "defin"  => 0x0200,
 125      # CT_CTYPE3 in high 16 bits
 126     "nonspacing"    => 0x00010000,
 127     "diacritic"     => 0x00020000,
 128     "vowelmark"     => 0x00040000,
 129     "symbol"        => 0x00080000,
 130     "katakana"      => 0x00100000,
 131     "hiragana"      => 0x00200000,
 132     "halfwidth"     => 0x00400000,
 133     "fullwidth"     => 0x00800000,
 134     "ideograph"     => 0x01000000,
 135     "kashida"       => 0x02000000,
 136     "lexical"       => 0x04000000,
 137     "highsurrogate" => 0x08000000,
 138     "lowsurrogate"  => 0x10000000,
 139 );
 140
 141 my %bracket_types =
 142 (
 143     "o" => 0x0000,
 144     "c" => 0x0001,
 145 );
 146
 147 my %indic_types =
 148 (
 149     "Other"    => 0x0000,
 150     "Bindu"    => 0x0001,
 151     "Visarga"  => 0x0002,
 152     "Avagraha" => 0x0003,
 153     "Nukta"    => 0x0004,
 154     "Virama"   => 0x0005,
 155     "Vowel_Independent"  => 0x0006,
 156     "Vowel_Dependent"  => 0x0007,
 157     "Vowel"  => 0x0008,
 158     "Consonant_Placeholder"  => 0x0009,
 159     "Consonant"  => 0x000a,
 160     "Consonant_Dead"  => 0x000b,
 161     "Consonant_Succeeding_Repha" => 0x000c,
 162     "Consonant_Subjoined"  => 0x000d,
 163     "Consonant_Medial"  => 0x000e,
 164     "Consonant_Final"  => 0x000f,
 165     "Consonant_Head_Letter"  => 0x0010,
 166     "Modifying_Letter"  => 0x0011,
 167     "Tone_Letter"  => 0x0012,
 168     "Tone_Mark"  => 0x0013,
 169     "Register_Shifter"  => 0x0014,
 170     "Consonant_Preceding_Repha" => 0x0015,
 171     "Pure_Killer" => 0x0016,
 172     "Invisible_Stacker" => 0x0017,
 173     "Gemination_Mark" => 0x0018,
 174     "Cantillation_Mark" => 0x0019,
 175     "Non_Joiner" => 0x001a,
 176     "Joiner" => 0x001b,
 177     "Number_Joiner" => 0x001c,
 178     "Number" => 0x001d,
 179     "Brahmi_Joining_Number" => 0x001e,
 180     "Consonant_With_Stacker" => 0x001f,
 181     "Consonant_Prefixed" => 0x0020,
 182     "Syllable_Modifier" => 0x0021,
 183     "Consonant_Killer" => 0x0022,
 184     "Consonant_Initial_Postfixed" => 0x0023,
 185 );
 186
 187 my %matra_types =
 188 (
 189     "Right"    => 0x01,
 190     "Left"  => 0x02,
 191     "Visual_Order_Left" => 0x03,
 192     "Left_And_Right"    => 0x04,
 193     "Top"   => 0x05,
 194     "Bottom"  => 0x06,
 195     "Top_And_Bottom"  => 0x07,
 196     "Top_And_Right"  => 0x08,
 197     "Top_And_Left"  => 0x09,
 198     "Top_And_Left_And_Right"  => 0x0a,
 199     "Bottom_And_Right"  => 0x0b,
 200     "Top_And_Bottom_And_Right"  => 0x0c,
 201     "Overstruck"  => 0x0d,
 202     "Invisible"  => 0x0e,
 203     "Bottom_And_Left"  => 0x0f,
 204     "Top_And_Bottom_And_Left"  => 0x10,
 205 );
 206
 207 my %break_types =
 208 (
 209     "BK"  => 0x0001,
 210     "CR"  => 0x0002,
 211     "LF"  => 0x0003,
 212     "CM"  => 0x0004,
 213     "SG"  => 0x0005,
 214     "GL"  => 0x0006,
 215     "CB"  => 0x0007,
 216     "SP"  => 0x0008,
 217     "ZW"  => 0x0009,
 218     "NL"  => 0x000a,
 219     "WJ"  => 0x000b,
 220     "JL"  => 0x000c,
 221     "JV"  => 0x000d,
 222     "JT"  => 0x000e,
 223     "H2"  => 0x000f,
 224     "H3"  => 0x0010,
 225     "XX"  => 0x0011,
 226     "OP"  => 0x0012,
 227     "CL"  => 0x0013,
 228     "CP"  => 0x0014,
 229     "QU"  => 0x0015,
 230     "NS"  => 0x0016,
 231     "EX"  => 0x0017,
 232     "SY"  => 0x0018,
 233     "IS"  => 0x0019,
 234     "PR"  => 0x001a,
 235     "PO"  => 0x001b,
 236     "NU"  => 0x001c,
 237     "AL"  => 0x001d,
 238     "ID"  => 0x001e,
 239     "IN"  => 0x001f,
 240     "HY"  => 0x0020,
 241     "BB"  => 0x0021,
 242     "BA"  => 0x0022,
 243     "SA"  => 0x0023,
 244     "AI"  => 0x0024,
 245     "B2"  => 0x0025,
 246     "HL"  => 0x0026,
 247     "CJ"  => 0x0027,
 248     "RI"  => 0x0028,
 249     "EB"  => 0x0029,
 250     "EM"  => 0x002a,
 251     "ZWJ" => 0x002b,
 252 );
 253
 254 my %vertical_types =
 255 (
 256     "R"  => 0x0000,
 257     "U"  => 0x0001,
 258     "Tr" => 0x0002,
 259     "Tu" => 0x0003,
 260 );
 261
 262 my %categories =
 263 (
 264     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 265     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 266     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 267     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 268     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 269     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 270     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 271     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 272     "No" => $ctype{"defin"},                    # Number, Other
 273     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 274     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 275     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 276     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 277     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 278     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 279     "Co" => $ctype{"defin"},                    # Other, Private Use
 280     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 281     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 282     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 283     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 284     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 285     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 286     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 287     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 288     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 289     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 290     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 291     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 292     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 293     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 294 );
 295
 296 # a few characters need additional categories that cannot be determined automatically
 297 my %special_categories =
 298 (
 299     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 300                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 301     "space"  => [ 0x09..0x0d, 0x85 ],
 302     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 303     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 304                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 305                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 306                   0xfff9, 0xfffa, 0xfffb ],
 307     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 308                   0xd7, 0xf7 ],
 309     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 310     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 311     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 312                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 313     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 314     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 315                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 316                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 317     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 318     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 319                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 320                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 321                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 322                      0x3131..0x3164 ],
 323     "ideograph" => [ 0x3006..0x3007 ],
 324     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 325                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 326                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 327                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 328                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 329                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 330                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 331                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 332                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 333                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 334     "kashida" => [ 0x0640 ],
 335 );
 336
 337 my %directions =
 338 (
 339     "L"   => 1,    # Left-to-Right
 340     "R"   => 2,    # Right-to-Left
 341     "AL"  => 12,   # Right-to-Left Arabic
 342     "EN"  => 3,    # European Number
 343     "ES"  => 4,    # European Number Separator
 344     "ET"  => 5,    # European Number Terminator
 345     "AN"  => 6,    # Arabic Number
 346     "CS"  => 7,    # Common Number Separator
 347     "NSM" => 13,   # Non-Spacing Mark
 348     "BN"  => 14,   # Boundary Neutral
 349     "B"   => 8,    # Paragraph Separator
 350     "S"   => 9,    # Segment Separator
 351     "WS"  => 10,   # Whitespace
 352     "ON"  => 11,   # Other Neutrals
 353     "LRE" => 15,   # Left-to-Right Embedding
 354     "LRO" => 15,   # Left-to-Right Override
 355     "RLE" => 15,   # Right-to-Left Embedding
 356     "RLO" => 15,   # Right-to-Left Override
 357     "PDF" => 15,   # Pop Directional Format
 358     "LRI" => 15,   # Left-to-Right Isolate
 359     "RLI" => 15,   # Right-to-Left Isolate
 360     "FSI" => 15,   # First Strong Isolate
 361     "PDI" => 15    # Pop Directional Isolate
 362 );
 363
 364 my %c2_types =
 365 (
 366     "L"   => 1,    # C2_LEFTTORIGHT
 367     "R"   => 2,    # C2_RIGHTTOLEFT
 368     "AL"  => 2,    # C2_RIGHTTOLEFT
 369     "EN"  => 3,    # C2_EUROPENUMBER
 370     "ES"  => 4,    # C2_EUROPESEPARATOR
 371     "ET"  => 5,    # C2_EUROPETERMINATOR
 372     "AN"  => 6,    # C2_ARABICNUMBER
 373     "CS"  => 7,    # C2_COMMONSEPARATOR
 374     "NSM" => 11,   # C2_OTHERNEUTRAL
 375     "BN"  => 0,    # C2_NOTAPPLICABLE
 376     "B"   => 8,    # C2_BLOCKSEPARATOR
 377     "S"   => 9,    # C2_SEGMENTSEPARATOR
 378     "WS"  => 10,   # C2_WHITESPACE
 379     "ON"  => 11,   # C2_OTHERNEUTRAL
 380     "LRE" => 11,   # C2_OTHERNEUTRAL
 381     "LRO" => 11,   # C2_OTHERNEUTRAL
 382     "RLE" => 11,   # C2_OTHERNEUTRAL
 383     "RLO" => 11,   # C2_OTHERNEUTRAL
 384     "PDF" => 11,   # C2_OTHERNEUTRAL
 385     "LRI" => 11,   # C2_OTHERNEUTRAL
 386     "RLI" => 11,   # C2_OTHERNEUTRAL
 387     "FSI" => 11,   # C2_OTHERNEUTRAL
 388     "PDI" => 11    # C2_OTHERNEUTRAL
 389 );
 390
 391 my %bidi_types =
 392 (
 393     "ON"  => 0,    # Other Neutrals
 394     "L"   => 1,    # Left-to-Right
 395     "R"   => 2,    # Right-to-Left
 396     "AN"  => 3,    # Arabic Number
 397     "EN"  => 4,    # European Number
 398     "AL"  => 5,    # Right-to-Left Arabic
 399     "NSM" => 6,    # Non-Spacing Mark
 400     "CS"  => 7,    # Common Number Separator
 401     "ES"  => 8,    # European Number Separator
 402     "ET"  => 9,    # European Number Terminator
 403     "BN"  => 10,   # Boundary Neutral
 404     "S"   => 11,   # Segment Separator
 405     "WS"  => 12,   # Whitespace
 406     "B"   => 13,   # Paragraph Separator
 407     "RLO" => 14,   # Right-to-Left Override
 408     "RLE" => 15,   # Right-to-Left Embedding
 409     "LRO" => 16,   # Left-to-Right Override
 410     "LRE" => 17,   # Left-to-Right Embedding
 411     "PDF" => 18,   # Pop Directional Format
 412     "LRI" => 19,   # Left-to-Right Isolate
 413     "RLI" => 20,   # Right-to-Left Isolate
 414     "FSI" => 21,   # First Strong Isolate
 415     "PDI" => 22    # Pop Directional Isolate
 416 );
 417
 418 my %joining_types =
 419 (
 420    "U" => 0,    # Non_Joining
 421    "T" => 1,    # Transparent
 422    "R" => 2,    # Right_Joining
 423    "L" => 3,    # Left_Joining
 424    "D" => 4,    # Dual_Joining
 425    "C" => 5,    # Join_Causing
 426 );
 427
 428 my @cp2uni = ();
 429 my @glyph2uni = ();
 430 my @lead_bytes = ();
 431 my @uni2cp = ();
 432 my @tolower_table = ();
 433 my @toupper_table = ();
 434 my @digitmap_table = ();
 435 my @category_table = ();
 436 my @joining_table = ();
 437 my @direction_table = ();
 438 my @decomp_table = ();
 439 my @combining_class_table = ();
 440 my @decomp_compat_table = ();
 441 my @comp_exclusions = ();
 442 my @idna_decomp_table = ();
 443 my @idna_disallowed = ();
 444 my %registry_keys;
 445 my $default_char;
 446 my $default_wchar;
 447
 448 my %joining_forms =
 449 (
 450    "isolated" => [],
 451    "final" => [],
 452    "initial" => [],
 453    "medial" => []
 454 );
 455
 456 sub to_utf16(@)
 457 {
 458     my @ret;
 459     foreach my $ch (@_)
 460     {
 461         if ($ch < 0x10000)
 462         {
 463             push @ret, $ch;
 464         }
 465         else
 466         {
 467             my $val = $ch - 0x10000;
 468             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 469         }
 470     }
 471     return @ret;
 472 }
 473
 474 ################################################################
 475 # fetch a unicode.org file and open it
 476 sub open_data_file($$)
 477 {
 478     my ($base, $name) = @_;
 479     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 480     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 481     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 482     local *FILE;
 483
 484     if ($base =~ /.*\/([^\/]+)\.zip$/)
 485     {
 486         my $zip = "$1$suffix.zip";
 487         unless (-f "$cache/$zip")
 488         {
 489             system "mkdir", "-p", $cache;
 490             print "Fetching $base...\n";
 491             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 492         }
 493         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 494     }
 495     else
 496     {
 497         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 498         unless (-f $dest)
 499         {
 500             system "mkdir", "-p", $dir;
 501             print "Fetching $base/$name...\n";
 502             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 503         }
 504         open FILE, "<$dest" or die "cannot open $dest";
 505     }
 506     return *FILE;
 507 }
 508
 509 ################################################################
 510 # recursively get the decomposition for a character
 511 sub get_decomposition($$);
 512 sub get_decomposition($$)
 513 {
 514     my ($char, $table) = @_;
 515     my @ret;
 516
 517     return $char unless defined $table->[$char];
 518     foreach my $ch (@{$table->[$char]})
 519     {
 520         push @ret, get_decomposition( $ch, $table );
 521     }
 522     return @ret;
 523 }
 524
 525 ################################################################
 526 # get the composition that results in a given character
 527 sub get_composition($$)
 528 {
 529     my ($ch, $compat) = @_;
 530     return () unless defined $decomp_table[$ch];  # no decomposition
 531     my @ret = @{$decomp_table[$ch]};
 532     return () if @ret < 2;                        # singleton decomposition
 533     return () if $comp_exclusions[$ch];           # composition exclusion
 534     return () if $combining_class_table[$ch];     # non-starter
 535     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 536     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 537         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 538     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 539         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 540     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 541         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 542     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 543     return @ret;
 544 }
 545
 546 ################################################################
 547 # recursively build decompositions
 548 sub build_decompositions(@)
 549 {
 550     my @src = @_;
 551     my @dst;
 552
 553     for (my $i = 0; $i < @src; $i++)
 554     {
 555         next unless defined $src[$i];
 556         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 557         $dst[$i] = \@decomp;
 558     }
 559     return @dst;
 560 }
 561
 562 ################################################################
 563 # compose Hangul sequences
 564 sub compose_hangul(@)
 565 {
 566     my $SBASE  = 0xac00;
 567     my $LBASE  = 0x1100;
 568     my $VBASE  = 0x1161;
 569     my $TBASE  = 0x11a7;
 570     my $LCOUNT = 19;
 571     my $VCOUNT = 21;
 572     my $TCOUNT = 28;
 573     my $NCOUNT = $VCOUNT * $TCOUNT;
 574     my $SCOUNT = $LCOUNT * $NCOUNT;
 575
 576     my @seq = @_;
 577     my @ret;
 578     my $i;
 579
 580     for ($i = 0; $i < @seq; $i++)
 581     {
 582         my $ch = $seq[$i];
 583         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 584             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 585         {
 586             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 587             $i++;
 588         }
 589         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 590             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 591         {
 592             $ch += $seq[$i+1] - $TBASE;
 593             $i++;
 594         }
 595         push @ret, $ch;
 596     }
 597     return @ret;
 598 }
 599
 600 ################################################################
 601 # remove linguistic-only mappings from the case table
 602 sub remove_linguistic_mappings($$)
 603 {
 604     my ($upper, $lower) = @_;
 605
 606     # remove case mappings that don't round-trip
 607
 608     for (my $i = 0; $i < @{$upper}; $i++)
 609     {
 610         next unless defined ${$upper}[$i];
 611         my $ch = ${$upper}[$i];
 612         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 613     }
 614     for (my $i = 0; $i < @{$lower}; $i++)
 615     {
 616         next unless defined ${$lower}[$i];
 617         my $ch = ${$lower}[$i];
 618         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 619     }
 620 }
 621
 622 ################################################################
 623 # read in the Unicode database files
 624 sub load_data()
 625 {
 626     my $start;
 627
 628     # now build mappings from the decomposition field of the Unicode database
 629
 630     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 631     while (<$UNICODE_DATA>)
 632     {
 633         # Decode the fields ...
 634         my ($code, $name, $cat, $comb, $bidi,
 635             $decomp, $dec, $dig, $num, $mirror,
 636             $oldname, $comment, $upper, $lower, $title) = split /;/;
 637         my $src = hex $code;
 638
 639         die "unknown category $cat" unless defined $categories{$cat};
 640         die "unknown directionality $bidi" unless defined $directions{$bidi};
 641
 642         $category_table[$src] = $categories{$cat};
 643         $direction_table[$src] = $bidi;
 644         $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
 645
 646         if ($lower ne "")
 647         {
 648             $tolower_table[$src] = hex $lower;
 649         }
 650         if ($upper ne "")
 651         {
 652             $toupper_table[$src] = hex $upper;
 653         }
 654         if ($dec ne "")
 655         {
 656             $category_table[$src] |= $ctype{"digit"};
 657         }
 658         if ($dig ne "")
 659         {
 660             $digitmap_table[$src] = ord $dig;
 661         }
 662         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 663
 664         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 665         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 666         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 667         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 668         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 669         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 670         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 671         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 672         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 673         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 674         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 675         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 676
 677         # copy the category and direction for everything between First/Last pairs
 678         if ($name =~ /, First>/) { $start = $src; }
 679         if ($name =~ /, Last>/)
 680         {
 681             while ($start < $src)
 682             {
 683                 $category_table[$start] = $category_table[$src];
 684                 $direction_table[$start] = $direction_table[$src];
 685                 $combining_class_table[$start] = $combining_class_table[$src];
 686                 $start++;
 687             }
 688         }
 689
 690         next if $decomp eq "";  # no decomposition, skip it
 691
 692         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 693         {
 694             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 695             $decomp_compat_table[$src] = \@seq;
 696         }
 697
 698         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 699         {
 700             # decomposition of the form "<foo> 1234" -> use char if type is known
 701             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 702             {
 703                 ${joining_forms{$1}}[hex $2] = $src;
 704             }
 705         }
 706         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 707         {
 708             # decomposition "<compat> 0020 1234" -> combining accent
 709         }
 710         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 711         {
 712             # store decomposition
 713             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 714             {
 715                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 716             }
 717             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 718             {
 719                 # Single char decomposition
 720                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 721             }
 722         }
 723     }
 724     close $UNICODE_DATA;
 725
 726     # patch the category of some special characters
 727
 728     for (my $i = 0; $i < @decomp_table; $i++)
 729     {
 730         next unless defined $decomp_table[$i];
 731         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 732     }
 733     foreach my $cat (keys %special_categories)
 734     {
 735         my $flag = $ctype{$cat};
 736         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 737     }
 738     for (my $i = 0; $i < @decomp_compat_table; $i++)
 739     {
 740         next unless defined $decomp_compat_table[$i];
 741         next unless @{$decomp_compat_table[$i]} == 2;
 742         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 743     }
 744
 745     # load the composition exclusions
 746
 747     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 748     while (<$EXCL>)
 749     {
 750         s/\#.*//;  # remove comments
 751         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 752         {
 753             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 754         }
 755         elsif (/^([0-9a-fA-F]+)\s*$/)
 756         {
 757             $comp_exclusions[hex $1] = 1;
 758         }
 759     }
 760     close $EXCL;
 761
 762     # load the IDNA mappings
 763
 764     @idna_decomp_table = @decomp_compat_table;
 765     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 766     while (<$IDNA>)
 767     {
 768         s/\#.*//;  # remove comments
 769         next if /^\s*$/;
 770         my ($char, $type, $mapping) = split /;/;
 771         my ($ch1, $ch2);
 772         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 773         {
 774             $ch1 = hex $1;
 775             $ch2 = hex $2;
 776         }
 777         elsif ($char =~ /([0-9a-fA-F]+)/)
 778         {
 779             $ch1 = $ch2 = hex $1;
 780         }
 781
 782         if ($type =~ /mapped/ || $type =~ /deviation/)
 783         {
 784             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 785             my @seq = map { hex $_; } split /\s+/, $mapping;
 786             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 787         }
 788         elsif ($type =~ /valid/)
 789         {
 790         }
 791         elsif ($type =~ /ignored/)
 792         {
 793             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 794         }
 795         elsif ($type =~ /disallowed/)
 796         {
 797             foreach my $i ($ch1 .. $ch2)
 798             {
 799                 $idna_decomp_table[$i] = undef;
 800                 $idna_disallowed[$i] = 1;
 801             }
 802         }
 803     }
 804     close $IDNA;
 805 }
 806
 807
 808 ################################################################
 809 # add a new registry key
 810 sub add_registry_key($$)
 811 {
 812     my ($key, $defval) = @_;
 813     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 814 }
 815
 816 ################################################################
 817 # add a new registry value
 818 sub add_registry_value($$$)
 819 {
 820     my ($key, $name, $value) = @_;
 821     add_registry_key( $key, undef );
 822     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 823 }
 824
 825 ################################################################
 826 # define a new lead byte
 827 sub add_lead_byte($)
 828 {
 829     my $ch = shift;
 830     return if defined $cp2uni[$ch];
 831     push @lead_bytes, $ch;
 832     $cp2uni[$ch] = 0;
 833 }
 834
 835 ################################################################
 836 # define a new char mapping
 837 sub add_mapping($$)
 838 {
 839     my ($cp, $uni) = @_;
 840     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 841     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 842     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 843 }
 844
 845 ################################################################
 846 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 847 sub get_glyphs_mapping(@)
 848 {
 849     my @table = @_;
 850
 851     for (my $i = 0; $i < @glyph2uni; $i++)
 852     {
 853         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 854     }
 855     return @table;
 856 }
 857
 858 ################################################################
 859 # build EUC-JP table from the JIS 0208/0212 files
 860 sub dump_eucjp_codepage()
 861 {
 862     @cp2uni = ();
 863     @glyph2uni = ();
 864     @lead_bytes = ();
 865     @uni2cp = ();
 866     $default_char = $DEF_CHAR;
 867     $default_wchar = 0x30fb;
 868
 869     # ASCII chars
 870     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 871
 872     # lead bytes
 873     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 874
 875     # JIS X 0201 right plane
 876     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 877
 878     # undefined chars
 879     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 880     $cp2uni[0xa0] = 0xf8f0;
 881     $cp2uni[0xff] = 0xf8f3;
 882
 883     # Fix backslash conversion
 884     add_mapping( 0xa1c0, 0xff3c );
 885
 886     # Add private mappings for rows undefined in JIS 0208/0212
 887     my $private = 0xe000;
 888     foreach my $hi (0xf5 .. 0xfe)
 889     {
 890         foreach my $lo (0xa1 .. 0xfe)
 891         {
 892             add_mapping( ($hi << 8) + $lo, $private++ );
 893         }
 894     }
 895     foreach my $hi (0xf5 .. 0xfe)
 896     {
 897         foreach my $lo (0x21 .. 0x7e)
 898         {
 899             add_mapping( ($hi << 8) + $lo, $private++ );
 900         }
 901     }
 902
 903     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 904     while (<$INPUT>)
 905     {
 906         next if /^\#/;  # skip comments
 907         next if /^$/;  # skip empty lines
 908         next if /\x1a/;  # skip ^Z
 909         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 910         {
 911             add_mapping( 0x8080 + hex $1, hex $2 );
 912             next;
 913         }
 914         die "Unrecognized line $_\n";
 915     }
 916     close $INPUT;
 917
 918     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 919     while (<$INPUT>)
 920     {
 921         next if /^\#/;  # skip comments
 922         next if /^$/;  # skip empty lines
 923         next if /\x1a/;  # skip ^Z
 924         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 925         {
 926             add_mapping( 0x8000 + hex $1, hex $2 );
 927             next;
 928         }
 929         die "Unrecognized line $_\n";
 930     }
 931     close $INPUT;
 932
 933     output_codepage_file( 20932 );
 934 }
 935
 936
 937 ################################################################
 938 # build the sort keys table
 939 sub dump_sortkeys($)
 940 {
 941     my $filename = shift;
 942     my @sortkeys = ();
 943
 944     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
 945     while (<$INPUT>)
 946     {
 947         next if /^\#/;  # skip comments
 948         next if /^$/;  # skip empty lines
 949         next if /\x1a/;  # skip ^Z
 950         next if /^\@version/;  # skip @version header
 951         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 952         {
 953             my ($uni,$variable) = (hex $1, $2);
 954             next if $uni > 65535;
 955             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
 956             next;
 957         }
 958         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 959         {
 960             # multiple character sequence, ignored for now
 961             next;
 962         }
 963         die "$SORTKEYS: Unrecognized line $_\n";
 964     }
 965     close $INPUT;
 966
 967     # compress the keys to 32 bit:
 968     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
 969
 970     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
 971                        ${$a}[2] <=> ${$b}[2] or
 972                        ${$a}[3] <=> ${$b}[3] or
 973                        ${$a}[4] <=> ${$b}[4] or
 974                        $a cmp $b; } @sortkeys;
 975
 976     my ($n2, $n3) = (1, 1);
 977     my @keys = (-1, -1, -1, -1, -1 );
 978     my @flatkeys = ();
 979
 980     for (my $i = 0; $i < @sortkeys; $i++)
 981     {
 982         next unless defined $sortkeys[$i];
 983         my @current = @{$sortkeys[$i]};
 984         if ($current[1] == $keys[1])
 985         {
 986             if ($current[2] == $keys[2])
 987             {
 988                 if ($current[3] == $keys[3])
 989                 {
 990                     # nothing
 991                 }
 992                 else
 993                 {
 994                     $keys[3] = $current[3];
 995                     $n3++;
 996                     die if ($n3 >= 16);
 997                 }
 998             }
 999             else
1000             {
1001                 $keys[2] = $current[2];
1002                 $keys[3] = $current[3];
1003                 $n2++;
1004                 $n3 = 1;
1005                 die if ($n2 >= 256);
1006             }
1007         }
1008         else
1009         {
1010             $keys[1] = $current[1];
1011             $keys[2] = $current[2];
1012             $keys[3] = $current[3];
1013             $n2 = 1;
1014             $n3 = 1;
1015         }
1016
1017         if ($current[2]) { $current[2] = $n2; }
1018         if ($current[3]) { $current[3] = $n3; }
1019         if ($current[4]) { $current[4] = 1; }
1020
1021         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1022     }
1023
1024     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1025     printf "Building $filename\n";
1026     printf OUTPUT "/* Unicode collation element table */\n";
1027     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1028     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1029     print OUTPUT "#include \"windef.h\"\n\n";
1030
1031     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1032
1033     close OUTPUT;
1034     save_file($filename);
1035 }
1036
1037
1038 ################################################################
1039 # dump an array of integers
1040 sub dump_array($$@)
1041 {
1042     my ($bit_width, $default, @array) = @_;
1043     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1044     my $i;
1045     my $ret = "    ";
1046     for ($i = 0; $i < $#array; $i++)
1047     {
1048         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1049         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1050     }
1051     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1052     return $ret;
1053 }
1054
1055
1056 ################################################################
1057 # dump an SBCS mapping table in binary format
1058 sub dump_binary_sbcs_table($)
1059 {
1060     my $codepage = shift;
1061
1062     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1063     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1064
1065     print OUTPUT pack "S<*", @header;
1066     print OUTPUT pack "C12", (0) x 12;
1067     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1068
1069     if (@glyph2uni)
1070     {
1071         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1072     }
1073     else
1074     {
1075         print OUTPUT pack "S<*", 0;
1076     }
1077
1078     print OUTPUT pack "S<*", 0, 0;
1079
1080     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1081 }
1082
1083
1084 ################################################################
1085 # dump a DBCS mapping table in binary format
1086 sub dump_binary_dbcs_table($)
1087 {
1088     my $codepage = shift;
1089     my @lb_ranges = get_lb_ranges();
1090     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1091
1092     my @offsets = (0) x 256;
1093     my $pos = 0;
1094     foreach my $i (@lead_bytes)
1095     {
1096         $offsets[$i] = ($pos += 256);
1097         $cp2uni[$i] = 0;
1098     }
1099
1100     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1101
1102     print OUTPUT pack "S<*", @header;
1103     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1104     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1105     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1106
1107     foreach my $i (@lead_bytes)
1108     {
1109         my $base = $i << 8;
1110         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1111     }
1112
1113     print OUTPUT pack "S<", 4;
1114     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1115 }
1116
1117
1118 ################################################################
1119 # get the list of defined lead byte ranges
1120 sub get_lb_ranges()
1121 {
1122     my @list = ();
1123     my @ranges = ();
1124
1125     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1126     my $on = 0;
1127     for (my $i = 0; $i < 256; $i++)
1128     {
1129         if ($on)
1130         {
1131             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1132         }
1133         else
1134         {
1135             if ($list[$i]) { push @ranges, $i; $on = 1; }
1136         }
1137     }
1138     if ($on) { push @ranges, 0xff; }
1139     return @ranges;
1140 }
1141
1142 ################################################################
1143 # dump the Indic Syllabic Category table
1144 sub dump_indic($)
1145 {
1146     my $filename = shift;
1147     my @indic_table;
1148
1149     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1150     while (<$INPUT>)
1151     {
1152         next if /^\#/;  # skip comments
1153         next if /^\s*$/;  # skip empty lines
1154         next if /\x1a/;  # skip ^Z
1155         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1156         {
1157             my $type = $2;
1158             die "unknown indic $type" unless defined $indic_types{$type};
1159             if (hex $1 < 65536)
1160             {
1161                 $indic_table[hex $1] = $indic_types{$type};
1162             }
1163             next;
1164         }
1165         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1166         {
1167             my $type = $3;
1168             die "unknown indic $type" unless defined $indic_types{$type};
1169             if (hex $1 < 65536 and hex $2 < 65536)
1170             {
1171                 foreach my $i (hex $1 .. hex $2)
1172                 {
1173                     $indic_table[$i] = $indic_types{$type};
1174                 }
1175             }
1176             next;
1177         }
1178         die "malformed line $_";
1179     }
1180     close $INPUT;
1181
1182     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1183     while (<$INPUT>)
1184     {
1185         next if /^\#/;  # skip comments
1186         next if /^\s*$/;  # skip empty lines
1187         next if /\x1a/;  # skip ^Z
1188         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1189         {
1190             my $type = $2;
1191             die "unknown matra $type" unless defined $matra_types{$type};
1192             $indic_table[hex $1] |= $matra_types{$type} << 8;
1193             next;
1194         }
1195         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1196         {
1197             my $type = $3;
1198             die "unknown matra $type" unless defined $matra_types{$type};
1199             foreach my $i (hex $1 .. hex $2)
1200             {
1201                 $indic_table[$i] |= $matra_types{$type} << 8;
1202             }
1203             next;
1204         }
1205         die "malformed line $_";
1206     }
1207     close $INPUT;
1208
1209     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1210     print "Building $filename\n";
1211     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1212     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1213     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1214     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1215     print OUTPUT "#include \"windef.h\"\n\n";
1216
1217     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1218
1219     close OUTPUT;
1220     save_file($filename);
1221 }
1222
1223 ################################################################
1224 # dump the Line Break Properties table
1225 sub dump_linebreak($)
1226 {
1227     my $filename = shift;
1228     my @break_table;
1229     my $next_group = 0;
1230
1231     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1232     while (<$INPUT>)
1233     {
1234         next if /^\#/;  # skip comments
1235         next if /^\s*$/;  # skip empty lines
1236         next if /\x1a/;  # skip ^Z
1237         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1238         {
1239             my $type = $2;
1240             die "unknown breaktype $type" unless defined $break_types{$type};
1241             $break_table[hex $1] = $break_types{$type};
1242             next;
1243         }
1244         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1245         {
1246             my $type = $3;
1247             die "unknown breaktype $type" unless defined $break_types{$type};
1248             foreach my $i (hex $1 .. hex $2)
1249             {
1250                 $break_table[$i] = $break_types{$type};
1251             }
1252             next;
1253         }
1254         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1255         {
1256             my $type = $2;
1257             die "unknown breaktype $type" unless defined $break_types{$type};
1258             $break_table[hex $1] = $break_types{$type};
1259             next;
1260         }
1261         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1262         {
1263             my $type = $3;
1264             die "unknown breaktype $type" unless defined $break_types{$type};
1265             foreach my $i (hex $1 .. hex $2)
1266             {
1267                 $break_table[$i] = $break_types{$type};
1268             }
1269             next;
1270         }
1271         die "malformed line $_";
1272     }
1273     close $INPUT;
1274
1275     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1276     print "Building $filename\n";
1277     print OUTPUT "/* Unicode Line Break Properties */\n";
1278     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1279     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1280     print OUTPUT "#include \"windef.h\"\n\n";
1281
1282     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1283
1284     close OUTPUT;
1285     save_file($filename);
1286 }
1287
1288 my %scripts =
1289 (
1290     "Unknown"                => 0,
1291     "Common"                 => 1,
1292     "Inherited"              => 2,
1293     "Arabic"                 => 3,
1294     "Armenian"               => 4,
1295     "Avestan"                => 5,
1296     "Balinese"               => 6,
1297     "Bamum"                  => 7,
1298     "Batak"                  => 8,
1299     "Bengali"                => 9,
1300     "Bopomofo"               => 10,
1301     "Brahmi"                 => 11,
1302     "Braille"                => 12,
1303     "Buginese"               => 13,
1304     "Buhid"                  => 14,
1305     "Canadian_Aboriginal"    => 15,
1306     "Carian"                 => 16,
1307     "Cham"                   => 17,
1308     "Cherokee"               => 18,
1309     "Coptic"                 => 19,
1310     "Cuneiform"              => 20,
1311     "Cypriot"                => 21,
1312     "Cyrillic"               => 22,
1313     "Deseret"                => 23,
1314     "Devanagari"             => 24,
1315     "Egyptian_Hieroglyphs"   => 25,
1316     "Ethiopic"               => 26,
1317     "Georgian"               => 27,
1318     "Glagolitic"             => 28,
1319     "Gothic"                 => 29,
1320     "Greek"                  => 30,
1321     "Gujarati"               => 31,
1322     "Gurmukhi"               => 32,
1323     "Han"                    => 33,
1324     "Hangul"                 => 34,
1325     "Hanunoo"                => 35,
1326     "Hebrew"                 => 36,
1327     "Hiragana"               => 37,
1328     "Imperial_Aramaic"       => 38,
1329     "Inscriptional_Pahlavi"  => 39,
1330     "Inscriptional_Parthian" => 40,
1331     "Javanese"               => 41,
1332     "Kaithi"                 => 42,
1333     "Kannada"                => 43,
1334     "Katakana"               => 44,
1335     "Kayah_Li"               => 45,
1336     "Kharoshthi"             => 46,
1337     "Khmer"                  => 47,
1338     "Lao"                    => 48,
1339     "Latin"                  => 49,
1340     "Lepcha"                 => 50,
1341     "Limbu"                  => 51,
1342     "Linear_B"               => 52,
1343     "Lisu"                   => 53,
1344     "Lycian"                 => 54,
1345     "Lydian"                 => 55,
1346     "Malayalam"              => 56,
1347     "Mandaic"                => 57,
1348     "Meetei_Mayek"           => 58,
1349     "Mongolian"              => 59,
1350     "Myanmar"                => 60,
1351     "New_Tai_Lue"            => 61,
1352     "Nko"                    => 62,
1353     "Ogham"                  => 63,
1354     "Ol_Chiki"               => 64,
1355     "Old_Italic"             => 65,
1356     "Old_Persian"            => 66,
1357     "Old_South_Arabian"      => 67,
1358     "Old_Turkic"             => 68,
1359     "Oriya"                  => 69,
1360     "Osmanya"                => 70,
1361     "Phags_Pa"               => 71,
1362     "Phoenician"             => 72,
1363     "Rejang"                 => 73,
1364     "Runic"                  => 74,
1365     "Samaritan"              => 75,
1366     "Saurashtra"             => 76,
1367     "Shavian"                => 77,
1368     "Sinhala"                => 78,
1369     "Sundanese"              => 79,
1370     "Syloti_Nagri"           => 80,
1371     "Syriac"                 => 81,
1372     "Tagalog"                => 82,
1373     "Tagbanwa"               => 83,
1374     "Tai_Le"                 => 84,
1375     "Tai_Tham"               => 85,
1376     "Tai_Viet"               => 86,
1377     "Tamil"                  => 87,
1378     "Telugu"                 => 88,
1379     "Thaana"                 => 89,
1380     "Thai"                   => 90,
1381     "Tibetan"                => 91,
1382     "Tifinagh"               => 92,
1383     "Ugaritic"               => 93,
1384     "Vai"                    => 94,
1385     "Yi"                     => 95,
1386     # Win8/Win8.1
1387     "Chakma"                 => 96,
1388     "Meroitic_Cursive"       => 97,
1389     "Meroitic_Hieroglyphs"   => 98,
1390     "Miao"                   => 99,
1391     "Sharada"                => 100,
1392     "Sora_Sompeng"           => 101,
1393     "Takri"                  => 102,
1394     # Win10
1395     "Bassa_Vah"              => 103,
1396     "Caucasian_Albanian"     => 104,
1397     "Duployan"               => 105,
1398     "Elbasan"                => 106,
1399     "Grantha"                => 107,
1400     "Khojki"                 => 108,
1401     "Khudawadi"              => 109,
1402     "Linear_A"               => 110,
1403     "Mahajani"               => 111,
1404     "Manichaean"             => 112,
1405     "Mende_Kikakui"          => 113,
1406     "Modi"                   => 114,
1407     "Mro"                    => 115,
1408     "Nabataean"              => 116,
1409     "Old_North_Arabian"      => 117,
1410     "Old_Permic"             => 118,
1411     "Pahawh_Hmong"           => 119,
1412     "Palmyrene"              => 120,
1413     "Pau_Cin_Hau"            => 121,
1414     "Psalter_Pahlavi"        => 122,
1415     "Siddham"                => 123,
1416     "Tirhuta"                => 124,
1417     "Warang_Citi"            => 125,
1418     # Win10 RS1
1419     "Adlam"                  => 126,
1420     "Ahom"                   => 127,
1421     "Anatolian_Hieroglyphs"  => 128,
1422     "Bhaiksuki"              => 129,
1423     "Hatran"                 => 130,
1424     "Marchen"                => 131,
1425     "Multani"                => 132,
1426     "Newa"                   => 133,
1427     "Old_Hungarian"          => 134,
1428     "Osage"                  => 135,
1429     "SignWriting"            => 136,
1430     "Tangut"                 => 137,
1431     # Win10 RS4
1432     "Masaram_Gondi"          => 138,
1433     "Nushu"                  => 139,
1434     "Soyombo"                => 140,
1435     "Zanabazar_Square"       => 141,
1436     # Win10 1903
1437     "Dogra"                  => 142,
1438     "Gunjala_Gondi"          => 143,
1439     "Hanifi_Rohingya"        => 144,
1440     "Makasar"                => 145,
1441     "Medefaidrin"            => 146,
1442     "Old_Sogdian"            => 147,
1443     "Sogdian"                => 148,
1444     # Win10 2004
1445     "Elymaic"                => 149,
1446     "Nyiakeng_Puachue_Hmong" => 150,
1447     "Nandinagari"            => 151,
1448     "Wancho"                 => 152,
1449 );
1450
1451 ################################################################
1452 # dump Script IDs table
1453 sub dump_scripts($)
1454 {
1455     my $filename = shift;
1456     my $header = $filename;
1457     my @scripts_table;
1458     my $script_index;
1459     my $i;
1460
1461     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1462     # Fill the table
1463     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1464     while (<$INPUT>)
1465     {
1466         my $type = "";
1467
1468         next if /^\#/;  # skip comments
1469         next if /^\s*$/;  # skip empty lines
1470         next if /\x1a/;  # skip ^Z
1471         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1472         {
1473             $type = $2;
1474             if (defined $scripts{$type})
1475             {
1476                 $scripts_table[hex $1] = $scripts{$type};
1477             }
1478             next;
1479         }
1480         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1481         {
1482             $type = $3;
1483             if (defined $scripts{$type})
1484             {
1485                 foreach my $i (hex $1 .. hex $2)
1486                 {
1487                     $scripts_table[$i] = $scripts{$type};
1488                 }
1489             }
1490             next;
1491         }
1492     }
1493
1494     close $INPUT;
1495
1496     $header = "$filename.h";
1497     open OUTPUT,">$header.new" or die "Cannot create $header";
1498     print "Building $header\n";
1499     print OUTPUT "/* Unicode Script IDs */\n";
1500     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1501     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1502
1503     print OUTPUT "enum unicode_script_id {\n";
1504     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1505     {
1506         print OUTPUT "    Script_$script = $scripts{$script},\n";
1507     }
1508     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1509     print OUTPUT "};\n";
1510
1511     close OUTPUT;
1512     save_file($header);
1513
1514     $filename = "$filename.c";
1515     open OUTPUT,">$filename.new" or die "Cannot create $header";
1516     print "Building $filename\n";
1517     print OUTPUT "/* Unicode Script IDs */\n";
1518     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1519     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1520     print OUTPUT "#include \"windef.h\"\n\n";
1521
1522     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1523     close OUTPUT;
1524     save_file($filename);
1525 }
1526
1527 ################################################################
1528 # dump the BiDi mirroring table
1529 sub dump_mirroring($)
1530 {
1531     my $filename = shift;
1532     my @mirror_table = ();
1533
1534     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1535     while (<$INPUT>)
1536     {
1537         next if /^\#/;  # skip comments
1538         next if /^$/;  # skip empty lines
1539         next if /\x1a/;  # skip ^Z
1540         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1541         {
1542             $mirror_table[hex $1] = hex $2;
1543             next;
1544         }
1545         die "malformed line $_";
1546     }
1547     close $INPUT;
1548
1549     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1550     print "Building $filename\n";
1551     print OUTPUT "/* Unicode BiDi mirroring */\n";
1552     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1553     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1554     print OUTPUT "#include \"windef.h\"\n\n";
1555     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1556     close OUTPUT;
1557     save_file($filename);
1558 }
1559
1560 ################################################################
1561 # dump the Bidi Brackets
1562 sub dump_bracket($)
1563 {
1564     my $filename = shift;
1565     my @bracket_table;
1566
1567     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1568     while (<$INPUT>)
1569     {
1570         next if /^\#/;  # skip comments
1571         next if /^\s*$/;  # skip empty lines
1572         next if /\x1a/;  # skip ^Z
1573         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1574         {
1575             my $type = $3;
1576             die "unknown bracket $type" unless defined $bracket_types{$type};
1577             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1578             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1579             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1580             next;
1581         }
1582         die "malformed line $_";
1583     }
1584     close $INPUT;
1585
1586     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1587     print "Building $filename\n";
1588     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1589     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1590     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1591     print OUTPUT "#include \"windef.h\"\n\n";
1592
1593     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1594
1595     close OUTPUT;
1596     save_file($filename);
1597 }
1598
1599 ################################################################
1600 # dump the Arabic shaping table
1601 sub dump_shaping($)
1602 {
1603     my $filename = shift;
1604     my %groups;
1605     my $next_group = 0;
1606
1607     $groups{"No_Joining_Group"} = $next_group++;
1608
1609     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1610     while (<$INPUT>)
1611     {
1612         next if /^\#/;  # skip comments
1613         next if /^\s*$/;  # skip empty lines
1614         next if /\x1a/;  # skip ^Z
1615         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1616         {
1617             my $type = $2;
1618             my $group = $3;
1619             $groups{$group} = $next_group++ unless defined $groups{$group};
1620             $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1621             next;
1622         }
1623         die "malformed line $_";
1624     }
1625     close $INPUT;
1626
1627     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1628     print "Building $filename\n";
1629     print OUTPUT "/* Unicode Arabic shaping */\n";
1630     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1631     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1632     print OUTPUT "#include \"windef.h\"\n\n";
1633
1634     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1635
1636     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1637     for (my $i = 0x600; $i <= 0x6ff; $i++)
1638     {
1639         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1640             ${joining_forms{"isolated"}}[$i] || $i,
1641             ${joining_forms{"final"}}[$i] || $i,
1642             ${joining_forms{"initial"}}[$i] || $i,
1643             ${joining_forms{"medial"}}[$i] || $i;
1644     }
1645     print OUTPUT "};\n";
1646
1647     close OUTPUT;
1648     save_file($filename);
1649 }
1650
1651 ################################################################
1652 # dump the Vertical Orientation table
1653 sub dump_vertical($)
1654 {
1655     my $filename = shift;
1656     my @vertical_table;
1657
1658     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1659     while (<$INPUT>)
1660     {
1661         next if /^\#/;  # skip comments
1662         next if /^\s*$/;  # skip empty lines
1663         next if /\x1a/;  # skip ^Z
1664         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1665         {
1666             my $type = $2;
1667             die "unknown vertical $type" unless defined $vertical_types{$type};
1668             if (hex $1 < 65536)
1669             {
1670                 $vertical_table[hex $1] = $vertical_types{$type};
1671             }
1672             next;
1673         }
1674         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1675         {
1676             my $type = $3;
1677             die "unknown vertical $type" unless defined $vertical_types{$type};
1678             foreach my $i (hex $1 .. hex $2)
1679             {
1680                 $vertical_table[$i] = $vertical_types{$type};
1681             }
1682             next;
1683         }
1684         die "malformed line $_";
1685     }
1686     close $INPUT;
1687
1688     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1689     print "Building $filename\n";
1690     print OUTPUT "/* Unicode Vertical Orientation */\n";
1691     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1692     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1693     print OUTPUT "#include \"windef.h\"\n\n";
1694
1695     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1696
1697     close OUTPUT;
1698     save_file($filename);
1699 }
1700
1701 ################################################################
1702 # dump the digit folding tables
1703 sub dump_digit_folding($)
1704 {
1705     my ($filename) = shift;
1706     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1707     print "Building $filename\n";
1708     print OUTPUT "/* Unicode digit folding mappings */\n";
1709     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1710     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1711     print OUTPUT "#include \"windef.h\"\n\n";
1712
1713     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1714     close OUTPUT;
1715     save_file($filename);
1716 }
1717
1718
1719 ################################################################
1720 # dump the case mapping tables
1721 sub dump_case_mappings($)
1722 {
1723     my $filename = shift;
1724     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1725     print "Building $filename\n";
1726     print OUTPUT "/* Unicode case mappings */\n";
1727     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1728     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1729     print OUTPUT "#include \"windef.h\"\n\n";
1730
1731     my @upper = @toupper_table;
1732     my @lower = @tolower_table;
1733     remove_linguistic_mappings( \@upper, \@lower );
1734
1735     dump_case_table( "wine_casemap_lower", @lower );
1736     print OUTPUT "\n";
1737     dump_case_table( "wine_casemap_upper", @upper );
1738     close OUTPUT;
1739     save_file($filename);
1740 }
1741
1742
1743 ################################################################
1744 # dump a case mapping table
1745 sub dump_case_table($@)
1746 {
1747     my ($name,@table) = @_;
1748
1749     for (my $i = 0; $i < 65536; $i++)
1750     {
1751         next unless defined $table[$i];
1752         $table[$i] = ($table[$i] - $i) & 0xffff;
1753     }
1754
1755     my @array = compress_array( 256, 0, @table[0..65535] );
1756
1757     printf OUTPUT "const WCHAR %s[%d] =\n", $name, scalar @array;
1758     printf OUTPUT "{\n    /* index */\n";
1759     printf OUTPUT "%s,\n", dump_array( 16, 0, @array[0..255] );
1760     printf OUTPUT "    /* data */\n";
1761     printf OUTPUT "%s", dump_array( 16, 0, @array[256..$#array] );
1762     printf OUTPUT "\n};\n";
1763 }
1764
1765 ################################################################
1766 # compress a mapping table by removing identical rows
1767 sub compress_array($$@)
1768 {
1769     my $rows = shift;
1770     my $def = shift;
1771     my @table = @_;
1772     my $len = @table / $rows;
1773     my @array;
1774     my $data = "";
1775
1776     # try to merge table rows
1777     for (my $row = 0; $row < $rows; $row++)
1778     {
1779         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1780         my $pos = index $data, $rowtxt;
1781         if ($pos == -1)
1782         {
1783             # check if the tail of the data can match the start of the new row
1784             my $first = substr( $rowtxt, 0, 1 );
1785             for (my $i = length($data) - 1; $i > 0; $i--)
1786             {
1787                 $pos = index( substr( $data, -$i ), $first );
1788                 last if $pos == -1;
1789                 $i -= $pos;
1790                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1791                 substr( $data, -$i ) = "";
1792                 last;
1793             }
1794             $pos = length $data;
1795             $data .= $rowtxt;
1796         }
1797         $array[$row] = $rows + $pos;
1798     }
1799     return @array, unpack "U*", $data;
1800 }
1801
1802 ################################################################
1803 # dump a simple char -> 16-bit value mapping table
1804 sub dump_simple_mapping($$@)
1805 {
1806     my $name = shift;
1807     my $def = shift;
1808     my @array = compress_array( 256, $def, @_[0..65535] );
1809
1810     printf OUTPUT "const unsigned short %s[%d] =\n{\n", $name, $#array+1;
1811     printf OUTPUT "    /* offsets */\n%s,\n", dump_array( 16, 0, @array[0..255] );
1812     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( 16, 0, @array[256..$#array] );
1813 }
1814
1815 ################################################################
1816 # dump a char -> 16-bit value mapping table using two-level tables
1817 sub dump_two_level_mapping($$@)
1818 {
1819     my $name = shift;
1820     my $def = shift;
1821     my $size = shift;
1822     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1823     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1824     my @array = compress_array( 256, 0, @row_array[0..4095] );
1825
1826     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1827
1828     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1829     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1830     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1831     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1832 }
1833
1834 ################################################################
1835 # dump a char -> value mapping table using three-level tables
1836 sub dump_three_level_mapping($$@)
1837 {
1838     my $name = shift;
1839     my $def = shift;
1840     my $size = shift;
1841     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1842     my $level3 = ($MAX_CHAR + 1) / 16;
1843     my $level2 = $level3 / 16;
1844     my $level1 = $level2 / 16;
1845     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1846     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1847     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1848
1849     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1850     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1851
1852     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1853     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1854     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1855     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1856     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1857 }
1858
1859 ################################################################
1860 # dump a binary case mapping table in l_intl.nls format
1861 sub dump_binary_case_table(@)
1862 {
1863     my (@table) = @_;
1864     my $max_char = 0x10000;
1865     my $level1 = $max_char / 16;
1866     my $level2 = $level1 / 16;
1867
1868     my @difftable;
1869     for (my $i = 0; $i < @table; $i++)
1870     {
1871         next unless defined $table[$i];
1872         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1873     }
1874
1875     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1876     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1877     my $offset = @array - $level1;
1878     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1879     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1880 }
1881
1882 ################################################################
1883 # dump case mappings for l_intl.nls
1884 sub dump_intl_nls($)
1885 {
1886     my @upper_table = @toupper_table;
1887     my @lower_table = @tolower_table;
1888     remove_linguistic_mappings( \@upper_table, \@lower_table );
1889
1890     my $upper = dump_binary_case_table( @upper_table );
1891     my $lower = dump_binary_case_table( @lower_table );
1892
1893     my $filename = shift;
1894     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1895     printf "Building $filename\n";
1896
1897     binmode OUTPUT;
1898     print OUTPUT pack "S<", 1;  # version
1899     print OUTPUT $upper;
1900     print OUTPUT $lower;
1901     close OUTPUT;
1902     save_file($filename);
1903 }
1904
1905
1906 ################################################################
1907 # dump the bidi direction table
1908 sub dump_bidi_dir_table($)
1909 {
1910     my $filename = shift;
1911     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1912     printf "Building $filename\n";
1913     printf OUTPUT "/* Unicode BiDi direction table */\n";
1914     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1915     printf OUTPUT "#include \"windef.h\"\n\n";
1916
1917     my @table;
1918
1919     for (my $i = 0; $i < 65536; $i++)
1920     {
1921         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1922     }
1923
1924     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1925
1926     close OUTPUT;
1927     save_file($filename);
1928 }
1929
1930
1931 ################################################################
1932 # dump the ctype tables
1933 sub dump_ctype_tables($)
1934 {
1935     my $filename = shift;
1936     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1937     printf "Building $filename\n";
1938     printf OUTPUT "/* Unicode ctype tables */\n";
1939     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1940     printf OUTPUT "#include \"windef.h\"\n\n";
1941
1942     my @table = map { ($_ || 0) & 0xffff; } @category_table;
1943
1944     # add the direction in the high 4 bits of the category
1945     for (my $i = 0; $i < 65536; $i++)
1946     {
1947         $table[$i] |= $directions{$direction_table[$i]} << 12 if defined $direction_table[$i];
1948     }
1949
1950     dump_simple_mapping( "wine_wctype_table", 0, @table );
1951
1952     close OUTPUT;
1953     save_file($filename);
1954 }
1955
1956
1957 sub rol($$)
1958 {
1959     my ($byte, $count) = @_;
1960     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1961 }
1962
1963 ################################################################
1964 # compress the character properties table
1965 sub compress_char_props_table($@)
1966 {
1967     my $rows = shift;
1968     my @table = @_;
1969     my $len = @table / $rows;
1970     my $pos = 0;
1971     my @array = (0) x $rows;
1972     my %sequences;
1973
1974     # add some predefined sequences
1975     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1976
1977     # try to merge table rows
1978     for (my $row = 0; $row < $rows; $row++)
1979     {
1980         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1981         my $rowtxt = pack "L*", @table_row;
1982         if (defined($sequences{$rowtxt}))
1983         {
1984             # reuse an existing row
1985             $array[$row] = $sequences{$rowtxt};
1986         }
1987         else
1988         {
1989             # create a new row
1990             $sequences{$rowtxt} = $array[$row] = ++$pos;
1991             push @array, @table_row;
1992         }
1993     }
1994     return @array;
1995 }
1996
1997 ################################################################
1998 # dump a normalization table in binary format
1999 sub dump_norm_table($)
2000 {
2001     my $filename = shift;
2002
2003     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2004     my %decomp = ( "nfc" => \@decomp_table,
2005                    "nfd" => \@decomp_table,
2006                    "nfkc" => \@decomp_compat_table,
2007                    "nfkd" => \@decomp_compat_table ,
2008                    "idna" => \@idna_decomp_table );
2009
2010     open OUTPUT,">$filename.new" or die "Cannot create $filename";
2011     print "Building $filename\n";
2012
2013     my $type = $filename;
2014     $type =~ s!.*/norm(\w+)\.nls!$1!;
2015
2016     my $compose = $forms{$type} & 1;
2017     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2018
2019     my @version = split /\./, $UNIVERSION;
2020
2021     # combining classes
2022
2023     my @classes;
2024     my @class_values;
2025
2026     foreach my $c (grep defined, @combining_class_table)
2027     {
2028         $classes[$c] = 1 if $c < 0x100;
2029     }
2030     for (my $i = 0; $i < @classes; $i++)
2031     {
2032         next unless defined $classes[$i];
2033         $classes[$i] = @class_values;
2034         push @class_values, $i;
2035     }
2036     push @class_values, 0 if (@class_values % 2);
2037     die "too many classes" if @class_values >= 0x40;
2038
2039     # character properties
2040
2041     my @char_props;
2042     my @decomposed;
2043     my @comp_hash_table;
2044     my $comp_hash_size = $compose ? 254 : 0;
2045
2046     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2047     {
2048         next unless defined $combining_class_table[$i];
2049         if (defined $decomp{$type}->[$i])
2050         {
2051             my @dec = get_decomposition( $i, $decomp{$type} );
2052             if ($compose && (my @comp = get_composition( $i, $compat )))
2053             {
2054                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2055                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2056
2057                 my $val = 0;
2058                 foreach my $d (@dec)
2059                 {
2060                     $val = $combining_class_table[$d];
2061                     last if $val;
2062                 }
2063                 $char_props[$i] = $classes[$val];
2064             }
2065             else
2066             {
2067                 $char_props[$i] = 0xbf;
2068             }
2069             @dec = compose_hangul( @dec ) if $compose;
2070             @dec = to_utf16( @dec );
2071             push @dec, 0 if @dec >= 7;
2072             $decomposed[$i] = \@dec;
2073         }
2074         else
2075         {
2076             if ($combining_class_table[$i] == 0x100)
2077             {
2078                 $char_props[$i] = 0x7f;
2079             }
2080             elsif ($combining_class_table[$i])
2081             {
2082                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2083             }
2084             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2085             {
2086                 $char_props[$i] = 0xff;
2087             }
2088             else
2089             {
2090                 $char_props[$i] = 0;
2091             }
2092         }
2093     }
2094
2095     if ($compose)
2096     {
2097         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2098         {
2099             my @comp = get_composition( $i, $compat );
2100             next unless @comp;
2101             if ($combining_class_table[$comp[1]])
2102             {
2103                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2104                 $char_props[$comp[1]] |= 0x40;
2105             }
2106             else
2107             {
2108                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2109                 $char_props[$comp[1]] |= 0xc0;
2110             }
2111         }
2112     }
2113
2114     # surrogates
2115     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2116     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2117
2118     # Hangul
2119     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2120     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2121     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2122
2123     # invalid chars
2124     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2125     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2126     foreach my $i (0x00..0x10)
2127     {
2128         $char_props[($i << 16) | 0xfffe] = 0xff;
2129         $char_props[($i << 16) | 0xffff] = 0xff;
2130     }
2131
2132     # decomposition hash table
2133
2134     my @decomp_hash_table;
2135     my @decomp_hash_index;
2136     my @decomp_hash_data;
2137     my $decomp_hash_size = 944;
2138
2139     # build string of character data, reusing substrings when possible
2140     my $decomp_char_data = "";
2141     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2142     {
2143         my $str = pack "U*", @{$i};
2144         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2145     }
2146     for (my $i = 0; $i < @decomposed; $i++)
2147     {
2148         next unless defined $decomposed[$i];
2149         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2150         die "sequence not found" if $pos == -1;
2151         my $len = @{$decomposed[$i]};
2152         $len = 7 if $len > 7;
2153         my $hash = $i % $decomp_hash_size;
2154         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2155     }
2156     for (my $i = 0; $i < $decomp_hash_size; $i++)
2157     {
2158         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2159         next unless defined $decomp_hash_table[$i];
2160         if (@{$decomp_hash_table[$i]} == 1)
2161         {
2162             my $entry = $decomp_hash_table[$i]->[0];
2163             if ($char_props[$entry->[0]] == 0xbf)
2164             {
2165                 $decomp_hash_index[$i] = $entry->[1];
2166                 next;
2167             }
2168         }
2169         foreach my $entry (@{$decomp_hash_table[$i]})
2170         {
2171             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2172         }
2173     }
2174     push @decomp_hash_data, 0, 0;
2175
2176     # composition hash table
2177
2178     my @comp_hash_index;
2179     my @comp_hash_data;
2180     if (@comp_hash_table)
2181     {
2182         for (my $i = 0; $i < $comp_hash_size; $i++)
2183         {
2184             $comp_hash_index[$i] = @comp_hash_data;
2185             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2186         }
2187         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2188         push @comp_hash_data, 0, 0, 0;
2189     }
2190
2191     my $level1 = ($MAX_CHAR + 1) / 128;
2192     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2193
2194     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2195                    0, $decomp_hash_size, $comp_hash_size, 0 );
2196     my @tables = (0) x 8;
2197
2198     $tables[0] = 16 + @header + @tables;
2199     $tables[1] = $tables[0] + @class_values / 2;
2200     $tables[2] = $tables[1] + $level1 / 2;
2201     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2202     $tables[4] = $tables[3] + @decomp_hash_index;
2203     $tables[5] = $tables[4] + @decomp_hash_data;
2204     $tables[6] = $tables[5] + length $decomp_char_data;
2205     $tables[7] = $tables[6] + @comp_hash_index;
2206
2207     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2208     print OUTPUT pack "S<*", @header;
2209     print OUTPUT pack "S<*", @tables;
2210     print OUTPUT pack "C*", @class_values;
2211
2212     print OUTPUT pack "C*", @rows[0..$level1-1];
2213     print OUTPUT pack "C*", @rows[$level1..$#rows];
2214     print OUTPUT pack "S<*", @decomp_hash_index;
2215     print OUTPUT pack "S<*", @decomp_hash_data;
2216     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2217     print OUTPUT pack "S<*", @comp_hash_index;
2218     print OUTPUT pack "S<*", @comp_hash_data;
2219
2220     close OUTPUT;
2221     save_file($filename);
2222
2223     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2224 }
2225
2226
2227 ################################################################
2228 # output a codepage definition file from the global tables
2229 sub output_codepage_file($)
2230 {
2231     my $codepage = shift;
2232
2233     my $output = sprintf "nls/c_%03d.nls", $codepage;
2234     open OUTPUT,">$output.new" or die "Cannot create $output";
2235
2236     printf "Building %s\n", $output;
2237     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2238     else { dump_binary_dbcs_table( $codepage ); }
2239
2240     close OUTPUT;
2241     save_file($output);
2242
2243     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2244 }
2245
2246 ################################################################
2247 # output a codepage table from a Microsoft-style mapping file
2248 sub dump_msdata_codepage($)
2249 {
2250     my $filename = shift;
2251
2252     my $state = "";
2253     my ($codepage, $width, $count);
2254     my ($lb_cur, $lb_end);
2255
2256     @cp2uni = ();
2257     @glyph2uni = ();
2258     @lead_bytes = ();
2259     @uni2cp = ();
2260     $default_char = $DEF_CHAR;
2261     $default_wchar = $DEF_CHAR;
2262
2263     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2264
2265     while (<$INPUT>)
2266     {
2267         next if /^;/;  # skip comments
2268         next if /^\s*$/;  # skip empty lines
2269         next if /\x1a/;  # skip ^Z
2270         last if /^ENDCODEPAGE/;
2271
2272         if (/^CODEPAGE\s+(\d+)/)
2273         {
2274             $codepage = $1;
2275             next;
2276         }
2277         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2278         {
2279             $width = $1;
2280             $default_char = hex $2;
2281             $default_wchar = hex $3;
2282             next;
2283         }
2284         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2285         {
2286             $state = $1;
2287             $count = $2;
2288             next;
2289         }
2290         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2291         {
2292             if ($state eq "MBTABLE")
2293             {
2294                 my $cp = hex $1;
2295                 my $uni = hex $2;
2296                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2297                 next;
2298             }
2299             if ($state eq "GLYPHTABLE")
2300             {
2301                 my $cp = hex $1;
2302                 my $uni = hex $2;
2303                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2304                 next;
2305             }
2306             if ($state eq "WCTABLE")
2307             {
2308                 my $uni = hex $1;
2309                 my $cp = hex $2;
2310                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2311                 next;
2312             }
2313             if ($state eq "DBCSRANGE")
2314             {
2315                 my $start = hex $1;
2316                 my $end = hex $2;
2317                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2318                 $lb_cur = $start;
2319                 $lb_end = $end;
2320                 next;
2321             }
2322             if ($state eq "DBCSTABLE")
2323             {
2324                 my $mb = hex $1;
2325                 my $uni = hex $2;
2326                 my $cp = ($lb_cur << 8) | $mb;
2327                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2328                 if (!--$count)
2329                 {
2330                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2331                 }
2332                 next;
2333             }
2334         }
2335         die "$filename: Unrecognized line $_\n";
2336     }
2337     close $INPUT;
2338
2339     output_codepage_file( $codepage );
2340 }
2341
2342 ################################################################
2343 # align a string length
2344 sub align_string($$)
2345 {
2346     my ($align, $str) = @_;
2347     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2348     return $str;
2349 }
2350
2351 ################################################################
2352 # pack a GUID string
2353 sub pack_guid($)
2354 {
2355     $_ = shift;
2356     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2357     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2358 }
2359
2360 ################################################################
2361 # comparison function for compression sort
2362 sub cmp_compression
2363 {
2364     return scalar @{$a} <=> scalar @{$b} ||
2365         $a->[4] <=> $b->[4] ||
2366         $a->[5] <=> $b->[5] ||
2367         $a->[6] <=> $b->[6] ||
2368         $a->[7] <=> $b->[7] ||
2369         $a->[8] <=> $b->[8] ||
2370         $a->[9] <=> $b->[9] ||
2371         $a->[10] <=> $b->[10] ||
2372         $a->[11] <=> $b->[11] ||
2373         $a->[12] <=> $b->[12];
2374 }
2375
2376 ################################################################
2377 # build a binary sort keys table
2378 sub dump_sortkey_table($$)
2379 {
2380     my ($filename, $download) = @_;
2381
2382     my @keys;
2383     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2384     my @multiple_weights;
2385     my @expansions;
2386     my @compressions;
2387     my %exceptions;
2388     my %guids;
2389     my %compr_flags;
2390     my %locales;
2391     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2392     my $jamostr = "";
2393
2394     my $re_hex = '0x[0-9A-Fa-f]+';
2395     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2396     $guids{$default_guid} = { };
2397
2398     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2399
2400     my $KEYS = open_data_file( $MSDATA, $download );
2401
2402     printf "Building $filename\n";
2403
2404     while (<$KEYS>)
2405     {
2406         s/\s*;.*$//;
2407         next if /^\s*$/;  # skip empty lines
2408         if (/^\s*(SORTKEY|SORTTABLES)/)
2409         {
2410             $part = $1;
2411             next;
2412         }
2413         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2414         {
2415             $part = $section = "";
2416             next;
2417         }
2418         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2419         {
2420             $section = $1;
2421             $guid = undef;
2422             next;
2423         }
2424         next unless $part;
2425         if ("$part.$section" eq "SORTKEY.DEFAULT")
2426         {
2427             if (/^\s*($re_hex)\s+$re_key/)
2428             {
2429                 $keys[hex $1] = [ split(/\s+/,$2) ];
2430                 next;
2431             }
2432         }
2433         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2434         {
2435             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2436             {
2437                 $version = hex $1;
2438                 next;
2439             }
2440             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2441             {
2442                 # ignore for now
2443                 next;
2444             }
2445         }
2446         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2447                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2448                "$part.$section" eq "SORTTABLES.INVERSECASING")
2449         {
2450             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2451             {
2452                 $guid = lc $1;
2453                 $guids{$guid} = { } unless defined $guids{$guid};
2454                 $guids{$guid}->{flags} |= $flags{$section};
2455                 next;
2456             }
2457             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2458             {
2459                 $locales{$1} = $guid;
2460                 next;
2461             }
2462         }
2463         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2464         {
2465             if (/^\s*(\d+)\s+(\d+)/)
2466             {
2467                 push @multiple_weights, $1, $2;
2468                 next;
2469             }
2470         }
2471         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2472         {
2473             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2474             {
2475                 my $pos = scalar @expansions / 2;
2476                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2477                 push @expansions, hex $2, hex $3;
2478                 next;
2479             }
2480         }
2481         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2482         {
2483             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2484             {
2485                 $keys[hex $1] = $keys[hex $2];
2486                 next;
2487             }
2488         }
2489         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2490         {
2491             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2492             {
2493                 if ($subsection || !$guid)  # start a new one
2494                 {
2495                     $guid = lc $1;
2496                     $subsection = "";
2497                     $guids{$guid} = { } unless defined $guids{$guid};
2498                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2499                     $guids{$guid}->{compr} = @compressions;
2500                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2501                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2502                     push @compressions, [ ];
2503                 }
2504                 else  # merge with current one
2505                 {
2506                     $guids{lc $1} = { } unless defined $guids{lc $1};
2507                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2508                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2509                     $compr_flags{lc $1} = $compr_flags{$guid};
2510                 }
2511                 next;
2512             }
2513             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2514             {
2515                 $locales{$1} = $guid;
2516                 next;
2517             }
2518             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2519             {
2520                 $subsection = $1;
2521                 next;
2522             }
2523             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2524             {
2525                 my @comp = map { hex $_; } split(/\s+/,$1);
2526                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2527                 # add compression flags
2528                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2529                 next;
2530             }
2531         }
2532         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2533         {
2534             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2535             {
2536                 $guid = lc $1;
2537                 $guids{$guid} = { } unless defined $guids{lc $1};
2538                 $ling_flag = ($2 ? "+" : "-");
2539                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2540                 next;
2541             }
2542             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2543             {
2544                 $locales{$1} = $guid;
2545                 next;
2546             }
2547             if (/^\s*($re_hex)\s+$re_key/)
2548             {
2549                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2550                 next;
2551             }
2552         }
2553         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2554         {
2555             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2556             {
2557                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2558                 next;
2559             }
2560         }
2561         die "$download: $part.$section: unrecognized line $_\n";
2562     }
2563     close $KEYS;
2564
2565     # Sortkey table
2566
2567     my $table;
2568     for (my $i = 0; $i < 0x10000; $i++)
2569     {
2570         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2571         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2572     }
2573
2574     foreach my $id (sort keys %exceptions)
2575     {
2576         my $pos = length($table) / 4;
2577         my @exc = @{$exceptions{$id}};
2578         my @filled;
2579         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2580         my $guid = substr( $id, 0, -1 );
2581         $guids{$guid}->{$key} = $pos;
2582         $pos += 0x100;
2583         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2584         for (my $j = 0; $j < 0x10000; $j++)
2585         {
2586             next unless defined $exc[$j] || defined $flags[$j];
2587             $filled[$j >> 8] = 1;
2588             $j |= 0xff;
2589         }
2590         for (my $j = 0; $j < 0x100; $j++)
2591         {
2592             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2593             $pos += 0x100 if $filled[$j];
2594         }
2595         for (my $j = 0; $j < 0x10000; $j++)
2596         {
2597             next unless $filled[$j >> 8];
2598             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2599             $k[3] |= $flags[$j] || 0;
2600             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2601         }
2602     }
2603
2604     # Case mapping tables
2605
2606     # standard table
2607     my @casemaps;
2608     my @upper = @toupper_table;
2609     my @lower = @tolower_table;
2610     remove_linguistic_mappings( \@upper, \@lower );
2611     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2612
2613     # linguistic table
2614     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2615
2616     # Turkish table
2617     @upper = @toupper_table;
2618     @lower = @tolower_table;
2619     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2620     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2621     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2622     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2623
2624     # Char type table
2625
2626     my @table;
2627     my $types = "";
2628     my %typestr;
2629     for (my $i = 0; $i < 0x10000; $i++)
2630     {
2631         my $str = pack "S<3",
2632             ($category_table[$i] || 0) & 0xffff,
2633             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2634             ($category_table[$i] || 0) >> 16;
2635
2636         if (!defined($typestr{$str}))
2637         {
2638             $typestr{$str} = length($types) / 6;
2639             $types .= $str;
2640         }
2641         $table[$i] = $typestr{$str};
2642     }
2643
2644     my @rows = compress_array( 4096, 0, @table[0..65535] );
2645     my @array = compress_array( 256, 0, @rows[0..4095] );
2646     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2647     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2648
2649     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2650     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2651     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2652
2653     # Sort tables
2654
2655     # guids
2656     my $sorttables = pack "L<2", $version, scalar %guids;
2657     foreach my $id (sort keys %guids)
2658     {
2659         my %guid = %{$guids{$id}};
2660         my $flags = $guid{flags} || 0;
2661         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2662         $sorttables .= pack_guid($id) . pack "L<5",
2663             $flags,
2664             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2665             $guid{except} || 0,
2666             $guid{ling_except} || 0,
2667             $map / 2;
2668     }
2669
2670     # expansions
2671     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2672
2673     # compressions
2674     $sorttables .= pack "L<", scalar @compressions;
2675     my $rowstr = "";
2676     foreach my $c (@compressions)
2677     {
2678         my $pos = length($rowstr) / 2;
2679         my $min = 0xffff;
2680         my $max = 0;
2681         my @lengths = (0) x 8;
2682         foreach my $r (sort cmp_compression @{$c})
2683         {
2684             my @row = @{$r};
2685             $lengths[scalar @row - 6]++;
2686             foreach my $val (@row[4..$#row])
2687             {
2688                 $min = $val if $min > $val;
2689                 $max = $val if $max < $val;
2690             }
2691             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2692             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2693         }
2694         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2695     }
2696     $sorttables .= $rowstr;
2697
2698     # multiple weights
2699     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2700
2701     # jamo sort
2702     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2703
2704     # Locales
2705
2706     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2707     foreach my $loc (sort keys %locales)
2708     {
2709         # skip specific locales that match more general ones
2710         my @parts = split /[-_]/, $loc;
2711         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2712         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2713         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2714     }
2715
2716     # File header
2717
2718     my @header;
2719     $header[0] = 16;
2720     $header[1] = $header[0] + length $table;
2721     $header[2] = $header[1] + length $casemaps;
2722     $header[3] = $header[2] + length $chartypes;
2723
2724     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2725     print OUTPUT pack "L<*", @header;
2726     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2727     close OUTPUT;
2728     save_file($filename);
2729 }
2730
2731
2732 ################################################################
2733 # build the script to create registry keys
2734 sub dump_registry_script($%)
2735 {
2736     my ($filename, %keys) = @_;
2737     my $indent = 1;
2738
2739     printf "Building %s\n", $filename;
2740     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2741     print OUTPUT "HKLM\n{\n";
2742     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2743     {
2744         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2745         $indent++;
2746     }
2747     foreach my $k (sort keys %keys)
2748     {
2749         my @subkeys = split /\\/, $k;
2750         my ($def, @vals) = @{$keys{$k}};
2751         for (my $i = 0; $i < @subkeys; $i++)
2752         {
2753             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2754                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2755             $indent++;
2756         }
2757         foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2758         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2759     }
2760     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2761     close OUTPUT;
2762     save_file($filename);
2763 }
2764
2765
2766 ################################################################
2767 # save a file if modified
2768 sub save_file($)
2769 {
2770     my $file = shift;
2771     if (-f $file && !system "cmp $file $file.new >/dev/null")
2772     {
2773         unlink "$file.new";
2774     }
2775     else
2776     {
2777         rename "$file.new", "$file";
2778     }
2779 }
2780
2781
2782 ################################################################
2783 # main routine
2784
2785 chdir ".." if -f "./make_unicode";
2786 load_data();
2787 dump_case_mappings( "libs/port/casemap.c" );
2788 dump_sortkeys( "dlls/kernelbase/collation.c" );
2789 dump_ctype_tables( "libs/port/wctype.c" );
2790 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2791 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2792 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2793 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2794 dump_mirroring( "dlls/dwrite/mirror.c" );
2795 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2796 dump_bracket( "dlls/dwrite/bracket.c" );
2797 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2798 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2799 dump_linebreak( "dlls/dwrite/linebreak.c" );
2800 dump_scripts( "dlls/dwrite/scripts" );
2801 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2802 dump_vertical( "dlls/gdi32/vertical.c" );
2803 dump_vertical( "dlls/wineps.drv/vertical.c" );
2804 dump_intl_nls("nls/l_intl.nls");
2805 dump_norm_table( "nls/normnfc.nls" );
2806 dump_norm_table( "nls/normnfd.nls" );
2807 dump_norm_table( "nls/normnfkc.nls" );
2808 dump_norm_table( "nls/normnfkd.nls" );
2809 dump_norm_table( "nls/normidna.nls" );
2810 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2811 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2812 dump_eucjp_codepage();
2813 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2814
2815 exit 0;
2816
2817 # Local Variables:
2818 # compile-command: "./make_unicode"
2819 # End: