tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $KSCDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
  30 my $REPORTS = "http://www.unicode.org/reports";
  31 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  32 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  33
  34 # Sort keys file
  35 my $SORTKEYS = "tr10/allkeys.txt";
  36
  37 # Default char for undefined mappings
  38 my $DEF_CHAR = ord '?';
  39
  40 # Last valid Unicode character
  41 my $MAX_CHAR = 0x10ffff;
  42
  43 my @allfiles =
  44 (
  45     "CodpageFiles/037.txt",
  46     "CodpageFiles/437.txt",
  47     "CodpageFiles/500.txt",
  48     "CodpageFiles/708.txt",
  49     "CodpageFiles/720.txt",
  50     "CodpageFiles/737.txt",
  51     "CodpageFiles/775.txt",
  52     "CodpageFiles/850.txt",
  53     "CodpageFiles/852.txt",
  54     "CodpageFiles/855.txt",
  55     "CodpageFiles/857.txt",
  56     "CodpageFiles/860.txt",
  57     "CodpageFiles/861.txt",
  58     "CodpageFiles/862.txt",
  59     "CodpageFiles/863.txt",
  60     "CodpageFiles/864.txt",
  61     "CodpageFiles/865.txt",
  62     "CodpageFiles/866.txt",
  63     "CodpageFiles/869.txt",
  64     "CodpageFiles/874.txt",
  65     "CodpageFiles/875.txt",
  66     "CodpageFiles/932.txt",
  67     "CodpageFiles/936.txt",
  68     "CodpageFiles/949.txt",
  69     "CodpageFiles/950.txt",
  70     "CodpageFiles/1026.txt",
  71     "CodpageFiles/1250.txt",
  72     "CodpageFiles/1251.txt",
  73     "CodpageFiles/1252.txt",
  74     "CodpageFiles/1253.txt",
  75     "CodpageFiles/1254.txt",
  76     "CodpageFiles/1255.txt",
  77     "CodpageFiles/1256.txt",
  78     "CodpageFiles/1257.txt",
  79     "CodpageFiles/1258.txt",
  80     "CodpageFiles/1361.txt",
  81     "CodpageFiles/10000.txt",
  82     "CodpageFiles/10001.txt",
  83     "CodpageFiles/10002.txt",
  84     "CodpageFiles/10003.txt",
  85     "CodpageFiles/10004.txt",
  86     "CodpageFiles/10005.txt",
  87     "CodpageFiles/10006.txt",
  88     "CodpageFiles/10007.txt",
  89     "CodpageFiles/10008.txt",
  90     "CodpageFiles/10010.txt",
  91     "CodpageFiles/10017.txt",
  92     "CodpageFiles/10021.txt",
  93     "CodpageFiles/10029.txt",
  94     "CodpageFiles/10079.txt",
  95     "CodpageFiles/10081.txt",
  96     "CodpageFiles/10082.txt",
  97     "CodpageFiles/20127.txt",
  98     "CodpageFiles/20866.txt",
  99     "CodpageFiles/21866.txt",
 100     "CodpageFiles/28591.txt",
 101     "CodpageFiles/28592.txt",
 102     "CodpageFiles/28593.txt",
 103     "CodpageFiles/28594.txt",
 104     "CodpageFiles/28595.txt",
 105     "CodpageFiles/28596.txt",
 106     "CodpageFiles/28597.txt",
 107     "CodpageFiles/28598.txt",
 108     "CodpageFiles/28599.txt",
 109     "CodpageFiles/28603.txt",
 110     "CodpageFiles/28605.txt",
 111 );
 112
 113
 114 my %ctype =
 115 (
 116      # CT_CTYPE1
 117     "upper"  => 0x0001,
 118     "lower"  => 0x0002,
 119     "digit"  => 0x0004,
 120     "space"  => 0x0008,
 121     "punct"  => 0x0010,
 122     "cntrl"  => 0x0020,
 123     "blank"  => 0x0040,
 124     "xdigit" => 0x0080,
 125     "alpha"  => 0x0100 | 0x80000000,
 126     "defin"  => 0x0200,
 127      # CT_CTYPE3 in high 16 bits
 128     "nonspacing"    => 0x00010000,
 129     "diacritic"     => 0x00020000,
 130     "vowelmark"     => 0x00040000,
 131     "symbol"        => 0x00080000,
 132     "katakana"      => 0x00100000,
 133     "hiragana"      => 0x00200000,
 134     "halfwidth"     => 0x00400000,
 135     "fullwidth"     => 0x00800000,
 136     "ideograph"     => 0x01000000,
 137     "kashida"       => 0x02000000,
 138     "lexical"       => 0x04000000,
 139     "highsurrogate" => 0x08000000,
 140     "lowsurrogate"  => 0x10000000,
 141 );
 142
 143 my %bracket_types =
 144 (
 145     "o" => 0x0000,
 146     "c" => 0x0001,
 147 );
 148
 149 my %indic_types =
 150 (
 151     "Other"    => 0x0000,
 152     "Bindu"    => 0x0001,
 153     "Visarga"  => 0x0002,
 154     "Avagraha" => 0x0003,
 155     "Nukta"    => 0x0004,
 156     "Virama"   => 0x0005,
 157     "Vowel_Independent"  => 0x0006,
 158     "Vowel_Dependent"  => 0x0007,
 159     "Vowel"  => 0x0008,
 160     "Consonant_Placeholder"  => 0x0009,
 161     "Consonant"  => 0x000a,
 162     "Consonant_Dead"  => 0x000b,
 163     "Consonant_Succeeding_Repha" => 0x000c,
 164     "Consonant_Subjoined"  => 0x000d,
 165     "Consonant_Medial"  => 0x000e,
 166     "Consonant_Final"  => 0x000f,
 167     "Consonant_Head_Letter"  => 0x0010,
 168     "Modifying_Letter"  => 0x0011,
 169     "Tone_Letter"  => 0x0012,
 170     "Tone_Mark"  => 0x0013,
 171     "Register_Shifter"  => 0x0014,
 172     "Consonant_Preceding_Repha" => 0x0015,
 173     "Pure_Killer" => 0x0016,
 174     "Invisible_Stacker" => 0x0017,
 175     "Gemination_Mark" => 0x0018,
 176     "Cantillation_Mark" => 0x0019,
 177     "Non_Joiner" => 0x001a,
 178     "Joiner" => 0x001b,
 179     "Number_Joiner" => 0x001c,
 180     "Number" => 0x001d,
 181     "Brahmi_Joining_Number" => 0x001e,
 182     "Consonant_With_Stacker" => 0x001f,
 183     "Consonant_Prefixed" => 0x0020,
 184     "Syllable_Modifier" => 0x0021,
 185     "Consonant_Killer" => 0x0022,
 186     "Consonant_Initial_Postfixed" => 0x0023,
 187 );
 188
 189 my %matra_types =
 190 (
 191     "Right"    => 0x01,
 192     "Left"  => 0x02,
 193     "Visual_Order_Left" => 0x03,
 194     "Left_And_Right"    => 0x04,
 195     "Top"   => 0x05,
 196     "Bottom"  => 0x06,
 197     "Top_And_Bottom"  => 0x07,
 198     "Top_And_Right"  => 0x08,
 199     "Top_And_Left"  => 0x09,
 200     "Top_And_Left_And_Right"  => 0x0a,
 201     "Bottom_And_Right"  => 0x0b,
 202     "Top_And_Bottom_And_Right"  => 0x0c,
 203     "Overstruck"  => 0x0d,
 204     "Invisible"  => 0x0e,
 205     "Bottom_And_Left"  => 0x0f,
 206     "Top_And_Bottom_And_Left"  => 0x10,
 207 );
 208
 209 my %break_types =
 210 (
 211     "BK"  => 0x0001,
 212     "CR"  => 0x0002,
 213     "LF"  => 0x0003,
 214     "CM"  => 0x0004,
 215     "SG"  => 0x0005,
 216     "GL"  => 0x0006,
 217     "CB"  => 0x0007,
 218     "SP"  => 0x0008,
 219     "ZW"  => 0x0009,
 220     "NL"  => 0x000a,
 221     "WJ"  => 0x000b,
 222     "JL"  => 0x000c,
 223     "JV"  => 0x000d,
 224     "JT"  => 0x000e,
 225     "H2"  => 0x000f,
 226     "H3"  => 0x0010,
 227     "XX"  => 0x0011,
 228     "OP"  => 0x0012,
 229     "CL"  => 0x0013,
 230     "CP"  => 0x0014,
 231     "QU"  => 0x0015,
 232     "NS"  => 0x0016,
 233     "EX"  => 0x0017,
 234     "SY"  => 0x0018,
 235     "IS"  => 0x0019,
 236     "PR"  => 0x001a,
 237     "PO"  => 0x001b,
 238     "NU"  => 0x001c,
 239     "AL"  => 0x001d,
 240     "ID"  => 0x001e,
 241     "IN"  => 0x001f,
 242     "HY"  => 0x0020,
 243     "BB"  => 0x0021,
 244     "BA"  => 0x0022,
 245     "SA"  => 0x0023,
 246     "AI"  => 0x0024,
 247     "B2"  => 0x0025,
 248     "HL"  => 0x0026,
 249     "CJ"  => 0x0027,
 250     "RI"  => 0x0028,
 251     "EB"  => 0x0029,
 252     "EM"  => 0x002a,
 253     "ZWJ" => 0x002b,
 254 );
 255
 256 my %vertical_types =
 257 (
 258     "R"  => 0x0000,
 259     "U"  => 0x0001,
 260     "Tr" => 0x0002,
 261     "Tu" => 0x0003,
 262 );
 263
 264 my %categories =
 265 (
 266     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 267     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 268     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 269     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 270     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 271     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 272     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 273     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 274     "No" => $ctype{"defin"},                    # Number, Other
 275     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 276     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 277     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 278     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 279     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 280     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 281     "Co" => $ctype{"defin"},                    # Other, Private Use
 282     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 283     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 284     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 285     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 286     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 287     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 288     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 289     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 290     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 291     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 292     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 293     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 294     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 295     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 296 );
 297
 298 # a few characters need additional categories that cannot be determined automatically
 299 my %special_categories =
 300 (
 301     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 302                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 303     "space"  => [ 0x09..0x0d, 0x85 ],
 304     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 305     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 306                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 307                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 308                   0xfff9, 0xfffa, 0xfffb ],
 309     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 310                   0xd7, 0xf7 ],
 311     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 312     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 313     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 314                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 315     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 316     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 317                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 318                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 319     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 320     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 321                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 322                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 323                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 324                      0x3131..0x3164 ],
 325     "ideograph" => [ 0x3006..0x3007 ],
 326     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 327                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 328                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 329                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 330                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 331                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 332                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 333                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 334                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 335                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 336     "kashida" => [ 0x0640 ],
 337 );
 338
 339 my %directions =
 340 (
 341     "L"   => 1,    # Left-to-Right
 342     "R"   => 2,    # Right-to-Left
 343     "AL"  => 12,   # Right-to-Left Arabic
 344     "EN"  => 3,    # European Number
 345     "ES"  => 4,    # European Number Separator
 346     "ET"  => 5,    # European Number Terminator
 347     "AN"  => 6,    # Arabic Number
 348     "CS"  => 7,    # Common Number Separator
 349     "NSM" => 13,   # Non-Spacing Mark
 350     "BN"  => 14,   # Boundary Neutral
 351     "B"   => 8,    # Paragraph Separator
 352     "S"   => 9,    # Segment Separator
 353     "WS"  => 10,   # Whitespace
 354     "ON"  => 11,   # Other Neutrals
 355     "LRE" => 15,   # Left-to-Right Embedding
 356     "LRO" => 15,   # Left-to-Right Override
 357     "RLE" => 15,   # Right-to-Left Embedding
 358     "RLO" => 15,   # Right-to-Left Override
 359     "PDF" => 15,   # Pop Directional Format
 360     "LRI" => 15,   # Left-to-Right Isolate
 361     "RLI" => 15,   # Right-to-Left Isolate
 362     "FSI" => 15,   # First Strong Isolate
 363     "PDI" => 15    # Pop Directional Isolate
 364 );
 365
 366 my %c2_types =
 367 (
 368     "L"   => 1,    # C2_LEFTTORIGHT
 369     "R"   => 2,    # C2_RIGHTTOLEFT
 370     "AL"  => 2,    # C2_RIGHTTOLEFT
 371     "EN"  => 3,    # C2_EUROPENUMBER
 372     "ES"  => 4,    # C2_EUROPESEPARATOR
 373     "ET"  => 5,    # C2_EUROPETERMINATOR
 374     "AN"  => 6,    # C2_ARABICNUMBER
 375     "CS"  => 7,    # C2_COMMONSEPARATOR
 376     "NSM" => 11,   # C2_OTHERNEUTRAL
 377     "BN"  => 0,    # C2_NOTAPPLICABLE
 378     "B"   => 8,    # C2_BLOCKSEPARATOR
 379     "S"   => 9,    # C2_SEGMENTSEPARATOR
 380     "WS"  => 10,   # C2_WHITESPACE
 381     "ON"  => 11,   # C2_OTHERNEUTRAL
 382     "LRE" => 11,   # C2_OTHERNEUTRAL
 383     "LRO" => 11,   # C2_OTHERNEUTRAL
 384     "RLE" => 11,   # C2_OTHERNEUTRAL
 385     "RLO" => 11,   # C2_OTHERNEUTRAL
 386     "PDF" => 11,   # C2_OTHERNEUTRAL
 387     "LRI" => 11,   # C2_OTHERNEUTRAL
 388     "RLI" => 11,   # C2_OTHERNEUTRAL
 389     "FSI" => 11,   # C2_OTHERNEUTRAL
 390     "PDI" => 11    # C2_OTHERNEUTRAL
 391 );
 392
 393 my %bidi_types =
 394 (
 395     "ON"  => 0,    # Other Neutrals
 396     "L"   => 1,    # Left-to-Right
 397     "R"   => 2,    # Right-to-Left
 398     "AN"  => 3,    # Arabic Number
 399     "EN"  => 4,    # European Number
 400     "AL"  => 5,    # Right-to-Left Arabic
 401     "NSM" => 6,    # Non-Spacing Mark
 402     "CS"  => 7,    # Common Number Separator
 403     "ES"  => 8,    # European Number Separator
 404     "ET"  => 9,    # European Number Terminator
 405     "BN"  => 10,   # Boundary Neutral
 406     "S"   => 11,   # Segment Separator
 407     "WS"  => 12,   # Whitespace
 408     "B"   => 13,   # Paragraph Separator
 409     "RLO" => 14,   # Right-to-Left Override
 410     "RLE" => 15,   # Right-to-Left Embedding
 411     "LRO" => 16,   # Left-to-Right Override
 412     "LRE" => 17,   # Left-to-Right Embedding
 413     "PDF" => 18,   # Pop Directional Format
 414     "LRI" => 19,   # Left-to-Right Isolate
 415     "RLI" => 20,   # Right-to-Left Isolate
 416     "FSI" => 21,   # First Strong Isolate
 417     "PDI" => 22    # Pop Directional Isolate
 418 );
 419
 420 my %joining_types =
 421 (
 422    "U" => 0,           # Non_Joining
 423    "L" => 1,           # Left_Joining
 424    "R" => 2,           # Right_Joining
 425    "D" => 3,           # Dual_Joining
 426    "C" => 3,           # Join_Causing
 427    "ALAPH" => 4,       # Syriac ALAPH
 428    "DALATH RISH" => 5, # Syriac DALATH RISH group
 429    "T" => 6,           # Transparent
 430 );
 431
 432 my @cp2uni = ();
 433 my @glyph2uni = ();
 434 my @lead_bytes = ();
 435 my @uni2cp = ();
 436 my @tolower_table = ();
 437 my @toupper_table = ();
 438 my @digitmap_table = ();
 439 my @category_table = ();
 440 my @initial_joining_table = ();
 441 my @direction_table = ();
 442 my @decomp_table = ();
 443 my @combining_class_table = ();
 444 my @decomp_compat_table = ();
 445 my @comp_exclusions = ();
 446 my @idna_decomp_table = ();
 447 my @idna_disallowed = ();
 448 my %registry_keys;
 449 my $default_char;
 450 my $default_wchar;
 451
 452 my %joining_forms =
 453 (
 454    "isolated" => [],
 455    "final" => [],
 456    "initial" => [],
 457    "medial" => []
 458 );
 459
 460 sub to_utf16(@)
 461 {
 462     my @ret;
 463     foreach my $ch (@_)
 464     {
 465         if ($ch < 0x10000)
 466         {
 467             push @ret, $ch;
 468         }
 469         else
 470         {
 471             my $val = $ch - 0x10000;
 472             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 473         }
 474     }
 475     return @ret;
 476 }
 477
 478 ################################################################
 479 # fetch a unicode.org file and open it
 480 sub open_data_file($$)
 481 {
 482     my ($base, $name) = @_;
 483     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 484     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 485     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 486     local *FILE;
 487
 488     if ($base =~ /.*\/([^\/]+)\.zip$/)
 489     {
 490         my $zip = "$1$suffix.zip";
 491         unless (-f "$cache/$zip")
 492         {
 493             system "mkdir", "-p", $cache;
 494             print "Fetching $base...\n";
 495             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 496         }
 497         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 498     }
 499     else
 500     {
 501         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 502         unless (-f $dest)
 503         {
 504             system "mkdir", "-p", $dir;
 505             print "Fetching $base/$name...\n";
 506             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 507         }
 508         open FILE, "<$dest" or die "cannot open $dest";
 509     }
 510     return *FILE;
 511 }
 512
 513 ################################################################
 514 # recursively get the decomposition for a character
 515 sub get_decomposition($$);
 516 sub get_decomposition($$)
 517 {
 518     my ($char, $table) = @_;
 519     my @ret;
 520
 521     return $char unless defined $table->[$char];
 522     foreach my $ch (@{$table->[$char]})
 523     {
 524         push @ret, get_decomposition( $ch, $table );
 525     }
 526     return @ret;
 527 }
 528
 529 ################################################################
 530 # get the composition that results in a given character
 531 sub get_composition($$)
 532 {
 533     my ($ch, $compat) = @_;
 534     return () unless defined $decomp_table[$ch];  # no decomposition
 535     my @ret = @{$decomp_table[$ch]};
 536     return () if @ret < 2;                        # singleton decomposition
 537     return () if $comp_exclusions[$ch];           # composition exclusion
 538     return () if $combining_class_table[$ch];     # non-starter
 539     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 540     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 541         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 542     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 543         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 544     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 545         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 546     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 547     return @ret;
 548 }
 549
 550 ################################################################
 551 # recursively build decompositions
 552 sub build_decompositions(@)
 553 {
 554     my @src = @_;
 555     my @dst;
 556
 557     for (my $i = 0; $i < @src; $i++)
 558     {
 559         next unless defined $src[$i];
 560         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 561         $dst[$i] = \@decomp;
 562     }
 563     return @dst;
 564 }
 565
 566 ################################################################
 567 # compose Hangul sequences
 568 sub compose_hangul(@)
 569 {
 570     my $SBASE  = 0xac00;
 571     my $LBASE  = 0x1100;
 572     my $VBASE  = 0x1161;
 573     my $TBASE  = 0x11a7;
 574     my $LCOUNT = 19;
 575     my $VCOUNT = 21;
 576     my $TCOUNT = 28;
 577     my $NCOUNT = $VCOUNT * $TCOUNT;
 578     my $SCOUNT = $LCOUNT * $NCOUNT;
 579
 580     my @seq = @_;
 581     my @ret;
 582     my $i;
 583
 584     for ($i = 0; $i < @seq; $i++)
 585     {
 586         my $ch = $seq[$i];
 587         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 588             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 589         {
 590             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 591             $i++;
 592         }
 593         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 594             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 595         {
 596             $ch += $seq[$i+1] - $TBASE;
 597             $i++;
 598         }
 599         push @ret, $ch;
 600     }
 601     return @ret;
 602 }
 603
 604 ################################################################
 605 # remove linguistic-only mappings from the case table
 606 sub remove_linguistic_mappings($$)
 607 {
 608     my ($upper, $lower) = @_;
 609
 610     # remove case mappings that don't round-trip
 611
 612     for (my $i = 0; $i < @{$upper}; $i++)
 613     {
 614         next unless defined ${$upper}[$i];
 615         my $ch = ${$upper}[$i];
 616         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 617     }
 618     for (my $i = 0; $i < @{$lower}; $i++)
 619     {
 620         next unless defined ${$lower}[$i];
 621         my $ch = ${$lower}[$i];
 622         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 623     }
 624 }
 625
 626 ################################################################
 627 # read in the Unicode database files
 628 sub load_data()
 629 {
 630     my $start;
 631
 632     # now build mappings from the decomposition field of the Unicode database
 633
 634     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 635     while (<$UNICODE_DATA>)
 636     {
 637         # Decode the fields ...
 638         my ($code, $name, $cat, $comb, $bidi,
 639             $decomp, $dec, $dig, $num, $mirror,
 640             $oldname, $comment, $upper, $lower, $title) = split /;/;
 641         my $src = hex $code;
 642
 643         die "unknown category $cat" unless defined $categories{$cat};
 644         die "unknown directionality $bidi" unless defined $directions{$bidi};
 645
 646         $category_table[$src] = $categories{$cat};
 647         $direction_table[$src] = $bidi;
 648         if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
 649         {
 650             $initial_joining_table[$src] = $joining_types{"T"};
 651         }
 652         else
 653         {
 654             $initial_joining_table[$src] = $joining_types{"U"};
 655         }
 656
 657         if ($lower ne "")
 658         {
 659             $tolower_table[$src] = hex $lower;
 660         }
 661         if ($upper ne "")
 662         {
 663             $toupper_table[$src] = hex $upper;
 664         }
 665         if ($dec ne "")
 666         {
 667             $category_table[$src] |= $ctype{"digit"};
 668         }
 669         if ($dig ne "")
 670         {
 671             $digitmap_table[$src] = ord $dig;
 672         }
 673         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 674
 675         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 676         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 677         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 678         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 679         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 680         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 681         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 682         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 683         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 684         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 685         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 686         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 687
 688         # copy the category and direction for everything between First/Last pairs
 689         if ($name =~ /, First>/) { $start = $src; }
 690         if ($name =~ /, Last>/)
 691         {
 692             while ($start < $src)
 693             {
 694                 $category_table[$start] = $category_table[$src];
 695                 $direction_table[$start] = $direction_table[$src];
 696                 $combining_class_table[$start] = $combining_class_table[$src];
 697                 $start++;
 698             }
 699         }
 700
 701         next if $decomp eq "";  # no decomposition, skip it
 702
 703         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 704         {
 705             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 706             $decomp_compat_table[$src] = \@seq;
 707         }
 708
 709         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 710         {
 711             # decomposition of the form "<foo> 1234" -> use char if type is known
 712             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 713             {
 714                 ${joining_forms{$1}}[hex $2] = $src;
 715             }
 716         }
 717         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 718         {
 719             # decomposition "<compat> 0020 1234" -> combining accent
 720         }
 721         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 722         {
 723             # store decomposition
 724             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 725             {
 726                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 727             }
 728             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 729             {
 730                 # Single char decomposition
 731                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 732             }
 733         }
 734     }
 735     close $UNICODE_DATA;
 736
 737     # patch the category of some special characters
 738
 739     for (my $i = 0; $i < @decomp_table; $i++)
 740     {
 741         next unless defined $decomp_table[$i];
 742         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 743     }
 744     foreach my $cat (keys %special_categories)
 745     {
 746         my $flag = $ctype{$cat};
 747         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 748     }
 749     for (my $i = 0; $i < @decomp_compat_table; $i++)
 750     {
 751         next unless defined $decomp_compat_table[$i];
 752         next unless @{$decomp_compat_table[$i]} == 2;
 753         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 754     }
 755
 756     # load the composition exclusions
 757
 758     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 759     while (<$EXCL>)
 760     {
 761         s/\#.*//;  # remove comments
 762         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 763         {
 764             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 765         }
 766         elsif (/^([0-9a-fA-F]+)\s*$/)
 767         {
 768             $comp_exclusions[hex $1] = 1;
 769         }
 770     }
 771     close $EXCL;
 772
 773     # load the IDNA mappings
 774
 775     @idna_decomp_table = @decomp_compat_table;
 776     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 777     while (<$IDNA>)
 778     {
 779         s/\#.*//;  # remove comments
 780         next if /^\s*$/;
 781         my ($char, $type, $mapping) = split /;/;
 782         my ($ch1, $ch2);
 783         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 784         {
 785             $ch1 = hex $1;
 786             $ch2 = hex $2;
 787         }
 788         elsif ($char =~ /([0-9a-fA-F]+)/)
 789         {
 790             $ch1 = $ch2 = hex $1;
 791         }
 792
 793         if ($type =~ /mapped/ || $type =~ /deviation/)
 794         {
 795             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 796             my @seq = map { hex $_; } split /\s+/, $mapping;
 797             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 798         }
 799         elsif ($type =~ /valid/)
 800         {
 801         }
 802         elsif ($type =~ /ignored/)
 803         {
 804             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 805         }
 806         elsif ($type =~ /disallowed/)
 807         {
 808             foreach my $i ($ch1 .. $ch2)
 809             {
 810                 $idna_decomp_table[$i] = undef;
 811                 $idna_disallowed[$i] = 1;
 812             }
 813         }
 814     }
 815     close $IDNA;
 816 }
 817
 818
 819 ################################################################
 820 # add a new registry key
 821 sub add_registry_key($$)
 822 {
 823     my ($key, $defval) = @_;
 824     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 825 }
 826
 827 ################################################################
 828 # add a new registry value
 829 sub add_registry_value($$$)
 830 {
 831     my ($key, $name, $value) = @_;
 832     add_registry_key( $key, undef );
 833     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 834 }
 835
 836 ################################################################
 837 # define a new lead byte
 838 sub add_lead_byte($)
 839 {
 840     my $ch = shift;
 841     return if defined $cp2uni[$ch];
 842     push @lead_bytes, $ch;
 843     $cp2uni[$ch] = 0;
 844 }
 845
 846 ################################################################
 847 # define a new char mapping
 848 sub add_mapping($$)
 849 {
 850     my ($cp, $uni) = @_;
 851     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 852     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 853     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 854 }
 855
 856 ################################################################
 857 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 858 sub get_glyphs_mapping(@)
 859 {
 860     my @table = @_;
 861
 862     for (my $i = 0; $i < @glyph2uni; $i++)
 863     {
 864         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 865     }
 866     return @table;
 867 }
 868
 869 ################################################################
 870 # build EUC-JP table from the JIS 0208/0212 files
 871 sub dump_eucjp_codepage()
 872 {
 873     @cp2uni = ();
 874     @glyph2uni = ();
 875     @lead_bytes = ();
 876     @uni2cp = ();
 877     $default_char = $DEF_CHAR;
 878     $default_wchar = 0x30fb;
 879
 880     # ASCII chars
 881     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 882
 883     # lead bytes
 884     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 885
 886     # JIS X 0201 right plane
 887     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 888
 889     # undefined chars
 890     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 891     $cp2uni[0xa0] = 0xf8f0;
 892     $cp2uni[0xff] = 0xf8f3;
 893
 894     # Fix backslash conversion
 895     add_mapping( 0xa1c0, 0xff3c );
 896
 897     # Add private mappings for rows undefined in JIS 0208/0212
 898     my $private = 0xe000;
 899     foreach my $hi (0xf5 .. 0xfe)
 900     {
 901         foreach my $lo (0xa1 .. 0xfe)
 902         {
 903             add_mapping( ($hi << 8) + $lo, $private++ );
 904         }
 905     }
 906     foreach my $hi (0xf5 .. 0xfe)
 907     {
 908         foreach my $lo (0x21 .. 0x7e)
 909         {
 910             add_mapping( ($hi << 8) + $lo, $private++ );
 911         }
 912     }
 913
 914     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 915     while (<$INPUT>)
 916     {
 917         next if /^\#/;  # skip comments
 918         next if /^$/;  # skip empty lines
 919         next if /\x1a/;  # skip ^Z
 920         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 921         {
 922             add_mapping( 0x8080 + hex $1, hex $2 );
 923             next;
 924         }
 925         die "Unrecognized line $_\n";
 926     }
 927     close $INPUT;
 928
 929     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 930     while (<$INPUT>)
 931     {
 932         next if /^\#/;  # skip comments
 933         next if /^$/;  # skip empty lines
 934         next if /\x1a/;  # skip ^Z
 935         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 936         {
 937             add_mapping( 0x8000 + hex $1, hex $2 );
 938             next;
 939         }
 940         die "Unrecognized line $_\n";
 941     }
 942     close $INPUT;
 943
 944     output_codepage_file( 20932 );
 945 }
 946
 947 ################################################################
 948 # build Korean Wansung table from the KSX1001 file
 949 sub dump_krwansung_codepage(@)
 950 {
 951     my @cp949 = @_;
 952     @cp2uni = ();
 953     @glyph2uni = ();
 954     @lead_bytes = ();
 955     @uni2cp = ();
 956     $default_char = 0x3f;
 957     $default_wchar = 0x003f;
 958
 959     # ASCII and undefined chars
 960     foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
 961     add_mapping( 0xa0, 0xf8e6 );
 962     add_mapping( 0xad, 0xf8e7 );
 963     add_mapping( 0xae, 0xf8e8 );
 964     add_mapping( 0xaf, 0xf8e9 );
 965     add_mapping( 0xfe, 0xf8ea );
 966     add_mapping( 0xff, 0xf8eb );
 967
 968     my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
 969     while (<$INPUT>)
 970     {
 971         next if /^\#/;  # skip comments
 972         next if /^$/;  # skip empty lines
 973         next if /\x1a/;  # skip ^Z
 974         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 975         {
 976             add_mapping( 0x8080 + hex $1, hex $2 );
 977             next;
 978         }
 979         die "Unrecognized line $_\n";
 980     }
 981     close $INPUT;
 982
 983     # get some extra mappings from cp 949
 984     my @defined_lb;
 985     map { $defined_lb[$_] = 1; } @lead_bytes;
 986     foreach my $i (0x0000 .. 0xffff)
 987     {
 988         next if ($i >= 0x1100 && $i <= 0x11ff);  # range not used in 20949
 989         next unless defined $cp949[$i];
 990         if ($cp949[$i] >= 0xff)
 991         {
 992             # only add chars for lead bytes that exist in 20949
 993             my $hi = $cp949[$i] >> 8;
 994             my $lo = $cp949[$i] & 0xff;
 995             next unless $defined_lb[$hi];
 996             next unless $lo >= 0xa1 && $lo <= 0xfe;
 997         }
 998         add_mapping( $cp949[$i], $i );
 999     }
1000
1001     output_codepage_file( 20949 );
1002 }
1003
1004 ################################################################
1005 # build the sort keys table
1006 sub dump_sortkeys($)
1007 {
1008     my $filename = shift;
1009     my @sortkeys = ();
1010
1011     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
1012     while (<$INPUT>)
1013     {
1014         next if /^\#/;  # skip comments
1015         next if /^$/;  # skip empty lines
1016         next if /\x1a/;  # skip ^Z
1017         next if /^\@version/;  # skip @version header
1018         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1019         {
1020             my ($uni,$variable) = (hex $1, $2);
1021             next if $uni > 65535;
1022             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
1023             next;
1024         }
1025         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1026         {
1027             # multiple character sequence, ignored for now
1028             next;
1029         }
1030         die "$SORTKEYS: Unrecognized line $_\n";
1031     }
1032     close $INPUT;
1033
1034     # compress the keys to 32 bit:
1035     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
1036
1037     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
1038                        ${$a}[2] <=> ${$b}[2] or
1039                        ${$a}[3] <=> ${$b}[3] or
1040                        ${$a}[4] <=> ${$b}[4] or
1041                        $a cmp $b; } @sortkeys;
1042
1043     my ($n2, $n3) = (1, 1);
1044     my @keys = (-1, -1, -1, -1, -1 );
1045     my @flatkeys = ();
1046
1047     for (my $i = 0; $i < @sortkeys; $i++)
1048     {
1049         next unless defined $sortkeys[$i];
1050         my @current = @{$sortkeys[$i]};
1051         if ($current[1] == $keys[1])
1052         {
1053             if ($current[2] == $keys[2])
1054             {
1055                 if ($current[3] == $keys[3])
1056                 {
1057                     # nothing
1058                 }
1059                 else
1060                 {
1061                     $keys[3] = $current[3];
1062                     $n3++;
1063                     die if ($n3 >= 16);
1064                 }
1065             }
1066             else
1067             {
1068                 $keys[2] = $current[2];
1069                 $keys[3] = $current[3];
1070                 $n2++;
1071                 $n3 = 1;
1072                 die if ($n2 >= 256);
1073             }
1074         }
1075         else
1076         {
1077             $keys[1] = $current[1];
1078             $keys[2] = $current[2];
1079             $keys[3] = $current[3];
1080             $n2 = 1;
1081             $n3 = 1;
1082         }
1083
1084         if ($current[2]) { $current[2] = $n2; }
1085         if ($current[3]) { $current[3] = $n3; }
1086         if ($current[4]) { $current[4] = 1; }
1087
1088         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1089     }
1090
1091     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1092     printf "Building $filename\n";
1093     printf OUTPUT "/* Unicode collation element table */\n";
1094     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1095     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1096     print OUTPUT "#include \"windef.h\"\n\n";
1097
1098     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1099
1100     close OUTPUT;
1101     save_file($filename);
1102 }
1103
1104
1105 ################################################################
1106 # dump an array of integers
1107 sub dump_array($$@)
1108 {
1109     my ($bit_width, $default, @array) = @_;
1110     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1111     my $i;
1112     my $ret = "    ";
1113     for ($i = 0; $i < $#array; $i++)
1114     {
1115         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1116         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1117     }
1118     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1119     return $ret;
1120 }
1121
1122
1123 ################################################################
1124 # dump an SBCS mapping table in binary format
1125 sub dump_binary_sbcs_table($)
1126 {
1127     my $codepage = shift;
1128
1129     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1130     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1131
1132     print OUTPUT pack "S<*", @header;
1133     print OUTPUT pack "C12", (0) x 12;
1134     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1135
1136     if (@glyph2uni)
1137     {
1138         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1139     }
1140     else
1141     {
1142         print OUTPUT pack "S<*", 0;
1143     }
1144
1145     print OUTPUT pack "S<*", 0, 0;
1146
1147     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1148 }
1149
1150
1151 ################################################################
1152 # dump a DBCS mapping table in binary format
1153 sub dump_binary_dbcs_table($)
1154 {
1155     my $codepage = shift;
1156     my @lb_ranges = get_lb_ranges();
1157     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1158
1159     my @offsets = (0) x 256;
1160     my $pos = 0;
1161     foreach my $i (@lead_bytes)
1162     {
1163         $offsets[$i] = ($pos += 256);
1164         $cp2uni[$i] = 0;
1165     }
1166
1167     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1168
1169     print OUTPUT pack "S<*", @header;
1170     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1171     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1172     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1173
1174     foreach my $i (@lead_bytes)
1175     {
1176         my $base = $i << 8;
1177         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1178     }
1179
1180     print OUTPUT pack "S<", 4;
1181     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1182 }
1183
1184
1185 ################################################################
1186 # get the list of defined lead byte ranges
1187 sub get_lb_ranges()
1188 {
1189     my @list = ();
1190     my @ranges = ();
1191
1192     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1193     my $on = 0;
1194     for (my $i = 0; $i < 256; $i++)
1195     {
1196         if ($on)
1197         {
1198             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1199         }
1200         else
1201         {
1202             if ($list[$i]) { push @ranges, $i; $on = 1; }
1203         }
1204     }
1205     if ($on) { push @ranges, 0xff; }
1206     return @ranges;
1207 }
1208
1209 ################################################################
1210 # dump the Indic Syllabic Category table
1211 sub dump_indic($)
1212 {
1213     my $filename = shift;
1214     my @indic_table;
1215
1216     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1217     while (<$INPUT>)
1218     {
1219         next if /^\#/;  # skip comments
1220         next if /^\s*$/;  # skip empty lines
1221         next if /\x1a/;  # skip ^Z
1222         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1223         {
1224             my $type = $2;
1225             die "unknown indic $type" unless defined $indic_types{$type};
1226             if (hex $1 < 65536)
1227             {
1228                 $indic_table[hex $1] = $indic_types{$type};
1229             }
1230             next;
1231         }
1232         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1233         {
1234             my $type = $3;
1235             die "unknown indic $type" unless defined $indic_types{$type};
1236             if (hex $1 < 65536 and hex $2 < 65536)
1237             {
1238                 foreach my $i (hex $1 .. hex $2)
1239                 {
1240                     $indic_table[$i] = $indic_types{$type};
1241                 }
1242             }
1243             next;
1244         }
1245         die "malformed line $_";
1246     }
1247     close $INPUT;
1248
1249     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1250     while (<$INPUT>)
1251     {
1252         next if /^\#/;  # skip comments
1253         next if /^\s*$/;  # skip empty lines
1254         next if /\x1a/;  # skip ^Z
1255         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1256         {
1257             my $type = $2;
1258             die "unknown matra $type" unless defined $matra_types{$type};
1259             $indic_table[hex $1] |= $matra_types{$type} << 8;
1260             next;
1261         }
1262         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1263         {
1264             my $type = $3;
1265             die "unknown matra $type" unless defined $matra_types{$type};
1266             foreach my $i (hex $1 .. hex $2)
1267             {
1268                 $indic_table[$i] |= $matra_types{$type} << 8;
1269             }
1270             next;
1271         }
1272         die "malformed line $_";
1273     }
1274     close $INPUT;
1275
1276     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1277     print "Building $filename\n";
1278     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1279     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1280     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1281     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1282     print OUTPUT "#include \"windef.h\"\n\n";
1283
1284     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1285
1286     close OUTPUT;
1287     save_file($filename);
1288 }
1289
1290 ################################################################
1291 # dump the Line Break Properties table
1292 sub dump_linebreak($)
1293 {
1294     my $filename = shift;
1295     my @break_table;
1296
1297     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1298     while (<$INPUT>)
1299     {
1300         next if /^\#/;  # skip comments
1301         next if /^\s*$/;  # skip empty lines
1302         next if /\x1a/;  # skip ^Z
1303         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1304         {
1305             my $type = $2;
1306             die "unknown breaktype $type" unless defined $break_types{$type};
1307             $break_table[hex $1] = $break_types{$type};
1308             next;
1309         }
1310         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1311         {
1312             my $type = $3;
1313             die "unknown breaktype $type" unless defined $break_types{$type};
1314             foreach my $i (hex $1 .. hex $2)
1315             {
1316                 $break_table[$i] = $break_types{$type};
1317             }
1318             next;
1319         }
1320         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1321         {
1322             my $type = $2;
1323             die "unknown breaktype $type" unless defined $break_types{$type};
1324             $break_table[hex $1] = $break_types{$type};
1325             next;
1326         }
1327         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1328         {
1329             my $type = $3;
1330             die "unknown breaktype $type" unless defined $break_types{$type};
1331             foreach my $i (hex $1 .. hex $2)
1332             {
1333                 $break_table[$i] = $break_types{$type};
1334             }
1335             next;
1336         }
1337         die "malformed line $_";
1338     }
1339     close $INPUT;
1340
1341     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1342     print "Building $filename\n";
1343     print OUTPUT "/* Unicode Line Break Properties */\n";
1344     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1345     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1346     print OUTPUT "#include \"windef.h\"\n\n";
1347
1348     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1349
1350     close OUTPUT;
1351     save_file($filename);
1352 }
1353
1354 my %scripts =
1355 (
1356     "Unknown"                => 0,
1357     "Common"                 => 1,
1358     "Inherited"              => 2,
1359     "Arabic"                 => 3,
1360     "Armenian"               => 4,
1361     "Avestan"                => 5,
1362     "Balinese"               => 6,
1363     "Bamum"                  => 7,
1364     "Batak"                  => 8,
1365     "Bengali"                => 9,
1366     "Bopomofo"               => 10,
1367     "Brahmi"                 => 11,
1368     "Braille"                => 12,
1369     "Buginese"               => 13,
1370     "Buhid"                  => 14,
1371     "Canadian_Aboriginal"    => 15,
1372     "Carian"                 => 16,
1373     "Cham"                   => 17,
1374     "Cherokee"               => 18,
1375     "Coptic"                 => 19,
1376     "Cuneiform"              => 20,
1377     "Cypriot"                => 21,
1378     "Cyrillic"               => 22,
1379     "Deseret"                => 23,
1380     "Devanagari"             => 24,
1381     "Egyptian_Hieroglyphs"   => 25,
1382     "Ethiopic"               => 26,
1383     "Georgian"               => 27,
1384     "Glagolitic"             => 28,
1385     "Gothic"                 => 29,
1386     "Greek"                  => 30,
1387     "Gujarati"               => 31,
1388     "Gurmukhi"               => 32,
1389     "Han"                    => 33,
1390     "Hangul"                 => 34,
1391     "Hanunoo"                => 35,
1392     "Hebrew"                 => 36,
1393     "Hiragana"               => 37,
1394     "Imperial_Aramaic"       => 38,
1395     "Inscriptional_Pahlavi"  => 39,
1396     "Inscriptional_Parthian" => 40,
1397     "Javanese"               => 41,
1398     "Kaithi"                 => 42,
1399     "Kannada"                => 43,
1400     "Katakana"               => 44,
1401     "Kayah_Li"               => 45,
1402     "Kharoshthi"             => 46,
1403     "Khmer"                  => 47,
1404     "Lao"                    => 48,
1405     "Latin"                  => 49,
1406     "Lepcha"                 => 50,
1407     "Limbu"                  => 51,
1408     "Linear_B"               => 52,
1409     "Lisu"                   => 53,
1410     "Lycian"                 => 54,
1411     "Lydian"                 => 55,
1412     "Malayalam"              => 56,
1413     "Mandaic"                => 57,
1414     "Meetei_Mayek"           => 58,
1415     "Mongolian"              => 59,
1416     "Myanmar"                => 60,
1417     "New_Tai_Lue"            => 61,
1418     "Nko"                    => 62,
1419     "Ogham"                  => 63,
1420     "Ol_Chiki"               => 64,
1421     "Old_Italic"             => 65,
1422     "Old_Persian"            => 66,
1423     "Old_South_Arabian"      => 67,
1424     "Old_Turkic"             => 68,
1425     "Oriya"                  => 69,
1426     "Osmanya"                => 70,
1427     "Phags_Pa"               => 71,
1428     "Phoenician"             => 72,
1429     "Rejang"                 => 73,
1430     "Runic"                  => 74,
1431     "Samaritan"              => 75,
1432     "Saurashtra"             => 76,
1433     "Shavian"                => 77,
1434     "Sinhala"                => 78,
1435     "Sundanese"              => 79,
1436     "Syloti_Nagri"           => 80,
1437     "Syriac"                 => 81,
1438     "Tagalog"                => 82,
1439     "Tagbanwa"               => 83,
1440     "Tai_Le"                 => 84,
1441     "Tai_Tham"               => 85,
1442     "Tai_Viet"               => 86,
1443     "Tamil"                  => 87,
1444     "Telugu"                 => 88,
1445     "Thaana"                 => 89,
1446     "Thai"                   => 90,
1447     "Tibetan"                => 91,
1448     "Tifinagh"               => 92,
1449     "Ugaritic"               => 93,
1450     "Vai"                    => 94,
1451     "Yi"                     => 95,
1452     # Win8/Win8.1
1453     "Chakma"                 => 96,
1454     "Meroitic_Cursive"       => 97,
1455     "Meroitic_Hieroglyphs"   => 98,
1456     "Miao"                   => 99,
1457     "Sharada"                => 100,
1458     "Sora_Sompeng"           => 101,
1459     "Takri"                  => 102,
1460     # Win10
1461     "Bassa_Vah"              => 103,
1462     "Caucasian_Albanian"     => 104,
1463     "Duployan"               => 105,
1464     "Elbasan"                => 106,
1465     "Grantha"                => 107,
1466     "Khojki"                 => 108,
1467     "Khudawadi"              => 109,
1468     "Linear_A"               => 110,
1469     "Mahajani"               => 111,
1470     "Manichaean"             => 112,
1471     "Mende_Kikakui"          => 113,
1472     "Modi"                   => 114,
1473     "Mro"                    => 115,
1474     "Nabataean"              => 116,
1475     "Old_North_Arabian"      => 117,
1476     "Old_Permic"             => 118,
1477     "Pahawh_Hmong"           => 119,
1478     "Palmyrene"              => 120,
1479     "Pau_Cin_Hau"            => 121,
1480     "Psalter_Pahlavi"        => 122,
1481     "Siddham"                => 123,
1482     "Tirhuta"                => 124,
1483     "Warang_Citi"            => 125,
1484     # Win10 RS1
1485     "Adlam"                  => 126,
1486     "Ahom"                   => 127,
1487     "Anatolian_Hieroglyphs"  => 128,
1488     "Bhaiksuki"              => 129,
1489     "Hatran"                 => 130,
1490     "Marchen"                => 131,
1491     "Multani"                => 132,
1492     "Newa"                   => 133,
1493     "Old_Hungarian"          => 134,
1494     "Osage"                  => 135,
1495     "SignWriting"            => 136,
1496     "Tangut"                 => 137,
1497     # Win10 RS4
1498     "Masaram_Gondi"          => 138,
1499     "Nushu"                  => 139,
1500     "Soyombo"                => 140,
1501     "Zanabazar_Square"       => 141,
1502     # Win10 1903
1503     "Dogra"                  => 142,
1504     "Gunjala_Gondi"          => 143,
1505     "Hanifi_Rohingya"        => 144,
1506     "Makasar"                => 145,
1507     "Medefaidrin"            => 146,
1508     "Old_Sogdian"            => 147,
1509     "Sogdian"                => 148,
1510     # Win10 2004
1511     "Elymaic"                => 149,
1512     "Nyiakeng_Puachue_Hmong" => 150,
1513     "Nandinagari"            => 151,
1514     "Wancho"                 => 152,
1515 );
1516
1517 ################################################################
1518 # dump Script IDs table
1519 sub dump_scripts($)
1520 {
1521     my $filename = shift;
1522     my $header = $filename;
1523     my @scripts_table;
1524     my $script_index;
1525     my $i;
1526
1527     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1528     # Fill the table
1529     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1530     while (<$INPUT>)
1531     {
1532         my $type = "";
1533
1534         next if /^\#/;  # skip comments
1535         next if /^\s*$/;  # skip empty lines
1536         next if /\x1a/;  # skip ^Z
1537         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1538         {
1539             $type = $2;
1540             if (defined $scripts{$type})
1541             {
1542                 $scripts_table[hex $1] = $scripts{$type};
1543             }
1544             next;
1545         }
1546         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1547         {
1548             $type = $3;
1549             if (defined $scripts{$type})
1550             {
1551                 foreach my $i (hex $1 .. hex $2)
1552                 {
1553                     $scripts_table[$i] = $scripts{$type};
1554                 }
1555             }
1556             next;
1557         }
1558     }
1559
1560     close $INPUT;
1561
1562     $header = "$filename.h";
1563     open OUTPUT,">$header.new" or die "Cannot create $header";
1564     print "Building $header\n";
1565     print OUTPUT "/* Unicode Script IDs */\n";
1566     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1567     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1568
1569     print OUTPUT "enum unicode_script_id {\n";
1570     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1571     {
1572         print OUTPUT "    Script_$script = $scripts{$script},\n";
1573     }
1574     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1575     print OUTPUT "};\n";
1576
1577     close OUTPUT;
1578     save_file($header);
1579
1580     $filename = "$filename.c";
1581     open OUTPUT,">$filename.new" or die "Cannot create $header";
1582     print "Building $filename\n";
1583     print OUTPUT "/* Unicode Script IDs */\n";
1584     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1585     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1586     print OUTPUT "#include \"windef.h\"\n\n";
1587
1588     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1589     close OUTPUT;
1590     save_file($filename);
1591 }
1592
1593 ################################################################
1594 # dump the BiDi mirroring table
1595 sub dump_mirroring($)
1596 {
1597     my $filename = shift;
1598     my @mirror_table = ();
1599
1600     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1601     while (<$INPUT>)
1602     {
1603         next if /^\#/;  # skip comments
1604         next if /^$/;  # skip empty lines
1605         next if /\x1a/;  # skip ^Z
1606         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1607         {
1608             $mirror_table[hex $1] = hex $2;
1609             next;
1610         }
1611         die "malformed line $_";
1612     }
1613     close $INPUT;
1614
1615     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1616     print "Building $filename\n";
1617     print OUTPUT "/* Unicode BiDi mirroring */\n";
1618     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1619     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1620     print OUTPUT "#include \"windef.h\"\n\n";
1621     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1622     close OUTPUT;
1623     save_file($filename);
1624 }
1625
1626 ################################################################
1627 # dump the Bidi Brackets
1628 sub dump_bracket($)
1629 {
1630     my $filename = shift;
1631     my @bracket_table;
1632
1633     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1634     while (<$INPUT>)
1635     {
1636         next if /^\#/;  # skip comments
1637         next if /^\s*$/;  # skip empty lines
1638         next if /\x1a/;  # skip ^Z
1639         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1640         {
1641             my $type = $3;
1642             die "unknown bracket $type" unless defined $bracket_types{$type};
1643             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1644             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1645             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1646             next;
1647         }
1648         die "malformed line $_";
1649     }
1650     close $INPUT;
1651
1652     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1653     print "Building $filename\n";
1654     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1655     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1656     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1657     print OUTPUT "#include \"windef.h\"\n\n";
1658
1659     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1660
1661     close OUTPUT;
1662     save_file($filename);
1663 }
1664
1665 ################################################################
1666 # dump the Arabic shaping table
1667 sub dump_shaping($)
1668 {
1669     my $filename = shift;
1670     my @joining_table = @initial_joining_table;
1671
1672     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1673     while (<$INPUT>)
1674     {
1675         next if /^\#/;  # skip comments
1676         next if /^\s*$/;  # skip empty lines
1677         next if /\x1a/;  # skip ^Z
1678         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1679         {
1680             my $type = $2;
1681             $joining_table[hex $1] = $joining_types{$type};
1682             next;
1683         }
1684         die "malformed line $_";
1685     }
1686     close $INPUT;
1687
1688     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1689     print "Building $filename\n";
1690     print OUTPUT "/* Unicode Arabic shaping */\n";
1691     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1692     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1693     print OUTPUT "#include \"windef.h\"\n\n";
1694
1695     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1696
1697     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1698     for (my $i = 0x600; $i <= 0x6ff; $i++)
1699     {
1700         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1701             ${joining_forms{"isolated"}}[$i] || $i,
1702             ${joining_forms{"final"}}[$i] || $i,
1703             ${joining_forms{"initial"}}[$i] || $i,
1704             ${joining_forms{"medial"}}[$i] || $i;
1705     }
1706     print OUTPUT "};\n";
1707
1708     close OUTPUT;
1709     save_file($filename);
1710 }
1711
1712 ################################################################
1713 # dump the Arabic shaping table
1714 sub dump_arabic_shaping($)
1715 {
1716     my $filename = shift;
1717     my @joining_table = @initial_joining_table;
1718
1719     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1720     while (<$INPUT>)
1721     {
1722         next if /^\#/;  # skip comments
1723         next if /^\s*$/;  # skip empty lines
1724         next if /\x1a/;  # skip ^Z
1725         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1726         {
1727             my $type = $2;
1728             my $group = $3;
1729
1730             if ($group eq "ALAPH" || $group eq "DALATH RISH")
1731             {
1732                 $joining_table[hex $1] = $joining_types{$group};
1733             }
1734             else
1735             {
1736                 $joining_table[hex $1] = $joining_types{$type};
1737             }
1738
1739             next;
1740         }
1741         die "malformed line $_";
1742     }
1743     close $INPUT;
1744
1745     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1746     print "Building $filename\n";
1747     print OUTPUT "/* Unicode Arabic shaping */\n";
1748     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1749     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1750     print OUTPUT "#include \"windef.h\"\n\n";
1751
1752     dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1753
1754     close OUTPUT;
1755     save_file($filename);
1756 }
1757
1758 ################################################################
1759 # dump the Vertical Orientation table
1760 sub dump_vertical($)
1761 {
1762     my $filename = shift;
1763     my @vertical_table;
1764
1765     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1766     while (<$INPUT>)
1767     {
1768         next if /^\#/;  # skip comments
1769         next if /^\s*$/;  # skip empty lines
1770         next if /\x1a/;  # skip ^Z
1771         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1772         {
1773             my $type = $2;
1774             die "unknown vertical $type" unless defined $vertical_types{$type};
1775             if (hex $1 < 65536)
1776             {
1777                 $vertical_table[hex $1] = $vertical_types{$type};
1778             }
1779             next;
1780         }
1781         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1782         {
1783             my $type = $3;
1784             die "unknown vertical $type" unless defined $vertical_types{$type};
1785             foreach my $i (hex $1 .. hex $2)
1786             {
1787                 $vertical_table[$i] = $vertical_types{$type};
1788             }
1789             next;
1790         }
1791         die "malformed line $_";
1792     }
1793     close $INPUT;
1794
1795     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1796     print "Building $filename\n";
1797     print OUTPUT "/* Unicode Vertical Orientation */\n";
1798     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1799     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1800     print OUTPUT "#include \"windef.h\"\n\n";
1801
1802     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1803
1804     close OUTPUT;
1805     save_file($filename);
1806 }
1807
1808 ################################################################
1809 # dump the digit folding tables
1810 sub dump_digit_folding($)
1811 {
1812     my ($filename) = shift;
1813     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1814     print "Building $filename\n";
1815     print OUTPUT "/* Unicode digit folding mappings */\n";
1816     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1817     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1818     print OUTPUT "#include \"windef.h\"\n\n";
1819
1820     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1821     close OUTPUT;
1822     save_file($filename);
1823 }
1824
1825
1826 ################################################################
1827 # compress a mapping table by removing identical rows
1828 sub compress_array($$@)
1829 {
1830     my $rows = shift;
1831     my $def = shift;
1832     my @table = @_;
1833     my $len = @table / $rows;
1834     my @array;
1835     my $data = "";
1836
1837     # try to merge table rows
1838     for (my $row = 0; $row < $rows; $row++)
1839     {
1840         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1841         my $pos = index $data, $rowtxt;
1842         if ($pos == -1)
1843         {
1844             # check if the tail of the data can match the start of the new row
1845             my $first = substr( $rowtxt, 0, 1 );
1846             for (my $i = length($data) - 1; $i > 0; $i--)
1847             {
1848                 $pos = index( substr( $data, -$i ), $first );
1849                 last if $pos == -1;
1850                 $i -= $pos;
1851                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1852                 substr( $data, -$i ) = "";
1853                 last;
1854             }
1855             $pos = length $data;
1856             $data .= $rowtxt;
1857         }
1858         $array[$row] = $rows + $pos;
1859     }
1860     return @array, unpack "U*", $data;
1861 }
1862
1863 ################################################################
1864 # dump a char -> 16-bit value mapping table using two-level tables
1865 sub dump_two_level_mapping($$@)
1866 {
1867     my $name = shift;
1868     my $def = shift;
1869     my $size = shift;
1870     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1871     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1872     my @array = compress_array( 256, 0, @row_array[0..4095] );
1873
1874     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1875
1876     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1877     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1878     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1879     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1880 }
1881
1882 ################################################################
1883 # dump a char -> value mapping table using three-level tables
1884 sub dump_three_level_mapping($$@)
1885 {
1886     my $name = shift;
1887     my $def = shift;
1888     my $size = shift;
1889     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1890     my $level3 = ($MAX_CHAR + 1) / 16;
1891     my $level2 = $level3 / 16;
1892     my $level1 = $level2 / 16;
1893     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1894     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1895     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1896
1897     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1898     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1899
1900     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1901     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1902     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1903     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1904     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1905 }
1906
1907 ################################################################
1908 # dump a binary case mapping table in l_intl.nls format
1909 sub dump_binary_case_table(@)
1910 {
1911     my (@table) = @_;
1912     my $max_char = 0x10000;
1913     my $level1 = $max_char / 16;
1914     my $level2 = $level1 / 16;
1915
1916     my @difftable;
1917     for (my $i = 0; $i < @table; $i++)
1918     {
1919         next unless defined $table[$i];
1920         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1921     }
1922
1923     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1924     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1925     my $offset = @array - $level1;
1926     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1927     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1928 }
1929
1930 ################################################################
1931 # dump case mappings for l_intl.nls
1932 sub dump_intl_nls($)
1933 {
1934     my @upper_table = @toupper_table;
1935     my @lower_table = @tolower_table;
1936     remove_linguistic_mappings( \@upper_table, \@lower_table );
1937
1938     my $upper = dump_binary_case_table( @upper_table );
1939     my $lower = dump_binary_case_table( @lower_table );
1940
1941     my $filename = shift;
1942     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1943     printf "Building $filename\n";
1944
1945     binmode OUTPUT;
1946     print OUTPUT pack "S<", 1;  # version
1947     print OUTPUT $upper;
1948     print OUTPUT $lower;
1949     close OUTPUT;
1950     save_file($filename);
1951 }
1952
1953
1954 ################################################################
1955 # dump the bidi direction table
1956 sub dump_bidi_dir_table($)
1957 {
1958     my $filename = shift;
1959     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1960     printf "Building $filename\n";
1961     printf OUTPUT "/* Unicode BiDi direction table */\n";
1962     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1963     printf OUTPUT "#include \"windef.h\"\n\n";
1964
1965     my @table;
1966
1967     for (my $i = 0; $i < 65536; $i++)
1968     {
1969         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1970     }
1971
1972     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1973
1974     close OUTPUT;
1975     save_file($filename);
1976 }
1977
1978
1979 sub rol($$)
1980 {
1981     my ($byte, $count) = @_;
1982     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1983 }
1984
1985 ################################################################
1986 # compress the character properties table
1987 sub compress_char_props_table($@)
1988 {
1989     my $rows = shift;
1990     my @table = @_;
1991     my $len = @table / $rows;
1992     my $pos = 0;
1993     my @array = (0) x $rows;
1994     my %sequences;
1995
1996     # add some predefined sequences
1997     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1998
1999     # try to merge table rows
2000     for (my $row = 0; $row < $rows; $row++)
2001     {
2002         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
2003         my $rowtxt = pack "L*", @table_row;
2004         if (defined($sequences{$rowtxt}))
2005         {
2006             # reuse an existing row
2007             $array[$row] = $sequences{$rowtxt};
2008         }
2009         else
2010         {
2011             # create a new row
2012             $sequences{$rowtxt} = $array[$row] = ++$pos;
2013             push @array, @table_row;
2014         }
2015     }
2016     return @array;
2017 }
2018
2019 ################################################################
2020 # dump a normalization table in binary format
2021 sub dump_norm_table($)
2022 {
2023     my $filename = shift;
2024
2025     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2026     my %decomp = ( "nfc" => \@decomp_table,
2027                    "nfd" => \@decomp_table,
2028                    "nfkc" => \@decomp_compat_table,
2029                    "nfkd" => \@decomp_compat_table ,
2030                    "idna" => \@idna_decomp_table );
2031
2032     open OUTPUT,">$filename.new" or die "Cannot create $filename";
2033     print "Building $filename\n";
2034
2035     my $type = $filename;
2036     $type =~ s!.*/norm(\w+)\.nls!$1!;
2037
2038     my $compose = $forms{$type} & 1;
2039     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2040
2041     my @version = split /\./, $UNIVERSION;
2042
2043     # combining classes
2044
2045     my @classes;
2046     my @class_values;
2047
2048     foreach my $c (grep defined, @combining_class_table)
2049     {
2050         $classes[$c] = 1 if $c < 0x100;
2051     }
2052     for (my $i = 0; $i < @classes; $i++)
2053     {
2054         next unless defined $classes[$i];
2055         $classes[$i] = @class_values;
2056         push @class_values, $i;
2057     }
2058     push @class_values, 0 if (@class_values % 2);
2059     die "too many classes" if @class_values >= 0x40;
2060
2061     # character properties
2062
2063     my @char_props;
2064     my @decomposed;
2065     my @comp_hash_table;
2066     my $comp_hash_size = $compose ? 254 : 0;
2067
2068     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2069     {
2070         next unless defined $combining_class_table[$i];
2071         if (defined $decomp{$type}->[$i])
2072         {
2073             my @dec = get_decomposition( $i, $decomp{$type} );
2074             if ($compose && (my @comp = get_composition( $i, $compat )))
2075             {
2076                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2077                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2078
2079                 my $val = 0;
2080                 foreach my $d (@dec)
2081                 {
2082                     $val = $combining_class_table[$d];
2083                     last if $val;
2084                 }
2085                 $char_props[$i] = $classes[$val];
2086             }
2087             else
2088             {
2089                 $char_props[$i] = 0xbf;
2090             }
2091             @dec = compose_hangul( @dec ) if $compose;
2092             @dec = to_utf16( @dec );
2093             push @dec, 0 if @dec >= 7;
2094             $decomposed[$i] = \@dec;
2095         }
2096         else
2097         {
2098             if ($combining_class_table[$i] == 0x100)
2099             {
2100                 $char_props[$i] = 0x7f;
2101             }
2102             elsif ($combining_class_table[$i])
2103             {
2104                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2105             }
2106             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2107             {
2108                 $char_props[$i] = 0xff;
2109             }
2110             else
2111             {
2112                 $char_props[$i] = 0;
2113             }
2114         }
2115     }
2116
2117     if ($compose)
2118     {
2119         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2120         {
2121             my @comp = get_composition( $i, $compat );
2122             next unless @comp;
2123             if ($combining_class_table[$comp[1]])
2124             {
2125                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2126                 $char_props[$comp[1]] |= 0x40;
2127             }
2128             else
2129             {
2130                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2131                 $char_props[$comp[1]] |= 0xc0;
2132             }
2133         }
2134     }
2135
2136     # surrogates
2137     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2138     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2139
2140     # Hangul
2141     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2142     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2143     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2144
2145     # invalid chars
2146     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2147     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2148     foreach my $i (0x00..0x10)
2149     {
2150         $char_props[($i << 16) | 0xfffe] = 0xff;
2151         $char_props[($i << 16) | 0xffff] = 0xff;
2152     }
2153
2154     # decomposition hash table
2155
2156     my @decomp_hash_table;
2157     my @decomp_hash_index;
2158     my @decomp_hash_data;
2159     my $decomp_hash_size = 944;
2160
2161     # build string of character data, reusing substrings when possible
2162     my $decomp_char_data = "";
2163     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2164     {
2165         my $str = pack "U*", @{$i};
2166         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2167     }
2168     for (my $i = 0; $i < @decomposed; $i++)
2169     {
2170         next unless defined $decomposed[$i];
2171         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2172         die "sequence not found" if $pos == -1;
2173         my $len = @{$decomposed[$i]};
2174         $len = 7 if $len > 7;
2175         my $hash = $i % $decomp_hash_size;
2176         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2177     }
2178     for (my $i = 0; $i < $decomp_hash_size; $i++)
2179     {
2180         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2181         next unless defined $decomp_hash_table[$i];
2182         if (@{$decomp_hash_table[$i]} == 1)
2183         {
2184             my $entry = $decomp_hash_table[$i]->[0];
2185             if ($char_props[$entry->[0]] == 0xbf)
2186             {
2187                 $decomp_hash_index[$i] = $entry->[1];
2188                 next;
2189             }
2190         }
2191         foreach my $entry (@{$decomp_hash_table[$i]})
2192         {
2193             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2194         }
2195     }
2196     push @decomp_hash_data, 0, 0;
2197
2198     # composition hash table
2199
2200     my @comp_hash_index;
2201     my @comp_hash_data;
2202     if (@comp_hash_table)
2203     {
2204         for (my $i = 0; $i < $comp_hash_size; $i++)
2205         {
2206             $comp_hash_index[$i] = @comp_hash_data;
2207             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2208         }
2209         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2210         push @comp_hash_data, 0, 0, 0;
2211     }
2212
2213     my $level1 = ($MAX_CHAR + 1) / 128;
2214     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2215
2216     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2217                    0, $decomp_hash_size, $comp_hash_size, 0 );
2218     my @tables = (0) x 8;
2219
2220     $tables[0] = 16 + @header + @tables;
2221     $tables[1] = $tables[0] + @class_values / 2;
2222     $tables[2] = $tables[1] + $level1 / 2;
2223     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2224     $tables[4] = $tables[3] + @decomp_hash_index;
2225     $tables[5] = $tables[4] + @decomp_hash_data;
2226     $tables[6] = $tables[5] + length $decomp_char_data;
2227     $tables[7] = $tables[6] + @comp_hash_index;
2228
2229     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2230     print OUTPUT pack "S<*", @header;
2231     print OUTPUT pack "S<*", @tables;
2232     print OUTPUT pack "C*", @class_values;
2233
2234     print OUTPUT pack "C*", @rows[0..$level1-1];
2235     print OUTPUT pack "C*", @rows[$level1..$#rows];
2236     print OUTPUT pack "S<*", @decomp_hash_index;
2237     print OUTPUT pack "S<*", @decomp_hash_data;
2238     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2239     print OUTPUT pack "S<*", @comp_hash_index;
2240     print OUTPUT pack "S<*", @comp_hash_data;
2241
2242     close OUTPUT;
2243     save_file($filename);
2244
2245     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2246 }
2247
2248
2249 ################################################################
2250 # output a codepage definition file from the global tables
2251 sub output_codepage_file($)
2252 {
2253     my $codepage = shift;
2254
2255     my $output = sprintf "nls/c_%03d.nls", $codepage;
2256     open OUTPUT,">$output.new" or die "Cannot create $output";
2257
2258     printf "Building %s\n", $output;
2259     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2260     else { dump_binary_dbcs_table( $codepage ); }
2261
2262     close OUTPUT;
2263     save_file($output);
2264
2265     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2266 }
2267
2268 ################################################################
2269 # output a codepage table from a Microsoft-style mapping file
2270 sub dump_msdata_codepage($)
2271 {
2272     my $filename = shift;
2273
2274     my $state = "";
2275     my ($codepage, $width, $count);
2276     my ($lb_cur, $lb_end);
2277
2278     @cp2uni = ();
2279     @glyph2uni = ();
2280     @lead_bytes = ();
2281     @uni2cp = ();
2282     $default_char = $DEF_CHAR;
2283     $default_wchar = $DEF_CHAR;
2284
2285     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2286
2287     while (<$INPUT>)
2288     {
2289         next if /^;/;  # skip comments
2290         next if /^\s*$/;  # skip empty lines
2291         next if /\x1a/;  # skip ^Z
2292         last if /^ENDCODEPAGE/;
2293
2294         if (/^CODEPAGE\s+(\d+)/)
2295         {
2296             $codepage = $1;
2297             next;
2298         }
2299         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2300         {
2301             $width = $1;
2302             $default_char = hex $2;
2303             $default_wchar = hex $3;
2304             next;
2305         }
2306         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2307         {
2308             $state = $1;
2309             $count = $2;
2310             next;
2311         }
2312         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2313         {
2314             if ($state eq "MBTABLE")
2315             {
2316                 my $cp = hex $1;
2317                 my $uni = hex $2;
2318                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2319                 next;
2320             }
2321             if ($state eq "GLYPHTABLE")
2322             {
2323                 my $cp = hex $1;
2324                 my $uni = hex $2;
2325                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2326                 next;
2327             }
2328             if ($state eq "WCTABLE")
2329             {
2330                 my $uni = hex $1;
2331                 my $cp = hex $2;
2332                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2333                 next;
2334             }
2335             if ($state eq "DBCSRANGE")
2336             {
2337                 my $start = hex $1;
2338                 my $end = hex $2;
2339                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2340                 $lb_cur = $start;
2341                 $lb_end = $end;
2342                 next;
2343             }
2344             if ($state eq "DBCSTABLE")
2345             {
2346                 my $mb = hex $1;
2347                 my $uni = hex $2;
2348                 my $cp = ($lb_cur << 8) | $mb;
2349                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2350                 if (!--$count)
2351                 {
2352                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2353                 }
2354                 next;
2355             }
2356         }
2357         die "$filename: Unrecognized line $_\n";
2358     }
2359     close $INPUT;
2360
2361     output_codepage_file( $codepage );
2362
2363     if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
2364 }
2365
2366 ################################################################
2367 # align a string length
2368 sub align_string($$)
2369 {
2370     my ($align, $str) = @_;
2371     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2372     return $str;
2373 }
2374
2375 ################################################################
2376 # pack a GUID string
2377 sub pack_guid($)
2378 {
2379     $_ = shift;
2380     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2381     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2382 }
2383
2384 ################################################################
2385 # comparison function for compression sort
2386 sub cmp_compression
2387 {
2388     return scalar @{$a} <=> scalar @{$b} ||
2389         $a->[4] <=> $b->[4] ||
2390         $a->[5] <=> $b->[5] ||
2391         $a->[6] <=> $b->[6] ||
2392         $a->[7] <=> $b->[7] ||
2393         $a->[8] <=> $b->[8] ||
2394         $a->[9] <=> $b->[9] ||
2395         $a->[10] <=> $b->[10] ||
2396         $a->[11] <=> $b->[11] ||
2397         $a->[12] <=> $b->[12];
2398 }
2399
2400 ################################################################
2401 # build a binary sort keys table
2402 sub dump_sortkey_table($$)
2403 {
2404     my ($filename, $download) = @_;
2405
2406     my @keys;
2407     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2408     my @multiple_weights;
2409     my @expansions;
2410     my @compressions;
2411     my %exceptions;
2412     my %guids;
2413     my %compr_flags;
2414     my %locales;
2415     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2416     my $jamostr = "";
2417
2418     my $re_hex = '0x[0-9A-Fa-f]+';
2419     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2420     $guids{$default_guid} = { };
2421
2422     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2423
2424     my $KEYS = open_data_file( $MSDATA, $download );
2425
2426     printf "Building $filename\n";
2427
2428     while (<$KEYS>)
2429     {
2430         s/\s*;.*$//;
2431         next if /^\s*$/;  # skip empty lines
2432         if (/^\s*(SORTKEY|SORTTABLES)/)
2433         {
2434             $part = $1;
2435             next;
2436         }
2437         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2438         {
2439             $part = $section = "";
2440             next;
2441         }
2442         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2443         {
2444             $section = $1;
2445             $guid = undef;
2446             next;
2447         }
2448         next unless $part;
2449         if ("$part.$section" eq "SORTKEY.DEFAULT")
2450         {
2451             if (/^\s*($re_hex)\s+$re_key/)
2452             {
2453                 $keys[hex $1] = [ split(/\s+/,$2) ];
2454                 next;
2455             }
2456         }
2457         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2458         {
2459             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2460             {
2461                 $version = hex $1;
2462                 next;
2463             }
2464             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2465             {
2466                 # ignore for now
2467                 next;
2468             }
2469         }
2470         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2471                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2472                "$part.$section" eq "SORTTABLES.INVERSECASING")
2473         {
2474             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2475             {
2476                 $guid = lc $1;
2477                 $guids{$guid} = { } unless defined $guids{$guid};
2478                 $guids{$guid}->{flags} |= $flags{$section};
2479                 next;
2480             }
2481             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2482             {
2483                 $locales{$1} = $guid;
2484                 next;
2485             }
2486         }
2487         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2488         {
2489             if (/^\s*(\d+)\s+(\d+)/)
2490             {
2491                 push @multiple_weights, $1, $2;
2492                 next;
2493             }
2494         }
2495         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2496         {
2497             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2498             {
2499                 my $pos = scalar @expansions / 2;
2500                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2501                 push @expansions, hex $2, hex $3;
2502                 next;
2503             }
2504         }
2505         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2506         {
2507             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2508             {
2509                 $keys[hex $1] = $keys[hex $2];
2510                 next;
2511             }
2512         }
2513         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2514         {
2515             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2516             {
2517                 if ($subsection || !$guid)  # start a new one
2518                 {
2519                     $guid = lc $1;
2520                     $subsection = "";
2521                     $guids{$guid} = { } unless defined $guids{$guid};
2522                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2523                     $guids{$guid}->{compr} = @compressions;
2524                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2525                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2526                     push @compressions, [ ];
2527                 }
2528                 else  # merge with current one
2529                 {
2530                     $guids{lc $1} = { } unless defined $guids{lc $1};
2531                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2532                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2533                     $compr_flags{lc $1} = $compr_flags{$guid};
2534                 }
2535                 next;
2536             }
2537             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2538             {
2539                 $locales{$1} = $guid;
2540                 next;
2541             }
2542             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2543             {
2544                 $subsection = $1;
2545                 next;
2546             }
2547             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2548             {
2549                 my @comp = map { hex $_; } split(/\s+/,$1);
2550                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2551                 # add compression flags
2552                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2553                 next;
2554             }
2555         }
2556         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2557         {
2558             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2559             {
2560                 $guid = lc $1;
2561                 $guids{$guid} = { } unless defined $guids{lc $1};
2562                 $ling_flag = ($2 ? "+" : "-");
2563                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2564                 next;
2565             }
2566             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2567             {
2568                 $locales{$1} = $guid;
2569                 next;
2570             }
2571             if (/^\s*($re_hex)\s+$re_key/)
2572             {
2573                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2574                 next;
2575             }
2576         }
2577         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2578         {
2579             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2580             {
2581                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2582                 next;
2583             }
2584         }
2585         die "$download: $part.$section: unrecognized line $_\n";
2586     }
2587     close $KEYS;
2588
2589     # Sortkey table
2590
2591     my $table;
2592     for (my $i = 0; $i < 0x10000; $i++)
2593     {
2594         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2595         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2596     }
2597
2598     foreach my $id (sort keys %exceptions)
2599     {
2600         my $pos = length($table) / 4;
2601         my @exc = @{$exceptions{$id}};
2602         my @filled;
2603         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2604         my $guid = substr( $id, 0, -1 );
2605         $guids{$guid}->{$key} = $pos;
2606         $pos += 0x100;
2607         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2608         for (my $j = 0; $j < 0x10000; $j++)
2609         {
2610             next unless defined $exc[$j] || defined $flags[$j];
2611             $filled[$j >> 8] = 1;
2612             $j |= 0xff;
2613         }
2614         for (my $j = 0; $j < 0x100; $j++)
2615         {
2616             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2617             $pos += 0x100 if $filled[$j];
2618         }
2619         for (my $j = 0; $j < 0x10000; $j++)
2620         {
2621             next unless $filled[$j >> 8];
2622             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2623             $k[3] |= $flags[$j] || 0;
2624             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2625         }
2626     }
2627
2628     # Case mapping tables
2629
2630     # standard table
2631     my @casemaps;
2632     my @upper = @toupper_table;
2633     my @lower = @tolower_table;
2634     remove_linguistic_mappings( \@upper, \@lower );
2635     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2636
2637     # linguistic table
2638     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2639
2640     # Turkish table
2641     @upper = @toupper_table;
2642     @lower = @tolower_table;
2643     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2644     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2645     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2646     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2647
2648     # Char type table
2649
2650     my @table;
2651     my $types = "";
2652     my %typestr;
2653     for (my $i = 0; $i < 0x10000; $i++)
2654     {
2655         my $str = pack "S<3",
2656             ($category_table[$i] || 0) & 0xffff,
2657             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2658             ($category_table[$i] || 0) >> 16;
2659
2660         if (!defined($typestr{$str}))
2661         {
2662             $typestr{$str} = length($types) / 6;
2663             $types .= $str;
2664         }
2665         $table[$i] = $typestr{$str};
2666     }
2667
2668     my @rows = compress_array( 4096, 0, @table[0..65535] );
2669     my @array = compress_array( 256, 0, @rows[0..4095] );
2670     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2671     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2672
2673     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2674     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2675     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2676
2677     # Sort tables
2678
2679     # guids
2680     my $sorttables = pack "L<2", $version, scalar %guids;
2681     foreach my $id (sort keys %guids)
2682     {
2683         my %guid = %{$guids{$id}};
2684         my $flags = $guid{flags} || 0;
2685         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2686         $sorttables .= pack_guid($id) . pack "L<5",
2687             $flags,
2688             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2689             $guid{except} || 0,
2690             $guid{ling_except} || 0,
2691             $map / 2;
2692     }
2693
2694     # expansions
2695     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2696
2697     # compressions
2698     $sorttables .= pack "L<", scalar @compressions;
2699     my $rowstr = "";
2700     foreach my $c (@compressions)
2701     {
2702         my $pos = length($rowstr) / 2;
2703         my $min = 0xffff;
2704         my $max = 0;
2705         my @lengths = (0) x 8;
2706         foreach my $r (sort cmp_compression @{$c})
2707         {
2708             my @row = @{$r};
2709             $lengths[scalar @row - 6]++;
2710             foreach my $val (@row[4..$#row])
2711             {
2712                 $min = $val if $min > $val;
2713                 $max = $val if $max < $val;
2714             }
2715             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2716             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2717         }
2718         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2719     }
2720     $sorttables .= $rowstr;
2721
2722     # multiple weights
2723     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2724
2725     # jamo sort
2726     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2727
2728     # Locales
2729
2730     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2731     foreach my $loc (sort keys %locales)
2732     {
2733         # skip specific locales that match more general ones
2734         my @parts = split /[-_]/, $loc;
2735         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2736         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2737         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2738     }
2739
2740     # File header
2741
2742     my @header;
2743     $header[0] = 16;
2744     $header[1] = $header[0] + length $table;
2745     $header[2] = $header[1] + length $casemaps;
2746     $header[3] = $header[2] + length $chartypes;
2747
2748     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2749     print OUTPUT pack "L<*", @header;
2750     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2751     close OUTPUT;
2752     save_file($filename);
2753 }
2754
2755
2756 ################################################################
2757 # build the script to create registry keys
2758 sub dump_registry_script($%)
2759 {
2760     my ($filename, %keys) = @_;
2761     my $indent = 1;
2762
2763     printf "Building %s\n", $filename;
2764     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2765     print OUTPUT "HKLM\n{\n";
2766     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2767     {
2768         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2769         $indent++;
2770     }
2771     foreach my $k (sort keys %keys)
2772     {
2773         my @subkeys = split /\\/, $k;
2774         my ($def, @vals) = @{$keys{$k}};
2775         for (my $i = 0; $i < @subkeys; $i++)
2776         {
2777             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2778                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2779             $indent++;
2780         }
2781         foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2782         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2783     }
2784     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2785     close OUTPUT;
2786     save_file($filename);
2787 }
2788
2789
2790 ################################################################
2791 # save a file if modified
2792 sub save_file($)
2793 {
2794     my $file = shift;
2795     if (-f $file && !system "cmp $file $file.new >/dev/null")
2796     {
2797         unlink "$file.new";
2798     }
2799     else
2800     {
2801         rename "$file.new", "$file";
2802     }
2803 }
2804
2805
2806 ################################################################
2807 # main routine
2808
2809 chdir ".." if -f "./make_unicode";
2810 load_data();
2811 dump_sortkeys( "dlls/kernelbase/collation.c" );
2812 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2813 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2814 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2815 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2816 dump_mirroring( "dlls/dwrite/mirror.c" );
2817 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2818 dump_bracket( "dlls/dwrite/bracket.c" );
2819 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2820 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2821 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2822 dump_linebreak( "dlls/dwrite/linebreak.c" );
2823 dump_scripts( "dlls/dwrite/scripts" );
2824 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2825 dump_vertical( "dlls/gdi32/vertical.c" );
2826 dump_vertical( "dlls/wineps.drv/vertical.c" );
2827 dump_intl_nls("nls/l_intl.nls");
2828 dump_norm_table( "nls/normnfc.nls" );
2829 dump_norm_table( "nls/normnfd.nls" );
2830 dump_norm_table( "nls/normnfkc.nls" );
2831 dump_norm_table( "nls/normnfkd.nls" );
2832 dump_norm_table( "nls/normidna.nls" );
2833 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2834 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2835 dump_eucjp_codepage();
2836 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2837
2838 exit 0;
2839
2840 # Local Variables:
2841 # compile-command: "./make_unicode"
2842 # End: