tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $KSCDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
  30 my $REPORTS = "http://www.unicode.org/reports";
  31 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  32 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  33
  34 # Sort keys file
  35 my $SORTKEYS = "tr10/allkeys.txt";
  36
  37 # Default char for undefined mappings
  38 my $DEF_CHAR = ord '?';
  39
  40 # Last valid Unicode character
  41 my $MAX_CHAR = 0x10ffff;
  42
  43 my @allfiles =
  44 (
  45     "CodpageFiles/037.txt",
  46     "CodpageFiles/437.txt",
  47     "CodpageFiles/500.txt",
  48     "CodpageFiles/708.txt",
  49     "CodpageFiles/737.txt",
  50     "CodpageFiles/775.txt",
  51     "CodpageFiles/850.txt",
  52     "CodpageFiles/852.txt",
  53     "CodpageFiles/855.txt",
  54     "CodpageFiles/857.txt",
  55     "CodpageFiles/860.txt",
  56     "CodpageFiles/861.txt",
  57     "CodpageFiles/862.txt",
  58     "CodpageFiles/863.txt",
  59     "CodpageFiles/864.txt",
  60     "CodpageFiles/865.txt",
  61     "CodpageFiles/866.txt",
  62     "CodpageFiles/869.txt",
  63     "CodpageFiles/874.txt",
  64     "CodpageFiles/875.txt",
  65     "CodpageFiles/932.txt",
  66     "CodpageFiles/936.txt",
  67     "CodpageFiles/949.txt",
  68     "CodpageFiles/950.txt",
  69     "CodpageFiles/1026.txt",
  70     "CodpageFiles/1250.txt",
  71     "CodpageFiles/1251.txt",
  72     "CodpageFiles/1252.txt",
  73     "CodpageFiles/1253.txt",
  74     "CodpageFiles/1254.txt",
  75     "CodpageFiles/1255.txt",
  76     "CodpageFiles/1256.txt",
  77     "CodpageFiles/1257.txt",
  78     "CodpageFiles/1258.txt",
  79     "CodpageFiles/1361.txt",
  80     "CodpageFiles/10000.txt",
  81     "CodpageFiles/10001.txt",
  82     "CodpageFiles/10002.txt",
  83     "CodpageFiles/10003.txt",
  84     "CodpageFiles/10004.txt",
  85     "CodpageFiles/10005.txt",
  86     "CodpageFiles/10006.txt",
  87     "CodpageFiles/10007.txt",
  88     "CodpageFiles/10008.txt",
  89     "CodpageFiles/10010.txt",
  90     "CodpageFiles/10017.txt",
  91     "CodpageFiles/10021.txt",
  92     "CodpageFiles/10029.txt",
  93     "CodpageFiles/10079.txt",
  94     "CodpageFiles/10081.txt",
  95     "CodpageFiles/10082.txt",
  96     "CodpageFiles/20127.txt",
  97     "CodpageFiles/20866.txt",
  98     "CodpageFiles/21866.txt",
  99     "CodpageFiles/28591.txt",
 100     "CodpageFiles/28592.txt",
 101     "CodpageFiles/28593.txt",
 102     "CodpageFiles/28594.txt",
 103     "CodpageFiles/28595.txt",
 104     "CodpageFiles/28596.txt",
 105     "CodpageFiles/28597.txt",
 106     "CodpageFiles/28598.txt",
 107     "CodpageFiles/28599.txt",
 108     "CodpageFiles/28603.txt",
 109     "CodpageFiles/28605.txt",
 110 );
 111
 112
 113 my %ctype =
 114 (
 115      # CT_CTYPE1
 116     "upper"  => 0x0001,
 117     "lower"  => 0x0002,
 118     "digit"  => 0x0004,
 119     "space"  => 0x0008,
 120     "punct"  => 0x0010,
 121     "cntrl"  => 0x0020,
 122     "blank"  => 0x0040,
 123     "xdigit" => 0x0080,
 124     "alpha"  => 0x0100 | 0x80000000,
 125     "defin"  => 0x0200,
 126      # CT_CTYPE3 in high 16 bits
 127     "nonspacing"    => 0x00010000,
 128     "diacritic"     => 0x00020000,
 129     "vowelmark"     => 0x00040000,
 130     "symbol"        => 0x00080000,
 131     "katakana"      => 0x00100000,
 132     "hiragana"      => 0x00200000,
 133     "halfwidth"     => 0x00400000,
 134     "fullwidth"     => 0x00800000,
 135     "ideograph"     => 0x01000000,
 136     "kashida"       => 0x02000000,
 137     "lexical"       => 0x04000000,
 138     "highsurrogate" => 0x08000000,
 139     "lowsurrogate"  => 0x10000000,
 140 );
 141
 142 my %bracket_types =
 143 (
 144     "o" => 0x0000,
 145     "c" => 0x0001,
 146 );
 147
 148 my %indic_types =
 149 (
 150     "Other"    => 0x0000,
 151     "Bindu"    => 0x0001,
 152     "Visarga"  => 0x0002,
 153     "Avagraha" => 0x0003,
 154     "Nukta"    => 0x0004,
 155     "Virama"   => 0x0005,
 156     "Vowel_Independent"  => 0x0006,
 157     "Vowel_Dependent"  => 0x0007,
 158     "Vowel"  => 0x0008,
 159     "Consonant_Placeholder"  => 0x0009,
 160     "Consonant"  => 0x000a,
 161     "Consonant_Dead"  => 0x000b,
 162     "Consonant_Succeeding_Repha" => 0x000c,
 163     "Consonant_Subjoined"  => 0x000d,
 164     "Consonant_Medial"  => 0x000e,
 165     "Consonant_Final"  => 0x000f,
 166     "Consonant_Head_Letter"  => 0x0010,
 167     "Modifying_Letter"  => 0x0011,
 168     "Tone_Letter"  => 0x0012,
 169     "Tone_Mark"  => 0x0013,
 170     "Register_Shifter"  => 0x0014,
 171     "Consonant_Preceding_Repha" => 0x0015,
 172     "Pure_Killer" => 0x0016,
 173     "Invisible_Stacker" => 0x0017,
 174     "Gemination_Mark" => 0x0018,
 175     "Cantillation_Mark" => 0x0019,
 176     "Non_Joiner" => 0x001a,
 177     "Joiner" => 0x001b,
 178     "Number_Joiner" => 0x001c,
 179     "Number" => 0x001d,
 180     "Brahmi_Joining_Number" => 0x001e,
 181     "Consonant_With_Stacker" => 0x001f,
 182     "Consonant_Prefixed" => 0x0020,
 183     "Syllable_Modifier" => 0x0021,
 184     "Consonant_Killer" => 0x0022,
 185     "Consonant_Initial_Postfixed" => 0x0023,
 186 );
 187
 188 my %matra_types =
 189 (
 190     "Right"    => 0x01,
 191     "Left"  => 0x02,
 192     "Visual_Order_Left" => 0x03,
 193     "Left_And_Right"    => 0x04,
 194     "Top"   => 0x05,
 195     "Bottom"  => 0x06,
 196     "Top_And_Bottom"  => 0x07,
 197     "Top_And_Right"  => 0x08,
 198     "Top_And_Left"  => 0x09,
 199     "Top_And_Left_And_Right"  => 0x0a,
 200     "Bottom_And_Right"  => 0x0b,
 201     "Top_And_Bottom_And_Right"  => 0x0c,
 202     "Overstruck"  => 0x0d,
 203     "Invisible"  => 0x0e,
 204     "Bottom_And_Left"  => 0x0f,
 205     "Top_And_Bottom_And_Left"  => 0x10,
 206 );
 207
 208 my %break_types =
 209 (
 210     "BK"  => 0x0001,
 211     "CR"  => 0x0002,
 212     "LF"  => 0x0003,
 213     "CM"  => 0x0004,
 214     "SG"  => 0x0005,
 215     "GL"  => 0x0006,
 216     "CB"  => 0x0007,
 217     "SP"  => 0x0008,
 218     "ZW"  => 0x0009,
 219     "NL"  => 0x000a,
 220     "WJ"  => 0x000b,
 221     "JL"  => 0x000c,
 222     "JV"  => 0x000d,
 223     "JT"  => 0x000e,
 224     "H2"  => 0x000f,
 225     "H3"  => 0x0010,
 226     "XX"  => 0x0011,
 227     "OP"  => 0x0012,
 228     "CL"  => 0x0013,
 229     "CP"  => 0x0014,
 230     "QU"  => 0x0015,
 231     "NS"  => 0x0016,
 232     "EX"  => 0x0017,
 233     "SY"  => 0x0018,
 234     "IS"  => 0x0019,
 235     "PR"  => 0x001a,
 236     "PO"  => 0x001b,
 237     "NU"  => 0x001c,
 238     "AL"  => 0x001d,
 239     "ID"  => 0x001e,
 240     "IN"  => 0x001f,
 241     "HY"  => 0x0020,
 242     "BB"  => 0x0021,
 243     "BA"  => 0x0022,
 244     "SA"  => 0x0023,
 245     "AI"  => 0x0024,
 246     "B2"  => 0x0025,
 247     "HL"  => 0x0026,
 248     "CJ"  => 0x0027,
 249     "RI"  => 0x0028,
 250     "EB"  => 0x0029,
 251     "EM"  => 0x002a,
 252     "ZWJ" => 0x002b,
 253 );
 254
 255 my %vertical_types =
 256 (
 257     "R"  => 0x0000,
 258     "U"  => 0x0001,
 259     "Tr" => 0x0002,
 260     "Tu" => 0x0003,
 261 );
 262
 263 my %categories =
 264 (
 265     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 266     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 267     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 268     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 269     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 270     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 271     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 272     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 273     "No" => $ctype{"defin"},                    # Number, Other
 274     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 275     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 276     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 277     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 278     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 279     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 280     "Co" => $ctype{"defin"},                    # Other, Private Use
 281     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 282     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 283     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 284     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 285     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 286     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 287     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 288     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 289     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 290     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 291     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 292     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 293     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 294     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 295 );
 296
 297 # a few characters need additional categories that cannot be determined automatically
 298 my %special_categories =
 299 (
 300     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 301                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 302     "space"  => [ 0x09..0x0d, 0x85 ],
 303     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 304     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 305                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 306                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 307                   0xfff9, 0xfffa, 0xfffb ],
 308     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 309                   0xd7, 0xf7 ],
 310     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 311     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 312     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 313                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 314     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 315     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 316                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 317                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 318     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 319     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 320                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 321                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 322                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 323                      0x3131..0x3164 ],
 324     "ideograph" => [ 0x3006..0x3007 ],
 325     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 326                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 327                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 328                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 329                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 330                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 331                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 332                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 333                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 334                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 335     "kashida" => [ 0x0640 ],
 336 );
 337
 338 my %directions =
 339 (
 340     "L"   => 1,    # Left-to-Right
 341     "R"   => 2,    # Right-to-Left
 342     "AL"  => 12,   # Right-to-Left Arabic
 343     "EN"  => 3,    # European Number
 344     "ES"  => 4,    # European Number Separator
 345     "ET"  => 5,    # European Number Terminator
 346     "AN"  => 6,    # Arabic Number
 347     "CS"  => 7,    # Common Number Separator
 348     "NSM" => 13,   # Non-Spacing Mark
 349     "BN"  => 14,   # Boundary Neutral
 350     "B"   => 8,    # Paragraph Separator
 351     "S"   => 9,    # Segment Separator
 352     "WS"  => 10,   # Whitespace
 353     "ON"  => 11,   # Other Neutrals
 354     "LRE" => 15,   # Left-to-Right Embedding
 355     "LRO" => 15,   # Left-to-Right Override
 356     "RLE" => 15,   # Right-to-Left Embedding
 357     "RLO" => 15,   # Right-to-Left Override
 358     "PDF" => 15,   # Pop Directional Format
 359     "LRI" => 15,   # Left-to-Right Isolate
 360     "RLI" => 15,   # Right-to-Left Isolate
 361     "FSI" => 15,   # First Strong Isolate
 362     "PDI" => 15    # Pop Directional Isolate
 363 );
 364
 365 my %c2_types =
 366 (
 367     "L"   => 1,    # C2_LEFTTORIGHT
 368     "R"   => 2,    # C2_RIGHTTOLEFT
 369     "AL"  => 2,    # C2_RIGHTTOLEFT
 370     "EN"  => 3,    # C2_EUROPENUMBER
 371     "ES"  => 4,    # C2_EUROPESEPARATOR
 372     "ET"  => 5,    # C2_EUROPETERMINATOR
 373     "AN"  => 6,    # C2_ARABICNUMBER
 374     "CS"  => 7,    # C2_COMMONSEPARATOR
 375     "NSM" => 11,   # C2_OTHERNEUTRAL
 376     "BN"  => 0,    # C2_NOTAPPLICABLE
 377     "B"   => 8,    # C2_BLOCKSEPARATOR
 378     "S"   => 9,    # C2_SEGMENTSEPARATOR
 379     "WS"  => 10,   # C2_WHITESPACE
 380     "ON"  => 11,   # C2_OTHERNEUTRAL
 381     "LRE" => 11,   # C2_OTHERNEUTRAL
 382     "LRO" => 11,   # C2_OTHERNEUTRAL
 383     "RLE" => 11,   # C2_OTHERNEUTRAL
 384     "RLO" => 11,   # C2_OTHERNEUTRAL
 385     "PDF" => 11,   # C2_OTHERNEUTRAL
 386     "LRI" => 11,   # C2_OTHERNEUTRAL
 387     "RLI" => 11,   # C2_OTHERNEUTRAL
 388     "FSI" => 11,   # C2_OTHERNEUTRAL
 389     "PDI" => 11    # C2_OTHERNEUTRAL
 390 );
 391
 392 my %bidi_types =
 393 (
 394     "ON"  => 0,    # Other Neutrals
 395     "L"   => 1,    # Left-to-Right
 396     "R"   => 2,    # Right-to-Left
 397     "AN"  => 3,    # Arabic Number
 398     "EN"  => 4,    # European Number
 399     "AL"  => 5,    # Right-to-Left Arabic
 400     "NSM" => 6,    # Non-Spacing Mark
 401     "CS"  => 7,    # Common Number Separator
 402     "ES"  => 8,    # European Number Separator
 403     "ET"  => 9,    # European Number Terminator
 404     "BN"  => 10,   # Boundary Neutral
 405     "S"   => 11,   # Segment Separator
 406     "WS"  => 12,   # Whitespace
 407     "B"   => 13,   # Paragraph Separator
 408     "RLO" => 14,   # Right-to-Left Override
 409     "RLE" => 15,   # Right-to-Left Embedding
 410     "LRO" => 16,   # Left-to-Right Override
 411     "LRE" => 17,   # Left-to-Right Embedding
 412     "PDF" => 18,   # Pop Directional Format
 413     "LRI" => 19,   # Left-to-Right Isolate
 414     "RLI" => 20,   # Right-to-Left Isolate
 415     "FSI" => 21,   # First Strong Isolate
 416     "PDI" => 22    # Pop Directional Isolate
 417 );
 418
 419 my %joining_types =
 420 (
 421    "U" => 0,           # Non_Joining
 422    "L" => 1,           # Left_Joining
 423    "R" => 2,           # Right_Joining
 424    "D" => 3,           # Dual_Joining
 425    "C" => 3,           # Join_Causing
 426    "ALAPH" => 4,       # Syriac ALAPH
 427    "DALATH RISH" => 5, # Syriac DALATH RISH group
 428    "T" => 6,           # Transparent
 429 );
 430
 431 my @cp2uni = ();
 432 my @glyph2uni = ();
 433 my @lead_bytes = ();
 434 my @uni2cp = ();
 435 my @tolower_table = ();
 436 my @toupper_table = ();
 437 my @digitmap_table = ();
 438 my @category_table = ();
 439 my @initial_joining_table = ();
 440 my @direction_table = ();
 441 my @decomp_table = ();
 442 my @combining_class_table = ();
 443 my @decomp_compat_table = ();
 444 my @comp_exclusions = ();
 445 my @idna_decomp_table = ();
 446 my @idna_disallowed = ();
 447 my %registry_keys;
 448 my $default_char;
 449 my $default_wchar;
 450
 451 my %joining_forms =
 452 (
 453    "isolated" => [],
 454    "final" => [],
 455    "initial" => [],
 456    "medial" => []
 457 );
 458
 459 sub to_utf16(@)
 460 {
 461     my @ret;
 462     foreach my $ch (@_)
 463     {
 464         if ($ch < 0x10000)
 465         {
 466             push @ret, $ch;
 467         }
 468         else
 469         {
 470             my $val = $ch - 0x10000;
 471             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 472         }
 473     }
 474     return @ret;
 475 }
 476
 477 ################################################################
 478 # fetch a unicode.org file and open it
 479 sub open_data_file($$)
 480 {
 481     my ($base, $name) = @_;
 482     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 483     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 484     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 485     local *FILE;
 486
 487     if ($base =~ /.*\/([^\/]+)\.zip$/)
 488     {
 489         my $zip = "$1$suffix.zip";
 490         unless (-f "$cache/$zip")
 491         {
 492             system "mkdir", "-p", $cache;
 493             print "Fetching $base...\n";
 494             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 495         }
 496         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 497     }
 498     else
 499     {
 500         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 501         unless (-f $dest)
 502         {
 503             system "mkdir", "-p", $dir;
 504             print "Fetching $base/$name...\n";
 505             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 506         }
 507         open FILE, "<$dest" or die "cannot open $dest";
 508     }
 509     return *FILE;
 510 }
 511
 512 ################################################################
 513 # recursively get the decomposition for a character
 514 sub get_decomposition($$);
 515 sub get_decomposition($$)
 516 {
 517     my ($char, $table) = @_;
 518     my @ret;
 519
 520     return $char unless defined $table->[$char];
 521     foreach my $ch (@{$table->[$char]})
 522     {
 523         push @ret, get_decomposition( $ch, $table );
 524     }
 525     return @ret;
 526 }
 527
 528 ################################################################
 529 # get the composition that results in a given character
 530 sub get_composition($$)
 531 {
 532     my ($ch, $compat) = @_;
 533     return () unless defined $decomp_table[$ch];  # no decomposition
 534     my @ret = @{$decomp_table[$ch]};
 535     return () if @ret < 2;                        # singleton decomposition
 536     return () if $comp_exclusions[$ch];           # composition exclusion
 537     return () if $combining_class_table[$ch];     # non-starter
 538     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 539     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 540         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 541     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 542         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 543     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 544         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 545     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 546     return @ret;
 547 }
 548
 549 ################################################################
 550 # recursively build decompositions
 551 sub build_decompositions(@)
 552 {
 553     my @src = @_;
 554     my @dst;
 555
 556     for (my $i = 0; $i < @src; $i++)
 557     {
 558         next unless defined $src[$i];
 559         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 560         $dst[$i] = \@decomp;
 561     }
 562     return @dst;
 563 }
 564
 565 ################################################################
 566 # compose Hangul sequences
 567 sub compose_hangul(@)
 568 {
 569     my $SBASE  = 0xac00;
 570     my $LBASE  = 0x1100;
 571     my $VBASE  = 0x1161;
 572     my $TBASE  = 0x11a7;
 573     my $LCOUNT = 19;
 574     my $VCOUNT = 21;
 575     my $TCOUNT = 28;
 576     my $NCOUNT = $VCOUNT * $TCOUNT;
 577     my $SCOUNT = $LCOUNT * $NCOUNT;
 578
 579     my @seq = @_;
 580     my @ret;
 581     my $i;
 582
 583     for ($i = 0; $i < @seq; $i++)
 584     {
 585         my $ch = $seq[$i];
 586         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 587             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 588         {
 589             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 590             $i++;
 591         }
 592         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 593             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 594         {
 595             $ch += $seq[$i+1] - $TBASE;
 596             $i++;
 597         }
 598         push @ret, $ch;
 599     }
 600     return @ret;
 601 }
 602
 603 ################################################################
 604 # remove linguistic-only mappings from the case table
 605 sub remove_linguistic_mappings($$)
 606 {
 607     my ($upper, $lower) = @_;
 608
 609     # remove case mappings that don't round-trip
 610
 611     for (my $i = 0; $i < @{$upper}; $i++)
 612     {
 613         next unless defined ${$upper}[$i];
 614         my $ch = ${$upper}[$i];
 615         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 616     }
 617     for (my $i = 0; $i < @{$lower}; $i++)
 618     {
 619         next unless defined ${$lower}[$i];
 620         my $ch = ${$lower}[$i];
 621         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 622     }
 623 }
 624
 625 ################################################################
 626 # read in the Unicode database files
 627 sub load_data()
 628 {
 629     my $start;
 630
 631     # now build mappings from the decomposition field of the Unicode database
 632
 633     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 634     while (<$UNICODE_DATA>)
 635     {
 636         # Decode the fields ...
 637         my ($code, $name, $cat, $comb, $bidi,
 638             $decomp, $dec, $dig, $num, $mirror,
 639             $oldname, $comment, $upper, $lower, $title) = split /;/;
 640         my $src = hex $code;
 641
 642         die "unknown category $cat" unless defined $categories{$cat};
 643         die "unknown directionality $bidi" unless defined $directions{$bidi};
 644
 645         $category_table[$src] = $categories{$cat};
 646         $direction_table[$src] = $bidi;
 647         if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
 648         {
 649             $initial_joining_table[$src] = $joining_types{"T"};
 650         }
 651         else
 652         {
 653             $initial_joining_table[$src] = $joining_types{"U"};
 654         }
 655
 656         if ($lower ne "")
 657         {
 658             $tolower_table[$src] = hex $lower;
 659         }
 660         if ($upper ne "")
 661         {
 662             $toupper_table[$src] = hex $upper;
 663         }
 664         if ($dec ne "")
 665         {
 666             $category_table[$src] |= $ctype{"digit"};
 667         }
 668         if ($dig ne "")
 669         {
 670             $digitmap_table[$src] = ord $dig;
 671         }
 672         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 673
 674         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 675         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 676         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 677         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 678         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 679         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 680         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 681         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 682         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 683         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 684         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 685         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 686
 687         # copy the category and direction for everything between First/Last pairs
 688         if ($name =~ /, First>/) { $start = $src; }
 689         if ($name =~ /, Last>/)
 690         {
 691             while ($start < $src)
 692             {
 693                 $category_table[$start] = $category_table[$src];
 694                 $direction_table[$start] = $direction_table[$src];
 695                 $combining_class_table[$start] = $combining_class_table[$src];
 696                 $start++;
 697             }
 698         }
 699
 700         next if $decomp eq "";  # no decomposition, skip it
 701
 702         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 703         {
 704             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 705             $decomp_compat_table[$src] = \@seq;
 706         }
 707
 708         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 709         {
 710             # decomposition of the form "<foo> 1234" -> use char if type is known
 711             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 712             {
 713                 ${joining_forms{$1}}[hex $2] = $src;
 714             }
 715         }
 716         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 717         {
 718             # decomposition "<compat> 0020 1234" -> combining accent
 719         }
 720         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 721         {
 722             # store decomposition
 723             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 724             {
 725                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 726             }
 727             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 728             {
 729                 # Single char decomposition
 730                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 731             }
 732         }
 733     }
 734     close $UNICODE_DATA;
 735
 736     # patch the category of some special characters
 737
 738     for (my $i = 0; $i < @decomp_table; $i++)
 739     {
 740         next unless defined $decomp_table[$i];
 741         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 742     }
 743     foreach my $cat (keys %special_categories)
 744     {
 745         my $flag = $ctype{$cat};
 746         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 747     }
 748     for (my $i = 0; $i < @decomp_compat_table; $i++)
 749     {
 750         next unless defined $decomp_compat_table[$i];
 751         next unless @{$decomp_compat_table[$i]} == 2;
 752         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 753     }
 754
 755     # load the composition exclusions
 756
 757     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 758     while (<$EXCL>)
 759     {
 760         s/\#.*//;  # remove comments
 761         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 762         {
 763             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 764         }
 765         elsif (/^([0-9a-fA-F]+)\s*$/)
 766         {
 767             $comp_exclusions[hex $1] = 1;
 768         }
 769     }
 770     close $EXCL;
 771
 772     # load the IDNA mappings
 773
 774     @idna_decomp_table = @decomp_compat_table;
 775     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 776     while (<$IDNA>)
 777     {
 778         s/\#.*//;  # remove comments
 779         next if /^\s*$/;
 780         my ($char, $type, $mapping) = split /;/;
 781         my ($ch1, $ch2);
 782         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 783         {
 784             $ch1 = hex $1;
 785             $ch2 = hex $2;
 786         }
 787         elsif ($char =~ /([0-9a-fA-F]+)/)
 788         {
 789             $ch1 = $ch2 = hex $1;
 790         }
 791
 792         if ($type =~ /mapped/ || $type =~ /deviation/)
 793         {
 794             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 795             my @seq = map { hex $_; } split /\s+/, $mapping;
 796             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 797         }
 798         elsif ($type =~ /valid/)
 799         {
 800         }
 801         elsif ($type =~ /ignored/)
 802         {
 803             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 804         }
 805         elsif ($type =~ /disallowed/)
 806         {
 807             foreach my $i ($ch1 .. $ch2)
 808             {
 809                 $idna_decomp_table[$i] = undef;
 810                 $idna_disallowed[$i] = 1;
 811             }
 812         }
 813     }
 814     close $IDNA;
 815 }
 816
 817
 818 ################################################################
 819 # add a new registry key
 820 sub add_registry_key($$)
 821 {
 822     my ($key, $defval) = @_;
 823     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 824 }
 825
 826 ################################################################
 827 # add a new registry value
 828 sub add_registry_value($$$)
 829 {
 830     my ($key, $name, $value) = @_;
 831     add_registry_key( $key, undef );
 832     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 833 }
 834
 835 ################################################################
 836 # define a new lead byte
 837 sub add_lead_byte($)
 838 {
 839     my $ch = shift;
 840     return if defined $cp2uni[$ch];
 841     push @lead_bytes, $ch;
 842     $cp2uni[$ch] = 0;
 843 }
 844
 845 ################################################################
 846 # define a new char mapping
 847 sub add_mapping($$)
 848 {
 849     my ($cp, $uni) = @_;
 850     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 851     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 852     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 853 }
 854
 855 ################################################################
 856 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 857 sub get_glyphs_mapping(@)
 858 {
 859     my @table = @_;
 860
 861     for (my $i = 0; $i < @glyph2uni; $i++)
 862     {
 863         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 864     }
 865     return @table;
 866 }
 867
 868 ################################################################
 869 # build EUC-JP table from the JIS 0208/0212 files
 870 sub dump_eucjp_codepage()
 871 {
 872     @cp2uni = ();
 873     @glyph2uni = ();
 874     @lead_bytes = ();
 875     @uni2cp = ();
 876     $default_char = $DEF_CHAR;
 877     $default_wchar = 0x30fb;
 878
 879     # ASCII chars
 880     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 881
 882     # lead bytes
 883     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 884
 885     # JIS X 0201 right plane
 886     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 887
 888     # undefined chars
 889     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 890     $cp2uni[0xa0] = 0xf8f0;
 891     $cp2uni[0xff] = 0xf8f3;
 892
 893     # Fix backslash conversion
 894     add_mapping( 0xa1c0, 0xff3c );
 895
 896     # Add private mappings for rows undefined in JIS 0208/0212
 897     my $private = 0xe000;
 898     foreach my $hi (0xf5 .. 0xfe)
 899     {
 900         foreach my $lo (0xa1 .. 0xfe)
 901         {
 902             add_mapping( ($hi << 8) + $lo, $private++ );
 903         }
 904     }
 905     foreach my $hi (0xf5 .. 0xfe)
 906     {
 907         foreach my $lo (0x21 .. 0x7e)
 908         {
 909             add_mapping( ($hi << 8) + $lo, $private++ );
 910         }
 911     }
 912
 913     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 914     while (<$INPUT>)
 915     {
 916         next if /^\#/;  # skip comments
 917         next if /^$/;  # skip empty lines
 918         next if /\x1a/;  # skip ^Z
 919         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 920         {
 921             add_mapping( 0x8080 + hex $1, hex $2 );
 922             next;
 923         }
 924         die "Unrecognized line $_\n";
 925     }
 926     close $INPUT;
 927
 928     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 929     while (<$INPUT>)
 930     {
 931         next if /^\#/;  # skip comments
 932         next if /^$/;  # skip empty lines
 933         next if /\x1a/;  # skip ^Z
 934         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 935         {
 936             add_mapping( 0x8000 + hex $1, hex $2 );
 937             next;
 938         }
 939         die "Unrecognized line $_\n";
 940     }
 941     close $INPUT;
 942
 943     output_codepage_file( 20932 );
 944 }
 945
 946 ################################################################
 947 # build Korean Wansung table from the KSX1001 file
 948 sub dump_krwansung_codepage(@)
 949 {
 950     my @cp949 = @_;
 951     @cp2uni = ();
 952     @glyph2uni = ();
 953     @lead_bytes = ();
 954     @uni2cp = ();
 955     $default_char = 0x3f;
 956     $default_wchar = 0x003f;
 957
 958     # ASCII and undefined chars
 959     foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
 960     add_mapping( 0xa0, 0xf8e6 );
 961     add_mapping( 0xad, 0xf8e7 );
 962     add_mapping( 0xae, 0xf8e8 );
 963     add_mapping( 0xaf, 0xf8e9 );
 964     add_mapping( 0xfe, 0xf8ea );
 965     add_mapping( 0xff, 0xf8eb );
 966
 967     my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
 968     while (<$INPUT>)
 969     {
 970         next if /^\#/;  # skip comments
 971         next if /^$/;  # skip empty lines
 972         next if /\x1a/;  # skip ^Z
 973         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 974         {
 975             add_mapping( 0x8080 + hex $1, hex $2 );
 976             next;
 977         }
 978         die "Unrecognized line $_\n";
 979     }
 980     close $INPUT;
 981
 982     # get some extra mappings from cp 949
 983     my @defined_lb;
 984     map { $defined_lb[$_] = 1; } @lead_bytes;
 985     foreach my $i (0x0000 .. 0xffff)
 986     {
 987         next if ($i >= 0x1100 && $i <= 0x11ff);  # range not used in 20949
 988         next unless defined $cp949[$i];
 989         if ($cp949[$i] >= 0xff)
 990         {
 991             # only add chars for lead bytes that exist in 20949
 992             my $hi = $cp949[$i] >> 8;
 993             my $lo = $cp949[$i] & 0xff;
 994             next unless $defined_lb[$hi];
 995             next unless $lo >= 0xa1 && $lo <= 0xfe;
 996         }
 997         add_mapping( $cp949[$i], $i );
 998     }
 999
1000     output_codepage_file( 20949 );
1001 }
1002
1003 ################################################################
1004 # build the sort keys table
1005 sub dump_sortkeys($)
1006 {
1007     my $filename = shift;
1008     my @sortkeys = ();
1009
1010     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
1011     while (<$INPUT>)
1012     {
1013         next if /^\#/;  # skip comments
1014         next if /^$/;  # skip empty lines
1015         next if /\x1a/;  # skip ^Z
1016         next if /^\@version/;  # skip @version header
1017         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1018         {
1019             my ($uni,$variable) = (hex $1, $2);
1020             next if $uni > 65535;
1021             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
1022             next;
1023         }
1024         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1025         {
1026             # multiple character sequence, ignored for now
1027             next;
1028         }
1029         die "$SORTKEYS: Unrecognized line $_\n";
1030     }
1031     close $INPUT;
1032
1033     # compress the keys to 32 bit:
1034     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
1035
1036     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
1037                        ${$a}[2] <=> ${$b}[2] or
1038                        ${$a}[3] <=> ${$b}[3] or
1039                        ${$a}[4] <=> ${$b}[4] or
1040                        $a cmp $b; } @sortkeys;
1041
1042     my ($n2, $n3) = (1, 1);
1043     my @keys = (-1, -1, -1, -1, -1 );
1044     my @flatkeys = ();
1045
1046     for (my $i = 0; $i < @sortkeys; $i++)
1047     {
1048         next unless defined $sortkeys[$i];
1049         my @current = @{$sortkeys[$i]};
1050         if ($current[1] == $keys[1])
1051         {
1052             if ($current[2] == $keys[2])
1053             {
1054                 if ($current[3] == $keys[3])
1055                 {
1056                     # nothing
1057                 }
1058                 else
1059                 {
1060                     $keys[3] = $current[3];
1061                     $n3++;
1062                     die if ($n3 >= 16);
1063                 }
1064             }
1065             else
1066             {
1067                 $keys[2] = $current[2];
1068                 $keys[3] = $current[3];
1069                 $n2++;
1070                 $n3 = 1;
1071                 die if ($n2 >= 256);
1072             }
1073         }
1074         else
1075         {
1076             $keys[1] = $current[1];
1077             $keys[2] = $current[2];
1078             $keys[3] = $current[3];
1079             $n2 = 1;
1080             $n3 = 1;
1081         }
1082
1083         if ($current[2]) { $current[2] = $n2; }
1084         if ($current[3]) { $current[3] = $n3; }
1085         if ($current[4]) { $current[4] = 1; }
1086
1087         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1088     }
1089
1090     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1091     printf "Building $filename\n";
1092     printf OUTPUT "/* Unicode collation element table */\n";
1093     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1094     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1095     print OUTPUT "#include \"windef.h\"\n\n";
1096
1097     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1098
1099     close OUTPUT;
1100     save_file($filename);
1101 }
1102
1103
1104 ################################################################
1105 # dump an array of integers
1106 sub dump_array($$@)
1107 {
1108     my ($bit_width, $default, @array) = @_;
1109     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1110     my $i;
1111     my $ret = "    ";
1112     for ($i = 0; $i < $#array; $i++)
1113     {
1114         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1115         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1116     }
1117     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1118     return $ret;
1119 }
1120
1121
1122 ################################################################
1123 # dump an SBCS mapping table in binary format
1124 sub dump_binary_sbcs_table($)
1125 {
1126     my $codepage = shift;
1127
1128     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1129     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1130
1131     print OUTPUT pack "S<*", @header;
1132     print OUTPUT pack "C12", (0) x 12;
1133     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1134
1135     if (@glyph2uni)
1136     {
1137         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1138     }
1139     else
1140     {
1141         print OUTPUT pack "S<*", 0;
1142     }
1143
1144     print OUTPUT pack "S<*", 0, 0;
1145
1146     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1147 }
1148
1149
1150 ################################################################
1151 # dump a DBCS mapping table in binary format
1152 sub dump_binary_dbcs_table($)
1153 {
1154     my $codepage = shift;
1155     my @lb_ranges = get_lb_ranges();
1156     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1157
1158     my @offsets = (0) x 256;
1159     my $pos = 0;
1160     foreach my $i (@lead_bytes)
1161     {
1162         $offsets[$i] = ($pos += 256);
1163         $cp2uni[$i] = 0;
1164     }
1165
1166     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1167
1168     print OUTPUT pack "S<*", @header;
1169     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1170     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1171     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1172
1173     foreach my $i (@lead_bytes)
1174     {
1175         my $base = $i << 8;
1176         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1177     }
1178
1179     print OUTPUT pack "S<", 4;
1180     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1181 }
1182
1183
1184 ################################################################
1185 # get the list of defined lead byte ranges
1186 sub get_lb_ranges()
1187 {
1188     my @list = ();
1189     my @ranges = ();
1190
1191     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1192     my $on = 0;
1193     for (my $i = 0; $i < 256; $i++)
1194     {
1195         if ($on)
1196         {
1197             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1198         }
1199         else
1200         {
1201             if ($list[$i]) { push @ranges, $i; $on = 1; }
1202         }
1203     }
1204     if ($on) { push @ranges, 0xff; }
1205     return @ranges;
1206 }
1207
1208 ################################################################
1209 # dump the Indic Syllabic Category table
1210 sub dump_indic($)
1211 {
1212     my $filename = shift;
1213     my @indic_table;
1214
1215     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1216     while (<$INPUT>)
1217     {
1218         next if /^\#/;  # skip comments
1219         next if /^\s*$/;  # skip empty lines
1220         next if /\x1a/;  # skip ^Z
1221         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1222         {
1223             my $type = $2;
1224             die "unknown indic $type" unless defined $indic_types{$type};
1225             if (hex $1 < 65536)
1226             {
1227                 $indic_table[hex $1] = $indic_types{$type};
1228             }
1229             next;
1230         }
1231         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1232         {
1233             my $type = $3;
1234             die "unknown indic $type" unless defined $indic_types{$type};
1235             if (hex $1 < 65536 and hex $2 < 65536)
1236             {
1237                 foreach my $i (hex $1 .. hex $2)
1238                 {
1239                     $indic_table[$i] = $indic_types{$type};
1240                 }
1241             }
1242             next;
1243         }
1244         die "malformed line $_";
1245     }
1246     close $INPUT;
1247
1248     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1249     while (<$INPUT>)
1250     {
1251         next if /^\#/;  # skip comments
1252         next if /^\s*$/;  # skip empty lines
1253         next if /\x1a/;  # skip ^Z
1254         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1255         {
1256             my $type = $2;
1257             die "unknown matra $type" unless defined $matra_types{$type};
1258             $indic_table[hex $1] |= $matra_types{$type} << 8;
1259             next;
1260         }
1261         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1262         {
1263             my $type = $3;
1264             die "unknown matra $type" unless defined $matra_types{$type};
1265             foreach my $i (hex $1 .. hex $2)
1266             {
1267                 $indic_table[$i] |= $matra_types{$type} << 8;
1268             }
1269             next;
1270         }
1271         die "malformed line $_";
1272     }
1273     close $INPUT;
1274
1275     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1276     print "Building $filename\n";
1277     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1278     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1279     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1280     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1281     print OUTPUT "#include \"windef.h\"\n\n";
1282
1283     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1284
1285     close OUTPUT;
1286     save_file($filename);
1287 }
1288
1289 ################################################################
1290 # dump the Line Break Properties table
1291 sub dump_linebreak($)
1292 {
1293     my $filename = shift;
1294     my @break_table;
1295
1296     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1297     while (<$INPUT>)
1298     {
1299         next if /^\#/;  # skip comments
1300         next if /^\s*$/;  # skip empty lines
1301         next if /\x1a/;  # skip ^Z
1302         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1303         {
1304             my $type = $2;
1305             die "unknown breaktype $type" unless defined $break_types{$type};
1306             $break_table[hex $1] = $break_types{$type};
1307             next;
1308         }
1309         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1310         {
1311             my $type = $3;
1312             die "unknown breaktype $type" unless defined $break_types{$type};
1313             foreach my $i (hex $1 .. hex $2)
1314             {
1315                 $break_table[$i] = $break_types{$type};
1316             }
1317             next;
1318         }
1319         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1320         {
1321             my $type = $2;
1322             die "unknown breaktype $type" unless defined $break_types{$type};
1323             $break_table[hex $1] = $break_types{$type};
1324             next;
1325         }
1326         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1327         {
1328             my $type = $3;
1329             die "unknown breaktype $type" unless defined $break_types{$type};
1330             foreach my $i (hex $1 .. hex $2)
1331             {
1332                 $break_table[$i] = $break_types{$type};
1333             }
1334             next;
1335         }
1336         die "malformed line $_";
1337     }
1338     close $INPUT;
1339
1340     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1341     print "Building $filename\n";
1342     print OUTPUT "/* Unicode Line Break Properties */\n";
1343     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1344     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1345     print OUTPUT "#include \"windef.h\"\n\n";
1346
1347     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1348
1349     close OUTPUT;
1350     save_file($filename);
1351 }
1352
1353 my %scripts =
1354 (
1355     "Unknown"                => 0,
1356     "Common"                 => 1,
1357     "Inherited"              => 2,
1358     "Arabic"                 => 3,
1359     "Armenian"               => 4,
1360     "Avestan"                => 5,
1361     "Balinese"               => 6,
1362     "Bamum"                  => 7,
1363     "Batak"                  => 8,
1364     "Bengali"                => 9,
1365     "Bopomofo"               => 10,
1366     "Brahmi"                 => 11,
1367     "Braille"                => 12,
1368     "Buginese"               => 13,
1369     "Buhid"                  => 14,
1370     "Canadian_Aboriginal"    => 15,
1371     "Carian"                 => 16,
1372     "Cham"                   => 17,
1373     "Cherokee"               => 18,
1374     "Coptic"                 => 19,
1375     "Cuneiform"              => 20,
1376     "Cypriot"                => 21,
1377     "Cyrillic"               => 22,
1378     "Deseret"                => 23,
1379     "Devanagari"             => 24,
1380     "Egyptian_Hieroglyphs"   => 25,
1381     "Ethiopic"               => 26,
1382     "Georgian"               => 27,
1383     "Glagolitic"             => 28,
1384     "Gothic"                 => 29,
1385     "Greek"                  => 30,
1386     "Gujarati"               => 31,
1387     "Gurmukhi"               => 32,
1388     "Han"                    => 33,
1389     "Hangul"                 => 34,
1390     "Hanunoo"                => 35,
1391     "Hebrew"                 => 36,
1392     "Hiragana"               => 37,
1393     "Imperial_Aramaic"       => 38,
1394     "Inscriptional_Pahlavi"  => 39,
1395     "Inscriptional_Parthian" => 40,
1396     "Javanese"               => 41,
1397     "Kaithi"                 => 42,
1398     "Kannada"                => 43,
1399     "Katakana"               => 44,
1400     "Kayah_Li"               => 45,
1401     "Kharoshthi"             => 46,
1402     "Khmer"                  => 47,
1403     "Lao"                    => 48,
1404     "Latin"                  => 49,
1405     "Lepcha"                 => 50,
1406     "Limbu"                  => 51,
1407     "Linear_B"               => 52,
1408     "Lisu"                   => 53,
1409     "Lycian"                 => 54,
1410     "Lydian"                 => 55,
1411     "Malayalam"              => 56,
1412     "Mandaic"                => 57,
1413     "Meetei_Mayek"           => 58,
1414     "Mongolian"              => 59,
1415     "Myanmar"                => 60,
1416     "New_Tai_Lue"            => 61,
1417     "Nko"                    => 62,
1418     "Ogham"                  => 63,
1419     "Ol_Chiki"               => 64,
1420     "Old_Italic"             => 65,
1421     "Old_Persian"            => 66,
1422     "Old_South_Arabian"      => 67,
1423     "Old_Turkic"             => 68,
1424     "Oriya"                  => 69,
1425     "Osmanya"                => 70,
1426     "Phags_Pa"               => 71,
1427     "Phoenician"             => 72,
1428     "Rejang"                 => 73,
1429     "Runic"                  => 74,
1430     "Samaritan"              => 75,
1431     "Saurashtra"             => 76,
1432     "Shavian"                => 77,
1433     "Sinhala"                => 78,
1434     "Sundanese"              => 79,
1435     "Syloti_Nagri"           => 80,
1436     "Syriac"                 => 81,
1437     "Tagalog"                => 82,
1438     "Tagbanwa"               => 83,
1439     "Tai_Le"                 => 84,
1440     "Tai_Tham"               => 85,
1441     "Tai_Viet"               => 86,
1442     "Tamil"                  => 87,
1443     "Telugu"                 => 88,
1444     "Thaana"                 => 89,
1445     "Thai"                   => 90,
1446     "Tibetan"                => 91,
1447     "Tifinagh"               => 92,
1448     "Ugaritic"               => 93,
1449     "Vai"                    => 94,
1450     "Yi"                     => 95,
1451     # Win8/Win8.1
1452     "Chakma"                 => 96,
1453     "Meroitic_Cursive"       => 97,
1454     "Meroitic_Hieroglyphs"   => 98,
1455     "Miao"                   => 99,
1456     "Sharada"                => 100,
1457     "Sora_Sompeng"           => 101,
1458     "Takri"                  => 102,
1459     # Win10
1460     "Bassa_Vah"              => 103,
1461     "Caucasian_Albanian"     => 104,
1462     "Duployan"               => 105,
1463     "Elbasan"                => 106,
1464     "Grantha"                => 107,
1465     "Khojki"                 => 108,
1466     "Khudawadi"              => 109,
1467     "Linear_A"               => 110,
1468     "Mahajani"               => 111,
1469     "Manichaean"             => 112,
1470     "Mende_Kikakui"          => 113,
1471     "Modi"                   => 114,
1472     "Mro"                    => 115,
1473     "Nabataean"              => 116,
1474     "Old_North_Arabian"      => 117,
1475     "Old_Permic"             => 118,
1476     "Pahawh_Hmong"           => 119,
1477     "Palmyrene"              => 120,
1478     "Pau_Cin_Hau"            => 121,
1479     "Psalter_Pahlavi"        => 122,
1480     "Siddham"                => 123,
1481     "Tirhuta"                => 124,
1482     "Warang_Citi"            => 125,
1483     # Win10 RS1
1484     "Adlam"                  => 126,
1485     "Ahom"                   => 127,
1486     "Anatolian_Hieroglyphs"  => 128,
1487     "Bhaiksuki"              => 129,
1488     "Hatran"                 => 130,
1489     "Marchen"                => 131,
1490     "Multani"                => 132,
1491     "Newa"                   => 133,
1492     "Old_Hungarian"          => 134,
1493     "Osage"                  => 135,
1494     "SignWriting"            => 136,
1495     "Tangut"                 => 137,
1496     # Win10 RS4
1497     "Masaram_Gondi"          => 138,
1498     "Nushu"                  => 139,
1499     "Soyombo"                => 140,
1500     "Zanabazar_Square"       => 141,
1501     # Win10 1903
1502     "Dogra"                  => 142,
1503     "Gunjala_Gondi"          => 143,
1504     "Hanifi_Rohingya"        => 144,
1505     "Makasar"                => 145,
1506     "Medefaidrin"            => 146,
1507     "Old_Sogdian"            => 147,
1508     "Sogdian"                => 148,
1509     # Win10 2004
1510     "Elymaic"                => 149,
1511     "Nyiakeng_Puachue_Hmong" => 150,
1512     "Nandinagari"            => 151,
1513     "Wancho"                 => 152,
1514 );
1515
1516 ################################################################
1517 # dump Script IDs table
1518 sub dump_scripts($)
1519 {
1520     my $filename = shift;
1521     my $header = $filename;
1522     my @scripts_table;
1523     my $script_index;
1524     my $i;
1525
1526     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1527     # Fill the table
1528     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1529     while (<$INPUT>)
1530     {
1531         my $type = "";
1532
1533         next if /^\#/;  # skip comments
1534         next if /^\s*$/;  # skip empty lines
1535         next if /\x1a/;  # skip ^Z
1536         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1537         {
1538             $type = $2;
1539             if (defined $scripts{$type})
1540             {
1541                 $scripts_table[hex $1] = $scripts{$type};
1542             }
1543             next;
1544         }
1545         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1546         {
1547             $type = $3;
1548             if (defined $scripts{$type})
1549             {
1550                 foreach my $i (hex $1 .. hex $2)
1551                 {
1552                     $scripts_table[$i] = $scripts{$type};
1553                 }
1554             }
1555             next;
1556         }
1557     }
1558
1559     close $INPUT;
1560
1561     $header = "$filename.h";
1562     open OUTPUT,">$header.new" or die "Cannot create $header";
1563     print "Building $header\n";
1564     print OUTPUT "/* Unicode Script IDs */\n";
1565     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1566     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1567
1568     print OUTPUT "enum unicode_script_id {\n";
1569     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1570     {
1571         print OUTPUT "    Script_$script = $scripts{$script},\n";
1572     }
1573     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1574     print OUTPUT "};\n";
1575
1576     close OUTPUT;
1577     save_file($header);
1578
1579     $filename = "$filename.c";
1580     open OUTPUT,">$filename.new" or die "Cannot create $header";
1581     print "Building $filename\n";
1582     print OUTPUT "/* Unicode Script IDs */\n";
1583     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1584     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1585     print OUTPUT "#include \"windef.h\"\n\n";
1586
1587     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1588     close OUTPUT;
1589     save_file($filename);
1590 }
1591
1592 ################################################################
1593 # dump the BiDi mirroring table
1594 sub dump_mirroring($)
1595 {
1596     my $filename = shift;
1597     my @mirror_table = ();
1598
1599     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1600     while (<$INPUT>)
1601     {
1602         next if /^\#/;  # skip comments
1603         next if /^$/;  # skip empty lines
1604         next if /\x1a/;  # skip ^Z
1605         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1606         {
1607             $mirror_table[hex $1] = hex $2;
1608             next;
1609         }
1610         die "malformed line $_";
1611     }
1612     close $INPUT;
1613
1614     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1615     print "Building $filename\n";
1616     print OUTPUT "/* Unicode BiDi mirroring */\n";
1617     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1618     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1619     print OUTPUT "#include \"windef.h\"\n\n";
1620     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1621     close OUTPUT;
1622     save_file($filename);
1623 }
1624
1625 ################################################################
1626 # dump the Bidi Brackets
1627 sub dump_bracket($)
1628 {
1629     my $filename = shift;
1630     my @bracket_table;
1631
1632     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1633     while (<$INPUT>)
1634     {
1635         next if /^\#/;  # skip comments
1636         next if /^\s*$/;  # skip empty lines
1637         next if /\x1a/;  # skip ^Z
1638         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1639         {
1640             my $type = $3;
1641             die "unknown bracket $type" unless defined $bracket_types{$type};
1642             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1643             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1644             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1645             next;
1646         }
1647         die "malformed line $_";
1648     }
1649     close $INPUT;
1650
1651     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1652     print "Building $filename\n";
1653     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1654     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1655     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1656     print OUTPUT "#include \"windef.h\"\n\n";
1657
1658     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1659
1660     close OUTPUT;
1661     save_file($filename);
1662 }
1663
1664 ################################################################
1665 # dump the Arabic shaping table
1666 sub dump_shaping($)
1667 {
1668     my $filename = shift;
1669     my @joining_table = @initial_joining_table;
1670
1671     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1672     while (<$INPUT>)
1673     {
1674         next if /^\#/;  # skip comments
1675         next if /^\s*$/;  # skip empty lines
1676         next if /\x1a/;  # skip ^Z
1677         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1678         {
1679             my $type = $2;
1680             $joining_table[hex $1] = $joining_types{$type};
1681             next;
1682         }
1683         die "malformed line $_";
1684     }
1685     close $INPUT;
1686
1687     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1688     print "Building $filename\n";
1689     print OUTPUT "/* Unicode Arabic shaping */\n";
1690     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1691     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1692     print OUTPUT "#include \"windef.h\"\n\n";
1693
1694     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1695
1696     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1697     for (my $i = 0x600; $i <= 0x6ff; $i++)
1698     {
1699         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1700             ${joining_forms{"isolated"}}[$i] || $i,
1701             ${joining_forms{"final"}}[$i] || $i,
1702             ${joining_forms{"initial"}}[$i] || $i,
1703             ${joining_forms{"medial"}}[$i] || $i;
1704     }
1705     print OUTPUT "};\n";
1706
1707     close OUTPUT;
1708     save_file($filename);
1709 }
1710
1711 ################################################################
1712 # dump the Arabic shaping table
1713 sub dump_arabic_shaping($)
1714 {
1715     my $filename = shift;
1716     my @joining_table = @initial_joining_table;
1717
1718     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1719     while (<$INPUT>)
1720     {
1721         next if /^\#/;  # skip comments
1722         next if /^\s*$/;  # skip empty lines
1723         next if /\x1a/;  # skip ^Z
1724         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1725         {
1726             my $type = $2;
1727             my $group = $3;
1728
1729             if ($group eq "ALAPH" || $group eq "DALATH RISH")
1730             {
1731                 $joining_table[hex $1] = $joining_types{$group};
1732             }
1733             else
1734             {
1735                 $joining_table[hex $1] = $joining_types{$type};
1736             }
1737
1738             next;
1739         }
1740         die "malformed line $_";
1741     }
1742     close $INPUT;
1743
1744     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1745     print "Building $filename\n";
1746     print OUTPUT "/* Unicode Arabic shaping */\n";
1747     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1748     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1749     print OUTPUT "#include \"windef.h\"\n\n";
1750
1751     dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1752
1753     close OUTPUT;
1754     save_file($filename);
1755 }
1756
1757 ################################################################
1758 # dump the Vertical Orientation table
1759 sub dump_vertical($)
1760 {
1761     my $filename = shift;
1762     my @vertical_table;
1763
1764     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1765     while (<$INPUT>)
1766     {
1767         next if /^\#/;  # skip comments
1768         next if /^\s*$/;  # skip empty lines
1769         next if /\x1a/;  # skip ^Z
1770         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1771         {
1772             my $type = $2;
1773             die "unknown vertical $type" unless defined $vertical_types{$type};
1774             if (hex $1 < 65536)
1775             {
1776                 $vertical_table[hex $1] = $vertical_types{$type};
1777             }
1778             next;
1779         }
1780         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1781         {
1782             my $type = $3;
1783             die "unknown vertical $type" unless defined $vertical_types{$type};
1784             foreach my $i (hex $1 .. hex $2)
1785             {
1786                 $vertical_table[$i] = $vertical_types{$type};
1787             }
1788             next;
1789         }
1790         die "malformed line $_";
1791     }
1792     close $INPUT;
1793
1794     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1795     print "Building $filename\n";
1796     print OUTPUT "/* Unicode Vertical Orientation */\n";
1797     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1798     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1799     print OUTPUT "#include \"windef.h\"\n\n";
1800
1801     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1802
1803     close OUTPUT;
1804     save_file($filename);
1805 }
1806
1807 ################################################################
1808 # dump the digit folding tables
1809 sub dump_digit_folding($)
1810 {
1811     my ($filename) = shift;
1812     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1813     print "Building $filename\n";
1814     print OUTPUT "/* Unicode digit folding mappings */\n";
1815     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1816     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1817     print OUTPUT "#include \"windef.h\"\n\n";
1818
1819     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1820     close OUTPUT;
1821     save_file($filename);
1822 }
1823
1824
1825 ################################################################
1826 # compress a mapping table by removing identical rows
1827 sub compress_array($$@)
1828 {
1829     my $rows = shift;
1830     my $def = shift;
1831     my @table = @_;
1832     my $len = @table / $rows;
1833     my @array;
1834     my $data = "";
1835
1836     # try to merge table rows
1837     for (my $row = 0; $row < $rows; $row++)
1838     {
1839         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1840         my $pos = index $data, $rowtxt;
1841         if ($pos == -1)
1842         {
1843             # check if the tail of the data can match the start of the new row
1844             my $first = substr( $rowtxt, 0, 1 );
1845             for (my $i = length($data) - 1; $i > 0; $i--)
1846             {
1847                 $pos = index( substr( $data, -$i ), $first );
1848                 last if $pos == -1;
1849                 $i -= $pos;
1850                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1851                 substr( $data, -$i ) = "";
1852                 last;
1853             }
1854             $pos = length $data;
1855             $data .= $rowtxt;
1856         }
1857         $array[$row] = $rows + $pos;
1858     }
1859     return @array, unpack "U*", $data;
1860 }
1861
1862 ################################################################
1863 # dump a char -> 16-bit value mapping table using two-level tables
1864 sub dump_two_level_mapping($$@)
1865 {
1866     my $name = shift;
1867     my $def = shift;
1868     my $size = shift;
1869     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1870     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1871     my @array = compress_array( 256, 0, @row_array[0..4095] );
1872
1873     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1874
1875     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1876     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1877     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1878     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1879 }
1880
1881 ################################################################
1882 # dump a char -> value mapping table using three-level tables
1883 sub dump_three_level_mapping($$@)
1884 {
1885     my $name = shift;
1886     my $def = shift;
1887     my $size = shift;
1888     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1889     my $level3 = ($MAX_CHAR + 1) / 16;
1890     my $level2 = $level3 / 16;
1891     my $level1 = $level2 / 16;
1892     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1893     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1894     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1895
1896     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1897     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1898
1899     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1900     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1901     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1902     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1903     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1904 }
1905
1906 ################################################################
1907 # dump a binary case mapping table in l_intl.nls format
1908 sub dump_binary_case_table(@)
1909 {
1910     my (@table) = @_;
1911     my $max_char = 0x10000;
1912     my $level1 = $max_char / 16;
1913     my $level2 = $level1 / 16;
1914
1915     my @difftable;
1916     for (my $i = 0; $i < @table; $i++)
1917     {
1918         next unless defined $table[$i];
1919         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1920     }
1921
1922     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1923     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1924     my $offset = @array - $level1;
1925     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1926     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1927 }
1928
1929 ################################################################
1930 # dump case mappings for l_intl.nls
1931 sub dump_intl_nls($)
1932 {
1933     my @upper_table = @toupper_table;
1934     my @lower_table = @tolower_table;
1935     remove_linguistic_mappings( \@upper_table, \@lower_table );
1936
1937     my $upper = dump_binary_case_table( @upper_table );
1938     my $lower = dump_binary_case_table( @lower_table );
1939
1940     my $filename = shift;
1941     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1942     printf "Building $filename\n";
1943
1944     binmode OUTPUT;
1945     print OUTPUT pack "S<", 1;  # version
1946     print OUTPUT $upper;
1947     print OUTPUT $lower;
1948     close OUTPUT;
1949     save_file($filename);
1950 }
1951
1952
1953 ################################################################
1954 # dump the bidi direction table
1955 sub dump_bidi_dir_table($)
1956 {
1957     my $filename = shift;
1958     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1959     printf "Building $filename\n";
1960     printf OUTPUT "/* Unicode BiDi direction table */\n";
1961     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1962     printf OUTPUT "#include \"windef.h\"\n\n";
1963
1964     my @table;
1965
1966     for (my $i = 0; $i < 65536; $i++)
1967     {
1968         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1969     }
1970
1971     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1972
1973     close OUTPUT;
1974     save_file($filename);
1975 }
1976
1977
1978 sub rol($$)
1979 {
1980     my ($byte, $count) = @_;
1981     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1982 }
1983
1984 ################################################################
1985 # compress the character properties table
1986 sub compress_char_props_table($@)
1987 {
1988     my $rows = shift;
1989     my @table = @_;
1990     my $len = @table / $rows;
1991     my $pos = 0;
1992     my @array = (0) x $rows;
1993     my %sequences;
1994
1995     # add some predefined sequences
1996     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1997
1998     # try to merge table rows
1999     for (my $row = 0; $row < $rows; $row++)
2000     {
2001         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
2002         my $rowtxt = pack "L*", @table_row;
2003         if (defined($sequences{$rowtxt}))
2004         {
2005             # reuse an existing row
2006             $array[$row] = $sequences{$rowtxt};
2007         }
2008         else
2009         {
2010             # create a new row
2011             $sequences{$rowtxt} = $array[$row] = ++$pos;
2012             push @array, @table_row;
2013         }
2014     }
2015     return @array;
2016 }
2017
2018 ################################################################
2019 # dump a normalization table in binary format
2020 sub dump_norm_table($)
2021 {
2022     my $filename = shift;
2023
2024     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2025     my %decomp = ( "nfc" => \@decomp_table,
2026                    "nfd" => \@decomp_table,
2027                    "nfkc" => \@decomp_compat_table,
2028                    "nfkd" => \@decomp_compat_table ,
2029                    "idna" => \@idna_decomp_table );
2030
2031     open OUTPUT,">$filename.new" or die "Cannot create $filename";
2032     print "Building $filename\n";
2033
2034     my $type = $filename;
2035     $type =~ s!.*/norm(\w+)\.nls!$1!;
2036
2037     my $compose = $forms{$type} & 1;
2038     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2039
2040     my @version = split /\./, $UNIVERSION;
2041
2042     # combining classes
2043
2044     my @classes;
2045     my @class_values;
2046
2047     foreach my $c (grep defined, @combining_class_table)
2048     {
2049         $classes[$c] = 1 if $c < 0x100;
2050     }
2051     for (my $i = 0; $i < @classes; $i++)
2052     {
2053         next unless defined $classes[$i];
2054         $classes[$i] = @class_values;
2055         push @class_values, $i;
2056     }
2057     push @class_values, 0 if (@class_values % 2);
2058     die "too many classes" if @class_values >= 0x40;
2059
2060     # character properties
2061
2062     my @char_props;
2063     my @decomposed;
2064     my @comp_hash_table;
2065     my $comp_hash_size = $compose ? 254 : 0;
2066
2067     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2068     {
2069         next unless defined $combining_class_table[$i];
2070         if (defined $decomp{$type}->[$i])
2071         {
2072             my @dec = get_decomposition( $i, $decomp{$type} );
2073             if ($compose && (my @comp = get_composition( $i, $compat )))
2074             {
2075                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2076                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2077
2078                 my $val = 0;
2079                 foreach my $d (@dec)
2080                 {
2081                     $val = $combining_class_table[$d];
2082                     last if $val;
2083                 }
2084                 $char_props[$i] = $classes[$val];
2085             }
2086             else
2087             {
2088                 $char_props[$i] = 0xbf;
2089             }
2090             @dec = compose_hangul( @dec ) if $compose;
2091             @dec = to_utf16( @dec );
2092             push @dec, 0 if @dec >= 7;
2093             $decomposed[$i] = \@dec;
2094         }
2095         else
2096         {
2097             if ($combining_class_table[$i] == 0x100)
2098             {
2099                 $char_props[$i] = 0x7f;
2100             }
2101             elsif ($combining_class_table[$i])
2102             {
2103                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2104             }
2105             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2106             {
2107                 $char_props[$i] = 0xff;
2108             }
2109             else
2110             {
2111                 $char_props[$i] = 0;
2112             }
2113         }
2114     }
2115
2116     if ($compose)
2117     {
2118         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2119         {
2120             my @comp = get_composition( $i, $compat );
2121             next unless @comp;
2122             if ($combining_class_table[$comp[1]])
2123             {
2124                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2125                 $char_props[$comp[1]] |= 0x40;
2126             }
2127             else
2128             {
2129                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2130                 $char_props[$comp[1]] |= 0xc0;
2131             }
2132         }
2133     }
2134
2135     # surrogates
2136     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2137     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2138
2139     # Hangul
2140     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2141     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2142     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2143
2144     # invalid chars
2145     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2146     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2147     foreach my $i (0x00..0x10)
2148     {
2149         $char_props[($i << 16) | 0xfffe] = 0xff;
2150         $char_props[($i << 16) | 0xffff] = 0xff;
2151     }
2152
2153     # decomposition hash table
2154
2155     my @decomp_hash_table;
2156     my @decomp_hash_index;
2157     my @decomp_hash_data;
2158     my $decomp_hash_size = 944;
2159
2160     # build string of character data, reusing substrings when possible
2161     my $decomp_char_data = "";
2162     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2163     {
2164         my $str = pack "U*", @{$i};
2165         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2166     }
2167     for (my $i = 0; $i < @decomposed; $i++)
2168     {
2169         next unless defined $decomposed[$i];
2170         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2171         die "sequence not found" if $pos == -1;
2172         my $len = @{$decomposed[$i]};
2173         $len = 7 if $len > 7;
2174         my $hash = $i % $decomp_hash_size;
2175         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2176     }
2177     for (my $i = 0; $i < $decomp_hash_size; $i++)
2178     {
2179         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2180         next unless defined $decomp_hash_table[$i];
2181         if (@{$decomp_hash_table[$i]} == 1)
2182         {
2183             my $entry = $decomp_hash_table[$i]->[0];
2184             if ($char_props[$entry->[0]] == 0xbf)
2185             {
2186                 $decomp_hash_index[$i] = $entry->[1];
2187                 next;
2188             }
2189         }
2190         foreach my $entry (@{$decomp_hash_table[$i]})
2191         {
2192             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2193         }
2194     }
2195     push @decomp_hash_data, 0, 0;
2196
2197     # composition hash table
2198
2199     my @comp_hash_index;
2200     my @comp_hash_data;
2201     if (@comp_hash_table)
2202     {
2203         for (my $i = 0; $i < $comp_hash_size; $i++)
2204         {
2205             $comp_hash_index[$i] = @comp_hash_data;
2206             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2207         }
2208         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2209         push @comp_hash_data, 0, 0, 0;
2210     }
2211
2212     my $level1 = ($MAX_CHAR + 1) / 128;
2213     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2214
2215     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2216                    0, $decomp_hash_size, $comp_hash_size, 0 );
2217     my @tables = (0) x 8;
2218
2219     $tables[0] = 16 + @header + @tables;
2220     $tables[1] = $tables[0] + @class_values / 2;
2221     $tables[2] = $tables[1] + $level1 / 2;
2222     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2223     $tables[4] = $tables[3] + @decomp_hash_index;
2224     $tables[5] = $tables[4] + @decomp_hash_data;
2225     $tables[6] = $tables[5] + length $decomp_char_data;
2226     $tables[7] = $tables[6] + @comp_hash_index;
2227
2228     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2229     print OUTPUT pack "S<*", @header;
2230     print OUTPUT pack "S<*", @tables;
2231     print OUTPUT pack "C*", @class_values;
2232
2233     print OUTPUT pack "C*", @rows[0..$level1-1];
2234     print OUTPUT pack "C*", @rows[$level1..$#rows];
2235     print OUTPUT pack "S<*", @decomp_hash_index;
2236     print OUTPUT pack "S<*", @decomp_hash_data;
2237     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2238     print OUTPUT pack "S<*", @comp_hash_index;
2239     print OUTPUT pack "S<*", @comp_hash_data;
2240
2241     close OUTPUT;
2242     save_file($filename);
2243
2244     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2245 }
2246
2247
2248 ################################################################
2249 # output a codepage definition file from the global tables
2250 sub output_codepage_file($)
2251 {
2252     my $codepage = shift;
2253
2254     my $output = sprintf "nls/c_%03d.nls", $codepage;
2255     open OUTPUT,">$output.new" or die "Cannot create $output";
2256
2257     printf "Building %s\n", $output;
2258     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2259     else { dump_binary_dbcs_table( $codepage ); }
2260
2261     close OUTPUT;
2262     save_file($output);
2263
2264     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2265 }
2266
2267 ################################################################
2268 # output a codepage table from a Microsoft-style mapping file
2269 sub dump_msdata_codepage($)
2270 {
2271     my $filename = shift;
2272
2273     my $state = "";
2274     my ($codepage, $width, $count);
2275     my ($lb_cur, $lb_end);
2276
2277     @cp2uni = ();
2278     @glyph2uni = ();
2279     @lead_bytes = ();
2280     @uni2cp = ();
2281     $default_char = $DEF_CHAR;
2282     $default_wchar = $DEF_CHAR;
2283
2284     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2285
2286     while (<$INPUT>)
2287     {
2288         next if /^;/;  # skip comments
2289         next if /^\s*$/;  # skip empty lines
2290         next if /\x1a/;  # skip ^Z
2291         last if /^ENDCODEPAGE/;
2292
2293         if (/^CODEPAGE\s+(\d+)/)
2294         {
2295             $codepage = $1;
2296             next;
2297         }
2298         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2299         {
2300             $width = $1;
2301             $default_char = hex $2;
2302             $default_wchar = hex $3;
2303             next;
2304         }
2305         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2306         {
2307             $state = $1;
2308             $count = $2;
2309             next;
2310         }
2311         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2312         {
2313             if ($state eq "MBTABLE")
2314             {
2315                 my $cp = hex $1;
2316                 my $uni = hex $2;
2317                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2318                 next;
2319             }
2320             if ($state eq "GLYPHTABLE")
2321             {
2322                 my $cp = hex $1;
2323                 my $uni = hex $2;
2324                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2325                 next;
2326             }
2327             if ($state eq "WCTABLE")
2328             {
2329                 my $uni = hex $1;
2330                 my $cp = hex $2;
2331                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2332                 next;
2333             }
2334             if ($state eq "DBCSRANGE")
2335             {
2336                 my $start = hex $1;
2337                 my $end = hex $2;
2338                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2339                 $lb_cur = $start;
2340                 $lb_end = $end;
2341                 next;
2342             }
2343             if ($state eq "DBCSTABLE")
2344             {
2345                 my $mb = hex $1;
2346                 my $uni = hex $2;
2347                 my $cp = ($lb_cur << 8) | $mb;
2348                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2349                 if (!--$count)
2350                 {
2351                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2352                 }
2353                 next;
2354             }
2355         }
2356         die "$filename: Unrecognized line $_\n";
2357     }
2358     close $INPUT;
2359
2360     output_codepage_file( $codepage );
2361
2362     if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
2363 }
2364
2365 ################################################################
2366 # align a string length
2367 sub align_string($$)
2368 {
2369     my ($align, $str) = @_;
2370     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2371     return $str;
2372 }
2373
2374 ################################################################
2375 # pack a GUID string
2376 sub pack_guid($)
2377 {
2378     $_ = shift;
2379     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2380     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2381 }
2382
2383 ################################################################
2384 # comparison function for compression sort
2385 sub cmp_compression
2386 {
2387     return scalar @{$a} <=> scalar @{$b} ||
2388         $a->[4] <=> $b->[4] ||
2389         $a->[5] <=> $b->[5] ||
2390         $a->[6] <=> $b->[6] ||
2391         $a->[7] <=> $b->[7] ||
2392         $a->[8] <=> $b->[8] ||
2393         $a->[9] <=> $b->[9] ||
2394         $a->[10] <=> $b->[10] ||
2395         $a->[11] <=> $b->[11] ||
2396         $a->[12] <=> $b->[12];
2397 }
2398
2399 ################################################################
2400 # build a binary sort keys table
2401 sub dump_sortkey_table($$)
2402 {
2403     my ($filename, $download) = @_;
2404
2405     my @keys;
2406     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2407     my @multiple_weights;
2408     my @expansions;
2409     my @compressions;
2410     my %exceptions;
2411     my %guids;
2412     my %compr_flags;
2413     my %locales;
2414     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2415     my $jamostr = "";
2416
2417     my $re_hex = '0x[0-9A-Fa-f]+';
2418     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2419     $guids{$default_guid} = { };
2420
2421     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2422
2423     my $KEYS = open_data_file( $MSDATA, $download );
2424
2425     printf "Building $filename\n";
2426
2427     while (<$KEYS>)
2428     {
2429         s/\s*;.*$//;
2430         next if /^\s*$/;  # skip empty lines
2431         if (/^\s*(SORTKEY|SORTTABLES)/)
2432         {
2433             $part = $1;
2434             next;
2435         }
2436         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2437         {
2438             $part = $section = "";
2439             next;
2440         }
2441         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2442         {
2443             $section = $1;
2444             $guid = undef;
2445             next;
2446         }
2447         next unless $part;
2448         if ("$part.$section" eq "SORTKEY.DEFAULT")
2449         {
2450             if (/^\s*($re_hex)\s+$re_key/)
2451             {
2452                 $keys[hex $1] = [ split(/\s+/,$2) ];
2453                 next;
2454             }
2455         }
2456         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2457         {
2458             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2459             {
2460                 $version = hex $1;
2461                 next;
2462             }
2463             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2464             {
2465                 # ignore for now
2466                 next;
2467             }
2468         }
2469         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2470                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2471                "$part.$section" eq "SORTTABLES.INVERSECASING")
2472         {
2473             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2474             {
2475                 $guid = lc $1;
2476                 $guids{$guid} = { } unless defined $guids{$guid};
2477                 $guids{$guid}->{flags} |= $flags{$section};
2478                 next;
2479             }
2480             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2481             {
2482                 $locales{$1} = $guid;
2483                 next;
2484             }
2485         }
2486         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2487         {
2488             if (/^\s*(\d+)\s+(\d+)/)
2489             {
2490                 push @multiple_weights, $1, $2;
2491                 next;
2492             }
2493         }
2494         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2495         {
2496             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2497             {
2498                 my $pos = scalar @expansions / 2;
2499                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2500                 push @expansions, hex $2, hex $3;
2501                 next;
2502             }
2503         }
2504         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2505         {
2506             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2507             {
2508                 $keys[hex $1] = $keys[hex $2];
2509                 next;
2510             }
2511         }
2512         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2513         {
2514             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2515             {
2516                 if ($subsection || !$guid)  # start a new one
2517                 {
2518                     $guid = lc $1;
2519                     $subsection = "";
2520                     $guids{$guid} = { } unless defined $guids{$guid};
2521                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2522                     $guids{$guid}->{compr} = @compressions;
2523                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2524                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2525                     push @compressions, [ ];
2526                 }
2527                 else  # merge with current one
2528                 {
2529                     $guids{lc $1} = { } unless defined $guids{lc $1};
2530                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2531                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2532                     $compr_flags{lc $1} = $compr_flags{$guid};
2533                 }
2534                 next;
2535             }
2536             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2537             {
2538                 $locales{$1} = $guid;
2539                 next;
2540             }
2541             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2542             {
2543                 $subsection = $1;
2544                 next;
2545             }
2546             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2547             {
2548                 my @comp = map { hex $_; } split(/\s+/,$1);
2549                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2550                 # add compression flags
2551                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2552                 next;
2553             }
2554         }
2555         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2556         {
2557             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2558             {
2559                 $guid = lc $1;
2560                 $guids{$guid} = { } unless defined $guids{lc $1};
2561                 $ling_flag = ($2 ? "+" : "-");
2562                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2563                 next;
2564             }
2565             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2566             {
2567                 $locales{$1} = $guid;
2568                 next;
2569             }
2570             if (/^\s*($re_hex)\s+$re_key/)
2571             {
2572                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2573                 next;
2574             }
2575         }
2576         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2577         {
2578             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2579             {
2580                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2581                 next;
2582             }
2583         }
2584         die "$download: $part.$section: unrecognized line $_\n";
2585     }
2586     close $KEYS;
2587
2588     # Sortkey table
2589
2590     my $table;
2591     for (my $i = 0; $i < 0x10000; $i++)
2592     {
2593         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2594         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2595     }
2596
2597     foreach my $id (sort keys %exceptions)
2598     {
2599         my $pos = length($table) / 4;
2600         my @exc = @{$exceptions{$id}};
2601         my @filled;
2602         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2603         my $guid = substr( $id, 0, -1 );
2604         $guids{$guid}->{$key} = $pos;
2605         $pos += 0x100;
2606         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2607         for (my $j = 0; $j < 0x10000; $j++)
2608         {
2609             next unless defined $exc[$j] || defined $flags[$j];
2610             $filled[$j >> 8] = 1;
2611             $j |= 0xff;
2612         }
2613         for (my $j = 0; $j < 0x100; $j++)
2614         {
2615             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2616             $pos += 0x100 if $filled[$j];
2617         }
2618         for (my $j = 0; $j < 0x10000; $j++)
2619         {
2620             next unless $filled[$j >> 8];
2621             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2622             $k[3] |= $flags[$j] || 0;
2623             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2624         }
2625     }
2626
2627     # Case mapping tables
2628
2629     # standard table
2630     my @casemaps;
2631     my @upper = @toupper_table;
2632     my @lower = @tolower_table;
2633     remove_linguistic_mappings( \@upper, \@lower );
2634     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2635
2636     # linguistic table
2637     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2638
2639     # Turkish table
2640     @upper = @toupper_table;
2641     @lower = @tolower_table;
2642     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2643     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2644     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2645     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2646
2647     # Char type table
2648
2649     my @table;
2650     my $types = "";
2651     my %typestr;
2652     for (my $i = 0; $i < 0x10000; $i++)
2653     {
2654         my $str = pack "S<3",
2655             ($category_table[$i] || 0) & 0xffff,
2656             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2657             ($category_table[$i] || 0) >> 16;
2658
2659         if (!defined($typestr{$str}))
2660         {
2661             $typestr{$str} = length($types) / 6;
2662             $types .= $str;
2663         }
2664         $table[$i] = $typestr{$str};
2665     }
2666
2667     my @rows = compress_array( 4096, 0, @table[0..65535] );
2668     my @array = compress_array( 256, 0, @rows[0..4095] );
2669     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2670     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2671
2672     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2673     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2674     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2675
2676     # Sort tables
2677
2678     # guids
2679     my $sorttables = pack "L<2", $version, scalar %guids;
2680     foreach my $id (sort keys %guids)
2681     {
2682         my %guid = %{$guids{$id}};
2683         my $flags = $guid{flags} || 0;
2684         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2685         $sorttables .= pack_guid($id) . pack "L<5",
2686             $flags,
2687             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2688             $guid{except} || 0,
2689             $guid{ling_except} || 0,
2690             $map / 2;
2691     }
2692
2693     # expansions
2694     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2695
2696     # compressions
2697     $sorttables .= pack "L<", scalar @compressions;
2698     my $rowstr = "";
2699     foreach my $c (@compressions)
2700     {
2701         my $pos = length($rowstr) / 2;
2702         my $min = 0xffff;
2703         my $max = 0;
2704         my @lengths = (0) x 8;
2705         foreach my $r (sort cmp_compression @{$c})
2706         {
2707             my @row = @{$r};
2708             $lengths[scalar @row - 6]++;
2709             foreach my $val (@row[4..$#row])
2710             {
2711                 $min = $val if $min > $val;
2712                 $max = $val if $max < $val;
2713             }
2714             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2715             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2716         }
2717         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2718     }
2719     $sorttables .= $rowstr;
2720
2721     # multiple weights
2722     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2723
2724     # jamo sort
2725     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2726
2727     # Locales
2728
2729     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2730     foreach my $loc (sort keys %locales)
2731     {
2732         # skip specific locales that match more general ones
2733         my @parts = split /[-_]/, $loc;
2734         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2735         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2736         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2737     }
2738
2739     # File header
2740
2741     my @header;
2742     $header[0] = 16;
2743     $header[1] = $header[0] + length $table;
2744     $header[2] = $header[1] + length $casemaps;
2745     $header[3] = $header[2] + length $chartypes;
2746
2747     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2748     print OUTPUT pack "L<*", @header;
2749     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2750     close OUTPUT;
2751     save_file($filename);
2752 }
2753
2754
2755 ################################################################
2756 # build the script to create registry keys
2757 sub dump_registry_script($%)
2758 {
2759     my ($filename, %keys) = @_;
2760     my $indent = 1;
2761
2762     printf "Building %s\n", $filename;
2763     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2764     print OUTPUT "HKLM\n{\n";
2765     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2766     {
2767         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2768         $indent++;
2769     }
2770     foreach my $k (sort keys %keys)
2771     {
2772         my @subkeys = split /\\/, $k;
2773         my ($def, @vals) = @{$keys{$k}};
2774         for (my $i = 0; $i < @subkeys; $i++)
2775         {
2776             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2777                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2778             $indent++;
2779         }
2780         foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2781         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2782     }
2783     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2784     close OUTPUT;
2785     save_file($filename);
2786 }
2787
2788
2789 ################################################################
2790 # save a file if modified
2791 sub save_file($)
2792 {
2793     my $file = shift;
2794     if (-f $file && !system "cmp $file $file.new >/dev/null")
2795     {
2796         unlink "$file.new";
2797     }
2798     else
2799     {
2800         rename "$file.new", "$file";
2801     }
2802 }
2803
2804
2805 ################################################################
2806 # main routine
2807
2808 chdir ".." if -f "./make_unicode";
2809 load_data();
2810 dump_sortkeys( "dlls/kernelbase/collation.c" );
2811 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2812 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2813 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2814 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2815 dump_mirroring( "dlls/dwrite/mirror.c" );
2816 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2817 dump_bracket( "dlls/dwrite/bracket.c" );
2818 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2819 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2820 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2821 dump_linebreak( "dlls/dwrite/linebreak.c" );
2822 dump_scripts( "dlls/dwrite/scripts" );
2823 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2824 dump_vertical( "dlls/gdi32/vertical.c" );
2825 dump_vertical( "dlls/wineps.drv/vertical.c" );
2826 dump_intl_nls("nls/l_intl.nls");
2827 dump_norm_table( "nls/normnfc.nls" );
2828 dump_norm_table( "nls/normnfd.nls" );
2829 dump_norm_table( "nls/normnfkc.nls" );
2830 dump_norm_table( "nls/normnfkd.nls" );
2831 dump_norm_table( "nls/normidna.nls" );
2832 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2833 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2834 dump_eucjp_codepage();
2835 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2836
2837 exit 0;
2838
2839 # Local Variables:
2840 # compile-command: "./make_unicode"
2841 # End: