tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $REPORTS = "http://www.unicode.org/reports";
  30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  32
  33 # Sort keys file
  34 my $SORTKEYS = "tr10/allkeys.txt";
  35
  36 # Default char for undefined mappings
  37 my $DEF_CHAR = ord '?';
  38
  39 # Last valid Unicode character
  40 my $MAX_CHAR = 0x10ffff;
  41
  42 my @allfiles =
  43 (
  44     "CodpageFiles/037.txt",
  45     "CodpageFiles/437.txt",
  46     "CodpageFiles/500.txt",
  47     "CodpageFiles/708.txt",
  48     "CodpageFiles/737.txt",
  49     "CodpageFiles/775.txt",
  50     "CodpageFiles/850.txt",
  51     "CodpageFiles/852.txt",
  52     "CodpageFiles/855.txt",
  53     "CodpageFiles/857.txt",
  54     "CodpageFiles/860.txt",
  55     "CodpageFiles/861.txt",
  56     "CodpageFiles/862.txt",
  57     "CodpageFiles/863.txt",
  58     "CodpageFiles/864.txt",
  59     "CodpageFiles/865.txt",
  60     "CodpageFiles/866.txt",
  61     "CodpageFiles/869.txt",
  62     "CodpageFiles/874.txt",
  63     "CodpageFiles/875.txt",
  64     "CodpageFiles/932.txt",
  65     "CodpageFiles/936.txt",
  66     "CodpageFiles/949.txt",
  67     "CodpageFiles/950.txt",
  68     "CodpageFiles/1026.txt",
  69     "CodpageFiles/1250.txt",
  70     "CodpageFiles/1251.txt",
  71     "CodpageFiles/1252.txt",
  72     "CodpageFiles/1253.txt",
  73     "CodpageFiles/1254.txt",
  74     "CodpageFiles/1255.txt",
  75     "CodpageFiles/1256.txt",
  76     "CodpageFiles/1257.txt",
  77     "CodpageFiles/1258.txt",
  78     "CodpageFiles/1361.txt",
  79     "CodpageFiles/10000.txt",
  80     "CodpageFiles/10001.txt",
  81     "CodpageFiles/10002.txt",
  82     "CodpageFiles/10003.txt",
  83     "CodpageFiles/10004.txt",
  84     "CodpageFiles/10005.txt",
  85     "CodpageFiles/10006.txt",
  86     "CodpageFiles/10007.txt",
  87     "CodpageFiles/10008.txt",
  88     "CodpageFiles/10010.txt",
  89     "CodpageFiles/10017.txt",
  90     "CodpageFiles/10021.txt",
  91     "CodpageFiles/10029.txt",
  92     "CodpageFiles/10079.txt",
  93     "CodpageFiles/10081.txt",
  94     "CodpageFiles/10082.txt",
  95     "CodpageFiles/20127.txt",
  96     "CodpageFiles/20866.txt",
  97     "CodpageFiles/21866.txt",
  98     "CodpageFiles/28591.txt",
  99     "CodpageFiles/28592.txt",
 100     "CodpageFiles/28593.txt",
 101     "CodpageFiles/28594.txt",
 102     "CodpageFiles/28595.txt",
 103     "CodpageFiles/28596.txt",
 104     "CodpageFiles/28597.txt",
 105     "CodpageFiles/28598.txt",
 106     "CodpageFiles/28599.txt",
 107     "CodpageFiles/28603.txt",
 108     "CodpageFiles/28605.txt",
 109 );
 110
 111
 112 my %ctype =
 113 (
 114      # CT_CTYPE1
 115     "upper"  => 0x0001,
 116     "lower"  => 0x0002,
 117     "digit"  => 0x0004,
 118     "space"  => 0x0008,
 119     "punct"  => 0x0010,
 120     "cntrl"  => 0x0020,
 121     "blank"  => 0x0040,
 122     "xdigit" => 0x0080,
 123     "alpha"  => 0x0100 | 0x80000000,
 124     "defin"  => 0x0200,
 125      # CT_CTYPE3 in high 16 bits
 126     "nonspacing"    => 0x00010000,
 127     "diacritic"     => 0x00020000,
 128     "vowelmark"     => 0x00040000,
 129     "symbol"        => 0x00080000,
 130     "katakana"      => 0x00100000,
 131     "hiragana"      => 0x00200000,
 132     "halfwidth"     => 0x00400000,
 133     "fullwidth"     => 0x00800000,
 134     "ideograph"     => 0x01000000,
 135     "kashida"       => 0x02000000,
 136     "lexical"       => 0x04000000,
 137     "highsurrogate" => 0x08000000,
 138     "lowsurrogate"  => 0x10000000,
 139 );
 140
 141 my %bracket_types =
 142 (
 143     "o" => 0x0000,
 144     "c" => 0x0001,
 145 );
 146
 147 my %indic_types =
 148 (
 149     "Other"    => 0x0000,
 150     "Bindu"    => 0x0001,
 151     "Visarga"  => 0x0002,
 152     "Avagraha" => 0x0003,
 153     "Nukta"    => 0x0004,
 154     "Virama"   => 0x0005,
 155     "Vowel_Independent"  => 0x0006,
 156     "Vowel_Dependent"  => 0x0007,
 157     "Vowel"  => 0x0008,
 158     "Consonant_Placeholder"  => 0x0009,
 159     "Consonant"  => 0x000a,
 160     "Consonant_Dead"  => 0x000b,
 161     "Consonant_Succeeding_Repha" => 0x000c,
 162     "Consonant_Subjoined"  => 0x000d,
 163     "Consonant_Medial"  => 0x000e,
 164     "Consonant_Final"  => 0x000f,
 165     "Consonant_Head_Letter"  => 0x0010,
 166     "Modifying_Letter"  => 0x0011,
 167     "Tone_Letter"  => 0x0012,
 168     "Tone_Mark"  => 0x0013,
 169     "Register_Shifter"  => 0x0014,
 170     "Consonant_Preceding_Repha" => 0x0015,
 171     "Pure_Killer" => 0x0016,
 172     "Invisible_Stacker" => 0x0017,
 173     "Gemination_Mark" => 0x0018,
 174     "Cantillation_Mark" => 0x0019,
 175     "Non_Joiner" => 0x001a,
 176     "Joiner" => 0x001b,
 177     "Number_Joiner" => 0x001c,
 178     "Number" => 0x001d,
 179     "Brahmi_Joining_Number" => 0x001e,
 180     "Consonant_With_Stacker" => 0x001f,
 181     "Consonant_Prefixed" => 0x0020,
 182     "Syllable_Modifier" => 0x0021,
 183     "Consonant_Killer" => 0x0022,
 184     "Consonant_Initial_Postfixed" => 0x0023,
 185 );
 186
 187 my %matra_types =
 188 (
 189     "Right"    => 0x01,
 190     "Left"  => 0x02,
 191     "Visual_Order_Left" => 0x03,
 192     "Left_And_Right"    => 0x04,
 193     "Top"   => 0x05,
 194     "Bottom"  => 0x06,
 195     "Top_And_Bottom"  => 0x07,
 196     "Top_And_Right"  => 0x08,
 197     "Top_And_Left"  => 0x09,
 198     "Top_And_Left_And_Right"  => 0x0a,
 199     "Bottom_And_Right"  => 0x0b,
 200     "Top_And_Bottom_And_Right"  => 0x0c,
 201     "Overstruck"  => 0x0d,
 202     "Invisible"  => 0x0e,
 203     "Bottom_And_Left"  => 0x0f,
 204     "Top_And_Bottom_And_Left"  => 0x10,
 205 );
 206
 207 my %break_types =
 208 (
 209     "BK"  => 0x0001,
 210     "CR"  => 0x0002,
 211     "LF"  => 0x0003,
 212     "CM"  => 0x0004,
 213     "SG"  => 0x0005,
 214     "GL"  => 0x0006,
 215     "CB"  => 0x0007,
 216     "SP"  => 0x0008,
 217     "ZW"  => 0x0009,
 218     "NL"  => 0x000a,
 219     "WJ"  => 0x000b,
 220     "JL"  => 0x000c,
 221     "JV"  => 0x000d,
 222     "JT"  => 0x000e,
 223     "H2"  => 0x000f,
 224     "H3"  => 0x0010,
 225     "XX"  => 0x0011,
 226     "OP"  => 0x0012,
 227     "CL"  => 0x0013,
 228     "CP"  => 0x0014,
 229     "QU"  => 0x0015,
 230     "NS"  => 0x0016,
 231     "EX"  => 0x0017,
 232     "SY"  => 0x0018,
 233     "IS"  => 0x0019,
 234     "PR"  => 0x001a,
 235     "PO"  => 0x001b,
 236     "NU"  => 0x001c,
 237     "AL"  => 0x001d,
 238     "ID"  => 0x001e,
 239     "IN"  => 0x001f,
 240     "HY"  => 0x0020,
 241     "BB"  => 0x0021,
 242     "BA"  => 0x0022,
 243     "SA"  => 0x0023,
 244     "AI"  => 0x0024,
 245     "B2"  => 0x0025,
 246     "HL"  => 0x0026,
 247     "CJ"  => 0x0027,
 248     "RI"  => 0x0028,
 249     "EB"  => 0x0029,
 250     "EM"  => 0x002a,
 251     "ZWJ" => 0x002b,
 252 );
 253
 254 my %vertical_types =
 255 (
 256     "R"  => 0x0000,
 257     "U"  => 0x0001,
 258     "Tr" => 0x0002,
 259     "Tu" => 0x0003,
 260 );
 261
 262 my %categories =
 263 (
 264     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 265     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 266     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 267     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 268     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 269     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 270     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 271     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 272     "No" => $ctype{"defin"},                    # Number, Other
 273     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 274     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 275     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 276     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 277     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 278     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 279     "Co" => $ctype{"defin"},                    # Other, Private Use
 280     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 281     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 282     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 283     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 284     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 285     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 286     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 287     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 288     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 289     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 290     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 291     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 292     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 293     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 294 );
 295
 296 # a few characters need additional categories that cannot be determined automatically
 297 my %special_categories =
 298 (
 299     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 300                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 301     "space"  => [ 0x09..0x0d, 0x85 ],
 302     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 303     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 304                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 305                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 306                   0xfff9, 0xfffa, 0xfffb ],
 307     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 308                   0xd7, 0xf7 ],
 309     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 310     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 311     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 312                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 313     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 314     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 315                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 316                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 317     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 318     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 319                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 320                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 321                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 322                      0x3131..0x3164 ],
 323     "ideograph" => [ 0x3006..0x3007 ],
 324     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 325                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 326                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 327                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 328                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 329                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 330                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 331                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 332                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 333                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 334     "kashida" => [ 0x0640 ],
 335 );
 336
 337 my %directions =
 338 (
 339     "L"   => 1,    # Left-to-Right
 340     "R"   => 2,    # Right-to-Left
 341     "AL"  => 12,   # Right-to-Left Arabic
 342     "EN"  => 3,    # European Number
 343     "ES"  => 4,    # European Number Separator
 344     "ET"  => 5,    # European Number Terminator
 345     "AN"  => 6,    # Arabic Number
 346     "CS"  => 7,    # Common Number Separator
 347     "NSM" => 13,   # Non-Spacing Mark
 348     "BN"  => 14,   # Boundary Neutral
 349     "B"   => 8,    # Paragraph Separator
 350     "S"   => 9,    # Segment Separator
 351     "WS"  => 10,   # Whitespace
 352     "ON"  => 11,   # Other Neutrals
 353     "LRE" => 15,   # Left-to-Right Embedding
 354     "LRO" => 15,   # Left-to-Right Override
 355     "RLE" => 15,   # Right-to-Left Embedding
 356     "RLO" => 15,   # Right-to-Left Override
 357     "PDF" => 15,   # Pop Directional Format
 358     "LRI" => 15,   # Left-to-Right Isolate
 359     "RLI" => 15,   # Right-to-Left Isolate
 360     "FSI" => 15,   # First Strong Isolate
 361     "PDI" => 15    # Pop Directional Isolate
 362 );
 363
 364 my %c2_types =
 365 (
 366     "L"   => 1,    # C2_LEFTTORIGHT
 367     "R"   => 2,    # C2_RIGHTTOLEFT
 368     "AL"  => 2,    # C2_RIGHTTOLEFT
 369     "EN"  => 3,    # C2_EUROPENUMBER
 370     "ES"  => 4,    # C2_EUROPESEPARATOR
 371     "ET"  => 5,    # C2_EUROPETERMINATOR
 372     "AN"  => 6,    # C2_ARABICNUMBER
 373     "CS"  => 7,    # C2_COMMONSEPARATOR
 374     "NSM" => 11,   # C2_OTHERNEUTRAL
 375     "BN"  => 0,    # C2_NOTAPPLICABLE
 376     "B"   => 8,    # C2_BLOCKSEPARATOR
 377     "S"   => 9,    # C2_SEGMENTSEPARATOR
 378     "WS"  => 10,   # C2_WHITESPACE
 379     "ON"  => 11,   # C2_OTHERNEUTRAL
 380     "LRE" => 11,   # C2_OTHERNEUTRAL
 381     "LRO" => 11,   # C2_OTHERNEUTRAL
 382     "RLE" => 11,   # C2_OTHERNEUTRAL
 383     "RLO" => 11,   # C2_OTHERNEUTRAL
 384     "PDF" => 11,   # C2_OTHERNEUTRAL
 385     "LRI" => 11,   # C2_OTHERNEUTRAL
 386     "RLI" => 11,   # C2_OTHERNEUTRAL
 387     "FSI" => 11,   # C2_OTHERNEUTRAL
 388     "PDI" => 11    # C2_OTHERNEUTRAL
 389 );
 390
 391 my %bidi_types =
 392 (
 393     "ON"  => 0,    # Other Neutrals
 394     "L"   => 1,    # Left-to-Right
 395     "R"   => 2,    # Right-to-Left
 396     "AN"  => 3,    # Arabic Number
 397     "EN"  => 4,    # European Number
 398     "AL"  => 5,    # Right-to-Left Arabic
 399     "NSM" => 6,    # Non-Spacing Mark
 400     "CS"  => 7,    # Common Number Separator
 401     "ES"  => 8,    # European Number Separator
 402     "ET"  => 9,    # European Number Terminator
 403     "BN"  => 10,   # Boundary Neutral
 404     "S"   => 11,   # Segment Separator
 405     "WS"  => 12,   # Whitespace
 406     "B"   => 13,   # Paragraph Separator
 407     "RLO" => 14,   # Right-to-Left Override
 408     "RLE" => 15,   # Right-to-Left Embedding
 409     "LRO" => 16,   # Left-to-Right Override
 410     "LRE" => 17,   # Left-to-Right Embedding
 411     "PDF" => 18,   # Pop Directional Format
 412     "LRI" => 19,   # Left-to-Right Isolate
 413     "RLI" => 20,   # Right-to-Left Isolate
 414     "FSI" => 21,   # First Strong Isolate
 415     "PDI" => 22    # Pop Directional Isolate
 416 );
 417
 418 my %joining_types =
 419 (
 420    "U" => 0,           # Non_Joining
 421    "L" => 1,           # Left_Joining
 422    "R" => 2,           # Right_Joining
 423    "D" => 3,           # Dual_Joining
 424    "C" => 3,           # Join_Causing
 425    "ALAPH" => 4,       # Syriac ALAPH
 426    "DALATH RISH" => 5, # Syriac DALATH RISH group
 427    "T" => 6,           # Transparent
 428 );
 429
 430 my @cp2uni = ();
 431 my @glyph2uni = ();
 432 my @lead_bytes = ();
 433 my @uni2cp = ();
 434 my @tolower_table = ();
 435 my @toupper_table = ();
 436 my @digitmap_table = ();
 437 my @category_table = ();
 438 my @initial_joining_table = ();
 439 my @direction_table = ();
 440 my @decomp_table = ();
 441 my @combining_class_table = ();
 442 my @decomp_compat_table = ();
 443 my @comp_exclusions = ();
 444 my @idna_decomp_table = ();
 445 my @idna_disallowed = ();
 446 my %registry_keys;
 447 my $default_char;
 448 my $default_wchar;
 449
 450 my %joining_forms =
 451 (
 452    "isolated" => [],
 453    "final" => [],
 454    "initial" => [],
 455    "medial" => []
 456 );
 457
 458 sub to_utf16(@)
 459 {
 460     my @ret;
 461     foreach my $ch (@_)
 462     {
 463         if ($ch < 0x10000)
 464         {
 465             push @ret, $ch;
 466         }
 467         else
 468         {
 469             my $val = $ch - 0x10000;
 470             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 471         }
 472     }
 473     return @ret;
 474 }
 475
 476 ################################################################
 477 # fetch a unicode.org file and open it
 478 sub open_data_file($$)
 479 {
 480     my ($base, $name) = @_;
 481     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 482     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 483     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 484     local *FILE;
 485
 486     if ($base =~ /.*\/([^\/]+)\.zip$/)
 487     {
 488         my $zip = "$1$suffix.zip";
 489         unless (-f "$cache/$zip")
 490         {
 491             system "mkdir", "-p", $cache;
 492             print "Fetching $base...\n";
 493             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 494         }
 495         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 496     }
 497     else
 498     {
 499         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 500         unless (-f $dest)
 501         {
 502             system "mkdir", "-p", $dir;
 503             print "Fetching $base/$name...\n";
 504             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 505         }
 506         open FILE, "<$dest" or die "cannot open $dest";
 507     }
 508     return *FILE;
 509 }
 510
 511 ################################################################
 512 # recursively get the decomposition for a character
 513 sub get_decomposition($$);
 514 sub get_decomposition($$)
 515 {
 516     my ($char, $table) = @_;
 517     my @ret;
 518
 519     return $char unless defined $table->[$char];
 520     foreach my $ch (@{$table->[$char]})
 521     {
 522         push @ret, get_decomposition( $ch, $table );
 523     }
 524     return @ret;
 525 }
 526
 527 ################################################################
 528 # get the composition that results in a given character
 529 sub get_composition($$)
 530 {
 531     my ($ch, $compat) = @_;
 532     return () unless defined $decomp_table[$ch];  # no decomposition
 533     my @ret = @{$decomp_table[$ch]};
 534     return () if @ret < 2;                        # singleton decomposition
 535     return () if $comp_exclusions[$ch];           # composition exclusion
 536     return () if $combining_class_table[$ch];     # non-starter
 537     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 538     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 539         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 540     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 541         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 542     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 543         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 544     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 545     return @ret;
 546 }
 547
 548 ################################################################
 549 # recursively build decompositions
 550 sub build_decompositions(@)
 551 {
 552     my @src = @_;
 553     my @dst;
 554
 555     for (my $i = 0; $i < @src; $i++)
 556     {
 557         next unless defined $src[$i];
 558         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 559         $dst[$i] = \@decomp;
 560     }
 561     return @dst;
 562 }
 563
 564 ################################################################
 565 # compose Hangul sequences
 566 sub compose_hangul(@)
 567 {
 568     my $SBASE  = 0xac00;
 569     my $LBASE  = 0x1100;
 570     my $VBASE  = 0x1161;
 571     my $TBASE  = 0x11a7;
 572     my $LCOUNT = 19;
 573     my $VCOUNT = 21;
 574     my $TCOUNT = 28;
 575     my $NCOUNT = $VCOUNT * $TCOUNT;
 576     my $SCOUNT = $LCOUNT * $NCOUNT;
 577
 578     my @seq = @_;
 579     my @ret;
 580     my $i;
 581
 582     for ($i = 0; $i < @seq; $i++)
 583     {
 584         my $ch = $seq[$i];
 585         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 586             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 587         {
 588             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 589             $i++;
 590         }
 591         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 592             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 593         {
 594             $ch += $seq[$i+1] - $TBASE;
 595             $i++;
 596         }
 597         push @ret, $ch;
 598     }
 599     return @ret;
 600 }
 601
 602 ################################################################
 603 # remove linguistic-only mappings from the case table
 604 sub remove_linguistic_mappings($$)
 605 {
 606     my ($upper, $lower) = @_;
 607
 608     # remove case mappings that don't round-trip
 609
 610     for (my $i = 0; $i < @{$upper}; $i++)
 611     {
 612         next unless defined ${$upper}[$i];
 613         my $ch = ${$upper}[$i];
 614         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 615     }
 616     for (my $i = 0; $i < @{$lower}; $i++)
 617     {
 618         next unless defined ${$lower}[$i];
 619         my $ch = ${$lower}[$i];
 620         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 621     }
 622 }
 623
 624 ################################################################
 625 # read in the Unicode database files
 626 sub load_data()
 627 {
 628     my $start;
 629
 630     # now build mappings from the decomposition field of the Unicode database
 631
 632     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 633     while (<$UNICODE_DATA>)
 634     {
 635         # Decode the fields ...
 636         my ($code, $name, $cat, $comb, $bidi,
 637             $decomp, $dec, $dig, $num, $mirror,
 638             $oldname, $comment, $upper, $lower, $title) = split /;/;
 639         my $src = hex $code;
 640
 641         die "unknown category $cat" unless defined $categories{$cat};
 642         die "unknown directionality $bidi" unless defined $directions{$bidi};
 643
 644         $category_table[$src] = $categories{$cat};
 645         $direction_table[$src] = $bidi;
 646         if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
 647         {
 648             $initial_joining_table[$src] = $joining_types{"T"};
 649         }
 650         else
 651         {
 652             $initial_joining_table[$src] = $joining_types{"U"};
 653         }
 654
 655         if ($lower ne "")
 656         {
 657             $tolower_table[$src] = hex $lower;
 658         }
 659         if ($upper ne "")
 660         {
 661             $toupper_table[$src] = hex $upper;
 662         }
 663         if ($dec ne "")
 664         {
 665             $category_table[$src] |= $ctype{"digit"};
 666         }
 667         if ($dig ne "")
 668         {
 669             $digitmap_table[$src] = ord $dig;
 670         }
 671         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 672
 673         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 674         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 675         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 676         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 677         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 678         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 679         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 680         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 681         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 682         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 683         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 684         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 685
 686         # copy the category and direction for everything between First/Last pairs
 687         if ($name =~ /, First>/) { $start = $src; }
 688         if ($name =~ /, Last>/)
 689         {
 690             while ($start < $src)
 691             {
 692                 $category_table[$start] = $category_table[$src];
 693                 $direction_table[$start] = $direction_table[$src];
 694                 $combining_class_table[$start] = $combining_class_table[$src];
 695                 $start++;
 696             }
 697         }
 698
 699         next if $decomp eq "";  # no decomposition, skip it
 700
 701         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 702         {
 703             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 704             $decomp_compat_table[$src] = \@seq;
 705         }
 706
 707         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 708         {
 709             # decomposition of the form "<foo> 1234" -> use char if type is known
 710             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 711             {
 712                 ${joining_forms{$1}}[hex $2] = $src;
 713             }
 714         }
 715         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 716         {
 717             # decomposition "<compat> 0020 1234" -> combining accent
 718         }
 719         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 720         {
 721             # store decomposition
 722             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 723             {
 724                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 725             }
 726             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 727             {
 728                 # Single char decomposition
 729                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 730             }
 731         }
 732     }
 733     close $UNICODE_DATA;
 734
 735     # patch the category of some special characters
 736
 737     for (my $i = 0; $i < @decomp_table; $i++)
 738     {
 739         next unless defined $decomp_table[$i];
 740         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 741     }
 742     foreach my $cat (keys %special_categories)
 743     {
 744         my $flag = $ctype{$cat};
 745         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 746     }
 747     for (my $i = 0; $i < @decomp_compat_table; $i++)
 748     {
 749         next unless defined $decomp_compat_table[$i];
 750         next unless @{$decomp_compat_table[$i]} == 2;
 751         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 752     }
 753
 754     # load the composition exclusions
 755
 756     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 757     while (<$EXCL>)
 758     {
 759         s/\#.*//;  # remove comments
 760         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 761         {
 762             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 763         }
 764         elsif (/^([0-9a-fA-F]+)\s*$/)
 765         {
 766             $comp_exclusions[hex $1] = 1;
 767         }
 768     }
 769     close $EXCL;
 770
 771     # load the IDNA mappings
 772
 773     @idna_decomp_table = @decomp_compat_table;
 774     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 775     while (<$IDNA>)
 776     {
 777         s/\#.*//;  # remove comments
 778         next if /^\s*$/;
 779         my ($char, $type, $mapping) = split /;/;
 780         my ($ch1, $ch2);
 781         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 782         {
 783             $ch1 = hex $1;
 784             $ch2 = hex $2;
 785         }
 786         elsif ($char =~ /([0-9a-fA-F]+)/)
 787         {
 788             $ch1 = $ch2 = hex $1;
 789         }
 790
 791         if ($type =~ /mapped/ || $type =~ /deviation/)
 792         {
 793             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 794             my @seq = map { hex $_; } split /\s+/, $mapping;
 795             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 796         }
 797         elsif ($type =~ /valid/)
 798         {
 799         }
 800         elsif ($type =~ /ignored/)
 801         {
 802             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 803         }
 804         elsif ($type =~ /disallowed/)
 805         {
 806             foreach my $i ($ch1 .. $ch2)
 807             {
 808                 $idna_decomp_table[$i] = undef;
 809                 $idna_disallowed[$i] = 1;
 810             }
 811         }
 812     }
 813     close $IDNA;
 814 }
 815
 816
 817 ################################################################
 818 # add a new registry key
 819 sub add_registry_key($$)
 820 {
 821     my ($key, $defval) = @_;
 822     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 823 }
 824
 825 ################################################################
 826 # add a new registry value
 827 sub add_registry_value($$$)
 828 {
 829     my ($key, $name, $value) = @_;
 830     add_registry_key( $key, undef );
 831     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 832 }
 833
 834 ################################################################
 835 # define a new lead byte
 836 sub add_lead_byte($)
 837 {
 838     my $ch = shift;
 839     return if defined $cp2uni[$ch];
 840     push @lead_bytes, $ch;
 841     $cp2uni[$ch] = 0;
 842 }
 843
 844 ################################################################
 845 # define a new char mapping
 846 sub add_mapping($$)
 847 {
 848     my ($cp, $uni) = @_;
 849     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 850     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 851     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 852 }
 853
 854 ################################################################
 855 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 856 sub get_glyphs_mapping(@)
 857 {
 858     my @table = @_;
 859
 860     for (my $i = 0; $i < @glyph2uni; $i++)
 861     {
 862         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 863     }
 864     return @table;
 865 }
 866
 867 ################################################################
 868 # build EUC-JP table from the JIS 0208/0212 files
 869 sub dump_eucjp_codepage()
 870 {
 871     @cp2uni = ();
 872     @glyph2uni = ();
 873     @lead_bytes = ();
 874     @uni2cp = ();
 875     $default_char = $DEF_CHAR;
 876     $default_wchar = 0x30fb;
 877
 878     # ASCII chars
 879     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 880
 881     # lead bytes
 882     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 883
 884     # JIS X 0201 right plane
 885     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 886
 887     # undefined chars
 888     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 889     $cp2uni[0xa0] = 0xf8f0;
 890     $cp2uni[0xff] = 0xf8f3;
 891
 892     # Fix backslash conversion
 893     add_mapping( 0xa1c0, 0xff3c );
 894
 895     # Add private mappings for rows undefined in JIS 0208/0212
 896     my $private = 0xe000;
 897     foreach my $hi (0xf5 .. 0xfe)
 898     {
 899         foreach my $lo (0xa1 .. 0xfe)
 900         {
 901             add_mapping( ($hi << 8) + $lo, $private++ );
 902         }
 903     }
 904     foreach my $hi (0xf5 .. 0xfe)
 905     {
 906         foreach my $lo (0x21 .. 0x7e)
 907         {
 908             add_mapping( ($hi << 8) + $lo, $private++ );
 909         }
 910     }
 911
 912     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 913     while (<$INPUT>)
 914     {
 915         next if /^\#/;  # skip comments
 916         next if /^$/;  # skip empty lines
 917         next if /\x1a/;  # skip ^Z
 918         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 919         {
 920             add_mapping( 0x8080 + hex $1, hex $2 );
 921             next;
 922         }
 923         die "Unrecognized line $_\n";
 924     }
 925     close $INPUT;
 926
 927     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 928     while (<$INPUT>)
 929     {
 930         next if /^\#/;  # skip comments
 931         next if /^$/;  # skip empty lines
 932         next if /\x1a/;  # skip ^Z
 933         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 934         {
 935             add_mapping( 0x8000 + hex $1, hex $2 );
 936             next;
 937         }
 938         die "Unrecognized line $_\n";
 939     }
 940     close $INPUT;
 941
 942     output_codepage_file( 20932 );
 943 }
 944
 945
 946 ################################################################
 947 # build the sort keys table
 948 sub dump_sortkeys($)
 949 {
 950     my $filename = shift;
 951     my @sortkeys = ();
 952
 953     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
 954     while (<$INPUT>)
 955     {
 956         next if /^\#/;  # skip comments
 957         next if /^$/;  # skip empty lines
 958         next if /\x1a/;  # skip ^Z
 959         next if /^\@version/;  # skip @version header
 960         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 961         {
 962             my ($uni,$variable) = (hex $1, $2);
 963             next if $uni > 65535;
 964             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
 965             next;
 966         }
 967         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 968         {
 969             # multiple character sequence, ignored for now
 970             next;
 971         }
 972         die "$SORTKEYS: Unrecognized line $_\n";
 973     }
 974     close $INPUT;
 975
 976     # compress the keys to 32 bit:
 977     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
 978
 979     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
 980                        ${$a}[2] <=> ${$b}[2] or
 981                        ${$a}[3] <=> ${$b}[3] or
 982                        ${$a}[4] <=> ${$b}[4] or
 983                        $a cmp $b; } @sortkeys;
 984
 985     my ($n2, $n3) = (1, 1);
 986     my @keys = (-1, -1, -1, -1, -1 );
 987     my @flatkeys = ();
 988
 989     for (my $i = 0; $i < @sortkeys; $i++)
 990     {
 991         next unless defined $sortkeys[$i];
 992         my @current = @{$sortkeys[$i]};
 993         if ($current[1] == $keys[1])
 994         {
 995             if ($current[2] == $keys[2])
 996             {
 997                 if ($current[3] == $keys[3])
 998                 {
 999                     # nothing
1000                 }
1001                 else
1002                 {
1003                     $keys[3] = $current[3];
1004                     $n3++;
1005                     die if ($n3 >= 16);
1006                 }
1007             }
1008             else
1009             {
1010                 $keys[2] = $current[2];
1011                 $keys[3] = $current[3];
1012                 $n2++;
1013                 $n3 = 1;
1014                 die if ($n2 >= 256);
1015             }
1016         }
1017         else
1018         {
1019             $keys[1] = $current[1];
1020             $keys[2] = $current[2];
1021             $keys[3] = $current[3];
1022             $n2 = 1;
1023             $n3 = 1;
1024         }
1025
1026         if ($current[2]) { $current[2] = $n2; }
1027         if ($current[3]) { $current[3] = $n3; }
1028         if ($current[4]) { $current[4] = 1; }
1029
1030         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1031     }
1032
1033     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1034     printf "Building $filename\n";
1035     printf OUTPUT "/* Unicode collation element table */\n";
1036     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1037     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1038     print OUTPUT "#include \"windef.h\"\n\n";
1039
1040     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1041
1042     close OUTPUT;
1043     save_file($filename);
1044 }
1045
1046
1047 ################################################################
1048 # dump an array of integers
1049 sub dump_array($$@)
1050 {
1051     my ($bit_width, $default, @array) = @_;
1052     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1053     my $i;
1054     my $ret = "    ";
1055     for ($i = 0; $i < $#array; $i++)
1056     {
1057         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1058         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1059     }
1060     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1061     return $ret;
1062 }
1063
1064
1065 ################################################################
1066 # dump an SBCS mapping table in binary format
1067 sub dump_binary_sbcs_table($)
1068 {
1069     my $codepage = shift;
1070
1071     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1072     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1073
1074     print OUTPUT pack "S<*", @header;
1075     print OUTPUT pack "C12", (0) x 12;
1076     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1077
1078     if (@glyph2uni)
1079     {
1080         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1081     }
1082     else
1083     {
1084         print OUTPUT pack "S<*", 0;
1085     }
1086
1087     print OUTPUT pack "S<*", 0, 0;
1088
1089     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1090 }
1091
1092
1093 ################################################################
1094 # dump a DBCS mapping table in binary format
1095 sub dump_binary_dbcs_table($)
1096 {
1097     my $codepage = shift;
1098     my @lb_ranges = get_lb_ranges();
1099     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1100
1101     my @offsets = (0) x 256;
1102     my $pos = 0;
1103     foreach my $i (@lead_bytes)
1104     {
1105         $offsets[$i] = ($pos += 256);
1106         $cp2uni[$i] = 0;
1107     }
1108
1109     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1110
1111     print OUTPUT pack "S<*", @header;
1112     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1113     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1114     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1115
1116     foreach my $i (@lead_bytes)
1117     {
1118         my $base = $i << 8;
1119         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1120     }
1121
1122     print OUTPUT pack "S<", 4;
1123     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1124 }
1125
1126
1127 ################################################################
1128 # get the list of defined lead byte ranges
1129 sub get_lb_ranges()
1130 {
1131     my @list = ();
1132     my @ranges = ();
1133
1134     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1135     my $on = 0;
1136     for (my $i = 0; $i < 256; $i++)
1137     {
1138         if ($on)
1139         {
1140             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1141         }
1142         else
1143         {
1144             if ($list[$i]) { push @ranges, $i; $on = 1; }
1145         }
1146     }
1147     if ($on) { push @ranges, 0xff; }
1148     return @ranges;
1149 }
1150
1151 ################################################################
1152 # dump the Indic Syllabic Category table
1153 sub dump_indic($)
1154 {
1155     my $filename = shift;
1156     my @indic_table;
1157
1158     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1159     while (<$INPUT>)
1160     {
1161         next if /^\#/;  # skip comments
1162         next if /^\s*$/;  # skip empty lines
1163         next if /\x1a/;  # skip ^Z
1164         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1165         {
1166             my $type = $2;
1167             die "unknown indic $type" unless defined $indic_types{$type};
1168             if (hex $1 < 65536)
1169             {
1170                 $indic_table[hex $1] = $indic_types{$type};
1171             }
1172             next;
1173         }
1174         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1175         {
1176             my $type = $3;
1177             die "unknown indic $type" unless defined $indic_types{$type};
1178             if (hex $1 < 65536 and hex $2 < 65536)
1179             {
1180                 foreach my $i (hex $1 .. hex $2)
1181                 {
1182                     $indic_table[$i] = $indic_types{$type};
1183                 }
1184             }
1185             next;
1186         }
1187         die "malformed line $_";
1188     }
1189     close $INPUT;
1190
1191     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1192     while (<$INPUT>)
1193     {
1194         next if /^\#/;  # skip comments
1195         next if /^\s*$/;  # skip empty lines
1196         next if /\x1a/;  # skip ^Z
1197         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1198         {
1199             my $type = $2;
1200             die "unknown matra $type" unless defined $matra_types{$type};
1201             $indic_table[hex $1] |= $matra_types{$type} << 8;
1202             next;
1203         }
1204         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1205         {
1206             my $type = $3;
1207             die "unknown matra $type" unless defined $matra_types{$type};
1208             foreach my $i (hex $1 .. hex $2)
1209             {
1210                 $indic_table[$i] |= $matra_types{$type} << 8;
1211             }
1212             next;
1213         }
1214         die "malformed line $_";
1215     }
1216     close $INPUT;
1217
1218     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1219     print "Building $filename\n";
1220     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1221     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1222     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1223     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1224     print OUTPUT "#include \"windef.h\"\n\n";
1225
1226     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1227
1228     close OUTPUT;
1229     save_file($filename);
1230 }
1231
1232 ################################################################
1233 # dump the Line Break Properties table
1234 sub dump_linebreak($)
1235 {
1236     my $filename = shift;
1237     my @break_table;
1238
1239     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1240     while (<$INPUT>)
1241     {
1242         next if /^\#/;  # skip comments
1243         next if /^\s*$/;  # skip empty lines
1244         next if /\x1a/;  # skip ^Z
1245         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1246         {
1247             my $type = $2;
1248             die "unknown breaktype $type" unless defined $break_types{$type};
1249             $break_table[hex $1] = $break_types{$type};
1250             next;
1251         }
1252         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1253         {
1254             my $type = $3;
1255             die "unknown breaktype $type" unless defined $break_types{$type};
1256             foreach my $i (hex $1 .. hex $2)
1257             {
1258                 $break_table[$i] = $break_types{$type};
1259             }
1260             next;
1261         }
1262         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1263         {
1264             my $type = $2;
1265             die "unknown breaktype $type" unless defined $break_types{$type};
1266             $break_table[hex $1] = $break_types{$type};
1267             next;
1268         }
1269         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1270         {
1271             my $type = $3;
1272             die "unknown breaktype $type" unless defined $break_types{$type};
1273             foreach my $i (hex $1 .. hex $2)
1274             {
1275                 $break_table[$i] = $break_types{$type};
1276             }
1277             next;
1278         }
1279         die "malformed line $_";
1280     }
1281     close $INPUT;
1282
1283     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1284     print "Building $filename\n";
1285     print OUTPUT "/* Unicode Line Break Properties */\n";
1286     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1287     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1288     print OUTPUT "#include \"windef.h\"\n\n";
1289
1290     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1291
1292     close OUTPUT;
1293     save_file($filename);
1294 }
1295
1296 my %scripts =
1297 (
1298     "Unknown"                => 0,
1299     "Common"                 => 1,
1300     "Inherited"              => 2,
1301     "Arabic"                 => 3,
1302     "Armenian"               => 4,
1303     "Avestan"                => 5,
1304     "Balinese"               => 6,
1305     "Bamum"                  => 7,
1306     "Batak"                  => 8,
1307     "Bengali"                => 9,
1308     "Bopomofo"               => 10,
1309     "Brahmi"                 => 11,
1310     "Braille"                => 12,
1311     "Buginese"               => 13,
1312     "Buhid"                  => 14,
1313     "Canadian_Aboriginal"    => 15,
1314     "Carian"                 => 16,
1315     "Cham"                   => 17,
1316     "Cherokee"               => 18,
1317     "Coptic"                 => 19,
1318     "Cuneiform"              => 20,
1319     "Cypriot"                => 21,
1320     "Cyrillic"               => 22,
1321     "Deseret"                => 23,
1322     "Devanagari"             => 24,
1323     "Egyptian_Hieroglyphs"   => 25,
1324     "Ethiopic"               => 26,
1325     "Georgian"               => 27,
1326     "Glagolitic"             => 28,
1327     "Gothic"                 => 29,
1328     "Greek"                  => 30,
1329     "Gujarati"               => 31,
1330     "Gurmukhi"               => 32,
1331     "Han"                    => 33,
1332     "Hangul"                 => 34,
1333     "Hanunoo"                => 35,
1334     "Hebrew"                 => 36,
1335     "Hiragana"               => 37,
1336     "Imperial_Aramaic"       => 38,
1337     "Inscriptional_Pahlavi"  => 39,
1338     "Inscriptional_Parthian" => 40,
1339     "Javanese"               => 41,
1340     "Kaithi"                 => 42,
1341     "Kannada"                => 43,
1342     "Katakana"               => 44,
1343     "Kayah_Li"               => 45,
1344     "Kharoshthi"             => 46,
1345     "Khmer"                  => 47,
1346     "Lao"                    => 48,
1347     "Latin"                  => 49,
1348     "Lepcha"                 => 50,
1349     "Limbu"                  => 51,
1350     "Linear_B"               => 52,
1351     "Lisu"                   => 53,
1352     "Lycian"                 => 54,
1353     "Lydian"                 => 55,
1354     "Malayalam"              => 56,
1355     "Mandaic"                => 57,
1356     "Meetei_Mayek"           => 58,
1357     "Mongolian"              => 59,
1358     "Myanmar"                => 60,
1359     "New_Tai_Lue"            => 61,
1360     "Nko"                    => 62,
1361     "Ogham"                  => 63,
1362     "Ol_Chiki"               => 64,
1363     "Old_Italic"             => 65,
1364     "Old_Persian"            => 66,
1365     "Old_South_Arabian"      => 67,
1366     "Old_Turkic"             => 68,
1367     "Oriya"                  => 69,
1368     "Osmanya"                => 70,
1369     "Phags_Pa"               => 71,
1370     "Phoenician"             => 72,
1371     "Rejang"                 => 73,
1372     "Runic"                  => 74,
1373     "Samaritan"              => 75,
1374     "Saurashtra"             => 76,
1375     "Shavian"                => 77,
1376     "Sinhala"                => 78,
1377     "Sundanese"              => 79,
1378     "Syloti_Nagri"           => 80,
1379     "Syriac"                 => 81,
1380     "Tagalog"                => 82,
1381     "Tagbanwa"               => 83,
1382     "Tai_Le"                 => 84,
1383     "Tai_Tham"               => 85,
1384     "Tai_Viet"               => 86,
1385     "Tamil"                  => 87,
1386     "Telugu"                 => 88,
1387     "Thaana"                 => 89,
1388     "Thai"                   => 90,
1389     "Tibetan"                => 91,
1390     "Tifinagh"               => 92,
1391     "Ugaritic"               => 93,
1392     "Vai"                    => 94,
1393     "Yi"                     => 95,
1394     # Win8/Win8.1
1395     "Chakma"                 => 96,
1396     "Meroitic_Cursive"       => 97,
1397     "Meroitic_Hieroglyphs"   => 98,
1398     "Miao"                   => 99,
1399     "Sharada"                => 100,
1400     "Sora_Sompeng"           => 101,
1401     "Takri"                  => 102,
1402     # Win10
1403     "Bassa_Vah"              => 103,
1404     "Caucasian_Albanian"     => 104,
1405     "Duployan"               => 105,
1406     "Elbasan"                => 106,
1407     "Grantha"                => 107,
1408     "Khojki"                 => 108,
1409     "Khudawadi"              => 109,
1410     "Linear_A"               => 110,
1411     "Mahajani"               => 111,
1412     "Manichaean"             => 112,
1413     "Mende_Kikakui"          => 113,
1414     "Modi"                   => 114,
1415     "Mro"                    => 115,
1416     "Nabataean"              => 116,
1417     "Old_North_Arabian"      => 117,
1418     "Old_Permic"             => 118,
1419     "Pahawh_Hmong"           => 119,
1420     "Palmyrene"              => 120,
1421     "Pau_Cin_Hau"            => 121,
1422     "Psalter_Pahlavi"        => 122,
1423     "Siddham"                => 123,
1424     "Tirhuta"                => 124,
1425     "Warang_Citi"            => 125,
1426     # Win10 RS1
1427     "Adlam"                  => 126,
1428     "Ahom"                   => 127,
1429     "Anatolian_Hieroglyphs"  => 128,
1430     "Bhaiksuki"              => 129,
1431     "Hatran"                 => 130,
1432     "Marchen"                => 131,
1433     "Multani"                => 132,
1434     "Newa"                   => 133,
1435     "Old_Hungarian"          => 134,
1436     "Osage"                  => 135,
1437     "SignWriting"            => 136,
1438     "Tangut"                 => 137,
1439     # Win10 RS4
1440     "Masaram_Gondi"          => 138,
1441     "Nushu"                  => 139,
1442     "Soyombo"                => 140,
1443     "Zanabazar_Square"       => 141,
1444     # Win10 1903
1445     "Dogra"                  => 142,
1446     "Gunjala_Gondi"          => 143,
1447     "Hanifi_Rohingya"        => 144,
1448     "Makasar"                => 145,
1449     "Medefaidrin"            => 146,
1450     "Old_Sogdian"            => 147,
1451     "Sogdian"                => 148,
1452     # Win10 2004
1453     "Elymaic"                => 149,
1454     "Nyiakeng_Puachue_Hmong" => 150,
1455     "Nandinagari"            => 151,
1456     "Wancho"                 => 152,
1457 );
1458
1459 ################################################################
1460 # dump Script IDs table
1461 sub dump_scripts($)
1462 {
1463     my $filename = shift;
1464     my $header = $filename;
1465     my @scripts_table;
1466     my $script_index;
1467     my $i;
1468
1469     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1470     # Fill the table
1471     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1472     while (<$INPUT>)
1473     {
1474         my $type = "";
1475
1476         next if /^\#/;  # skip comments
1477         next if /^\s*$/;  # skip empty lines
1478         next if /\x1a/;  # skip ^Z
1479         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1480         {
1481             $type = $2;
1482             if (defined $scripts{$type})
1483             {
1484                 $scripts_table[hex $1] = $scripts{$type};
1485             }
1486             next;
1487         }
1488         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1489         {
1490             $type = $3;
1491             if (defined $scripts{$type})
1492             {
1493                 foreach my $i (hex $1 .. hex $2)
1494                 {
1495                     $scripts_table[$i] = $scripts{$type};
1496                 }
1497             }
1498             next;
1499         }
1500     }
1501
1502     close $INPUT;
1503
1504     $header = "$filename.h";
1505     open OUTPUT,">$header.new" or die "Cannot create $header";
1506     print "Building $header\n";
1507     print OUTPUT "/* Unicode Script IDs */\n";
1508     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1509     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1510
1511     print OUTPUT "enum unicode_script_id {\n";
1512     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1513     {
1514         print OUTPUT "    Script_$script = $scripts{$script},\n";
1515     }
1516     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1517     print OUTPUT "};\n";
1518
1519     close OUTPUT;
1520     save_file($header);
1521
1522     $filename = "$filename.c";
1523     open OUTPUT,">$filename.new" or die "Cannot create $header";
1524     print "Building $filename\n";
1525     print OUTPUT "/* Unicode Script IDs */\n";
1526     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1527     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1528     print OUTPUT "#include \"windef.h\"\n\n";
1529
1530     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1531     close OUTPUT;
1532     save_file($filename);
1533 }
1534
1535 ################################################################
1536 # dump the BiDi mirroring table
1537 sub dump_mirroring($)
1538 {
1539     my $filename = shift;
1540     my @mirror_table = ();
1541
1542     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1543     while (<$INPUT>)
1544     {
1545         next if /^\#/;  # skip comments
1546         next if /^$/;  # skip empty lines
1547         next if /\x1a/;  # skip ^Z
1548         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1549         {
1550             $mirror_table[hex $1] = hex $2;
1551             next;
1552         }
1553         die "malformed line $_";
1554     }
1555     close $INPUT;
1556
1557     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1558     print "Building $filename\n";
1559     print OUTPUT "/* Unicode BiDi mirroring */\n";
1560     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1561     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1562     print OUTPUT "#include \"windef.h\"\n\n";
1563     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1564     close OUTPUT;
1565     save_file($filename);
1566 }
1567
1568 ################################################################
1569 # dump the Bidi Brackets
1570 sub dump_bracket($)
1571 {
1572     my $filename = shift;
1573     my @bracket_table;
1574
1575     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1576     while (<$INPUT>)
1577     {
1578         next if /^\#/;  # skip comments
1579         next if /^\s*$/;  # skip empty lines
1580         next if /\x1a/;  # skip ^Z
1581         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1582         {
1583             my $type = $3;
1584             die "unknown bracket $type" unless defined $bracket_types{$type};
1585             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1586             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1587             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1588             next;
1589         }
1590         die "malformed line $_";
1591     }
1592     close $INPUT;
1593
1594     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1595     print "Building $filename\n";
1596     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1597     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1598     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1599     print OUTPUT "#include \"windef.h\"\n\n";
1600
1601     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1602
1603     close OUTPUT;
1604     save_file($filename);
1605 }
1606
1607 ################################################################
1608 # dump the Arabic shaping table
1609 sub dump_shaping($)
1610 {
1611     my $filename = shift;
1612     my @joining_table = @initial_joining_table;
1613
1614     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1615     while (<$INPUT>)
1616     {
1617         next if /^\#/;  # skip comments
1618         next if /^\s*$/;  # skip empty lines
1619         next if /\x1a/;  # skip ^Z
1620         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1621         {
1622             my $type = $2;
1623             $joining_table[hex $1] = $joining_types{$type};
1624             next;
1625         }
1626         die "malformed line $_";
1627     }
1628     close $INPUT;
1629
1630     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1631     print "Building $filename\n";
1632     print OUTPUT "/* Unicode Arabic shaping */\n";
1633     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1634     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1635     print OUTPUT "#include \"windef.h\"\n\n";
1636
1637     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1638
1639     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1640     for (my $i = 0x600; $i <= 0x6ff; $i++)
1641     {
1642         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1643             ${joining_forms{"isolated"}}[$i] || $i,
1644             ${joining_forms{"final"}}[$i] || $i,
1645             ${joining_forms{"initial"}}[$i] || $i,
1646             ${joining_forms{"medial"}}[$i] || $i;
1647     }
1648     print OUTPUT "};\n";
1649
1650     close OUTPUT;
1651     save_file($filename);
1652 }
1653
1654 ################################################################
1655 # dump the Arabic shaping table
1656 sub dump_arabic_shaping($)
1657 {
1658     my $filename = shift;
1659     my @joining_table = @initial_joining_table;
1660
1661     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1662     while (<$INPUT>)
1663     {
1664         next if /^\#/;  # skip comments
1665         next if /^\s*$/;  # skip empty lines
1666         next if /\x1a/;  # skip ^Z
1667         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1668         {
1669             my $type = $2;
1670             my $group = $3;
1671
1672             if ($group eq "ALAPH" || $group eq "DALATH RISH")
1673             {
1674                 $joining_table[hex $1] = $joining_types{$group};
1675             }
1676             else
1677             {
1678                 $joining_table[hex $1] = $joining_types{$type};
1679             }
1680
1681             next;
1682         }
1683         die "malformed line $_";
1684     }
1685     close $INPUT;
1686
1687     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1688     print "Building $filename\n";
1689     print OUTPUT "/* Unicode Arabic shaping */\n";
1690     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1691     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1692     print OUTPUT "#include \"windef.h\"\n\n";
1693
1694     dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1695
1696     close OUTPUT;
1697     save_file($filename);
1698 }
1699
1700 ################################################################
1701 # dump the Vertical Orientation table
1702 sub dump_vertical($)
1703 {
1704     my $filename = shift;
1705     my @vertical_table;
1706
1707     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1708     while (<$INPUT>)
1709     {
1710         next if /^\#/;  # skip comments
1711         next if /^\s*$/;  # skip empty lines
1712         next if /\x1a/;  # skip ^Z
1713         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1714         {
1715             my $type = $2;
1716             die "unknown vertical $type" unless defined $vertical_types{$type};
1717             if (hex $1 < 65536)
1718             {
1719                 $vertical_table[hex $1] = $vertical_types{$type};
1720             }
1721             next;
1722         }
1723         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1724         {
1725             my $type = $3;
1726             die "unknown vertical $type" unless defined $vertical_types{$type};
1727             foreach my $i (hex $1 .. hex $2)
1728             {
1729                 $vertical_table[$i] = $vertical_types{$type};
1730             }
1731             next;
1732         }
1733         die "malformed line $_";
1734     }
1735     close $INPUT;
1736
1737     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1738     print "Building $filename\n";
1739     print OUTPUT "/* Unicode Vertical Orientation */\n";
1740     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1741     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1742     print OUTPUT "#include \"windef.h\"\n\n";
1743
1744     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1745
1746     close OUTPUT;
1747     save_file($filename);
1748 }
1749
1750 ################################################################
1751 # dump the digit folding tables
1752 sub dump_digit_folding($)
1753 {
1754     my ($filename) = shift;
1755     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1756     print "Building $filename\n";
1757     print OUTPUT "/* Unicode digit folding mappings */\n";
1758     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1759     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1760     print OUTPUT "#include \"windef.h\"\n\n";
1761
1762     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1763     close OUTPUT;
1764     save_file($filename);
1765 }
1766
1767
1768 ################################################################
1769 # compress a mapping table by removing identical rows
1770 sub compress_array($$@)
1771 {
1772     my $rows = shift;
1773     my $def = shift;
1774     my @table = @_;
1775     my $len = @table / $rows;
1776     my @array;
1777     my $data = "";
1778
1779     # try to merge table rows
1780     for (my $row = 0; $row < $rows; $row++)
1781     {
1782         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1783         my $pos = index $data, $rowtxt;
1784         if ($pos == -1)
1785         {
1786             # check if the tail of the data can match the start of the new row
1787             my $first = substr( $rowtxt, 0, 1 );
1788             for (my $i = length($data) - 1; $i > 0; $i--)
1789             {
1790                 $pos = index( substr( $data, -$i ), $first );
1791                 last if $pos == -1;
1792                 $i -= $pos;
1793                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1794                 substr( $data, -$i ) = "";
1795                 last;
1796             }
1797             $pos = length $data;
1798             $data .= $rowtxt;
1799         }
1800         $array[$row] = $rows + $pos;
1801     }
1802     return @array, unpack "U*", $data;
1803 }
1804
1805 ################################################################
1806 # dump a char -> 16-bit value mapping table using two-level tables
1807 sub dump_two_level_mapping($$@)
1808 {
1809     my $name = shift;
1810     my $def = shift;
1811     my $size = shift;
1812     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1813     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1814     my @array = compress_array( 256, 0, @row_array[0..4095] );
1815
1816     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1817
1818     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1819     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1820     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1821     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1822 }
1823
1824 ################################################################
1825 # dump a char -> value mapping table using three-level tables
1826 sub dump_three_level_mapping($$@)
1827 {
1828     my $name = shift;
1829     my $def = shift;
1830     my $size = shift;
1831     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1832     my $level3 = ($MAX_CHAR + 1) / 16;
1833     my $level2 = $level3 / 16;
1834     my $level1 = $level2 / 16;
1835     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1836     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1837     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1838
1839     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1840     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1841
1842     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1843     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1844     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1845     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1846     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1847 }
1848
1849 ################################################################
1850 # dump a binary case mapping table in l_intl.nls format
1851 sub dump_binary_case_table(@)
1852 {
1853     my (@table) = @_;
1854     my $max_char = 0x10000;
1855     my $level1 = $max_char / 16;
1856     my $level2 = $level1 / 16;
1857
1858     my @difftable;
1859     for (my $i = 0; $i < @table; $i++)
1860     {
1861         next unless defined $table[$i];
1862         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1863     }
1864
1865     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1866     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1867     my $offset = @array - $level1;
1868     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1869     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1870 }
1871
1872 ################################################################
1873 # dump case mappings for l_intl.nls
1874 sub dump_intl_nls($)
1875 {
1876     my @upper_table = @toupper_table;
1877     my @lower_table = @tolower_table;
1878     remove_linguistic_mappings( \@upper_table, \@lower_table );
1879
1880     my $upper = dump_binary_case_table( @upper_table );
1881     my $lower = dump_binary_case_table( @lower_table );
1882
1883     my $filename = shift;
1884     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1885     printf "Building $filename\n";
1886
1887     binmode OUTPUT;
1888     print OUTPUT pack "S<", 1;  # version
1889     print OUTPUT $upper;
1890     print OUTPUT $lower;
1891     close OUTPUT;
1892     save_file($filename);
1893 }
1894
1895
1896 ################################################################
1897 # dump the bidi direction table
1898 sub dump_bidi_dir_table($)
1899 {
1900     my $filename = shift;
1901     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1902     printf "Building $filename\n";
1903     printf OUTPUT "/* Unicode BiDi direction table */\n";
1904     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1905     printf OUTPUT "#include \"windef.h\"\n\n";
1906
1907     my @table;
1908
1909     for (my $i = 0; $i < 65536; $i++)
1910     {
1911         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1912     }
1913
1914     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1915
1916     close OUTPUT;
1917     save_file($filename);
1918 }
1919
1920
1921 sub rol($$)
1922 {
1923     my ($byte, $count) = @_;
1924     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1925 }
1926
1927 ################################################################
1928 # compress the character properties table
1929 sub compress_char_props_table($@)
1930 {
1931     my $rows = shift;
1932     my @table = @_;
1933     my $len = @table / $rows;
1934     my $pos = 0;
1935     my @array = (0) x $rows;
1936     my %sequences;
1937
1938     # add some predefined sequences
1939     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1940
1941     # try to merge table rows
1942     for (my $row = 0; $row < $rows; $row++)
1943     {
1944         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1945         my $rowtxt = pack "L*", @table_row;
1946         if (defined($sequences{$rowtxt}))
1947         {
1948             # reuse an existing row
1949             $array[$row] = $sequences{$rowtxt};
1950         }
1951         else
1952         {
1953             # create a new row
1954             $sequences{$rowtxt} = $array[$row] = ++$pos;
1955             push @array, @table_row;
1956         }
1957     }
1958     return @array;
1959 }
1960
1961 ################################################################
1962 # dump a normalization table in binary format
1963 sub dump_norm_table($)
1964 {
1965     my $filename = shift;
1966
1967     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
1968     my %decomp = ( "nfc" => \@decomp_table,
1969                    "nfd" => \@decomp_table,
1970                    "nfkc" => \@decomp_compat_table,
1971                    "nfkd" => \@decomp_compat_table ,
1972                    "idna" => \@idna_decomp_table );
1973
1974     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1975     print "Building $filename\n";
1976
1977     my $type = $filename;
1978     $type =~ s!.*/norm(\w+)\.nls!$1!;
1979
1980     my $compose = $forms{$type} & 1;
1981     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
1982
1983     my @version = split /\./, $UNIVERSION;
1984
1985     # combining classes
1986
1987     my @classes;
1988     my @class_values;
1989
1990     foreach my $c (grep defined, @combining_class_table)
1991     {
1992         $classes[$c] = 1 if $c < 0x100;
1993     }
1994     for (my $i = 0; $i < @classes; $i++)
1995     {
1996         next unless defined $classes[$i];
1997         $classes[$i] = @class_values;
1998         push @class_values, $i;
1999     }
2000     push @class_values, 0 if (@class_values % 2);
2001     die "too many classes" if @class_values >= 0x40;
2002
2003     # character properties
2004
2005     my @char_props;
2006     my @decomposed;
2007     my @comp_hash_table;
2008     my $comp_hash_size = $compose ? 254 : 0;
2009
2010     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2011     {
2012         next unless defined $combining_class_table[$i];
2013         if (defined $decomp{$type}->[$i])
2014         {
2015             my @dec = get_decomposition( $i, $decomp{$type} );
2016             if ($compose && (my @comp = get_composition( $i, $compat )))
2017             {
2018                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2019                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2020
2021                 my $val = 0;
2022                 foreach my $d (@dec)
2023                 {
2024                     $val = $combining_class_table[$d];
2025                     last if $val;
2026                 }
2027                 $char_props[$i] = $classes[$val];
2028             }
2029             else
2030             {
2031                 $char_props[$i] = 0xbf;
2032             }
2033             @dec = compose_hangul( @dec ) if $compose;
2034             @dec = to_utf16( @dec );
2035             push @dec, 0 if @dec >= 7;
2036             $decomposed[$i] = \@dec;
2037         }
2038         else
2039         {
2040             if ($combining_class_table[$i] == 0x100)
2041             {
2042                 $char_props[$i] = 0x7f;
2043             }
2044             elsif ($combining_class_table[$i])
2045             {
2046                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2047             }
2048             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2049             {
2050                 $char_props[$i] = 0xff;
2051             }
2052             else
2053             {
2054                 $char_props[$i] = 0;
2055             }
2056         }
2057     }
2058
2059     if ($compose)
2060     {
2061         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2062         {
2063             my @comp = get_composition( $i, $compat );
2064             next unless @comp;
2065             if ($combining_class_table[$comp[1]])
2066             {
2067                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2068                 $char_props[$comp[1]] |= 0x40;
2069             }
2070             else
2071             {
2072                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2073                 $char_props[$comp[1]] |= 0xc0;
2074             }
2075         }
2076     }
2077
2078     # surrogates
2079     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2080     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2081
2082     # Hangul
2083     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2084     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2085     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2086
2087     # invalid chars
2088     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2089     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2090     foreach my $i (0x00..0x10)
2091     {
2092         $char_props[($i << 16) | 0xfffe] = 0xff;
2093         $char_props[($i << 16) | 0xffff] = 0xff;
2094     }
2095
2096     # decomposition hash table
2097
2098     my @decomp_hash_table;
2099     my @decomp_hash_index;
2100     my @decomp_hash_data;
2101     my $decomp_hash_size = 944;
2102
2103     # build string of character data, reusing substrings when possible
2104     my $decomp_char_data = "";
2105     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2106     {
2107         my $str = pack "U*", @{$i};
2108         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2109     }
2110     for (my $i = 0; $i < @decomposed; $i++)
2111     {
2112         next unless defined $decomposed[$i];
2113         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2114         die "sequence not found" if $pos == -1;
2115         my $len = @{$decomposed[$i]};
2116         $len = 7 if $len > 7;
2117         my $hash = $i % $decomp_hash_size;
2118         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2119     }
2120     for (my $i = 0; $i < $decomp_hash_size; $i++)
2121     {
2122         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2123         next unless defined $decomp_hash_table[$i];
2124         if (@{$decomp_hash_table[$i]} == 1)
2125         {
2126             my $entry = $decomp_hash_table[$i]->[0];
2127             if ($char_props[$entry->[0]] == 0xbf)
2128             {
2129                 $decomp_hash_index[$i] = $entry->[1];
2130                 next;
2131             }
2132         }
2133         foreach my $entry (@{$decomp_hash_table[$i]})
2134         {
2135             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2136         }
2137     }
2138     push @decomp_hash_data, 0, 0;
2139
2140     # composition hash table
2141
2142     my @comp_hash_index;
2143     my @comp_hash_data;
2144     if (@comp_hash_table)
2145     {
2146         for (my $i = 0; $i < $comp_hash_size; $i++)
2147         {
2148             $comp_hash_index[$i] = @comp_hash_data;
2149             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2150         }
2151         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2152         push @comp_hash_data, 0, 0, 0;
2153     }
2154
2155     my $level1 = ($MAX_CHAR + 1) / 128;
2156     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2157
2158     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2159                    0, $decomp_hash_size, $comp_hash_size, 0 );
2160     my @tables = (0) x 8;
2161
2162     $tables[0] = 16 + @header + @tables;
2163     $tables[1] = $tables[0] + @class_values / 2;
2164     $tables[2] = $tables[1] + $level1 / 2;
2165     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2166     $tables[4] = $tables[3] + @decomp_hash_index;
2167     $tables[5] = $tables[4] + @decomp_hash_data;
2168     $tables[6] = $tables[5] + length $decomp_char_data;
2169     $tables[7] = $tables[6] + @comp_hash_index;
2170
2171     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2172     print OUTPUT pack "S<*", @header;
2173     print OUTPUT pack "S<*", @tables;
2174     print OUTPUT pack "C*", @class_values;
2175
2176     print OUTPUT pack "C*", @rows[0..$level1-1];
2177     print OUTPUT pack "C*", @rows[$level1..$#rows];
2178     print OUTPUT pack "S<*", @decomp_hash_index;
2179     print OUTPUT pack "S<*", @decomp_hash_data;
2180     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2181     print OUTPUT pack "S<*", @comp_hash_index;
2182     print OUTPUT pack "S<*", @comp_hash_data;
2183
2184     close OUTPUT;
2185     save_file($filename);
2186
2187     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2188 }
2189
2190
2191 ################################################################
2192 # output a codepage definition file from the global tables
2193 sub output_codepage_file($)
2194 {
2195     my $codepage = shift;
2196
2197     my $output = sprintf "nls/c_%03d.nls", $codepage;
2198     open OUTPUT,">$output.new" or die "Cannot create $output";
2199
2200     printf "Building %s\n", $output;
2201     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2202     else { dump_binary_dbcs_table( $codepage ); }
2203
2204     close OUTPUT;
2205     save_file($output);
2206
2207     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2208 }
2209
2210 ################################################################
2211 # output a codepage table from a Microsoft-style mapping file
2212 sub dump_msdata_codepage($)
2213 {
2214     my $filename = shift;
2215
2216     my $state = "";
2217     my ($codepage, $width, $count);
2218     my ($lb_cur, $lb_end);
2219
2220     @cp2uni = ();
2221     @glyph2uni = ();
2222     @lead_bytes = ();
2223     @uni2cp = ();
2224     $default_char = $DEF_CHAR;
2225     $default_wchar = $DEF_CHAR;
2226
2227     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2228
2229     while (<$INPUT>)
2230     {
2231         next if /^;/;  # skip comments
2232         next if /^\s*$/;  # skip empty lines
2233         next if /\x1a/;  # skip ^Z
2234         last if /^ENDCODEPAGE/;
2235
2236         if (/^CODEPAGE\s+(\d+)/)
2237         {
2238             $codepage = $1;
2239             next;
2240         }
2241         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2242         {
2243             $width = $1;
2244             $default_char = hex $2;
2245             $default_wchar = hex $3;
2246             next;
2247         }
2248         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2249         {
2250             $state = $1;
2251             $count = $2;
2252             next;
2253         }
2254         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2255         {
2256             if ($state eq "MBTABLE")
2257             {
2258                 my $cp = hex $1;
2259                 my $uni = hex $2;
2260                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2261                 next;
2262             }
2263             if ($state eq "GLYPHTABLE")
2264             {
2265                 my $cp = hex $1;
2266                 my $uni = hex $2;
2267                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2268                 next;
2269             }
2270             if ($state eq "WCTABLE")
2271             {
2272                 my $uni = hex $1;
2273                 my $cp = hex $2;
2274                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2275                 next;
2276             }
2277             if ($state eq "DBCSRANGE")
2278             {
2279                 my $start = hex $1;
2280                 my $end = hex $2;
2281                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2282                 $lb_cur = $start;
2283                 $lb_end = $end;
2284                 next;
2285             }
2286             if ($state eq "DBCSTABLE")
2287             {
2288                 my $mb = hex $1;
2289                 my $uni = hex $2;
2290                 my $cp = ($lb_cur << 8) | $mb;
2291                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2292                 if (!--$count)
2293                 {
2294                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2295                 }
2296                 next;
2297             }
2298         }
2299         die "$filename: Unrecognized line $_\n";
2300     }
2301     close $INPUT;
2302
2303     output_codepage_file( $codepage );
2304 }
2305
2306 ################################################################
2307 # align a string length
2308 sub align_string($$)
2309 {
2310     my ($align, $str) = @_;
2311     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2312     return $str;
2313 }
2314
2315 ################################################################
2316 # pack a GUID string
2317 sub pack_guid($)
2318 {
2319     $_ = shift;
2320     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2321     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2322 }
2323
2324 ################################################################
2325 # comparison function for compression sort
2326 sub cmp_compression
2327 {
2328     return scalar @{$a} <=> scalar @{$b} ||
2329         $a->[4] <=> $b->[4] ||
2330         $a->[5] <=> $b->[5] ||
2331         $a->[6] <=> $b->[6] ||
2332         $a->[7] <=> $b->[7] ||
2333         $a->[8] <=> $b->[8] ||
2334         $a->[9] <=> $b->[9] ||
2335         $a->[10] <=> $b->[10] ||
2336         $a->[11] <=> $b->[11] ||
2337         $a->[12] <=> $b->[12];
2338 }
2339
2340 ################################################################
2341 # build a binary sort keys table
2342 sub dump_sortkey_table($$)
2343 {
2344     my ($filename, $download) = @_;
2345
2346     my @keys;
2347     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2348     my @multiple_weights;
2349     my @expansions;
2350     my @compressions;
2351     my %exceptions;
2352     my %guids;
2353     my %compr_flags;
2354     my %locales;
2355     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2356     my $jamostr = "";
2357
2358     my $re_hex = '0x[0-9A-Fa-f]+';
2359     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2360     $guids{$default_guid} = { };
2361
2362     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2363
2364     my $KEYS = open_data_file( $MSDATA, $download );
2365
2366     printf "Building $filename\n";
2367
2368     while (<$KEYS>)
2369     {
2370         s/\s*;.*$//;
2371         next if /^\s*$/;  # skip empty lines
2372         if (/^\s*(SORTKEY|SORTTABLES)/)
2373         {
2374             $part = $1;
2375             next;
2376         }
2377         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2378         {
2379             $part = $section = "";
2380             next;
2381         }
2382         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2383         {
2384             $section = $1;
2385             $guid = undef;
2386             next;
2387         }
2388         next unless $part;
2389         if ("$part.$section" eq "SORTKEY.DEFAULT")
2390         {
2391             if (/^\s*($re_hex)\s+$re_key/)
2392             {
2393                 $keys[hex $1] = [ split(/\s+/,$2) ];
2394                 next;
2395             }
2396         }
2397         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2398         {
2399             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2400             {
2401                 $version = hex $1;
2402                 next;
2403             }
2404             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2405             {
2406                 # ignore for now
2407                 next;
2408             }
2409         }
2410         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2411                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2412                "$part.$section" eq "SORTTABLES.INVERSECASING")
2413         {
2414             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2415             {
2416                 $guid = lc $1;
2417                 $guids{$guid} = { } unless defined $guids{$guid};
2418                 $guids{$guid}->{flags} |= $flags{$section};
2419                 next;
2420             }
2421             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2422             {
2423                 $locales{$1} = $guid;
2424                 next;
2425             }
2426         }
2427         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2428         {
2429             if (/^\s*(\d+)\s+(\d+)/)
2430             {
2431                 push @multiple_weights, $1, $2;
2432                 next;
2433             }
2434         }
2435         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2436         {
2437             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2438             {
2439                 my $pos = scalar @expansions / 2;
2440                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2441                 push @expansions, hex $2, hex $3;
2442                 next;
2443             }
2444         }
2445         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2446         {
2447             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2448             {
2449                 $keys[hex $1] = $keys[hex $2];
2450                 next;
2451             }
2452         }
2453         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2454         {
2455             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2456             {
2457                 if ($subsection || !$guid)  # start a new one
2458                 {
2459                     $guid = lc $1;
2460                     $subsection = "";
2461                     $guids{$guid} = { } unless defined $guids{$guid};
2462                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2463                     $guids{$guid}->{compr} = @compressions;
2464                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2465                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2466                     push @compressions, [ ];
2467                 }
2468                 else  # merge with current one
2469                 {
2470                     $guids{lc $1} = { } unless defined $guids{lc $1};
2471                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2472                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2473                     $compr_flags{lc $1} = $compr_flags{$guid};
2474                 }
2475                 next;
2476             }
2477             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2478             {
2479                 $locales{$1} = $guid;
2480                 next;
2481             }
2482             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2483             {
2484                 $subsection = $1;
2485                 next;
2486             }
2487             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2488             {
2489                 my @comp = map { hex $_; } split(/\s+/,$1);
2490                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2491                 # add compression flags
2492                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2493                 next;
2494             }
2495         }
2496         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2497         {
2498             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2499             {
2500                 $guid = lc $1;
2501                 $guids{$guid} = { } unless defined $guids{lc $1};
2502                 $ling_flag = ($2 ? "+" : "-");
2503                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2504                 next;
2505             }
2506             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2507             {
2508                 $locales{$1} = $guid;
2509                 next;
2510             }
2511             if (/^\s*($re_hex)\s+$re_key/)
2512             {
2513                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2514                 next;
2515             }
2516         }
2517         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2518         {
2519             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2520             {
2521                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2522                 next;
2523             }
2524         }
2525         die "$download: $part.$section: unrecognized line $_\n";
2526     }
2527     close $KEYS;
2528
2529     # Sortkey table
2530
2531     my $table;
2532     for (my $i = 0; $i < 0x10000; $i++)
2533     {
2534         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2535         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2536     }
2537
2538     foreach my $id (sort keys %exceptions)
2539     {
2540         my $pos = length($table) / 4;
2541         my @exc = @{$exceptions{$id}};
2542         my @filled;
2543         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2544         my $guid = substr( $id, 0, -1 );
2545         $guids{$guid}->{$key} = $pos;
2546         $pos += 0x100;
2547         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2548         for (my $j = 0; $j < 0x10000; $j++)
2549         {
2550             next unless defined $exc[$j] || defined $flags[$j];
2551             $filled[$j >> 8] = 1;
2552             $j |= 0xff;
2553         }
2554         for (my $j = 0; $j < 0x100; $j++)
2555         {
2556             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2557             $pos += 0x100 if $filled[$j];
2558         }
2559         for (my $j = 0; $j < 0x10000; $j++)
2560         {
2561             next unless $filled[$j >> 8];
2562             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2563             $k[3] |= $flags[$j] || 0;
2564             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2565         }
2566     }
2567
2568     # Case mapping tables
2569
2570     # standard table
2571     my @casemaps;
2572     my @upper = @toupper_table;
2573     my @lower = @tolower_table;
2574     remove_linguistic_mappings( \@upper, \@lower );
2575     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2576
2577     # linguistic table
2578     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2579
2580     # Turkish table
2581     @upper = @toupper_table;
2582     @lower = @tolower_table;
2583     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2584     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2585     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2586     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2587
2588     # Char type table
2589
2590     my @table;
2591     my $types = "";
2592     my %typestr;
2593     for (my $i = 0; $i < 0x10000; $i++)
2594     {
2595         my $str = pack "S<3",
2596             ($category_table[$i] || 0) & 0xffff,
2597             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2598             ($category_table[$i] || 0) >> 16;
2599
2600         if (!defined($typestr{$str}))
2601         {
2602             $typestr{$str} = length($types) / 6;
2603             $types .= $str;
2604         }
2605         $table[$i] = $typestr{$str};
2606     }
2607
2608     my @rows = compress_array( 4096, 0, @table[0..65535] );
2609     my @array = compress_array( 256, 0, @rows[0..4095] );
2610     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2611     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2612
2613     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2614     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2615     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2616
2617     # Sort tables
2618
2619     # guids
2620     my $sorttables = pack "L<2", $version, scalar %guids;
2621     foreach my $id (sort keys %guids)
2622     {
2623         my %guid = %{$guids{$id}};
2624         my $flags = $guid{flags} || 0;
2625         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2626         $sorttables .= pack_guid($id) . pack "L<5",
2627             $flags,
2628             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2629             $guid{except} || 0,
2630             $guid{ling_except} || 0,
2631             $map / 2;
2632     }
2633
2634     # expansions
2635     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2636
2637     # compressions
2638     $sorttables .= pack "L<", scalar @compressions;
2639     my $rowstr = "";
2640     foreach my $c (@compressions)
2641     {
2642         my $pos = length($rowstr) / 2;
2643         my $min = 0xffff;
2644         my $max = 0;
2645         my @lengths = (0) x 8;
2646         foreach my $r (sort cmp_compression @{$c})
2647         {
2648             my @row = @{$r};
2649             $lengths[scalar @row - 6]++;
2650             foreach my $val (@row[4..$#row])
2651             {
2652                 $min = $val if $min > $val;
2653                 $max = $val if $max < $val;
2654             }
2655             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2656             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2657         }
2658         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2659     }
2660     $sorttables .= $rowstr;
2661
2662     # multiple weights
2663     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2664
2665     # jamo sort
2666     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2667
2668     # Locales
2669
2670     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2671     foreach my $loc (sort keys %locales)
2672     {
2673         # skip specific locales that match more general ones
2674         my @parts = split /[-_]/, $loc;
2675         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2676         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2677         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2678     }
2679
2680     # File header
2681
2682     my @header;
2683     $header[0] = 16;
2684     $header[1] = $header[0] + length $table;
2685     $header[2] = $header[1] + length $casemaps;
2686     $header[3] = $header[2] + length $chartypes;
2687
2688     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2689     print OUTPUT pack "L<*", @header;
2690     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2691     close OUTPUT;
2692     save_file($filename);
2693 }
2694
2695
2696 ################################################################
2697 # build the script to create registry keys
2698 sub dump_registry_script($%)
2699 {
2700     my ($filename, %keys) = @_;
2701     my $indent = 1;
2702
2703     printf "Building %s\n", $filename;
2704     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2705     print OUTPUT "HKLM\n{\n";
2706     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2707     {
2708         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2709         $indent++;
2710     }
2711     foreach my $k (sort keys %keys)
2712     {
2713         my @subkeys = split /\\/, $k;
2714         my ($def, @vals) = @{$keys{$k}};
2715         for (my $i = 0; $i < @subkeys; $i++)
2716         {
2717             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2718                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2719             $indent++;
2720         }
2721         foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2722         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2723     }
2724     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2725     close OUTPUT;
2726     save_file($filename);
2727 }
2728
2729
2730 ################################################################
2731 # save a file if modified
2732 sub save_file($)
2733 {
2734     my $file = shift;
2735     if (-f $file && !system "cmp $file $file.new >/dev/null")
2736     {
2737         unlink "$file.new";
2738     }
2739     else
2740     {
2741         rename "$file.new", "$file";
2742     }
2743 }
2744
2745
2746 ################################################################
2747 # main routine
2748
2749 chdir ".." if -f "./make_unicode";
2750 load_data();
2751 dump_sortkeys( "dlls/kernelbase/collation.c" );
2752 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2753 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2754 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2755 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2756 dump_mirroring( "dlls/dwrite/mirror.c" );
2757 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2758 dump_bracket( "dlls/dwrite/bracket.c" );
2759 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2760 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2761 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2762 dump_linebreak( "dlls/dwrite/linebreak.c" );
2763 dump_scripts( "dlls/dwrite/scripts" );
2764 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2765 dump_vertical( "dlls/gdi32/vertical.c" );
2766 dump_vertical( "dlls/wineps.drv/vertical.c" );
2767 dump_intl_nls("nls/l_intl.nls");
2768 dump_norm_table( "nls/normnfc.nls" );
2769 dump_norm_table( "nls/normnfd.nls" );
2770 dump_norm_table( "nls/normnfkc.nls" );
2771 dump_norm_table( "nls/normnfkd.nls" );
2772 dump_norm_table( "nls/normidna.nls" );
2773 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2774 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2775 dump_eucjp_codepage();
2776 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2777
2778 exit 0;
2779
2780 # Local Variables:
2781 # compile-command: "./make_unicode"
2782 # End: