tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $REPORTS = "http://www.unicode.org/reports";
  30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  32
  33 # Sort keys file
  34 my $SORTKEYS = "tr10/allkeys.txt";
  35
  36 # Default char for undefined mappings
  37 my $DEF_CHAR = ord '?';
  38
  39 # Last valid Unicode character
  40 my $MAX_CHAR = 0x10ffff;
  41
  42 my @allfiles =
  43 (
  44     "CodpageFiles/037.txt",
  45     "CodpageFiles/437.txt",
  46     "CodpageFiles/500.txt",
  47     "CodpageFiles/737.txt",
  48     "CodpageFiles/775.txt",
  49     "CodpageFiles/850.txt",
  50     "CodpageFiles/852.txt",
  51     "CodpageFiles/855.txt",
  52     "CodpageFiles/857.txt",
  53     "CodpageFiles/860.txt",
  54     "CodpageFiles/861.txt",
  55     "CodpageFiles/862.txt",
  56     "CodpageFiles/863.txt",
  57     "CodpageFiles/864.txt",
  58     "CodpageFiles/865.txt",
  59     "CodpageFiles/866.txt",
  60     "CodpageFiles/869.txt",
  61     "CodpageFiles/874.txt",
  62     "CodpageFiles/875.txt",
  63     "CodpageFiles/932.txt",
  64     "CodpageFiles/936.txt",
  65     "CodpageFiles/949.txt",
  66     "CodpageFiles/950.txt",
  67     "CodpageFiles/1026.txt",
  68     "CodpageFiles/1250.txt",
  69     "CodpageFiles/1251.txt",
  70     "CodpageFiles/1252.txt",
  71     "CodpageFiles/1253.txt",
  72     "CodpageFiles/1254.txt",
  73     "CodpageFiles/1255.txt",
  74     "CodpageFiles/1256.txt",
  75     "CodpageFiles/1257.txt",
  76     "CodpageFiles/1258.txt",
  77     "CodpageFiles/1361.txt",
  78     "CodpageFiles/10000.txt",
  79     "CodpageFiles/10001.txt",
  80     "CodpageFiles/10002.txt",
  81     "CodpageFiles/10003.txt",
  82     "CodpageFiles/10004.txt",
  83     "CodpageFiles/10005.txt",
  84     "CodpageFiles/10006.txt",
  85     "CodpageFiles/10007.txt",
  86     "CodpageFiles/10008.txt",
  87     "CodpageFiles/10010.txt",
  88     "CodpageFiles/10017.txt",
  89     "CodpageFiles/10021.txt",
  90     "CodpageFiles/10029.txt",
  91     "CodpageFiles/10079.txt",
  92     "CodpageFiles/10081.txt",
  93     "CodpageFiles/10082.txt",
  94     "CodpageFiles/20127.txt",
  95     "CodpageFiles/20866.txt",
  96     "CodpageFiles/21866.txt",
  97     "CodpageFiles/28591.txt",
  98     "CodpageFiles/28592.txt",
  99     "CodpageFiles/28593.txt",
 100     "CodpageFiles/28594.txt",
 101     "CodpageFiles/28595.txt",
 102     "CodpageFiles/28596.txt",
 103     "CodpageFiles/28597.txt",
 104     "CodpageFiles/28598.txt",
 105     "CodpageFiles/28599.txt",
 106     "CodpageFiles/28603.txt",
 107     "CodpageFiles/28605.txt",
 108 );
 109
 110
 111 my %ctype =
 112 (
 113      # CT_CTYPE1
 114     "upper"  => 0x0001,
 115     "lower"  => 0x0002,
 116     "digit"  => 0x0004,
 117     "space"  => 0x0008,
 118     "punct"  => 0x0010,
 119     "cntrl"  => 0x0020,
 120     "blank"  => 0x0040,
 121     "xdigit" => 0x0080,
 122     "alpha"  => 0x0100 | 0x80000000,
 123     "defin"  => 0x0200,
 124      # CT_CTYPE3 in high 16 bits
 125     "nonspacing"    => 0x00010000,
 126     "diacritic"     => 0x00020000,
 127     "vowelmark"     => 0x00040000,
 128     "symbol"        => 0x00080000,
 129     "katakana"      => 0x00100000,
 130     "hiragana"      => 0x00200000,
 131     "halfwidth"     => 0x00400000,
 132     "fullwidth"     => 0x00800000,
 133     "ideograph"     => 0x01000000,
 134     "kashida"       => 0x02000000,
 135     "lexical"       => 0x04000000,
 136     "highsurrogate" => 0x08000000,
 137     "lowsurrogate"  => 0x10000000,
 138 );
 139
 140 my %bracket_types =
 141 (
 142     "o" => 0x0000,
 143     "c" => 0x0001,
 144 );
 145
 146 my %indic_types =
 147 (
 148     "Other"    => 0x0000,
 149     "Bindu"    => 0x0001,
 150     "Visarga"  => 0x0002,
 151     "Avagraha" => 0x0003,
 152     "Nukta"    => 0x0004,
 153     "Virama"   => 0x0005,
 154     "Vowel_Independent"  => 0x0006,
 155     "Vowel_Dependent"  => 0x0007,
 156     "Vowel"  => 0x0008,
 157     "Consonant_Placeholder"  => 0x0009,
 158     "Consonant"  => 0x000a,
 159     "Consonant_Dead"  => 0x000b,
 160     "Consonant_Succeeding_Repha" => 0x000c,
 161     "Consonant_Subjoined"  => 0x000d,
 162     "Consonant_Medial"  => 0x000e,
 163     "Consonant_Final"  => 0x000f,
 164     "Consonant_Head_Letter"  => 0x0010,
 165     "Modifying_Letter"  => 0x0011,
 166     "Tone_Letter"  => 0x0012,
 167     "Tone_Mark"  => 0x0013,
 168     "Register_Shifter"  => 0x0014,
 169     "Consonant_Preceding_Repha" => 0x0015,
 170     "Pure_Killer" => 0x0016,
 171     "Invisible_Stacker" => 0x0017,
 172     "Gemination_Mark" => 0x0018,
 173     "Cantillation_Mark" => 0x0019,
 174     "Non_Joiner" => 0x001a,
 175     "Joiner" => 0x001b,
 176     "Number_Joiner" => 0x001c,
 177     "Number" => 0x001d,
 178     "Brahmi_Joining_Number" => 0x001e,
 179     "Consonant_With_Stacker" => 0x001f,
 180     "Consonant_Prefixed" => 0x0020,
 181     "Syllable_Modifier" => 0x0021,
 182     "Consonant_Killer" => 0x0022,
 183     "Consonant_Initial_Postfixed" => 0x0023,
 184 );
 185
 186 my %matra_types =
 187 (
 188     "Right"    => 0x01,
 189     "Left"  => 0x02,
 190     "Visual_Order_Left" => 0x03,
 191     "Left_And_Right"    => 0x04,
 192     "Top"   => 0x05,
 193     "Bottom"  => 0x06,
 194     "Top_And_Bottom"  => 0x07,
 195     "Top_And_Right"  => 0x08,
 196     "Top_And_Left"  => 0x09,
 197     "Top_And_Left_And_Right"  => 0x0a,
 198     "Bottom_And_Right"  => 0x0b,
 199     "Top_And_Bottom_And_Right"  => 0x0c,
 200     "Overstruck"  => 0x0d,
 201     "Invisible"  => 0x0e,
 202     "Bottom_And_Left"  => 0x0f,
 203     "Top_And_Bottom_And_Left"  => 0x10,
 204 );
 205
 206 my %break_types =
 207 (
 208     "BK"  => 0x0001,
 209     "CR"  => 0x0002,
 210     "LF"  => 0x0003,
 211     "CM"  => 0x0004,
 212     "SG"  => 0x0005,
 213     "GL"  => 0x0006,
 214     "CB"  => 0x0007,
 215     "SP"  => 0x0008,
 216     "ZW"  => 0x0009,
 217     "NL"  => 0x000a,
 218     "WJ"  => 0x000b,
 219     "JL"  => 0x000c,
 220     "JV"  => 0x000d,
 221     "JT"  => 0x000e,
 222     "H2"  => 0x000f,
 223     "H3"  => 0x0010,
 224     "XX"  => 0x0011,
 225     "OP"  => 0x0012,
 226     "CL"  => 0x0013,
 227     "CP"  => 0x0014,
 228     "QU"  => 0x0015,
 229     "NS"  => 0x0016,
 230     "EX"  => 0x0017,
 231     "SY"  => 0x0018,
 232     "IS"  => 0x0019,
 233     "PR"  => 0x001a,
 234     "PO"  => 0x001b,
 235     "NU"  => 0x001c,
 236     "AL"  => 0x001d,
 237     "ID"  => 0x001e,
 238     "IN"  => 0x001f,
 239     "HY"  => 0x0020,
 240     "BB"  => 0x0021,
 241     "BA"  => 0x0022,
 242     "SA"  => 0x0023,
 243     "AI"  => 0x0024,
 244     "B2"  => 0x0025,
 245     "HL"  => 0x0026,
 246     "CJ"  => 0x0027,
 247     "RI"  => 0x0028,
 248     "EB"  => 0x0029,
 249     "EM"  => 0x002a,
 250     "ZWJ" => 0x002b,
 251 );
 252
 253 my %vertical_types =
 254 (
 255     "R"  => 0x0000,
 256     "U"  => 0x0001,
 257     "Tr" => 0x0002,
 258     "Tu" => 0x0003,
 259 );
 260
 261 my %categories =
 262 (
 263     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 264     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 265     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 266     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 267     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 268     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 269     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 270     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 271     "No" => $ctype{"defin"},                    # Number, Other
 272     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 273     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 274     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 275     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 276     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 277     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 278     "Co" => $ctype{"defin"},                    # Other, Private Use
 279     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 280     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 281     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 282     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 283     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 284     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 285     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 286     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 287     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 288     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 289     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 290     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 291     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 292     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 293 );
 294
 295 # a few characters need additional categories that cannot be determined automatically
 296 my %special_categories =
 297 (
 298     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 299                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 300     "space"  => [ 0x09..0x0d, 0x85 ],
 301     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 302     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 303                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 304                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 305                   0xfff9, 0xfffa, 0xfffb ],
 306     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 307                   0xd7, 0xf7 ],
 308     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 309     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 310     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 311                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 312     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 313     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 314                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 315                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 316     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 317     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 318                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 319                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 320                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 321                      0x3131..0x3164 ],
 322     "ideograph" => [ 0x3006..0x3007 ],
 323     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 324                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 325                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 326                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 327                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 328                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 329                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 330                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 331                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 332                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 333     "kashida" => [ 0x0640 ],
 334 );
 335
 336 my %directions =
 337 (
 338     "L"   => 1,    # Left-to-Right
 339     "R"   => 2,    # Right-to-Left
 340     "AL"  => 12,   # Right-to-Left Arabic
 341     "EN"  => 3,    # European Number
 342     "ES"  => 4,    # European Number Separator
 343     "ET"  => 5,    # European Number Terminator
 344     "AN"  => 6,    # Arabic Number
 345     "CS"  => 7,    # Common Number Separator
 346     "NSM" => 13,   # Non-Spacing Mark
 347     "BN"  => 14,   # Boundary Neutral
 348     "B"   => 8,    # Paragraph Separator
 349     "S"   => 9,    # Segment Separator
 350     "WS"  => 10,   # Whitespace
 351     "ON"  => 11,   # Other Neutrals
 352     "LRE" => 15,   # Left-to-Right Embedding
 353     "LRO" => 15,   # Left-to-Right Override
 354     "RLE" => 15,   # Right-to-Left Embedding
 355     "RLO" => 15,   # Right-to-Left Override
 356     "PDF" => 15,   # Pop Directional Format
 357     "LRI" => 15,   # Left-to-Right Isolate
 358     "RLI" => 15,   # Right-to-Left Isolate
 359     "FSI" => 15,   # First Strong Isolate
 360     "PDI" => 15    # Pop Directional Isolate
 361 );
 362
 363 my %c2_types =
 364 (
 365     "L"   => 1,    # C2_LEFTTORIGHT
 366     "R"   => 2,    # C2_RIGHTTOLEFT
 367     "AL"  => 2,    # C2_RIGHTTOLEFT
 368     "EN"  => 3,    # C2_EUROPENUMBER
 369     "ES"  => 4,    # C2_EUROPESEPARATOR
 370     "ET"  => 5,    # C2_EUROPETERMINATOR
 371     "AN"  => 6,    # C2_ARABICNUMBER
 372     "CS"  => 7,    # C2_COMMONSEPARATOR
 373     "NSM" => 11,   # C2_OTHERNEUTRAL
 374     "BN"  => 0,    # C2_NOTAPPLICABLE
 375     "B"   => 8,    # C2_BLOCKSEPARATOR
 376     "S"   => 9,    # C2_SEGMENTSEPARATOR
 377     "WS"  => 10,   # C2_WHITESPACE
 378     "ON"  => 11,   # C2_OTHERNEUTRAL
 379     "LRE" => 11,   # C2_OTHERNEUTRAL
 380     "LRO" => 11,   # C2_OTHERNEUTRAL
 381     "RLE" => 11,   # C2_OTHERNEUTRAL
 382     "RLO" => 11,   # C2_OTHERNEUTRAL
 383     "PDF" => 11,   # C2_OTHERNEUTRAL
 384     "LRI" => 11,   # C2_OTHERNEUTRAL
 385     "RLI" => 11,   # C2_OTHERNEUTRAL
 386     "FSI" => 11,   # C2_OTHERNEUTRAL
 387     "PDI" => 11    # C2_OTHERNEUTRAL
 388 );
 389
 390 my %bidi_types =
 391 (
 392     "ON"  => 0,    # Other Neutrals
 393     "L"   => 1,    # Left-to-Right
 394     "R"   => 2,    # Right-to-Left
 395     "AN"  => 3,    # Arabic Number
 396     "EN"  => 4,    # European Number
 397     "AL"  => 5,    # Right-to-Left Arabic
 398     "NSM" => 6,    # Non-Spacing Mark
 399     "CS"  => 7,    # Common Number Separator
 400     "ES"  => 8,    # European Number Separator
 401     "ET"  => 9,    # European Number Terminator
 402     "BN"  => 10,   # Boundary Neutral
 403     "S"   => 11,   # Segment Separator
 404     "WS"  => 12,   # Whitespace
 405     "B"   => 13,   # Paragraph Separator
 406     "RLO" => 14,   # Right-to-Left Override
 407     "RLE" => 15,   # Right-to-Left Embedding
 408     "LRO" => 16,   # Left-to-Right Override
 409     "LRE" => 17,   # Left-to-Right Embedding
 410     "PDF" => 18,   # Pop Directional Format
 411     "LRI" => 19,   # Left-to-Right Isolate
 412     "RLI" => 20,   # Right-to-Left Isolate
 413     "FSI" => 21,   # First Strong Isolate
 414     "PDI" => 22    # Pop Directional Isolate
 415 );
 416
 417 my %joining_types =
 418 (
 419    "U" => 0,    # Non_Joining
 420    "T" => 1,    # Transparent
 421    "R" => 2,    # Right_Joining
 422    "L" => 3,    # Left_Joining
 423    "D" => 4,    # Dual_Joining
 424    "C" => 5,    # Join_Causing
 425 );
 426
 427 my @cp2uni = ();
 428 my @glyph2uni = ();
 429 my @lead_bytes = ();
 430 my @uni2cp = ();
 431 my @tolower_table = ();
 432 my @toupper_table = ();
 433 my @digitmap_table = ();
 434 my @category_table = ();
 435 my @joining_table = ();
 436 my @direction_table = ();
 437 my @decomp_table = ();
 438 my @combining_class_table = ();
 439 my @decomp_compat_table = ();
 440 my @comp_exclusions = ();
 441 my @idna_decomp_table = ();
 442 my @idna_disallowed = ();
 443 my %registry_keys;
 444 my $default_char;
 445 my $default_wchar;
 446
 447 my %joining_forms =
 448 (
 449    "isolated" => [],
 450    "final" => [],
 451    "initial" => [],
 452    "medial" => []
 453 );
 454
 455 sub to_utf16(@)
 456 {
 457     my @ret;
 458     foreach my $ch (@_)
 459     {
 460         if ($ch < 0x10000)
 461         {
 462             push @ret, $ch;
 463         }
 464         else
 465         {
 466             my $val = $ch - 0x10000;
 467             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 468         }
 469     }
 470     return @ret;
 471 }
 472
 473 ################################################################
 474 # fetch a unicode.org file and open it
 475 sub open_data_file($$)
 476 {
 477     my ($base, $name) = @_;
 478     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 479     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 480     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 481     local *FILE;
 482
 483     if ($base =~ /.*\/([^\/]+)\.zip$/)
 484     {
 485         my $zip = "$1$suffix.zip";
 486         unless (-f "$cache/$zip")
 487         {
 488             system "mkdir", "-p", $cache;
 489             print "Fetching $base...\n";
 490             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 491         }
 492         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 493     }
 494     else
 495     {
 496         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 497         unless (-f $dest)
 498         {
 499             system "mkdir", "-p", $dir;
 500             print "Fetching $base/$name...\n";
 501             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 502         }
 503         open FILE, "<$dest" or die "cannot open $dest";
 504     }
 505     return *FILE;
 506 }
 507
 508 ################################################################
 509 # recursively get the decomposition for a character
 510 sub get_decomposition($$);
 511 sub get_decomposition($$)
 512 {
 513     my ($char, $table) = @_;
 514     my @ret;
 515
 516     return $char unless defined $table->[$char];
 517     foreach my $ch (@{$table->[$char]})
 518     {
 519         push @ret, get_decomposition( $ch, $table );
 520     }
 521     return @ret;
 522 }
 523
 524 ################################################################
 525 # get the composition that results in a given character
 526 sub get_composition($$)
 527 {
 528     my ($ch, $compat) = @_;
 529     return () unless defined $decomp_table[$ch];  # no decomposition
 530     my @ret = @{$decomp_table[$ch]};
 531     return () if @ret < 2;                        # singleton decomposition
 532     return () if $comp_exclusions[$ch];           # composition exclusion
 533     return () if $combining_class_table[$ch];     # non-starter
 534     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 535     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 536         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 537     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 538         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 539     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 540         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 541     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 542     return @ret;
 543 }
 544
 545 ################################################################
 546 # recursively build decompositions
 547 sub build_decompositions(@)
 548 {
 549     my @src = @_;
 550     my @dst;
 551
 552     for (my $i = 0; $i < @src; $i++)
 553     {
 554         next unless defined $src[$i];
 555         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 556         $dst[$i] = \@decomp;
 557     }
 558     return @dst;
 559 }
 560
 561 ################################################################
 562 # compose Hangul sequences
 563 sub compose_hangul(@)
 564 {
 565     my $SBASE  = 0xac00;
 566     my $LBASE  = 0x1100;
 567     my $VBASE  = 0x1161;
 568     my $TBASE  = 0x11a7;
 569     my $LCOUNT = 19;
 570     my $VCOUNT = 21;
 571     my $TCOUNT = 28;
 572     my $NCOUNT = $VCOUNT * $TCOUNT;
 573     my $SCOUNT = $LCOUNT * $NCOUNT;
 574
 575     my @seq = @_;
 576     my @ret;
 577     my $i;
 578
 579     for ($i = 0; $i < @seq; $i++)
 580     {
 581         my $ch = $seq[$i];
 582         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 583             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 584         {
 585             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 586             $i++;
 587         }
 588         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 589             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 590         {
 591             $ch += $seq[$i+1] - $TBASE;
 592             $i++;
 593         }
 594         push @ret, $ch;
 595     }
 596     return @ret;
 597 }
 598
 599 ################################################################
 600 # remove linguistic-only mappings from the case table
 601 sub remove_linguistic_mappings($$)
 602 {
 603     my ($upper, $lower) = @_;
 604
 605     # remove case mappings that don't round-trip
 606
 607     for (my $i = 0; $i < @{$upper}; $i++)
 608     {
 609         next unless defined ${$upper}[$i];
 610         my $ch = ${$upper}[$i];
 611         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 612     }
 613     for (my $i = 0; $i < @{$lower}; $i++)
 614     {
 615         next unless defined ${$lower}[$i];
 616         my $ch = ${$lower}[$i];
 617         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 618     }
 619 }
 620
 621 ################################################################
 622 # read in the Unicode database files
 623 sub load_data()
 624 {
 625     my $start;
 626
 627     # now build mappings from the decomposition field of the Unicode database
 628
 629     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 630     while (<$UNICODE_DATA>)
 631     {
 632         # Decode the fields ...
 633         my ($code, $name, $cat, $comb, $bidi,
 634             $decomp, $dec, $dig, $num, $mirror,
 635             $oldname, $comment, $upper, $lower, $title) = split /;/;
 636         my $src = hex $code;
 637
 638         die "unknown category $cat" unless defined $categories{$cat};
 639         die "unknown directionality $bidi" unless defined $directions{$bidi};
 640
 641         $category_table[$src] = $categories{$cat};
 642         $direction_table[$src] = $bidi;
 643         $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
 644
 645         if ($lower ne "")
 646         {
 647             $tolower_table[$src] = hex $lower;
 648         }
 649         if ($upper ne "")
 650         {
 651             $toupper_table[$src] = hex $upper;
 652         }
 653         if ($dec ne "")
 654         {
 655             $category_table[$src] |= $ctype{"digit"};
 656         }
 657         if ($dig ne "")
 658         {
 659             $digitmap_table[$src] = ord $dig;
 660         }
 661         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 662
 663         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 664         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 665         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 666         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 667         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 668         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 669         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 670         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 671         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 672         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 673         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 674         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 675
 676         # copy the category and direction for everything between First/Last pairs
 677         if ($name =~ /, First>/) { $start = $src; }
 678         if ($name =~ /, Last>/)
 679         {
 680             while ($start < $src)
 681             {
 682                 $category_table[$start] = $category_table[$src];
 683                 $direction_table[$start] = $direction_table[$src];
 684                 $combining_class_table[$start] = $combining_class_table[$src];
 685                 $start++;
 686             }
 687         }
 688
 689         next if $decomp eq "";  # no decomposition, skip it
 690
 691         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 692         {
 693             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 694             $decomp_compat_table[$src] = \@seq;
 695         }
 696
 697         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 698         {
 699             # decomposition of the form "<foo> 1234" -> use char if type is known
 700             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 701             {
 702                 ${joining_forms{$1}}[hex $2] = $src;
 703             }
 704         }
 705         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 706         {
 707             # decomposition "<compat> 0020 1234" -> combining accent
 708         }
 709         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 710         {
 711             # store decomposition
 712             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 713             {
 714                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 715             }
 716             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 717             {
 718                 # Single char decomposition
 719                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 720             }
 721         }
 722     }
 723     close $UNICODE_DATA;
 724
 725     # patch the category of some special characters
 726
 727     for (my $i = 0; $i < @decomp_table; $i++)
 728     {
 729         next unless defined $decomp_table[$i];
 730         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 731     }
 732     foreach my $cat (keys %special_categories)
 733     {
 734         my $flag = $ctype{$cat};
 735         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 736     }
 737     for (my $i = 0; $i < @decomp_compat_table; $i++)
 738     {
 739         next unless defined $decomp_compat_table[$i];
 740         next unless @{$decomp_compat_table[$i]} == 2;
 741         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 742     }
 743
 744     # load the composition exclusions
 745
 746     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 747     while (<$EXCL>)
 748     {
 749         s/\#.*//;  # remove comments
 750         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 751         {
 752             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 753         }
 754         elsif (/^([0-9a-fA-F]+)\s*$/)
 755         {
 756             $comp_exclusions[hex $1] = 1;
 757         }
 758     }
 759     close $EXCL;
 760
 761     # load the IDNA mappings
 762
 763     @idna_decomp_table = @decomp_compat_table;
 764     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 765     while (<$IDNA>)
 766     {
 767         s/\#.*//;  # remove comments
 768         next if /^\s*$/;
 769         my ($char, $type, $mapping) = split /;/;
 770         my ($ch1, $ch2);
 771         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 772         {
 773             $ch1 = hex $1;
 774             $ch2 = hex $2;
 775         }
 776         elsif ($char =~ /([0-9a-fA-F]+)/)
 777         {
 778             $ch1 = $ch2 = hex $1;
 779         }
 780
 781         if ($type =~ /mapped/ || $type =~ /deviation/)
 782         {
 783             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 784             my @seq = map { hex $_; } split /\s+/, $mapping;
 785             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 786         }
 787         elsif ($type =~ /valid/)
 788         {
 789         }
 790         elsif ($type =~ /ignored/)
 791         {
 792             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 793         }
 794         elsif ($type =~ /disallowed/)
 795         {
 796             foreach my $i ($ch1 .. $ch2)
 797             {
 798                 $idna_decomp_table[$i] = undef;
 799                 $idna_disallowed[$i] = 1;
 800             }
 801         }
 802     }
 803     close $IDNA;
 804 }
 805
 806
 807 ################################################################
 808 # add a new registry key
 809 sub add_registry_key($$)
 810 {
 811     my ($key, $defval) = @_;
 812     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 813 }
 814
 815 ################################################################
 816 # add a new registry value
 817 sub add_registry_value($$$)
 818 {
 819     my ($key, $name, $value) = @_;
 820     add_registry_key( $key, undef );
 821     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 822 }
 823
 824 ################################################################
 825 # define a new lead byte
 826 sub add_lead_byte($)
 827 {
 828     my $ch = shift;
 829     return if defined $cp2uni[$ch];
 830     push @lead_bytes, $ch;
 831     $cp2uni[$ch] = 0;
 832 }
 833
 834 ################################################################
 835 # define a new char mapping
 836 sub add_mapping($$)
 837 {
 838     my ($cp, $uni) = @_;
 839     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 840     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 841     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 842 }
 843
 844 ################################################################
 845 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 846 sub get_glyphs_mapping(@)
 847 {
 848     my @table = @_;
 849
 850     for (my $i = 0; $i < @glyph2uni; $i++)
 851     {
 852         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 853     }
 854     return @table;
 855 }
 856
 857 ################################################################
 858 # build EUC-JP table from the JIS 0208/0212 files
 859 sub dump_eucjp_codepage()
 860 {
 861     @cp2uni = ();
 862     @glyph2uni = ();
 863     @lead_bytes = ();
 864     @uni2cp = ();
 865     $default_char = $DEF_CHAR;
 866     $default_wchar = 0x30fb;
 867
 868     # ASCII chars
 869     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 870
 871     # lead bytes
 872     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 873
 874     # JIS X 0201 right plane
 875     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 876
 877     # undefined chars
 878     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 879     $cp2uni[0xa0] = 0xf8f0;
 880     $cp2uni[0xff] = 0xf8f3;
 881
 882     # Fix backslash conversion
 883     add_mapping( 0xa1c0, 0xff3c );
 884
 885     # Add private mappings for rows undefined in JIS 0208/0212
 886     my $private = 0xe000;
 887     foreach my $hi (0xf5 .. 0xfe)
 888     {
 889         foreach my $lo (0xa1 .. 0xfe)
 890         {
 891             add_mapping( ($hi << 8) + $lo, $private++ );
 892         }
 893     }
 894     foreach my $hi (0xf5 .. 0xfe)
 895     {
 896         foreach my $lo (0x21 .. 0x7e)
 897         {
 898             add_mapping( ($hi << 8) + $lo, $private++ );
 899         }
 900     }
 901
 902     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 903     while (<$INPUT>)
 904     {
 905         next if /^\#/;  # skip comments
 906         next if /^$/;  # skip empty lines
 907         next if /\x1a/;  # skip ^Z
 908         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 909         {
 910             add_mapping( 0x8080 + hex $1, hex $2 );
 911             next;
 912         }
 913         die "Unrecognized line $_\n";
 914     }
 915     close $INPUT;
 916
 917     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 918     while (<$INPUT>)
 919     {
 920         next if /^\#/;  # skip comments
 921         next if /^$/;  # skip empty lines
 922         next if /\x1a/;  # skip ^Z
 923         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 924         {
 925             add_mapping( 0x8000 + hex $1, hex $2 );
 926             next;
 927         }
 928         die "Unrecognized line $_\n";
 929     }
 930     close $INPUT;
 931
 932     output_codepage_file( 20932 );
 933 }
 934
 935
 936 ################################################################
 937 # build the sort keys table
 938 sub dump_sortkeys($)
 939 {
 940     my $filename = shift;
 941     my @sortkeys = ();
 942
 943     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
 944     while (<$INPUT>)
 945     {
 946         next if /^\#/;  # skip comments
 947         next if /^$/;  # skip empty lines
 948         next if /\x1a/;  # skip ^Z
 949         next if /^\@version/;  # skip @version header
 950         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 951         {
 952             my ($uni,$variable) = (hex $1, $2);
 953             next if $uni > 65535;
 954             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
 955             next;
 956         }
 957         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 958         {
 959             # multiple character sequence, ignored for now
 960             next;
 961         }
 962         die "$SORTKEYS: Unrecognized line $_\n";
 963     }
 964     close $INPUT;
 965
 966     # compress the keys to 32 bit:
 967     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
 968
 969     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
 970                        ${$a}[2] <=> ${$b}[2] or
 971                        ${$a}[3] <=> ${$b}[3] or
 972                        ${$a}[4] <=> ${$b}[4] or
 973                        $a cmp $b; } @sortkeys;
 974
 975     my ($n2, $n3) = (1, 1);
 976     my @keys = (-1, -1, -1, -1, -1 );
 977     my @flatkeys = ();
 978
 979     for (my $i = 0; $i < @sortkeys; $i++)
 980     {
 981         next unless defined $sortkeys[$i];
 982         my @current = @{$sortkeys[$i]};
 983         if ($current[1] == $keys[1])
 984         {
 985             if ($current[2] == $keys[2])
 986             {
 987                 if ($current[3] == $keys[3])
 988                 {
 989                     # nothing
 990                 }
 991                 else
 992                 {
 993                     $keys[3] = $current[3];
 994                     $n3++;
 995                     die if ($n3 >= 16);
 996                 }
 997             }
 998             else
 999             {
1000                 $keys[2] = $current[2];
1001                 $keys[3] = $current[3];
1002                 $n2++;
1003                 $n3 = 1;
1004                 die if ($n2 >= 256);
1005             }
1006         }
1007         else
1008         {
1009             $keys[1] = $current[1];
1010             $keys[2] = $current[2];
1011             $keys[3] = $current[3];
1012             $n2 = 1;
1013             $n3 = 1;
1014         }
1015
1016         if ($current[2]) { $current[2] = $n2; }
1017         if ($current[3]) { $current[3] = $n3; }
1018         if ($current[4]) { $current[4] = 1; }
1019
1020         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1021     }
1022
1023     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1024     printf "Building $filename\n";
1025     printf OUTPUT "/* Unicode collation element table */\n";
1026     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1027     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1028     print OUTPUT "#include \"windef.h\"\n\n";
1029
1030     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1031
1032     close OUTPUT;
1033     save_file($filename);
1034 }
1035
1036
1037 ################################################################
1038 # dump an array of integers
1039 sub dump_array($$@)
1040 {
1041     my ($bit_width, $default, @array) = @_;
1042     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1043     my $i;
1044     my $ret = "    ";
1045     for ($i = 0; $i < $#array; $i++)
1046     {
1047         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1048         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1049     }
1050     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1051     return $ret;
1052 }
1053
1054
1055 ################################################################
1056 # dump an SBCS mapping table in binary format
1057 sub dump_binary_sbcs_table($)
1058 {
1059     my $codepage = shift;
1060
1061     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1062     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1063
1064     print OUTPUT pack "S<*", @header;
1065     print OUTPUT pack "C12", (0) x 12;
1066     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1067
1068     if (@glyph2uni)
1069     {
1070         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1071     }
1072     else
1073     {
1074         print OUTPUT pack "S<*", 0;
1075     }
1076
1077     print OUTPUT pack "S<*", 0, 0;
1078
1079     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1080 }
1081
1082
1083 ################################################################
1084 # dump a DBCS mapping table in binary format
1085 sub dump_binary_dbcs_table($)
1086 {
1087     my $codepage = shift;
1088     my @lb_ranges = get_lb_ranges();
1089     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1090
1091     my @offsets = (0) x 256;
1092     my $pos = 0;
1093     foreach my $i (@lead_bytes)
1094     {
1095         $offsets[$i] = ($pos += 256);
1096         $cp2uni[$i] = 0;
1097     }
1098
1099     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1100
1101     print OUTPUT pack "S<*", @header;
1102     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1103     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1104     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1105
1106     foreach my $i (@lead_bytes)
1107     {
1108         my $base = $i << 8;
1109         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1110     }
1111
1112     print OUTPUT pack "S<", 4;
1113     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1114 }
1115
1116
1117 ################################################################
1118 # get the list of defined lead byte ranges
1119 sub get_lb_ranges()
1120 {
1121     my @list = ();
1122     my @ranges = ();
1123
1124     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1125     my $on = 0;
1126     for (my $i = 0; $i < 256; $i++)
1127     {
1128         if ($on)
1129         {
1130             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1131         }
1132         else
1133         {
1134             if ($list[$i]) { push @ranges, $i; $on = 1; }
1135         }
1136     }
1137     if ($on) { push @ranges, 0xff; }
1138     return @ranges;
1139 }
1140
1141 ################################################################
1142 # dump the Indic Syllabic Category table
1143 sub dump_indic($)
1144 {
1145     my $filename = shift;
1146     my @indic_table;
1147
1148     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1149     while (<$INPUT>)
1150     {
1151         next if /^\#/;  # skip comments
1152         next if /^\s*$/;  # skip empty lines
1153         next if /\x1a/;  # skip ^Z
1154         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1155         {
1156             my $type = $2;
1157             die "unknown indic $type" unless defined $indic_types{$type};
1158             if (hex $1 < 65536)
1159             {
1160                 $indic_table[hex $1] = $indic_types{$type};
1161             }
1162             next;
1163         }
1164         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1165         {
1166             my $type = $3;
1167             die "unknown indic $type" unless defined $indic_types{$type};
1168             if (hex $1 < 65536 and hex $2 < 65536)
1169             {
1170                 foreach my $i (hex $1 .. hex $2)
1171                 {
1172                     $indic_table[$i] = $indic_types{$type};
1173                 }
1174             }
1175             next;
1176         }
1177         die "malformed line $_";
1178     }
1179     close $INPUT;
1180
1181     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1182     while (<$INPUT>)
1183     {
1184         next if /^\#/;  # skip comments
1185         next if /^\s*$/;  # skip empty lines
1186         next if /\x1a/;  # skip ^Z
1187         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1188         {
1189             my $type = $2;
1190             die "unknown matra $type" unless defined $matra_types{$type};
1191             $indic_table[hex $1] |= $matra_types{$type} << 8;
1192             next;
1193         }
1194         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1195         {
1196             my $type = $3;
1197             die "unknown matra $type" unless defined $matra_types{$type};
1198             foreach my $i (hex $1 .. hex $2)
1199             {
1200                 $indic_table[$i] |= $matra_types{$type} << 8;
1201             }
1202             next;
1203         }
1204         die "malformed line $_";
1205     }
1206     close $INPUT;
1207
1208     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1209     print "Building $filename\n";
1210     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1211     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1212     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1213     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1214     print OUTPUT "#include \"windef.h\"\n\n";
1215
1216     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1217
1218     close OUTPUT;
1219     save_file($filename);
1220 }
1221
1222 ################################################################
1223 # dump the Line Break Properties table
1224 sub dump_linebreak($)
1225 {
1226     my $filename = shift;
1227     my @break_table;
1228     my $next_group = 0;
1229
1230     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1231     while (<$INPUT>)
1232     {
1233         next if /^\#/;  # skip comments
1234         next if /^\s*$/;  # skip empty lines
1235         next if /\x1a/;  # skip ^Z
1236         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1237         {
1238             my $type = $2;
1239             die "unknown breaktype $type" unless defined $break_types{$type};
1240             $break_table[hex $1] = $break_types{$type};
1241             next;
1242         }
1243         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1244         {
1245             my $type = $3;
1246             die "unknown breaktype $type" unless defined $break_types{$type};
1247             foreach my $i (hex $1 .. hex $2)
1248             {
1249                 $break_table[$i] = $break_types{$type};
1250             }
1251             next;
1252         }
1253         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1254         {
1255             my $type = $2;
1256             die "unknown breaktype $type" unless defined $break_types{$type};
1257             $break_table[hex $1] = $break_types{$type};
1258             next;
1259         }
1260         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1261         {
1262             my $type = $3;
1263             die "unknown breaktype $type" unless defined $break_types{$type};
1264             foreach my $i (hex $1 .. hex $2)
1265             {
1266                 $break_table[$i] = $break_types{$type};
1267             }
1268             next;
1269         }
1270         die "malformed line $_";
1271     }
1272     close $INPUT;
1273
1274     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1275     print "Building $filename\n";
1276     print OUTPUT "/* Unicode Line Break Properties */\n";
1277     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1278     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1279     print OUTPUT "#include \"windef.h\"\n\n";
1280
1281     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1282
1283     close OUTPUT;
1284     save_file($filename);
1285 }
1286
1287 my %scripts =
1288 (
1289     "Unknown"                => 0,
1290     "Common"                 => 1,
1291     "Inherited"              => 2,
1292     "Arabic"                 => 3,
1293     "Armenian"               => 4,
1294     "Avestan"                => 5,
1295     "Balinese"               => 6,
1296     "Bamum"                  => 7,
1297     "Batak"                  => 8,
1298     "Bengali"                => 9,
1299     "Bopomofo"               => 10,
1300     "Brahmi"                 => 11,
1301     "Braille"                => 12,
1302     "Buginese"               => 13,
1303     "Buhid"                  => 14,
1304     "Canadian_Aboriginal"    => 15,
1305     "Carian"                 => 16,
1306     "Cham"                   => 17,
1307     "Cherokee"               => 18,
1308     "Coptic"                 => 19,
1309     "Cuneiform"              => 20,
1310     "Cypriot"                => 21,
1311     "Cyrillic"               => 22,
1312     "Deseret"                => 23,
1313     "Devanagari"             => 24,
1314     "Egyptian_Hieroglyphs"   => 25,
1315     "Ethiopic"               => 26,
1316     "Georgian"               => 27,
1317     "Glagolitic"             => 28,
1318     "Gothic"                 => 29,
1319     "Greek"                  => 30,
1320     "Gujarati"               => 31,
1321     "Gurmukhi"               => 32,
1322     "Han"                    => 33,
1323     "Hangul"                 => 34,
1324     "Hanunoo"                => 35,
1325     "Hebrew"                 => 36,
1326     "Hiragana"               => 37,
1327     "Imperial_Aramaic"       => 38,
1328     "Inscriptional_Pahlavi"  => 39,
1329     "Inscriptional_Parthian" => 40,
1330     "Javanese"               => 41,
1331     "Kaithi"                 => 42,
1332     "Kannada"                => 43,
1333     "Katakana"               => 44,
1334     "Kayah_Li"               => 45,
1335     "Kharoshthi"             => 46,
1336     "Khmer"                  => 47,
1337     "Lao"                    => 48,
1338     "Latin"                  => 49,
1339     "Lepcha"                 => 50,
1340     "Limbu"                  => 51,
1341     "Linear_B"               => 52,
1342     "Lisu"                   => 53,
1343     "Lycian"                 => 54,
1344     "Lydian"                 => 55,
1345     "Malayalam"              => 56,
1346     "Mandaic"                => 57,
1347     "Meetei_Mayek"           => 58,
1348     "Mongolian"              => 59,
1349     "Myanmar"                => 60,
1350     "New_Tai_Lue"            => 61,
1351     "Nko"                    => 62,
1352     "Ogham"                  => 63,
1353     "Ol_Chiki"               => 64,
1354     "Old_Italic"             => 65,
1355     "Old_Persian"            => 66,
1356     "Old_South_Arabian"      => 67,
1357     "Old_Turkic"             => 68,
1358     "Oriya"                  => 69,
1359     "Osmanya"                => 70,
1360     "Phags_Pa"               => 71,
1361     "Phoenician"             => 72,
1362     "Rejang"                 => 73,
1363     "Runic"                  => 74,
1364     "Samaritan"              => 75,
1365     "Saurashtra"             => 76,
1366     "Shavian"                => 77,
1367     "Sinhala"                => 78,
1368     "Sundanese"              => 79,
1369     "Syloti_Nagri"           => 80,
1370     "Syriac"                 => 81,
1371     "Tagalog"                => 82,
1372     "Tagbanwa"               => 83,
1373     "Tai_Le"                 => 84,
1374     "Tai_Tham"               => 85,
1375     "Tai_Viet"               => 86,
1376     "Tamil"                  => 87,
1377     "Telugu"                 => 88,
1378     "Thaana"                 => 89,
1379     "Thai"                   => 90,
1380     "Tibetan"                => 91,
1381     "Tifinagh"               => 92,
1382     "Ugaritic"               => 93,
1383     "Vai"                    => 94,
1384     "Yi"                     => 95,
1385     # Win8/Win8.1
1386     "Chakma"                 => 96,
1387     "Meroitic_Cursive"       => 97,
1388     "Meroitic_Hieroglyphs"   => 98,
1389     "Miao"                   => 99,
1390     "Sharada"                => 100,
1391     "Sora_Sompeng"           => 101,
1392     "Takri"                  => 102,
1393     # Win10
1394     "Bassa_Vah"              => 103,
1395     "Caucasian_Albanian"     => 104,
1396     "Duployan"               => 105,
1397     "Elbasan"                => 106,
1398     "Grantha"                => 107,
1399     "Khojki"                 => 108,
1400     "Khudawadi"              => 109,
1401     "Linear_A"               => 110,
1402     "Mahajani"               => 111,
1403     "Manichaean"             => 112,
1404     "Mende_Kikakui"          => 113,
1405     "Modi"                   => 114,
1406     "Mro"                    => 115,
1407     "Nabataean"              => 116,
1408     "Old_North_Arabian"      => 117,
1409     "Old_Permic"             => 118,
1410     "Pahawh_Hmong"           => 119,
1411     "Palmyrene"              => 120,
1412     "Pau_Cin_Hau"            => 121,
1413     "Psalter_Pahlavi"        => 122,
1414     "Siddham"                => 123,
1415     "Tirhuta"                => 124,
1416     "Warang_Citi"            => 125,
1417     # Win10 RS1
1418     "Adlam"                  => 126,
1419     "Ahom"                   => 127,
1420     "Anatolian_Hieroglyphs"  => 128,
1421     "Bhaiksuki"              => 129,
1422     "Hatran"                 => 130,
1423     "Marchen"                => 131,
1424     "Multani"                => 132,
1425     "Newa"                   => 133,
1426     "Old_Hungarian"          => 134,
1427     "Osage"                  => 135,
1428     "SignWriting"            => 136,
1429     "Tangut"                 => 137,
1430     # Win10 RS4
1431     "Masaram_Gondi"          => 138,
1432     "Nushu"                  => 139,
1433     "Soyombo"                => 140,
1434     "Zanabazar_Square"       => 141,
1435     # Win10 1903
1436     "Dogra"                  => 142,
1437     "Gunjala_Gondi"          => 143,
1438     "Hanifi_Rohingya"        => 144,
1439     "Makasar"                => 145,
1440     "Medefaidrin"            => 146,
1441     "Old_Sogdian"            => 147,
1442     "Sogdian"                => 148,
1443     # Win10 2004
1444     "Elymaic"                => 149,
1445     "Nyiakeng_Puachue_Hmong" => 150,
1446     "Nandinagari"            => 151,
1447     "Wancho"                 => 152,
1448 );
1449
1450 ################################################################
1451 # dump Script IDs table
1452 sub dump_scripts($)
1453 {
1454     my $filename = shift;
1455     my $header = $filename;
1456     my @scripts_table;
1457     my $script_index;
1458     my $i;
1459
1460     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1461     # Fill the table
1462     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1463     while (<$INPUT>)
1464     {
1465         my $type = "";
1466
1467         next if /^\#/;  # skip comments
1468         next if /^\s*$/;  # skip empty lines
1469         next if /\x1a/;  # skip ^Z
1470         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1471         {
1472             $type = $2;
1473             if (defined $scripts{$type})
1474             {
1475                 $scripts_table[hex $1] = $scripts{$type};
1476             }
1477             next;
1478         }
1479         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1480         {
1481             $type = $3;
1482             if (defined $scripts{$type})
1483             {
1484                 foreach my $i (hex $1 .. hex $2)
1485                 {
1486                     $scripts_table[$i] = $scripts{$type};
1487                 }
1488             }
1489             next;
1490         }
1491     }
1492
1493     close $INPUT;
1494
1495     $header = "$filename.h";
1496     open OUTPUT,">$header.new" or die "Cannot create $header";
1497     print "Building $header\n";
1498     print OUTPUT "/* Unicode Script IDs */\n";
1499     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1500     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1501
1502     print OUTPUT "enum unicode_script_id {\n";
1503     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1504     {
1505         print OUTPUT "    Script_$script = $scripts{$script},\n";
1506     }
1507     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1508     print OUTPUT "};\n";
1509
1510     close OUTPUT;
1511     save_file($header);
1512
1513     $filename = "$filename.c";
1514     open OUTPUT,">$filename.new" or die "Cannot create $header";
1515     print "Building $filename\n";
1516     print OUTPUT "/* Unicode Script IDs */\n";
1517     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1518     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1519     print OUTPUT "#include \"windef.h\"\n\n";
1520
1521     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1522     close OUTPUT;
1523     save_file($filename);
1524 }
1525
1526 ################################################################
1527 # dump the BiDi mirroring table
1528 sub dump_mirroring($)
1529 {
1530     my $filename = shift;
1531     my @mirror_table = ();
1532
1533     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1534     while (<$INPUT>)
1535     {
1536         next if /^\#/;  # skip comments
1537         next if /^$/;  # skip empty lines
1538         next if /\x1a/;  # skip ^Z
1539         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1540         {
1541             $mirror_table[hex $1] = hex $2;
1542             next;
1543         }
1544         die "malformed line $_";
1545     }
1546     close $INPUT;
1547
1548     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1549     print "Building $filename\n";
1550     print OUTPUT "/* Unicode BiDi mirroring */\n";
1551     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1552     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1553     print OUTPUT "#include \"windef.h\"\n\n";
1554     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1555     close OUTPUT;
1556     save_file($filename);
1557 }
1558
1559 ################################################################
1560 # dump the Bidi Brackets
1561 sub dump_bracket($)
1562 {
1563     my $filename = shift;
1564     my @bracket_table;
1565
1566     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1567     while (<$INPUT>)
1568     {
1569         next if /^\#/;  # skip comments
1570         next if /^\s*$/;  # skip empty lines
1571         next if /\x1a/;  # skip ^Z
1572         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1573         {
1574             my $type = $3;
1575             die "unknown bracket $type" unless defined $bracket_types{$type};
1576             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1577             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1578             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1579             next;
1580         }
1581         die "malformed line $_";
1582     }
1583     close $INPUT;
1584
1585     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1586     print "Building $filename\n";
1587     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1588     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1589     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1590     print OUTPUT "#include \"windef.h\"\n\n";
1591
1592     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1593
1594     close OUTPUT;
1595     save_file($filename);
1596 }
1597
1598 ################################################################
1599 # dump the Arabic shaping table
1600 sub dump_shaping($)
1601 {
1602     my $filename = shift;
1603     my %groups;
1604     my $next_group = 0;
1605
1606     $groups{"No_Joining_Group"} = $next_group++;
1607
1608     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1609     while (<$INPUT>)
1610     {
1611         next if /^\#/;  # skip comments
1612         next if /^\s*$/;  # skip empty lines
1613         next if /\x1a/;  # skip ^Z
1614         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1615         {
1616             my $type = $2;
1617             my $group = $3;
1618             $groups{$group} = $next_group++ unless defined $groups{$group};
1619             $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1620             next;
1621         }
1622         die "malformed line $_";
1623     }
1624     close $INPUT;
1625
1626     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1627     print "Building $filename\n";
1628     print OUTPUT "/* Unicode Arabic shaping */\n";
1629     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1630     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1631     print OUTPUT "#include \"windef.h\"\n\n";
1632
1633     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1634
1635     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1636     for (my $i = 0x600; $i <= 0x6ff; $i++)
1637     {
1638         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1639             ${joining_forms{"isolated"}}[$i] || $i,
1640             ${joining_forms{"final"}}[$i] || $i,
1641             ${joining_forms{"initial"}}[$i] || $i,
1642             ${joining_forms{"medial"}}[$i] || $i;
1643     }
1644     print OUTPUT "};\n";
1645
1646     close OUTPUT;
1647     save_file($filename);
1648 }
1649
1650 ################################################################
1651 # dump the Vertical Orientation table
1652 sub dump_vertical($)
1653 {
1654     my $filename = shift;
1655     my @vertical_table;
1656
1657     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1658     while (<$INPUT>)
1659     {
1660         next if /^\#/;  # skip comments
1661         next if /^\s*$/;  # skip empty lines
1662         next if /\x1a/;  # skip ^Z
1663         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1664         {
1665             my $type = $2;
1666             die "unknown vertical $type" unless defined $vertical_types{$type};
1667             if (hex $1 < 65536)
1668             {
1669                 $vertical_table[hex $1] = $vertical_types{$type};
1670             }
1671             next;
1672         }
1673         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1674         {
1675             my $type = $3;
1676             die "unknown vertical $type" unless defined $vertical_types{$type};
1677             foreach my $i (hex $1 .. hex $2)
1678             {
1679                 $vertical_table[$i] = $vertical_types{$type};
1680             }
1681             next;
1682         }
1683         die "malformed line $_";
1684     }
1685     close $INPUT;
1686
1687     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1688     print "Building $filename\n";
1689     print OUTPUT "/* Unicode Vertical Orientation */\n";
1690     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1691     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1692     print OUTPUT "#include \"windef.h\"\n\n";
1693
1694     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1695
1696     close OUTPUT;
1697     save_file($filename);
1698 }
1699
1700 ################################################################
1701 # dump the digit folding tables
1702 sub dump_digit_folding($)
1703 {
1704     my ($filename) = shift;
1705     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1706     print "Building $filename\n";
1707     print OUTPUT "/* Unicode digit folding mappings */\n";
1708     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1709     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1710     print OUTPUT "#include \"windef.h\"\n\n";
1711
1712     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1713     close OUTPUT;
1714     save_file($filename);
1715 }
1716
1717
1718 ################################################################
1719 # dump the case mapping tables
1720 sub dump_case_mappings($)
1721 {
1722     my $filename = shift;
1723     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1724     print "Building $filename\n";
1725     print OUTPUT "/* Unicode case mappings */\n";
1726     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1727     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1728     print OUTPUT "#include \"windef.h\"\n\n";
1729
1730     my @upper = @toupper_table;
1731     my @lower = @tolower_table;
1732     remove_linguistic_mappings( \@upper, \@lower );
1733
1734     dump_case_table( "wine_casemap_lower", @lower );
1735     print OUTPUT "\n";
1736     dump_case_table( "wine_casemap_upper", @upper );
1737     close OUTPUT;
1738     save_file($filename);
1739 }
1740
1741
1742 ################################################################
1743 # dump a case mapping table
1744 sub dump_case_table($@)
1745 {
1746     my ($name,@table) = @_;
1747
1748     for (my $i = 0; $i < 65536; $i++)
1749     {
1750         next unless defined $table[$i];
1751         $table[$i] = ($table[$i] - $i) & 0xffff;
1752     }
1753
1754     my @array = compress_array( 256, 0, @table[0..65535] );
1755
1756     printf OUTPUT "const WCHAR %s[%d] =\n", $name, scalar @array;
1757     printf OUTPUT "{\n    /* index */\n";
1758     printf OUTPUT "%s,\n", dump_array( 16, 0, @array[0..255] );
1759     printf OUTPUT "    /* data */\n";
1760     printf OUTPUT "%s", dump_array( 16, 0, @array[256..$#array] );
1761     printf OUTPUT "\n};\n";
1762 }
1763
1764 ################################################################
1765 # compress a mapping table by removing identical rows
1766 sub compress_array($$@)
1767 {
1768     my $rows = shift;
1769     my $def = shift;
1770     my @table = @_;
1771     my $len = @table / $rows;
1772     my @array;
1773     my $data = "";
1774
1775     # try to merge table rows
1776     for (my $row = 0; $row < $rows; $row++)
1777     {
1778         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1779         my $pos = index $data, $rowtxt;
1780         if ($pos == -1)
1781         {
1782             # check if the tail of the data can match the start of the new row
1783             my $first = substr( $rowtxt, 0, 1 );
1784             for (my $i = length($data) - 1; $i > 0; $i--)
1785             {
1786                 $pos = index( substr( $data, -$i ), $first );
1787                 last if $pos == -1;
1788                 $i -= $pos;
1789                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1790                 substr( $data, -$i ) = "";
1791                 last;
1792             }
1793             $pos = length $data;
1794             $data .= $rowtxt;
1795         }
1796         $array[$row] = $rows + $pos;
1797     }
1798     return @array, unpack "U*", $data;
1799 }
1800
1801 ################################################################
1802 # dump a simple char -> 16-bit value mapping table
1803 sub dump_simple_mapping($$@)
1804 {
1805     my $name = shift;
1806     my $def = shift;
1807     my @array = compress_array( 256, $def, @_[0..65535] );
1808
1809     printf OUTPUT "const unsigned short %s[%d] =\n{\n", $name, $#array+1;
1810     printf OUTPUT "    /* offsets */\n%s,\n", dump_array( 16, 0, @array[0..255] );
1811     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( 16, 0, @array[256..$#array] );
1812 }
1813
1814 ################################################################
1815 # dump a char -> 16-bit value mapping table using two-level tables
1816 sub dump_two_level_mapping($$@)
1817 {
1818     my $name = shift;
1819     my $def = shift;
1820     my $size = shift;
1821     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1822     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1823     my @array = compress_array( 256, 0, @row_array[0..4095] );
1824
1825     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1826
1827     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1828     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1829     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1830     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1831 }
1832
1833 ################################################################
1834 # dump a char -> value mapping table using three-level tables
1835 sub dump_three_level_mapping($$@)
1836 {
1837     my $name = shift;
1838     my $def = shift;
1839     my $size = shift;
1840     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1841     my $level3 = ($MAX_CHAR + 1) / 16;
1842     my $level2 = $level3 / 16;
1843     my $level1 = $level2 / 16;
1844     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1845     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1846     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1847
1848     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1849     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1850
1851     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1852     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1853     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1854     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1855     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1856 }
1857
1858 ################################################################
1859 # dump a binary case mapping table in l_intl.nls format
1860 sub dump_binary_case_table(@)
1861 {
1862     my (@table) = @_;
1863     my $max_char = 0x10000;
1864     my $level1 = $max_char / 16;
1865     my $level2 = $level1 / 16;
1866
1867     my @difftable;
1868     for (my $i = 0; $i < @table; $i++)
1869     {
1870         next unless defined $table[$i];
1871         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1872     }
1873
1874     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1875     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1876     my $offset = @array - $level1;
1877     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1878     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1879 }
1880
1881 ################################################################
1882 # dump case mappings for l_intl.nls
1883 sub dump_intl_nls($)
1884 {
1885     my @upper_table = @toupper_table;
1886     my @lower_table = @tolower_table;
1887     remove_linguistic_mappings( \@upper_table, \@lower_table );
1888
1889     my $upper = dump_binary_case_table( @upper_table );
1890     my $lower = dump_binary_case_table( @lower_table );
1891
1892     my $filename = shift;
1893     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1894     printf "Building $filename\n";
1895
1896     binmode OUTPUT;
1897     print OUTPUT pack "S<", 1;  # version
1898     print OUTPUT $upper;
1899     print OUTPUT $lower;
1900     close OUTPUT;
1901     save_file($filename);
1902 }
1903
1904
1905 ################################################################
1906 # dump the bidi direction table
1907 sub dump_bidi_dir_table($)
1908 {
1909     my $filename = shift;
1910     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1911     printf "Building $filename\n";
1912     printf OUTPUT "/* Unicode BiDi direction table */\n";
1913     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1914     printf OUTPUT "#include \"windef.h\"\n\n";
1915
1916     my @table;
1917
1918     for (my $i = 0; $i < 65536; $i++)
1919     {
1920         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1921     }
1922
1923     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1924
1925     close OUTPUT;
1926     save_file($filename);
1927 }
1928
1929
1930 ################################################################
1931 # dump the ctype tables
1932 sub dump_ctype_tables($)
1933 {
1934     my $filename = shift;
1935     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1936     printf "Building $filename\n";
1937     printf OUTPUT "/* Unicode ctype tables */\n";
1938     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1939     printf OUTPUT "#include \"windef.h\"\n\n";
1940
1941     my @table = map { ($_ || 0) & 0xffff; } @category_table;
1942
1943     # add the direction in the high 4 bits of the category
1944     for (my $i = 0; $i < 65536; $i++)
1945     {
1946         $table[$i] |= $directions{$direction_table[$i]} << 12 if defined $direction_table[$i];
1947     }
1948
1949     dump_simple_mapping( "wine_wctype_table", 0, @table );
1950
1951     close OUTPUT;
1952     save_file($filename);
1953 }
1954
1955
1956 sub rol($$)
1957 {
1958     my ($byte, $count) = @_;
1959     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1960 }
1961
1962 ################################################################
1963 # compress the character properties table
1964 sub compress_char_props_table($@)
1965 {
1966     my $rows = shift;
1967     my @table = @_;
1968     my $len = @table / $rows;
1969     my $pos = 0;
1970     my @array = (0) x $rows;
1971     my %sequences;
1972
1973     # add some predefined sequences
1974     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1975
1976     # try to merge table rows
1977     for (my $row = 0; $row < $rows; $row++)
1978     {
1979         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1980         my $rowtxt = pack "L*", @table_row;
1981         if (defined($sequences{$rowtxt}))
1982         {
1983             # reuse an existing row
1984             $array[$row] = $sequences{$rowtxt};
1985         }
1986         else
1987         {
1988             # create a new row
1989             $sequences{$rowtxt} = $array[$row] = ++$pos;
1990             push @array, @table_row;
1991         }
1992     }
1993     return @array;
1994 }
1995
1996 ################################################################
1997 # dump a normalization table in binary format
1998 sub dump_norm_table($)
1999 {
2000     my $filename = shift;
2001
2002     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2003     my %decomp = ( "nfc" => \@decomp_table,
2004                    "nfd" => \@decomp_table,
2005                    "nfkc" => \@decomp_compat_table,
2006                    "nfkd" => \@decomp_compat_table ,
2007                    "idna" => \@idna_decomp_table );
2008
2009     open OUTPUT,">$filename.new" or die "Cannot create $filename";
2010     print "Building $filename\n";
2011
2012     my $type = $filename;
2013     $type =~ s!.*/norm(\w+)\.nls!$1!;
2014
2015     my $compose = $forms{$type} & 1;
2016     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2017
2018     my @version = split /\./, $UNIVERSION;
2019
2020     # combining classes
2021
2022     my @classes;
2023     my @class_values;
2024
2025     foreach my $c (grep defined, @combining_class_table)
2026     {
2027         $classes[$c] = 1 if $c < 0x100;
2028     }
2029     for (my $i = 0; $i < @classes; $i++)
2030     {
2031         next unless defined $classes[$i];
2032         $classes[$i] = @class_values;
2033         push @class_values, $i;
2034     }
2035     push @class_values, 0 if (@class_values % 2);
2036     die "too many classes" if @class_values >= 0x40;
2037
2038     # character properties
2039
2040     my @char_props;
2041     my @decomposed;
2042     my @comp_hash_table;
2043     my $comp_hash_size = $compose ? 254 : 0;
2044
2045     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2046     {
2047         next unless defined $combining_class_table[$i];
2048         if (defined $decomp{$type}->[$i])
2049         {
2050             my @dec = get_decomposition( $i, $decomp{$type} );
2051             if ($compose && (my @comp = get_composition( $i, $compat )))
2052             {
2053                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2054                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2055
2056                 my $val = 0;
2057                 foreach my $d (@dec)
2058                 {
2059                     $val = $combining_class_table[$d];
2060                     last if $val;
2061                 }
2062                 $char_props[$i] = $classes[$val];
2063             }
2064             else
2065             {
2066                 $char_props[$i] = 0xbf;
2067             }
2068             @dec = compose_hangul( @dec ) if $compose;
2069             @dec = to_utf16( @dec );
2070             push @dec, 0 if @dec >= 7;
2071             $decomposed[$i] = \@dec;
2072         }
2073         else
2074         {
2075             if ($combining_class_table[$i] == 0x100)
2076             {
2077                 $char_props[$i] = 0x7f;
2078             }
2079             elsif ($combining_class_table[$i])
2080             {
2081                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2082             }
2083             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2084             {
2085                 $char_props[$i] = 0xff;
2086             }
2087             else
2088             {
2089                 $char_props[$i] = 0;
2090             }
2091         }
2092     }
2093
2094     if ($compose)
2095     {
2096         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2097         {
2098             my @comp = get_composition( $i, $compat );
2099             next unless @comp;
2100             if ($combining_class_table[$comp[1]])
2101             {
2102                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2103                 $char_props[$comp[1]] |= 0x40;
2104             }
2105             else
2106             {
2107                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2108                 $char_props[$comp[1]] |= 0xc0;
2109             }
2110         }
2111     }
2112
2113     # surrogates
2114     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2115     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2116
2117     # Hangul
2118     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2119     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2120     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2121
2122     # invalid chars
2123     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2124     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2125     foreach my $i (0x00..0x10)
2126     {
2127         $char_props[($i << 16) | 0xfffe] = 0xff;
2128         $char_props[($i << 16) | 0xffff] = 0xff;
2129     }
2130
2131     # decomposition hash table
2132
2133     my @decomp_hash_table;
2134     my @decomp_hash_index;
2135     my @decomp_hash_data;
2136     my $decomp_hash_size = 944;
2137
2138     # build string of character data, reusing substrings when possible
2139     my $decomp_char_data = "";
2140     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2141     {
2142         my $str = pack "U*", @{$i};
2143         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2144     }
2145     for (my $i = 0; $i < @decomposed; $i++)
2146     {
2147         next unless defined $decomposed[$i];
2148         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2149         die "sequence not found" if $pos == -1;
2150         my $len = @{$decomposed[$i]};
2151         $len = 7 if $len > 7;
2152         my $hash = $i % $decomp_hash_size;
2153         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2154     }
2155     for (my $i = 0; $i < $decomp_hash_size; $i++)
2156     {
2157         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2158         next unless defined $decomp_hash_table[$i];
2159         if (@{$decomp_hash_table[$i]} == 1)
2160         {
2161             my $entry = $decomp_hash_table[$i]->[0];
2162             if ($char_props[$entry->[0]] == 0xbf)
2163             {
2164                 $decomp_hash_index[$i] = $entry->[1];
2165                 next;
2166             }
2167         }
2168         foreach my $entry (@{$decomp_hash_table[$i]})
2169         {
2170             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2171         }
2172     }
2173     push @decomp_hash_data, 0, 0;
2174
2175     # composition hash table
2176
2177     my @comp_hash_index;
2178     my @comp_hash_data;
2179     if (@comp_hash_table)
2180     {
2181         for (my $i = 0; $i < $comp_hash_size; $i++)
2182         {
2183             $comp_hash_index[$i] = @comp_hash_data;
2184             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2185         }
2186         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2187         push @comp_hash_data, 0, 0, 0;
2188     }
2189
2190     my $level1 = ($MAX_CHAR + 1) / 128;
2191     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2192
2193     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2194                    0, $decomp_hash_size, $comp_hash_size, 0 );
2195     my @tables = (0) x 8;
2196
2197     $tables[0] = 16 + @header + @tables;
2198     $tables[1] = $tables[0] + @class_values / 2;
2199     $tables[2] = $tables[1] + $level1 / 2;
2200     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2201     $tables[4] = $tables[3] + @decomp_hash_index;
2202     $tables[5] = $tables[4] + @decomp_hash_data;
2203     $tables[6] = $tables[5] + length $decomp_char_data;
2204     $tables[7] = $tables[6] + @comp_hash_index;
2205
2206     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2207     print OUTPUT pack "S<*", @header;
2208     print OUTPUT pack "S<*", @tables;
2209     print OUTPUT pack "C*", @class_values;
2210
2211     print OUTPUT pack "C*", @rows[0..$level1-1];
2212     print OUTPUT pack "C*", @rows[$level1..$#rows];
2213     print OUTPUT pack "S<*", @decomp_hash_index;
2214     print OUTPUT pack "S<*", @decomp_hash_data;
2215     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2216     print OUTPUT pack "S<*", @comp_hash_index;
2217     print OUTPUT pack "S<*", @comp_hash_data;
2218
2219     close OUTPUT;
2220     save_file($filename);
2221
2222     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2223 }
2224
2225
2226 ################################################################
2227 # output a codepage definition file from the global tables
2228 sub output_codepage_file($)
2229 {
2230     my $codepage = shift;
2231
2232     my $output = sprintf "nls/c_%03d.nls", $codepage;
2233     open OUTPUT,">$output.new" or die "Cannot create $output";
2234
2235     printf "Building %s\n", $output;
2236     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2237     else { dump_binary_dbcs_table( $codepage ); }
2238
2239     close OUTPUT;
2240     save_file($output);
2241
2242     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2243 }
2244
2245 ################################################################
2246 # output a codepage table from a Microsoft-style mapping file
2247 sub dump_msdata_codepage($)
2248 {
2249     my $filename = shift;
2250
2251     my $state = "";
2252     my ($codepage, $width, $count);
2253     my ($lb_cur, $lb_end);
2254
2255     @cp2uni = ();
2256     @glyph2uni = ();
2257     @lead_bytes = ();
2258     @uni2cp = ();
2259     $default_char = $DEF_CHAR;
2260     $default_wchar = $DEF_CHAR;
2261
2262     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2263
2264     while (<$INPUT>)
2265     {
2266         next if /^;/;  # skip comments
2267         next if /^\s*$/;  # skip empty lines
2268         next if /\x1a/;  # skip ^Z
2269         last if /^ENDCODEPAGE/;
2270
2271         if (/^CODEPAGE\s+(\d+)/)
2272         {
2273             $codepage = $1;
2274             next;
2275         }
2276         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2277         {
2278             $width = $1;
2279             $default_char = hex $2;
2280             $default_wchar = hex $3;
2281             next;
2282         }
2283         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2284         {
2285             $state = $1;
2286             $count = $2;
2287             next;
2288         }
2289         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2290         {
2291             if ($state eq "MBTABLE")
2292             {
2293                 my $cp = hex $1;
2294                 my $uni = hex $2;
2295                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2296                 next;
2297             }
2298             if ($state eq "GLYPHTABLE")
2299             {
2300                 my $cp = hex $1;
2301                 my $uni = hex $2;
2302                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2303                 next;
2304             }
2305             if ($state eq "WCTABLE")
2306             {
2307                 my $uni = hex $1;
2308                 my $cp = hex $2;
2309                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2310                 next;
2311             }
2312             if ($state eq "DBCSRANGE")
2313             {
2314                 my $start = hex $1;
2315                 my $end = hex $2;
2316                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2317                 $lb_cur = $start;
2318                 $lb_end = $end;
2319                 next;
2320             }
2321             if ($state eq "DBCSTABLE")
2322             {
2323                 my $mb = hex $1;
2324                 my $uni = hex $2;
2325                 my $cp = ($lb_cur << 8) | $mb;
2326                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2327                 if (!--$count)
2328                 {
2329                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2330                 }
2331                 next;
2332             }
2333         }
2334         die "$filename: Unrecognized line $_\n";
2335     }
2336     close $INPUT;
2337
2338     output_codepage_file( $codepage );
2339 }
2340
2341 ################################################################
2342 # align a string length
2343 sub align_string($$)
2344 {
2345     my ($align, $str) = @_;
2346     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2347     return $str;
2348 }
2349
2350 ################################################################
2351 # pack a GUID string
2352 sub pack_guid($)
2353 {
2354     $_ = shift;
2355     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2356     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2357 }
2358
2359 ################################################################
2360 # comparison function for compression sort
2361 sub cmp_compression
2362 {
2363     return scalar @{$a} <=> scalar @{$b} ||
2364         $a->[4] <=> $b->[4] ||
2365         $a->[5] <=> $b->[5] ||
2366         $a->[6] <=> $b->[6] ||
2367         $a->[7] <=> $b->[7] ||
2368         $a->[8] <=> $b->[8] ||
2369         $a->[9] <=> $b->[9] ||
2370         $a->[10] <=> $b->[10] ||
2371         $a->[11] <=> $b->[11] ||
2372         $a->[12] <=> $b->[12];
2373 }
2374
2375 ################################################################
2376 # build a binary sort keys table
2377 sub dump_sortkey_table($$)
2378 {
2379     my ($filename, $download) = @_;
2380
2381     my @keys;
2382     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2383     my @multiple_weights;
2384     my @expansions;
2385     my @compressions;
2386     my %exceptions;
2387     my %guids;
2388     my %compr_flags;
2389     my %locales;
2390     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2391     my $jamostr = "";
2392
2393     my $re_hex = '0x[0-9A-Fa-f]+';
2394     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2395     $guids{$default_guid} = { };
2396
2397     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2398
2399     my $KEYS = open_data_file( $MSDATA, $download );
2400
2401     printf "Building $filename\n";
2402
2403     while (<$KEYS>)
2404     {
2405         s/\s*;.*$//;
2406         next if /^\s*$/;  # skip empty lines
2407         if (/^\s*(SORTKEY|SORTTABLES)/)
2408         {
2409             $part = $1;
2410             next;
2411         }
2412         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2413         {
2414             $part = $section = "";
2415             next;
2416         }
2417         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2418         {
2419             $section = $1;
2420             $guid = undef;
2421             next;
2422         }
2423         next unless $part;
2424         if ("$part.$section" eq "SORTKEY.DEFAULT")
2425         {
2426             if (/^\s*($re_hex)\s+$re_key/)
2427             {
2428                 $keys[hex $1] = [ split(/\s+/,$2) ];
2429                 next;
2430             }
2431         }
2432         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2433         {
2434             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2435             {
2436                 $version = hex $1;
2437                 next;
2438             }
2439             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2440             {
2441                 # ignore for now
2442                 next;
2443             }
2444         }
2445         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2446                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2447                "$part.$section" eq "SORTTABLES.INVERSECASING")
2448         {
2449             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2450             {
2451                 $guid = lc $1;
2452                 $guids{$guid} = { } unless defined $guids{$guid};
2453                 $guids{$guid}->{flags} |= $flags{$section};
2454                 next;
2455             }
2456             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2457             {
2458                 $locales{$1} = $guid;
2459                 next;
2460             }
2461         }
2462         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2463         {
2464             if (/^\s*(\d+)\s+(\d+)/)
2465             {
2466                 push @multiple_weights, $1, $2;
2467                 next;
2468             }
2469         }
2470         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2471         {
2472             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2473             {
2474                 my $pos = scalar @expansions / 2;
2475                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2476                 push @expansions, hex $2, hex $3;
2477                 next;
2478             }
2479         }
2480         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2481         {
2482             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2483             {
2484                 $keys[hex $1] = $keys[hex $2];
2485                 next;
2486             }
2487         }
2488         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2489         {
2490             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2491             {
2492                 if ($subsection || !$guid)  # start a new one
2493                 {
2494                     $guid = lc $1;
2495                     $subsection = "";
2496                     $guids{$guid} = { } unless defined $guids{$guid};
2497                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2498                     $guids{$guid}->{compr} = @compressions;
2499                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2500                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2501                     push @compressions, [ ];
2502                 }
2503                 else  # merge with current one
2504                 {
2505                     $guids{lc $1} = { } unless defined $guids{lc $1};
2506                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2507                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2508                     $compr_flags{lc $1} = $compr_flags{$guid};
2509                 }
2510                 next;
2511             }
2512             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2513             {
2514                 $locales{$1} = $guid;
2515                 next;
2516             }
2517             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2518             {
2519                 $subsection = $1;
2520                 next;
2521             }
2522             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2523             {
2524                 my @comp = map { hex $_; } split(/\s+/,$1);
2525                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2526                 # add compression flags
2527                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2528                 next;
2529             }
2530         }
2531         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2532         {
2533             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2534             {
2535                 $guid = lc $1;
2536                 $guids{$guid} = { } unless defined $guids{lc $1};
2537                 $ling_flag = ($2 ? "+" : "-");
2538                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2539                 next;
2540             }
2541             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2542             {
2543                 $locales{$1} = $guid;
2544                 next;
2545             }
2546             if (/^\s*($re_hex)\s+$re_key/)
2547             {
2548                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2549                 next;
2550             }
2551         }
2552         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2553         {
2554             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2555             {
2556                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2557                 next;
2558             }
2559         }
2560         die "$download: $part.$section: unrecognized line $_\n";
2561     }
2562     close $KEYS;
2563
2564     # Sortkey table
2565
2566     my $table;
2567     for (my $i = 0; $i < 0x10000; $i++)
2568     {
2569         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2570         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2571     }
2572
2573     foreach my $id (sort keys %exceptions)
2574     {
2575         my $pos = length($table) / 4;
2576         my @exc = @{$exceptions{$id}};
2577         my @filled;
2578         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2579         my $guid = substr( $id, 0, -1 );
2580         $guids{$guid}->{$key} = $pos;
2581         $pos += 0x100;
2582         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2583         for (my $j = 0; $j < 0x10000; $j++)
2584         {
2585             next unless defined $exc[$j] || defined $flags[$j];
2586             $filled[$j >> 8] = 1;
2587             $j |= 0xff;
2588         }
2589         for (my $j = 0; $j < 0x100; $j++)
2590         {
2591             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2592             $pos += 0x100 if $filled[$j];
2593         }
2594         for (my $j = 0; $j < 0x10000; $j++)
2595         {
2596             next unless $filled[$j >> 8];
2597             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2598             $k[3] |= $flags[$j] || 0;
2599             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2600         }
2601     }
2602
2603     # Case mapping tables
2604
2605     # standard table
2606     my @casemaps;
2607     my @upper = @toupper_table;
2608     my @lower = @tolower_table;
2609     remove_linguistic_mappings( \@upper, \@lower );
2610     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2611
2612     # linguistic table
2613     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2614
2615     # Turkish table
2616     @upper = @toupper_table;
2617     @lower = @tolower_table;
2618     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2619     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2620     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2621     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2622
2623     # Char type table
2624
2625     my @table;
2626     my $types = "";
2627     my %typestr;
2628     for (my $i = 0; $i < 0x10000; $i++)
2629     {
2630         my $str = pack "S<3",
2631             ($category_table[$i] || 0) & 0xffff,
2632             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2633             ($category_table[$i] || 0) >> 16;
2634
2635         if (!defined($typestr{$str}))
2636         {
2637             $typestr{$str} = length($types) / 6;
2638             $types .= $str;
2639         }
2640         $table[$i] = $typestr{$str};
2641     }
2642
2643     my @rows = compress_array( 4096, 0, @table[0..65535] );
2644     my @array = compress_array( 256, 0, @rows[0..4095] );
2645     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2646     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2647
2648     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2649     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2650     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2651
2652     # Sort tables
2653
2654     # guids
2655     my $sorttables = pack "L<2", $version, scalar %guids;
2656     foreach my $id (sort keys %guids)
2657     {
2658         my %guid = %{$guids{$id}};
2659         my $flags = $guid{flags} || 0;
2660         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2661         $sorttables .= pack_guid($id) . pack "L<5",
2662             $flags,
2663             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2664             $guid{except} || 0,
2665             $guid{ling_except} || 0,
2666             $map / 2;
2667     }
2668
2669     # expansions
2670     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2671
2672     # compressions
2673     $sorttables .= pack "L<", scalar @compressions;
2674     my $rowstr = "";
2675     foreach my $c (@compressions)
2676     {
2677         my $pos = length($rowstr) / 2;
2678         my $min = 0xffff;
2679         my $max = 0;
2680         my @lengths = (0) x 8;
2681         foreach my $r (sort cmp_compression @{$c})
2682         {
2683             my @row = @{$r};
2684             $lengths[scalar @row - 6]++;
2685             foreach my $val (@row[4..$#row])
2686             {
2687                 $min = $val if $min > $val;
2688                 $max = $val if $max < $val;
2689             }
2690             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2691             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2692         }
2693         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2694     }
2695     $sorttables .= $rowstr;
2696
2697     # multiple weights
2698     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2699
2700     # jamo sort
2701     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2702
2703     # Locales
2704
2705     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2706     foreach my $loc (sort keys %locales)
2707     {
2708         # skip specific locales that match more general ones
2709         my @parts = split /[-_]/, $loc;
2710         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2711         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2712         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2713     }
2714
2715     # File header
2716
2717     my @header;
2718     $header[0] = 16;
2719     $header[1] = $header[0] + length $table;
2720     $header[2] = $header[1] + length $casemaps;
2721     $header[3] = $header[2] + length $chartypes;
2722
2723     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2724     print OUTPUT pack "L<*", @header;
2725     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2726     close OUTPUT;
2727     save_file($filename);
2728 }
2729
2730
2731 ################################################################
2732 # build the script to create registry keys
2733 sub dump_registry_script($%)
2734 {
2735     my ($filename, %keys) = @_;
2736     my $indent = 1;
2737
2738     printf "Building %s\n", $filename;
2739     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2740     print OUTPUT "HKLM\n{\n";
2741     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2742     {
2743         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2744         $indent++;
2745     }
2746     foreach my $k (sort keys %keys)
2747     {
2748         my @subkeys = split /\\/, $k;
2749         my ($def, @vals) = @{$keys{$k}};
2750         for (my $i = 0; $i < @subkeys; $i++)
2751         {
2752             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2753                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2754             $indent++;
2755         }
2756         foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2757         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2758     }
2759     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2760     close OUTPUT;
2761     save_file($filename);
2762 }
2763
2764
2765 ################################################################
2766 # save a file if modified
2767 sub save_file($)
2768 {
2769     my $file = shift;
2770     if (-f $file && !system "cmp $file $file.new >/dev/null")
2771     {
2772         unlink "$file.new";
2773     }
2774     else
2775     {
2776         rename "$file.new", "$file";
2777     }
2778 }
2779
2780
2781 ################################################################
2782 # main routine
2783
2784 chdir ".." if -f "./make_unicode";
2785 load_data();
2786 dump_case_mappings( "libs/port/casemap.c" );
2787 dump_sortkeys( "dlls/kernelbase/collation.c" );
2788 dump_ctype_tables( "libs/port/wctype.c" );
2789 dump_bidi_dir_table( "dlls/gdi32/direction.c" );
2790 dump_bidi_dir_table( "dlls/usp10/direction.c" );
2791 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2792 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2793 dump_mirroring( "dlls/usp10/mirror.c" );
2794 dump_mirroring( "dlls/dwrite/mirror.c" );
2795 dump_bracket( "dlls/usp10/bracket.c" );
2796 dump_bracket( "dlls/dwrite/bracket.c" );
2797 dump_shaping( "dlls/usp10/shaping.c" );
2798 dump_linebreak( "dlls/usp10/linebreak.c" );
2799 dump_linebreak( "dlls/dwrite/linebreak.c" );
2800 dump_scripts( "dlls/dwrite/scripts" );
2801 dump_indic( "dlls/usp10/indicsyllable.c" );
2802 dump_vertical( "dlls/gdi32/vertical.c" );
2803 dump_vertical( "dlls/wineps.drv/vertical.c" );
2804 dump_intl_nls("nls/l_intl.nls");
2805 dump_norm_table( "nls/normnfc.nls" );
2806 dump_norm_table( "nls/normnfd.nls" );
2807 dump_norm_table( "nls/normnfkc.nls" );
2808 dump_norm_table( "nls/normnfkd.nls" );
2809 dump_norm_table( "nls/normidna.nls" );
2810 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2811 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2812 dump_eucjp_codepage();
2813 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2814
2815 exit 0;
2816
2817 # Local Variables:
2818 # compile-command: "./make_unicode"
2819 # End: