tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "13.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $REPORTS = "http://www.unicode.org/reports";
  30 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  31 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  32
  33 # Sort keys file
  34 my $SORTKEYS = "tr10/allkeys.txt";
  35
  36 # Default char for undefined mappings
  37 my $DEF_CHAR = ord '?';
  38
  39 # Last valid Unicode character
  40 my $MAX_CHAR = 0x10ffff;
  41
  42 my @allfiles =
  43 (
  44     "CodpageFiles/037.txt",
  45     "CodpageFiles/437.txt",
  46     "CodpageFiles/500.txt",
  47     "CodpageFiles/737.txt",
  48     "CodpageFiles/775.txt",
  49     "CodpageFiles/850.txt",
  50     "CodpageFiles/852.txt",
  51     "CodpageFiles/855.txt",
  52     "CodpageFiles/857.txt",
  53     "CodpageFiles/860.txt",
  54     "CodpageFiles/861.txt",
  55     "CodpageFiles/862.txt",
  56     "CodpageFiles/863.txt",
  57     "CodpageFiles/864.txt",
  58     "CodpageFiles/865.txt",
  59     "CodpageFiles/866.txt",
  60     "CodpageFiles/869.txt",
  61     "CodpageFiles/874.txt",
  62     "CodpageFiles/875.txt",
  63     "CodpageFiles/932.txt",
  64     "CodpageFiles/936.txt",
  65     "CodpageFiles/949.txt",
  66     "CodpageFiles/950.txt",
  67     "CodpageFiles/1026.txt",
  68     "CodpageFiles/1250.txt",
  69     "CodpageFiles/1251.txt",
  70     "CodpageFiles/1252.txt",
  71     "CodpageFiles/1253.txt",
  72     "CodpageFiles/1254.txt",
  73     "CodpageFiles/1255.txt",
  74     "CodpageFiles/1256.txt",
  75     "CodpageFiles/1257.txt",
  76     "CodpageFiles/1258.txt",
  77     "CodpageFiles/1361.txt",
  78     "CodpageFiles/10000.txt",
  79     "CodpageFiles/10001.txt",
  80     "CodpageFiles/10002.txt",
  81     "CodpageFiles/10003.txt",
  82     "CodpageFiles/10004.txt",
  83     "CodpageFiles/10005.txt",
  84     "CodpageFiles/10006.txt",
  85     "CodpageFiles/10007.txt",
  86     "CodpageFiles/10008.txt",
  87     "CodpageFiles/10010.txt",
  88     "CodpageFiles/10017.txt",
  89     "CodpageFiles/10021.txt",
  90     "CodpageFiles/10029.txt",
  91     "CodpageFiles/10079.txt",
  92     "CodpageFiles/10081.txt",
  93     "CodpageFiles/10082.txt",
  94     "CodpageFiles/20127.txt",
  95     "CodpageFiles/20866.txt",
  96     "CodpageFiles/21866.txt",
  97     "CodpageFiles/28591.txt",
  98     "CodpageFiles/28592.txt",
  99     "CodpageFiles/28593.txt",
 100     "CodpageFiles/28594.txt",
 101     "CodpageFiles/28595.txt",
 102     "CodpageFiles/28596.txt",
 103     "CodpageFiles/28597.txt",
 104     "CodpageFiles/28598.txt",
 105     "CodpageFiles/28599.txt",
 106     "CodpageFiles/28603.txt",
 107     "CodpageFiles/28605.txt",
 108 );
 109
 110
 111 my %ctype =
 112 (
 113      # CT_CTYPE1
 114     "upper"  => 0x0001,
 115     "lower"  => 0x0002,
 116     "digit"  => 0x0004,
 117     "space"  => 0x0008,
 118     "punct"  => 0x0010,
 119     "cntrl"  => 0x0020,
 120     "blank"  => 0x0040,
 121     "xdigit" => 0x0080,
 122     "alpha"  => 0x0100 | 0x80000000,
 123     "defin"  => 0x0200,
 124      # CT_CTYPE3 in high 16 bits
 125     "nonspacing"    => 0x00010000,
 126     "diacritic"     => 0x00020000,
 127     "vowelmark"     => 0x00040000,
 128     "symbol"        => 0x00080000,
 129     "katakana"      => 0x00100000,
 130     "hiragana"      => 0x00200000,
 131     "halfwidth"     => 0x00400000,
 132     "fullwidth"     => 0x00800000,
 133     "ideograph"     => 0x01000000,
 134     "kashida"       => 0x02000000,
 135     "lexical"       => 0x04000000,
 136     "highsurrogate" => 0x08000000,
 137     "lowsurrogate"  => 0x10000000,
 138 );
 139
 140 my %bracket_types =
 141 (
 142     "o" => 0x0000,
 143     "c" => 0x0001,
 144 );
 145
 146 my %indic_types =
 147 (
 148     "Other"    => 0x0000,
 149     "Bindu"    => 0x0001,
 150     "Visarga"  => 0x0002,
 151     "Avagraha" => 0x0003,
 152     "Nukta"    => 0x0004,
 153     "Virama"   => 0x0005,
 154     "Vowel_Independent"  => 0x0006,
 155     "Vowel_Dependent"  => 0x0007,
 156     "Vowel"  => 0x0008,
 157     "Consonant_Placeholder"  => 0x0009,
 158     "Consonant"  => 0x000a,
 159     "Consonant_Dead"  => 0x000b,
 160     "Consonant_Succeeding_Repha" => 0x000c,
 161     "Consonant_Subjoined"  => 0x000d,
 162     "Consonant_Medial"  => 0x000e,
 163     "Consonant_Final"  => 0x000f,
 164     "Consonant_Head_Letter"  => 0x0010,
 165     "Modifying_Letter"  => 0x0011,
 166     "Tone_Letter"  => 0x0012,
 167     "Tone_Mark"  => 0x0013,
 168     "Register_Shifter"  => 0x0014,
 169     "Consonant_Preceding_Repha" => 0x0015,
 170     "Pure_Killer" => 0x0016,
 171     "Invisible_Stacker" => 0x0017,
 172     "Gemination_Mark" => 0x0018,
 173     "Cantillation_Mark" => 0x0019,
 174     "Non_Joiner" => 0x001a,
 175     "Joiner" => 0x001b,
 176     "Number_Joiner" => 0x001c,
 177     "Number" => 0x001d,
 178     "Brahmi_Joining_Number" => 0x001e,
 179     "Consonant_With_Stacker" => 0x001f,
 180     "Consonant_Prefixed" => 0x0020,
 181     "Syllable_Modifier" => 0x0021,
 182     "Consonant_Killer" => 0x0022,
 183     "Consonant_Initial_Postfixed" => 0x0023,
 184 );
 185
 186 my %matra_types =
 187 (
 188     "Right"    => 0x01,
 189     "Left"  => 0x02,
 190     "Visual_Order_Left" => 0x03,
 191     "Left_And_Right"    => 0x04,
 192     "Top"   => 0x05,
 193     "Bottom"  => 0x06,
 194     "Top_And_Bottom"  => 0x07,
 195     "Top_And_Right"  => 0x08,
 196     "Top_And_Left"  => 0x09,
 197     "Top_And_Left_And_Right"  => 0x0a,
 198     "Bottom_And_Right"  => 0x0b,
 199     "Top_And_Bottom_And_Right"  => 0x0c,
 200     "Overstruck"  => 0x0d,
 201     "Invisible"  => 0x0e,
 202     "Bottom_And_Left"  => 0x0f,
 203     "Top_And_Bottom_And_Left"  => 0x10,
 204 );
 205
 206 my %break_types =
 207 (
 208     "BK"  => 0x0001,
 209     "CR"  => 0x0002,
 210     "LF"  => 0x0003,
 211     "CM"  => 0x0004,
 212     "SG"  => 0x0005,
 213     "GL"  => 0x0006,
 214     "CB"  => 0x0007,
 215     "SP"  => 0x0008,
 216     "ZW"  => 0x0009,
 217     "NL"  => 0x000a,
 218     "WJ"  => 0x000b,
 219     "JL"  => 0x000c,
 220     "JV"  => 0x000d,
 221     "JT"  => 0x000e,
 222     "H2"  => 0x000f,
 223     "H3"  => 0x0010,
 224     "XX"  => 0x0011,
 225     "OP"  => 0x0012,
 226     "CL"  => 0x0013,
 227     "CP"  => 0x0014,
 228     "QU"  => 0x0015,
 229     "NS"  => 0x0016,
 230     "EX"  => 0x0017,
 231     "SY"  => 0x0018,
 232     "IS"  => 0x0019,
 233     "PR"  => 0x001a,
 234     "PO"  => 0x001b,
 235     "NU"  => 0x001c,
 236     "AL"  => 0x001d,
 237     "ID"  => 0x001e,
 238     "IN"  => 0x001f,
 239     "HY"  => 0x0020,
 240     "BB"  => 0x0021,
 241     "BA"  => 0x0022,
 242     "SA"  => 0x0023,
 243     "AI"  => 0x0024,
 244     "B2"  => 0x0025,
 245     "HL"  => 0x0026,
 246     "CJ"  => 0x0027,
 247     "RI"  => 0x0028,
 248     "EB"  => 0x0029,
 249     "EM"  => 0x002a,
 250     "ZWJ" => 0x002b,
 251 );
 252
 253 my %vertical_types =
 254 (
 255     "R"  => 0x0000,
 256     "U"  => 0x0001,
 257     "Tr" => 0x0002,
 258     "Tu" => 0x0003,
 259 );
 260
 261 my %categories =
 262 (
 263     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 264     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 265     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 266     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 267     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 268     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 269     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 270     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 271     "No" => $ctype{"defin"},                    # Number, Other
 272     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 273     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 274     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 275     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 276     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 277     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 278     "Co" => $ctype{"defin"},                    # Other, Private Use
 279     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 280     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 281     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 282     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 283     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 284     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 285     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 286     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 287     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 288     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 289     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 290     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 291     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 292     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 293 );
 294
 295 # a few characters need additional categories that cannot be determined automatically
 296 my %special_categories =
 297 (
 298     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 299                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 300     "space"  => [ 0x09..0x0d, 0x85 ],
 301     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 302     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 303                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 304                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 305                   0xfff9, 0xfffa, 0xfffb ],
 306     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 307                   0xd7, 0xf7 ],
 308     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 309     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 310     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 311                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 312     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 313     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 314                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 315                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 316     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 317     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 318                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 319                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 320                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 321                      0x3131..0x3164 ],
 322     "ideograph" => [ 0x3006..0x3007 ],
 323     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 324                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 325                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 326                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 327                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 328                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 329                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 330                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 331                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 332                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 333     "kashida" => [ 0x0640 ],
 334 );
 335
 336 my %directions =
 337 (
 338     "L"   => 1,    # Left-to-Right
 339     "R"   => 2,    # Right-to-Left
 340     "AL"  => 12,   # Right-to-Left Arabic
 341     "EN"  => 3,    # European Number
 342     "ES"  => 4,    # European Number Separator
 343     "ET"  => 5,    # European Number Terminator
 344     "AN"  => 6,    # Arabic Number
 345     "CS"  => 7,    # Common Number Separator
 346     "NSM" => 13,   # Non-Spacing Mark
 347     "BN"  => 14,   # Boundary Neutral
 348     "B"   => 8,    # Paragraph Separator
 349     "S"   => 9,    # Segment Separator
 350     "WS"  => 10,   # Whitespace
 351     "ON"  => 11,   # Other Neutrals
 352     "LRE" => 15,   # Left-to-Right Embedding
 353     "LRO" => 15,   # Left-to-Right Override
 354     "RLE" => 15,   # Right-to-Left Embedding
 355     "RLO" => 15,   # Right-to-Left Override
 356     "PDF" => 15,   # Pop Directional Format
 357     "LRI" => 15,   # Left-to-Right Isolate
 358     "RLI" => 15,   # Right-to-Left Isolate
 359     "FSI" => 15,   # First Strong Isolate
 360     "PDI" => 15    # Pop Directional Isolate
 361 );
 362
 363 my %c2_types =
 364 (
 365     "L"   => 1,    # C2_LEFTTORIGHT
 366     "R"   => 2,    # C2_RIGHTTOLEFT
 367     "AL"  => 2,    # C2_RIGHTTOLEFT
 368     "EN"  => 3,    # C2_EUROPENUMBER
 369     "ES"  => 4,    # C2_EUROPESEPARATOR
 370     "ET"  => 5,    # C2_EUROPETERMINATOR
 371     "AN"  => 6,    # C2_ARABICNUMBER
 372     "CS"  => 7,    # C2_COMMONSEPARATOR
 373     "NSM" => 11,   # C2_OTHERNEUTRAL
 374     "BN"  => 0,    # C2_NOTAPPLICABLE
 375     "B"   => 8,    # C2_BLOCKSEPARATOR
 376     "S"   => 9,    # C2_SEGMENTSEPARATOR
 377     "WS"  => 10,   # C2_WHITESPACE
 378     "ON"  => 11,   # C2_OTHERNEUTRAL
 379     "LRE" => 11,   # C2_OTHERNEUTRAL
 380     "LRO" => 11,   # C2_OTHERNEUTRAL
 381     "RLE" => 11,   # C2_OTHERNEUTRAL
 382     "RLO" => 11,   # C2_OTHERNEUTRAL
 383     "PDF" => 11,   # C2_OTHERNEUTRAL
 384     "LRI" => 11,   # C2_OTHERNEUTRAL
 385     "RLI" => 11,   # C2_OTHERNEUTRAL
 386     "FSI" => 11,   # C2_OTHERNEUTRAL
 387     "PDI" => 11    # C2_OTHERNEUTRAL
 388 );
 389
 390 my %bidi_types =
 391 (
 392     "ON"  => 0,    # Other Neutrals
 393     "L"   => 1,    # Left-to-Right
 394     "R"   => 2,    # Right-to-Left
 395     "AN"  => 3,    # Arabic Number
 396     "EN"  => 4,    # European Number
 397     "AL"  => 5,    # Right-to-Left Arabic
 398     "NSM" => 6,    # Non-Spacing Mark
 399     "CS"  => 7,    # Common Number Separator
 400     "ES"  => 8,    # European Number Separator
 401     "ET"  => 9,    # European Number Terminator
 402     "BN"  => 10,   # Boundary Neutral
 403     "S"   => 11,   # Segment Separator
 404     "WS"  => 12,   # Whitespace
 405     "B"   => 13,   # Paragraph Separator
 406     "RLO" => 14,   # Right-to-Left Override
 407     "RLE" => 15,   # Right-to-Left Embedding
 408     "LRO" => 16,   # Left-to-Right Override
 409     "LRE" => 17,   # Left-to-Right Embedding
 410     "PDF" => 18,   # Pop Directional Format
 411     "LRI" => 19,   # Left-to-Right Isolate
 412     "RLI" => 20,   # Right-to-Left Isolate
 413     "FSI" => 21,   # First Strong Isolate
 414     "PDI" => 22    # Pop Directional Isolate
 415 );
 416
 417 my %joining_types =
 418 (
 419    "U" => 0,    # Non_Joining
 420    "T" => 1,    # Transparent
 421    "R" => 2,    # Right_Joining
 422    "L" => 3,    # Left_Joining
 423    "D" => 4,    # Dual_Joining
 424    "C" => 5,    # Join_Causing
 425 );
 426
 427 my @cp2uni = ();
 428 my @glyph2uni = ();
 429 my @lead_bytes = ();
 430 my @uni2cp = ();
 431 my @tolower_table = ();
 432 my @toupper_table = ();
 433 my @digitmap_table = ();
 434 my @category_table = ();
 435 my @joining_table = ();
 436 my @direction_table = ();
 437 my @decomp_table = ();
 438 my @combining_class_table = ();
 439 my @decomp_compat_table = ();
 440 my @comp_exclusions = ();
 441 my @idna_decomp_table = ();
 442 my @idna_disallowed = ();
 443 my %registry_keys;
 444 my $default_char;
 445 my $default_wchar;
 446
 447 my %joining_forms =
 448 (
 449    "isolated" => [],
 450    "final" => [],
 451    "initial" => [],
 452    "medial" => []
 453 );
 454
 455 sub to_utf16(@)
 456 {
 457     my @ret;
 458     foreach my $ch (@_)
 459     {
 460         if ($ch < 0x10000)
 461         {
 462             push @ret, $ch;
 463         }
 464         else
 465         {
 466             my $val = $ch - 0x10000;
 467             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 468         }
 469     }
 470     return @ret;
 471 }
 472
 473 ################################################################
 474 # fetch a unicode.org file and open it
 475 sub open_data_file($$)
 476 {
 477     my ($base, $name) = @_;
 478     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 479     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 480     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 481     local *FILE;
 482
 483     if ($base =~ /.*\/([^\/]+)\.zip$/)
 484     {
 485         my $zip = "$1$suffix.zip";
 486         unless (-f "$cache/$zip")
 487         {
 488             system "mkdir", "-p", $cache;
 489             print "Fetching $base...\n";
 490             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 491         }
 492         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 493     }
 494     else
 495     {
 496         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 497         unless (-f $dest)
 498         {
 499             system "mkdir", "-p", $dir;
 500             print "Fetching $base/$name...\n";
 501             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 502         }
 503         open FILE, "<$dest" or die "cannot open $dest";
 504     }
 505     return *FILE;
 506 }
 507
 508 ################################################################
 509 # recursively get the decomposition for a character
 510 sub get_decomposition($$);
 511 sub get_decomposition($$)
 512 {
 513     my ($char, $table) = @_;
 514     my @ret;
 515
 516     return $char unless defined $table->[$char];
 517     foreach my $ch (@{$table->[$char]})
 518     {
 519         push @ret, get_decomposition( $ch, $table );
 520     }
 521     return @ret;
 522 }
 523
 524 ################################################################
 525 # get the composition that results in a given character
 526 sub get_composition($$)
 527 {
 528     my ($ch, $compat) = @_;
 529     return () unless defined $decomp_table[$ch];  # no decomposition
 530     my @ret = @{$decomp_table[$ch]};
 531     return () if @ret < 2;                        # singleton decomposition
 532     return () if $comp_exclusions[$ch];           # composition exclusion
 533     return () if $combining_class_table[$ch];     # non-starter
 534     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 535     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 536         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 537     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 538         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 539     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 540         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 541     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 542     return @ret;
 543 }
 544
 545 ################################################################
 546 # recursively build decompositions
 547 sub build_decompositions(@)
 548 {
 549     my @src = @_;
 550     my @dst;
 551
 552     for (my $i = 0; $i < @src; $i++)
 553     {
 554         next unless defined $src[$i];
 555         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 556         $dst[$i] = \@decomp;
 557     }
 558     return @dst;
 559 }
 560
 561 ################################################################
 562 # compose Hangul sequences
 563 sub compose_hangul(@)
 564 {
 565     my $SBASE  = 0xac00;
 566     my $LBASE  = 0x1100;
 567     my $VBASE  = 0x1161;
 568     my $TBASE  = 0x11a7;
 569     my $LCOUNT = 19;
 570     my $VCOUNT = 21;
 571     my $TCOUNT = 28;
 572     my $NCOUNT = $VCOUNT * $TCOUNT;
 573     my $SCOUNT = $LCOUNT * $NCOUNT;
 574
 575     my @seq = @_;
 576     my @ret;
 577     my $i;
 578
 579     for ($i = 0; $i < @seq; $i++)
 580     {
 581         my $ch = $seq[$i];
 582         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 583             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 584         {
 585             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 586             $i++;
 587         }
 588         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 589             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 590         {
 591             $ch += $seq[$i+1] - $TBASE;
 592             $i++;
 593         }
 594         push @ret, $ch;
 595     }
 596     return @ret;
 597 }
 598
 599 ################################################################
 600 # remove linguistic-only mappings from the case table
 601 sub remove_linguistic_mappings($$)
 602 {
 603     my ($upper, $lower) = @_;
 604
 605     # remove case mappings that don't round-trip
 606
 607     for (my $i = 0; $i < @{$upper}; $i++)
 608     {
 609         next unless defined ${$upper}[$i];
 610         my $ch = ${$upper}[$i];
 611         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 612     }
 613     for (my $i = 0; $i < @{$lower}; $i++)
 614     {
 615         next unless defined ${$lower}[$i];
 616         my $ch = ${$lower}[$i];
 617         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 618     }
 619 }
 620
 621 ################################################################
 622 # read in the Unicode database files
 623 sub load_data()
 624 {
 625     my $start;
 626
 627     # now build mappings from the decomposition field of the Unicode database
 628
 629     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 630     while (<$UNICODE_DATA>)
 631     {
 632         # Decode the fields ...
 633         my ($code, $name, $cat, $comb, $bidi,
 634             $decomp, $dec, $dig, $num, $mirror,
 635             $oldname, $comment, $upper, $lower, $title) = split /;/;
 636         my $src = hex $code;
 637
 638         die "unknown category $cat" unless defined $categories{$cat};
 639         die "unknown directionality $bidi" unless defined $directions{$bidi};
 640
 641         $category_table[$src] = $categories{$cat};
 642         $direction_table[$src] = $bidi;
 643         $joining_table[$src] = $joining_types{"T"} if $cat eq "Mn" || $cat eq "Me" || $cat eq "Cf";
 644
 645         if ($lower ne "")
 646         {
 647             $tolower_table[$src] = hex $lower;
 648         }
 649         if ($upper ne "")
 650         {
 651             $toupper_table[$src] = hex $upper;
 652         }
 653         if ($dec ne "")
 654         {
 655             $category_table[$src] |= $ctype{"digit"};
 656         }
 657         if ($dig ne "")
 658         {
 659             $digitmap_table[$src] = ord $dig;
 660         }
 661         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 662
 663         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 664         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 665         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 666         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 667         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 668         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 669         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 670         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 671         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 672         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 673         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 674         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 675
 676         # copy the category and direction for everything between First/Last pairs
 677         if ($name =~ /, First>/) { $start = $src; }
 678         if ($name =~ /, Last>/)
 679         {
 680             while ($start < $src)
 681             {
 682                 $category_table[$start] = $category_table[$src];
 683                 $direction_table[$start] = $direction_table[$src];
 684                 $combining_class_table[$start] = $combining_class_table[$src];
 685                 $start++;
 686             }
 687         }
 688
 689         next if $decomp eq "";  # no decomposition, skip it
 690
 691         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 692         {
 693             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 694             $decomp_compat_table[$src] = \@seq;
 695         }
 696
 697         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 698         {
 699             # decomposition of the form "<foo> 1234" -> use char if type is known
 700             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 701             {
 702                 ${joining_forms{$1}}[hex $2] = $src;
 703             }
 704         }
 705         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 706         {
 707             # decomposition "<compat> 0020 1234" -> combining accent
 708         }
 709         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 710         {
 711             # store decomposition
 712             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 713             {
 714                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 715             }
 716             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 717             {
 718                 # Single char decomposition
 719                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 720             }
 721         }
 722     }
 723     close $UNICODE_DATA;
 724
 725     # patch the category of some special characters
 726
 727     for (my $i = 0; $i < @decomp_table; $i++)
 728     {
 729         next unless defined $decomp_table[$i];
 730         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 731     }
 732     foreach my $cat (keys %special_categories)
 733     {
 734         my $flag = $ctype{$cat};
 735         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 736     }
 737     for (my $i = 0; $i < @decomp_compat_table; $i++)
 738     {
 739         next unless defined $decomp_compat_table[$i];
 740         next unless @{$decomp_compat_table[$i]} == 2;
 741         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 742     }
 743
 744     # load the composition exclusions
 745
 746     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 747     while (<$EXCL>)
 748     {
 749         s/\#.*//;  # remove comments
 750         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 751         {
 752             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 753         }
 754         elsif (/^([0-9a-fA-F]+)\s*$/)
 755         {
 756             $comp_exclusions[hex $1] = 1;
 757         }
 758     }
 759     close $EXCL;
 760
 761     # load the IDNA mappings
 762
 763     @idna_decomp_table = @decomp_compat_table;
 764     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 765     while (<$IDNA>)
 766     {
 767         s/\#.*//;  # remove comments
 768         next if /^\s*$/;
 769         my ($char, $type, $mapping) = split /;/;
 770         my ($ch1, $ch2);
 771         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 772         {
 773             $ch1 = hex $1;
 774             $ch2 = hex $2;
 775         }
 776         elsif ($char =~ /([0-9a-fA-F]+)/)
 777         {
 778             $ch1 = $ch2 = hex $1;
 779         }
 780
 781         if ($type =~ /mapped/ || $type =~ /deviation/)
 782         {
 783             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 784             my @seq = map { hex $_; } split /\s+/, $mapping;
 785             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 786         }
 787         elsif ($type =~ /valid/)
 788         {
 789         }
 790         elsif ($type =~ /ignored/)
 791         {
 792             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 793         }
 794         elsif ($type =~ /disallowed/)
 795         {
 796             foreach my $i ($ch1 .. $ch2)
 797             {
 798                 $idna_decomp_table[$i] = undef;
 799                 $idna_disallowed[$i] = 1;
 800             }
 801         }
 802     }
 803     close $IDNA;
 804 }
 805
 806
 807 ################################################################
 808 # add a new registry key
 809 sub add_registry_key($$)
 810 {
 811     my ($key, $defval) = @_;
 812     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 813 }
 814
 815 ################################################################
 816 # add a new registry value
 817 sub add_registry_value($$$)
 818 {
 819     my ($key, $name, $value) = @_;
 820     add_registry_key( $key, undef );
 821     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 822 }
 823
 824 ################################################################
 825 # define a new lead byte
 826 sub add_lead_byte($)
 827 {
 828     my $ch = shift;
 829     return if defined $cp2uni[$ch];
 830     push @lead_bytes, $ch;
 831     $cp2uni[$ch] = 0;
 832 }
 833
 834 ################################################################
 835 # define a new char mapping
 836 sub add_mapping($$)
 837 {
 838     my ($cp, $uni) = @_;
 839     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 840     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 841     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 842 }
 843
 844 ################################################################
 845 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 846 sub get_glyphs_mapping(@)
 847 {
 848     my @table = @_;
 849
 850     for (my $i = 0; $i < @glyph2uni; $i++)
 851     {
 852         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 853     }
 854     return @table;
 855 }
 856
 857 ################################################################
 858 # build EUC-JP table from the JIS 0208/0212 files
 859 sub dump_eucjp_codepage()
 860 {
 861     @cp2uni = ();
 862     @glyph2uni = ();
 863     @lead_bytes = ();
 864     @uni2cp = ();
 865     $default_char = $DEF_CHAR;
 866     $default_wchar = 0x30fb;
 867
 868     # ASCII chars
 869     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 870
 871     # lead bytes
 872     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 873
 874     # JIS X 0201 right plane
 875     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 876
 877     # undefined chars
 878     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 879     $cp2uni[0xa0] = 0xf8f0;
 880     $cp2uni[0xff] = 0xf8f3;
 881
 882     # Fix backslash conversion
 883     add_mapping( 0xa1c0, 0xff3c );
 884
 885     # Add private mappings for rows undefined in JIS 0208/0212
 886     my $private = 0xe000;
 887     foreach my $hi (0xf5 .. 0xfe)
 888     {
 889         foreach my $lo (0xa1 .. 0xfe)
 890         {
 891             add_mapping( ($hi << 8) + $lo, $private++ );
 892         }
 893     }
 894     foreach my $hi (0xf5 .. 0xfe)
 895     {
 896         foreach my $lo (0x21 .. 0x7e)
 897         {
 898             add_mapping( ($hi << 8) + $lo, $private++ );
 899         }
 900     }
 901
 902     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 903     while (<$INPUT>)
 904     {
 905         next if /^\#/;  # skip comments
 906         next if /^$/;  # skip empty lines
 907         next if /\x1a/;  # skip ^Z
 908         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 909         {
 910             add_mapping( 0x8080 + hex $1, hex $2 );
 911             next;
 912         }
 913         die "Unrecognized line $_\n";
 914     }
 915     close $INPUT;
 916
 917     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 918     while (<$INPUT>)
 919     {
 920         next if /^\#/;  # skip comments
 921         next if /^$/;  # skip empty lines
 922         next if /\x1a/;  # skip ^Z
 923         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 924         {
 925             add_mapping( 0x8000 + hex $1, hex $2 );
 926             next;
 927         }
 928         die "Unrecognized line $_\n";
 929     }
 930     close $INPUT;
 931
 932     output_codepage_file( 20932 );
 933 }
 934
 935
 936 ################################################################
 937 # build the sort keys table
 938 sub dump_sortkeys($)
 939 {
 940     my $filename = shift;
 941     my @sortkeys = ();
 942
 943     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
 944     while (<$INPUT>)
 945     {
 946         next if /^\#/;  # skip comments
 947         next if /^$/;  # skip empty lines
 948         next if /\x1a/;  # skip ^Z
 949         next if /^\@version/;  # skip @version header
 950         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 951         {
 952             my ($uni,$variable) = (hex $1, $2);
 953             next if $uni > 65535;
 954             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
 955             next;
 956         }
 957         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
 958         {
 959             # multiple character sequence, ignored for now
 960             next;
 961         }
 962         die "$SORTKEYS: Unrecognized line $_\n";
 963     }
 964     close $INPUT;
 965
 966     # compress the keys to 32 bit:
 967     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
 968
 969     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
 970                        ${$a}[2] <=> ${$b}[2] or
 971                        ${$a}[3] <=> ${$b}[3] or
 972                        ${$a}[4] <=> ${$b}[4] or
 973                        $a cmp $b; } @sortkeys;
 974
 975     my ($n2, $n3) = (1, 1);
 976     my @keys = (-1, -1, -1, -1, -1 );
 977     my @flatkeys = ();
 978
 979     for (my $i = 0; $i < @sortkeys; $i++)
 980     {
 981         next unless defined $sortkeys[$i];
 982         my @current = @{$sortkeys[$i]};
 983         if ($current[1] == $keys[1])
 984         {
 985             if ($current[2] == $keys[2])
 986             {
 987                 if ($current[3] == $keys[3])
 988                 {
 989                     # nothing
 990                 }
 991                 else
 992                 {
 993                     $keys[3] = $current[3];
 994                     $n3++;
 995                     die if ($n3 >= 16);
 996                 }
 997             }
 998             else
 999             {
1000                 $keys[2] = $current[2];
1001                 $keys[3] = $current[3];
1002                 $n2++;
1003                 $n3 = 1;
1004                 die if ($n2 >= 256);
1005             }
1006         }
1007         else
1008         {
1009             $keys[1] = $current[1];
1010             $keys[2] = $current[2];
1011             $keys[3] = $current[3];
1012             $n2 = 1;
1013             $n3 = 1;
1014         }
1015
1016         if ($current[2]) { $current[2] = $n2; }
1017         if ($current[3]) { $current[3] = $n3; }
1018         if ($current[4]) { $current[4] = 1; }
1019
1020         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1021     }
1022
1023     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1024     printf "Building $filename\n";
1025     printf OUTPUT "/* Unicode collation element table */\n";
1026     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1027     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1028     print OUTPUT "#include \"windef.h\"\n\n";
1029
1030     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1031
1032     close OUTPUT;
1033     save_file($filename);
1034 }
1035
1036
1037 ################################################################
1038 # dump an array of integers
1039 sub dump_array($$@)
1040 {
1041     my ($bit_width, $default, @array) = @_;
1042     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1043     my $i;
1044     my $ret = "    ";
1045     for ($i = 0; $i < $#array; $i++)
1046     {
1047         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1048         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1049     }
1050     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1051     return $ret;
1052 }
1053
1054
1055 ################################################################
1056 # dump an SBCS mapping table in binary format
1057 sub dump_binary_sbcs_table($)
1058 {
1059     my $codepage = shift;
1060
1061     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1062     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1063
1064     print OUTPUT pack "S<*", @header;
1065     print OUTPUT pack "C12", (0) x 12;
1066     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1067
1068     if (@glyph2uni)
1069     {
1070         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1071     }
1072     else
1073     {
1074         print OUTPUT pack "S<*", 0;
1075     }
1076
1077     print OUTPUT pack "S<*", 0, 0;
1078
1079     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1080 }
1081
1082
1083 ################################################################
1084 # dump a DBCS mapping table in binary format
1085 sub dump_binary_dbcs_table($)
1086 {
1087     my $codepage = shift;
1088     my @lb_ranges = get_lb_ranges();
1089     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1090
1091     my @offsets = (0) x 256;
1092     my $pos = 0;
1093     foreach my $i (@lead_bytes)
1094     {
1095         $offsets[$i] = ($pos += 256);
1096         $cp2uni[$i] = 0;
1097     }
1098
1099     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1100
1101     print OUTPUT pack "S<*", @header;
1102     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1103     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1104     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1105
1106     foreach my $i (@lead_bytes)
1107     {
1108         my $base = $i << 8;
1109         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1110     }
1111
1112     print OUTPUT pack "S<", 4;
1113     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1114 }
1115
1116
1117 ################################################################
1118 # get the list of defined lead byte ranges
1119 sub get_lb_ranges()
1120 {
1121     my @list = ();
1122     my @ranges = ();
1123
1124     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1125     my $on = 0;
1126     for (my $i = 0; $i < 256; $i++)
1127     {
1128         if ($on)
1129         {
1130             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1131         }
1132         else
1133         {
1134             if ($list[$i]) { push @ranges, $i; $on = 1; }
1135         }
1136     }
1137     if ($on) { push @ranges, 0xff; }
1138     return @ranges;
1139 }
1140
1141 ################################################################
1142 # dump the Indic Syllabic Category table
1143 sub dump_indic($)
1144 {
1145     my $filename = shift;
1146     my @indic_table;
1147
1148     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1149     while (<$INPUT>)
1150     {
1151         next if /^\#/;  # skip comments
1152         next if /^\s*$/;  # skip empty lines
1153         next if /\x1a/;  # skip ^Z
1154         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1155         {
1156             my $type = $2;
1157             die "unknown indic $type" unless defined $indic_types{$type};
1158             if (hex $1 < 65536)
1159             {
1160                 $indic_table[hex $1] = $indic_types{$type};
1161             }
1162             next;
1163         }
1164         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1165         {
1166             my $type = $3;
1167             die "unknown indic $type" unless defined $indic_types{$type};
1168             if (hex $1 < 65536 and hex $2 < 65536)
1169             {
1170                 foreach my $i (hex $1 .. hex $2)
1171                 {
1172                     $indic_table[$i] = $indic_types{$type};
1173                 }
1174             }
1175             next;
1176         }
1177         die "malformed line $_";
1178     }
1179     close $INPUT;
1180
1181     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1182     while (<$INPUT>)
1183     {
1184         next if /^\#/;  # skip comments
1185         next if /^\s*$/;  # skip empty lines
1186         next if /\x1a/;  # skip ^Z
1187         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1188         {
1189             my $type = $2;
1190             die "unknown matra $type" unless defined $matra_types{$type};
1191             $indic_table[hex $1] |= $matra_types{$type} << 8;
1192             next;
1193         }
1194         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1195         {
1196             my $type = $3;
1197             die "unknown matra $type" unless defined $matra_types{$type};
1198             foreach my $i (hex $1 .. hex $2)
1199             {
1200                 $indic_table[$i] |= $matra_types{$type} << 8;
1201             }
1202             next;
1203         }
1204         die "malformed line $_";
1205     }
1206     close $INPUT;
1207
1208     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1209     print "Building $filename\n";
1210     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1211     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1212     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1213     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1214     print OUTPUT "#include \"windef.h\"\n\n";
1215
1216     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1217
1218     close OUTPUT;
1219     save_file($filename);
1220 }
1221
1222 ################################################################
1223 # dump the Line Break Properties table
1224 sub dump_linebreak($)
1225 {
1226     my $filename = shift;
1227     my @break_table;
1228     my $next_group = 0;
1229
1230     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1231     while (<$INPUT>)
1232     {
1233         next if /^\#/;  # skip comments
1234         next if /^\s*$/;  # skip empty lines
1235         next if /\x1a/;  # skip ^Z
1236         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1237         {
1238             my $type = $2;
1239             die "unknown breaktype $type" unless defined $break_types{$type};
1240             $break_table[hex $1] = $break_types{$type};
1241             next;
1242         }
1243         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1244         {
1245             my $type = $3;
1246             die "unknown breaktype $type" unless defined $break_types{$type};
1247             foreach my $i (hex $1 .. hex $2)
1248             {
1249                 $break_table[$i] = $break_types{$type};
1250             }
1251             next;
1252         }
1253         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1254         {
1255             my $type = $2;
1256             die "unknown breaktype $type" unless defined $break_types{$type};
1257             $break_table[hex $1] = $break_types{$type};
1258             next;
1259         }
1260         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1261         {
1262             my $type = $3;
1263             die "unknown breaktype $type" unless defined $break_types{$type};
1264             foreach my $i (hex $1 .. hex $2)
1265             {
1266                 $break_table[$i] = $break_types{$type};
1267             }
1268             next;
1269         }
1270         die "malformed line $_";
1271     }
1272     close $INPUT;
1273
1274     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1275     print "Building $filename\n";
1276     print OUTPUT "/* Unicode Line Break Properties */\n";
1277     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1278     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1279     print OUTPUT "#include \"windef.h\"\n\n";
1280
1281     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1282
1283     close OUTPUT;
1284     save_file($filename);
1285 }
1286
1287 my %scripts =
1288 (
1289     "Unknown"                => 0,
1290     "Common"                 => 1,
1291     "Inherited"              => 2,
1292     "Arabic"                 => 3,
1293     "Armenian"               => 4,
1294     "Avestan"                => 5,
1295     "Balinese"               => 6,
1296     "Bamum"                  => 7,
1297     "Batak"                  => 8,
1298     "Bengali"                => 9,
1299     "Bopomofo"               => 10,
1300     "Brahmi"                 => 11,
1301     "Braille"                => 12,
1302     "Buginese"               => 13,
1303     "Buhid"                  => 14,
1304     "Canadian_Aboriginal"    => 15,
1305     "Carian"                 => 16,
1306     "Cham"                   => 17,
1307     "Cherokee"               => 18,
1308     "Coptic"                 => 19,
1309     "Cuneiform"              => 20,
1310     "Cypriot"                => 21,
1311     "Cyrillic"               => 22,
1312     "Deseret"                => 23,
1313     "Devanagari"             => 24,
1314     "Egyptian_Hieroglyphs"   => 25,
1315     "Ethiopic"               => 26,
1316     "Georgian"               => 27,
1317     "Glagolitic"             => 28,
1318     "Gothic"                 => 29,
1319     "Greek"                  => 30,
1320     "Gujarati"               => 31,
1321     "Gurmukhi"               => 32,
1322     "Han"                    => 33,
1323     "Hangul"                 => 34,
1324     "Hanunoo"                => 35,
1325     "Hebrew"                 => 36,
1326     "Hiragana"               => 37,
1327     "Imperial_Aramaic"       => 38,
1328     "Inscriptional_Pahlavi"  => 39,
1329     "Inscriptional_Parthian" => 40,
1330     "Javanese"               => 41,
1331     "Kaithi"                 => 42,
1332     "Kannada"                => 43,
1333     "Katakana"               => 44,
1334     "Kayah_Li"               => 45,
1335     "Kharoshthi"             => 46,
1336     "Khmer"                  => 47,
1337     "Lao"                    => 48,
1338     "Latin"                  => 49,
1339     "Lepcha"                 => 50,
1340     "Limbu"                  => 51,
1341     "Linear_B"               => 52,
1342     "Lisu"                   => 53,
1343     "Lycian"                 => 54,
1344     "Lydian"                 => 55,
1345     "Malayalam"              => 56,
1346     "Mandaic"                => 57,
1347     "Meetei_Mayek"           => 58,
1348     "Mongolian"              => 59,
1349     "Myanmar"                => 60,
1350     "New_Tai_Lue"            => 61,
1351     "Nko"                    => 62,
1352     "Ogham"                  => 63,
1353     "Ol_Chiki"               => 64,
1354     "Old_Italic"             => 65,
1355     "Old_Persian"            => 66,
1356     "Old_South_Arabian"      => 67,
1357     "Old_Turkic"             => 68,
1358     "Oriya"                  => 69,
1359     "Osmanya"                => 70,
1360     "Phags_Pa"               => 71,
1361     "Phoenician"             => 72,
1362     "Rejang"                 => 73,
1363     "Runic"                  => 74,
1364     "Samaritan"              => 75,
1365     "Saurashtra"             => 76,
1366     "Shavian"                => 77,
1367     "Sinhala"                => 78,
1368     "Sundanese"              => 79,
1369     "Syloti_Nagri"           => 80,
1370     "Syriac"                 => 81,
1371     "Tagalog"                => 82,
1372     "Tagbanwa"               => 83,
1373     "Tai_Le"                 => 84,
1374     "Tai_Tham"               => 85,
1375     "Tai_Viet"               => 86,
1376     "Tamil"                  => 87,
1377     "Telugu"                 => 88,
1378     "Thaana"                 => 89,
1379     "Thai"                   => 90,
1380     "Tibetan"                => 91,
1381     "Tifinagh"               => 92,
1382     "Ugaritic"               => 93,
1383     "Vai"                    => 94,
1384     "Yi"                     => 95,
1385     # Win8/Win8.1
1386     "Chakma"                 => 96,
1387     "Meroitic_Cursive"       => 97,
1388     "Meroitic_Hieroglyphs"   => 98,
1389     "Miao"                   => 99,
1390     "Sharada"                => 100,
1391     "Sora_Sompeng"           => 101,
1392     "Takri"                  => 102,
1393     # Win10
1394     "Bassa_Vah"              => 103,
1395     "Caucasian_Albanian"     => 104,
1396     "Duployan"               => 105,
1397     "Elbasan"                => 106,
1398     "Grantha"                => 107,
1399     "Khojki"                 => 108,
1400     "Khudawadi"              => 109,
1401     "Linear_A"               => 110,
1402     "Mahajani"               => 111,
1403     "Manichaean"             => 112,
1404     "Mende_Kikakui"          => 113,
1405     "Modi"                   => 114,
1406     "Mro"                    => 115,
1407     "Nabataean"              => 116,
1408     "Old_North_Arabian"      => 117,
1409     "Old_Permic"             => 118,
1410     "Pahawh_Hmong"           => 119,
1411     "Palmyrene"              => 120,
1412     "Pau_Cin_Hau"            => 121,
1413     "Psalter_Pahlavi"        => 122,
1414     "Siddham"                => 123,
1415     "Tirhuta"                => 124,
1416     "Warang_Citi"            => 125,
1417     # Win10 RS1
1418     "Adlam"                  => 126,
1419     "Ahom"                   => 127,
1420     "Anatolian_Hieroglyphs"  => 128,
1421     "Bhaiksuki"              => 129,
1422     "Hatran"                 => 130,
1423     "Marchen"                => 131,
1424     "Multani"                => 132,
1425     "Newa"                   => 133,
1426     "Old_Hungarian"          => 134,
1427     "Osage"                  => 135,
1428     "SignWriting"            => 136,
1429     "Tangut"                 => 137,
1430     # Win10 RS4
1431     "Masaram_Gondi"          => 138,
1432     "Nushu"                  => 139,
1433     "Soyombo"                => 140,
1434     "Zanabazar_Square"       => 141,
1435     # Win10 1903
1436     "Dogra"                  => 142,
1437     "Gunjala_Gondi"          => 143,
1438     "Hanifi_Rohingya"        => 144,
1439     "Makasar"                => 145,
1440     "Medefaidrin"            => 146,
1441     "Old_Sogdian"            => 147,
1442     "Sogdian"                => 148,
1443 );
1444
1445 ################################################################
1446 # dump Script IDs table
1447 sub dump_scripts($)
1448 {
1449     my $filename = shift;
1450     my $header = $filename;
1451     my @scripts_table;
1452     my $script_index;
1453     my $i;
1454
1455     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1456     # Fill the table
1457     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1458     while (<$INPUT>)
1459     {
1460         my $type = "";
1461
1462         next if /^\#/;  # skip comments
1463         next if /^\s*$/;  # skip empty lines
1464         next if /\x1a/;  # skip ^Z
1465         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1466         {
1467             $type = $2;
1468             if (defined $scripts{$type})
1469             {
1470                 $scripts_table[hex $1] = $scripts{$type};
1471             }
1472             next;
1473         }
1474         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1475         {
1476             $type = $3;
1477             if (defined $scripts{$type})
1478             {
1479                 foreach my $i (hex $1 .. hex $2)
1480                 {
1481                     $scripts_table[$i] = $scripts{$type};
1482                 }
1483             }
1484             next;
1485         }
1486     }
1487
1488     close $INPUT;
1489
1490     $header = "$filename.h";
1491     open OUTPUT,">$header.new" or die "Cannot create $header";
1492     print "Building $header\n";
1493     print OUTPUT "/* Unicode Script IDs */\n";
1494     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1495     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1496
1497     print OUTPUT "enum unicode_script_id {\n";
1498     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1499     {
1500         print OUTPUT "    Script_$script = $scripts{$script},\n";
1501     }
1502     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1503     print OUTPUT "};\n";
1504
1505     close OUTPUT;
1506     save_file($header);
1507
1508     $filename = "$filename.c";
1509     open OUTPUT,">$filename.new" or die "Cannot create $header";
1510     print "Building $filename\n";
1511     print OUTPUT "/* Unicode Script IDs */\n";
1512     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1513     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1514     print OUTPUT "#include \"windef.h\"\n\n";
1515
1516     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1517     close OUTPUT;
1518     save_file($filename);
1519 }
1520
1521 ################################################################
1522 # dump the BiDi mirroring table
1523 sub dump_mirroring($)
1524 {
1525     my $filename = shift;
1526     my @mirror_table = ();
1527
1528     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1529     while (<$INPUT>)
1530     {
1531         next if /^\#/;  # skip comments
1532         next if /^$/;  # skip empty lines
1533         next if /\x1a/;  # skip ^Z
1534         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1535         {
1536             $mirror_table[hex $1] = hex $2;
1537             next;
1538         }
1539         die "malformed line $_";
1540     }
1541     close $INPUT;
1542
1543     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1544     print "Building $filename\n";
1545     print OUTPUT "/* Unicode BiDi mirroring */\n";
1546     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1547     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1548     print OUTPUT "#include \"windef.h\"\n\n";
1549     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1550     close OUTPUT;
1551     save_file($filename);
1552 }
1553
1554 ################################################################
1555 # dump the Bidi Brackets
1556 sub dump_bracket($)
1557 {
1558     my $filename = shift;
1559     my @bracket_table;
1560
1561     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1562     while (<$INPUT>)
1563     {
1564         next if /^\#/;  # skip comments
1565         next if /^\s*$/;  # skip empty lines
1566         next if /\x1a/;  # skip ^Z
1567         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1568         {
1569             my $type = $3;
1570             die "unknown bracket $type" unless defined $bracket_types{$type};
1571             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1572             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1573             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1574             next;
1575         }
1576         die "malformed line $_";
1577     }
1578     close $INPUT;
1579
1580     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1581     print "Building $filename\n";
1582     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1583     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1584     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1585     print OUTPUT "#include \"windef.h\"\n\n";
1586
1587     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1588
1589     close OUTPUT;
1590     save_file($filename);
1591 }
1592
1593 ################################################################
1594 # dump the Arabic shaping table
1595 sub dump_shaping($)
1596 {
1597     my $filename = shift;
1598     my %groups;
1599     my $next_group = 0;
1600
1601     $groups{"No_Joining_Group"} = $next_group++;
1602
1603     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1604     while (<$INPUT>)
1605     {
1606         next if /^\#/;  # skip comments
1607         next if /^\s*$/;  # skip empty lines
1608         next if /\x1a/;  # skip ^Z
1609         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1610         {
1611             my $type = $2;
1612             my $group = $3;
1613             $groups{$group} = $next_group++ unless defined $groups{$group};
1614             $joining_table[hex $1] = $joining_types{$type} | ($groups{$group} << 8);
1615             next;
1616         }
1617         die "malformed line $_";
1618     }
1619     close $INPUT;
1620
1621     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1622     print "Building $filename\n";
1623     print OUTPUT "/* Unicode Arabic shaping */\n";
1624     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1625     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1626     print OUTPUT "#include \"windef.h\"\n\n";
1627
1628     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1629
1630     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1631     for (my $i = 0x600; $i <= 0x6ff; $i++)
1632     {
1633         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1634             ${joining_forms{"isolated"}}[$i] || $i,
1635             ${joining_forms{"final"}}[$i] || $i,
1636             ${joining_forms{"initial"}}[$i] || $i,
1637             ${joining_forms{"medial"}}[$i] || $i;
1638     }
1639     print OUTPUT "};\n";
1640
1641     close OUTPUT;
1642     save_file($filename);
1643 }
1644
1645 ################################################################
1646 # dump the Vertical Orientation table
1647 sub dump_vertical($)
1648 {
1649     my $filename = shift;
1650     my @vertical_table;
1651
1652     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1653     while (<$INPUT>)
1654     {
1655         next if /^\#/;  # skip comments
1656         next if /^\s*$/;  # skip empty lines
1657         next if /\x1a/;  # skip ^Z
1658         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1659         {
1660             my $type = $2;
1661             die "unknown vertical $type" unless defined $vertical_types{$type};
1662             if (hex $1 < 65536)
1663             {
1664                 $vertical_table[hex $1] = $vertical_types{$type};
1665             }
1666             next;
1667         }
1668         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1669         {
1670             my $type = $3;
1671             die "unknown vertical $type" unless defined $vertical_types{$type};
1672             foreach my $i (hex $1 .. hex $2)
1673             {
1674                 $vertical_table[$i] = $vertical_types{$type};
1675             }
1676             next;
1677         }
1678         die "malformed line $_";
1679     }
1680     close $INPUT;
1681
1682     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1683     print "Building $filename\n";
1684     print OUTPUT "/* Unicode Vertical Orientation */\n";
1685     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1686     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1687     print OUTPUT "#include \"windef.h\"\n\n";
1688
1689     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1690
1691     close OUTPUT;
1692     save_file($filename);
1693 }
1694
1695 ################################################################
1696 # dump the digit folding tables
1697 sub dump_digit_folding($)
1698 {
1699     my ($filename) = shift;
1700     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1701     print "Building $filename\n";
1702     print OUTPUT "/* Unicode digit folding mappings */\n";
1703     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1704     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1705     print OUTPUT "#include \"windef.h\"\n\n";
1706
1707     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1708     close OUTPUT;
1709     save_file($filename);
1710 }
1711
1712
1713 ################################################################
1714 # dump the case mapping tables
1715 sub dump_case_mappings($)
1716 {
1717     my $filename = shift;
1718     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1719     print "Building $filename\n";
1720     print OUTPUT "/* Unicode case mappings */\n";
1721     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1722     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1723     print OUTPUT "#include \"windef.h\"\n\n";
1724
1725     my @upper = @toupper_table;
1726     my @lower = @tolower_table;
1727     remove_linguistic_mappings( \@upper, \@lower );
1728
1729     dump_case_table( "wine_casemap_lower", @lower );
1730     print OUTPUT "\n";
1731     dump_case_table( "wine_casemap_upper", @upper );
1732     close OUTPUT;
1733     save_file($filename);
1734 }
1735
1736
1737 ################################################################
1738 # dump a case mapping table
1739 sub dump_case_table($@)
1740 {
1741     my ($name,@table) = @_;
1742
1743     for (my $i = 0; $i < 65536; $i++)
1744     {
1745         next unless defined $table[$i];
1746         $table[$i] = ($table[$i] - $i) & 0xffff;
1747     }
1748
1749     my @array = compress_array( 256, 0, @table[0..65535] );
1750
1751     printf OUTPUT "const WCHAR %s[%d] =\n", $name, scalar @array;
1752     printf OUTPUT "{\n    /* index */\n";
1753     printf OUTPUT "%s,\n", dump_array( 16, 0, @array[0..255] );
1754     printf OUTPUT "    /* data */\n";
1755     printf OUTPUT "%s", dump_array( 16, 0, @array[256..$#array] );
1756     printf OUTPUT "\n};\n";
1757 }
1758
1759 ################################################################
1760 # compress a mapping table by removing identical rows
1761 sub compress_array($$@)
1762 {
1763     my $rows = shift;
1764     my $def = shift;
1765     my @table = @_;
1766     my $len = @table / $rows;
1767     my @array;
1768     my $data = "";
1769
1770     # try to merge table rows
1771     for (my $row = 0; $row < $rows; $row++)
1772     {
1773         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1774         my $pos = index $data, $rowtxt;
1775         if ($pos == -1)
1776         {
1777             # check if the tail of the data can match the start of the new row
1778             my $first = substr( $rowtxt, 0, 1 );
1779             for (my $i = length($data) - 1; $i > 0; $i--)
1780             {
1781                 $pos = index( substr( $data, -$i ), $first );
1782                 last if $pos == -1;
1783                 $i -= $pos;
1784                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1785                 substr( $data, -$i ) = "";
1786                 last;
1787             }
1788             $pos = length $data;
1789             $data .= $rowtxt;
1790         }
1791         $array[$row] = $rows + $pos;
1792     }
1793     return @array, unpack "U*", $data;
1794 }
1795
1796 ################################################################
1797 # dump a simple char -> 16-bit value mapping table
1798 sub dump_simple_mapping($$@)
1799 {
1800     my $name = shift;
1801     my $def = shift;
1802     my @array = compress_array( 256, $def, @_[0..65535] );
1803
1804     printf OUTPUT "const unsigned short %s[%d] =\n{\n", $name, $#array+1;
1805     printf OUTPUT "    /* offsets */\n%s,\n", dump_array( 16, 0, @array[0..255] );
1806     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( 16, 0, @array[256..$#array] );
1807 }
1808
1809 ################################################################
1810 # dump a char -> 16-bit value mapping table using two-level tables
1811 sub dump_two_level_mapping($$@)
1812 {
1813     my $name = shift;
1814     my $def = shift;
1815     my $size = shift;
1816     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1817     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1818     my @array = compress_array( 256, 0, @row_array[0..4095] );
1819
1820     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1821
1822     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1823     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1824     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1825     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1826 }
1827
1828 ################################################################
1829 # dump a char -> value mapping table using three-level tables
1830 sub dump_three_level_mapping($$@)
1831 {
1832     my $name = shift;
1833     my $def = shift;
1834     my $size = shift;
1835     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1836     my $level3 = ($MAX_CHAR + 1) / 16;
1837     my $level2 = $level3 / 16;
1838     my $level1 = $level2 / 16;
1839     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1840     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1841     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1842
1843     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1844     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1845
1846     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1847     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1848     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1849     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1850     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1851 }
1852
1853 ################################################################
1854 # dump a binary case mapping table in l_intl.nls format
1855 sub dump_binary_case_table(@)
1856 {
1857     my (@table) = @_;
1858     my $max_char = 0x10000;
1859     my $level1 = $max_char / 16;
1860     my $level2 = $level1 / 16;
1861
1862     my @difftable;
1863     for (my $i = 0; $i < @table; $i++)
1864     {
1865         next unless defined $table[$i];
1866         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1867     }
1868
1869     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1870     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1871     my $offset = @array - $level1;
1872     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1873     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1874 }
1875
1876 ################################################################
1877 # dump case mappings for l_intl.nls
1878 sub dump_intl_nls($)
1879 {
1880     my @upper_table = @toupper_table;
1881     my @lower_table = @tolower_table;
1882     remove_linguistic_mappings( \@upper_table, \@lower_table );
1883
1884     my $upper = dump_binary_case_table( @upper_table );
1885     my $lower = dump_binary_case_table( @lower_table );
1886
1887     my $filename = shift;
1888     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1889     printf "Building $filename\n";
1890
1891     binmode OUTPUT;
1892     print OUTPUT pack "S<", 1;  # version
1893     print OUTPUT $upper;
1894     print OUTPUT $lower;
1895     close OUTPUT;
1896     save_file($filename);
1897 }
1898
1899
1900 ################################################################
1901 # dump the bidi direction table
1902 sub dump_bidi_dir_table($)
1903 {
1904     my $filename = shift;
1905     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1906     printf "Building $filename\n";
1907     printf OUTPUT "/* Unicode BiDi direction table */\n";
1908     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1909     printf OUTPUT "#include \"windef.h\"\n\n";
1910
1911     my @table;
1912
1913     for (my $i = 0; $i < 65536; $i++)
1914     {
1915         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1916     }
1917
1918     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1919
1920     close OUTPUT;
1921     save_file($filename);
1922 }
1923
1924
1925 ################################################################
1926 # dump the ctype tables
1927 sub dump_ctype_tables($)
1928 {
1929     my $filename = shift;
1930     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1931     printf "Building $filename\n";
1932     printf OUTPUT "/* Unicode ctype tables */\n";
1933     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1934     printf OUTPUT "#include \"windef.h\"\n\n";
1935
1936     my @table = map { ($_ || 0) & 0xffff; } @category_table;
1937
1938     # add the direction in the high 4 bits of the category
1939     for (my $i = 0; $i < 65536; $i++)
1940     {
1941         $table[$i] |= $directions{$direction_table[$i]} << 12 if defined $direction_table[$i];
1942     }
1943
1944     dump_simple_mapping( "wine_wctype_table", 0, @table );
1945
1946     close OUTPUT;
1947     save_file($filename);
1948 }
1949
1950
1951 sub rol($$)
1952 {
1953     my ($byte, $count) = @_;
1954     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1955 }
1956
1957 ################################################################
1958 # compress the character properties table
1959 sub compress_char_props_table($@)
1960 {
1961     my $rows = shift;
1962     my @table = @_;
1963     my $len = @table / $rows;
1964     my $pos = 0;
1965     my @array = (0) x $rows;
1966     my %sequences;
1967
1968     # add some predefined sequences
1969     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
1970
1971     # try to merge table rows
1972     for (my $row = 0; $row < $rows; $row++)
1973     {
1974         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
1975         my $rowtxt = pack "L*", @table_row;
1976         if (defined($sequences{$rowtxt}))
1977         {
1978             # reuse an existing row
1979             $array[$row] = $sequences{$rowtxt};
1980         }
1981         else
1982         {
1983             # create a new row
1984             $sequences{$rowtxt} = $array[$row] = ++$pos;
1985             push @array, @table_row;
1986         }
1987     }
1988     return @array;
1989 }
1990
1991 ################################################################
1992 # dump a normalization table in binary format
1993 sub dump_norm_table($)
1994 {
1995     my $filename = shift;
1996
1997     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
1998     my %decomp = ( "nfc" => \@decomp_table,
1999                    "nfd" => \@decomp_table,
2000                    "nfkc" => \@decomp_compat_table,
2001                    "nfkd" => \@decomp_compat_table ,
2002                    "idna" => \@idna_decomp_table );
2003
2004     open OUTPUT,">$filename.new" or die "Cannot create $filename";
2005     print "Building $filename\n";
2006
2007     my $type = $filename;
2008     $type =~ s!.*/norm(\w+)\.nls!$1!;
2009
2010     my $compose = $forms{$type} & 1;
2011     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2012
2013     my @version = split /\./, $UNIVERSION;
2014
2015     # combining classes
2016
2017     my @classes;
2018     my @class_values;
2019
2020     foreach my $c (grep defined, @combining_class_table)
2021     {
2022         $classes[$c] = 1 if $c < 0x100;
2023     }
2024     for (my $i = 0; $i < @classes; $i++)
2025     {
2026         next unless defined $classes[$i];
2027         $classes[$i] = @class_values;
2028         push @class_values, $i;
2029     }
2030     push @class_values, 0 if (@class_values % 2);
2031     die "too many classes" if @class_values >= 0x40;
2032
2033     # character properties
2034
2035     my @char_props;
2036     my @decomposed;
2037     my @comp_hash_table;
2038     my $comp_hash_size = $compose ? 254 : 0;
2039
2040     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2041     {
2042         next unless defined $combining_class_table[$i];
2043         if (defined $decomp{$type}->[$i])
2044         {
2045             my @dec = get_decomposition( $i, $decomp{$type} );
2046             if ($compose && (my @comp = get_composition( $i, $compat )))
2047             {
2048                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2049                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2050
2051                 my $val = 0;
2052                 foreach my $d (@dec)
2053                 {
2054                     $val = $combining_class_table[$d];
2055                     last if $val;
2056                 }
2057                 $char_props[$i] = $classes[$val];
2058             }
2059             else
2060             {
2061                 $char_props[$i] = 0xbf;
2062             }
2063             @dec = compose_hangul( @dec ) if $compose;
2064             @dec = to_utf16( @dec );
2065             push @dec, 0 if @dec >= 7;
2066             $decomposed[$i] = \@dec;
2067         }
2068         else
2069         {
2070             if ($combining_class_table[$i] == 0x100)
2071             {
2072                 $char_props[$i] = 0x7f;
2073             }
2074             elsif ($combining_class_table[$i])
2075             {
2076                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2077             }
2078             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2079             {
2080                 $char_props[$i] = 0xff;
2081             }
2082             else
2083             {
2084                 $char_props[$i] = 0;
2085             }
2086         }
2087     }
2088
2089     if ($compose)
2090     {
2091         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2092         {
2093             my @comp = get_composition( $i, $compat );
2094             next unless @comp;
2095             if ($combining_class_table[$comp[1]])
2096             {
2097                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2098                 $char_props[$comp[1]] |= 0x40;
2099             }
2100             else
2101             {
2102                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2103                 $char_props[$comp[1]] |= 0xc0;
2104             }
2105         }
2106     }
2107
2108     # surrogates
2109     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2110     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2111
2112     # Hangul
2113     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2114     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2115     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2116
2117     # invalid chars
2118     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2119     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2120     foreach my $i (0x00..0x10)
2121     {
2122         $char_props[($i << 16) | 0xfffe] = 0xff;
2123         $char_props[($i << 16) | 0xffff] = 0xff;
2124     }
2125
2126     # decomposition hash table
2127
2128     my @decomp_hash_table;
2129     my @decomp_hash_index;
2130     my @decomp_hash_data;
2131     my $decomp_hash_size = 944;
2132
2133     # build string of character data, reusing substrings when possible
2134     my $decomp_char_data = "";
2135     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2136     {
2137         my $str = pack "U*", @{$i};
2138         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2139     }
2140     for (my $i = 0; $i < @decomposed; $i++)
2141     {
2142         next unless defined $decomposed[$i];
2143         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2144         die "sequence not found" if $pos == -1;
2145         my $len = @{$decomposed[$i]};
2146         $len = 7 if $len > 7;
2147         my $hash = $i % $decomp_hash_size;
2148         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2149     }
2150     for (my $i = 0; $i < $decomp_hash_size; $i++)
2151     {
2152         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2153         next unless defined $decomp_hash_table[$i];
2154         if (@{$decomp_hash_table[$i]} == 1)
2155         {
2156             my $entry = $decomp_hash_table[$i]->[0];
2157             if ($char_props[$entry->[0]] == 0xbf)
2158             {
2159                 $decomp_hash_index[$i] = $entry->[1];
2160                 next;
2161             }
2162         }
2163         foreach my $entry (@{$decomp_hash_table[$i]})
2164         {
2165             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2166         }
2167     }
2168     push @decomp_hash_data, 0, 0;
2169
2170     # composition hash table
2171
2172     my @comp_hash_index;
2173     my @comp_hash_data;
2174     if (@comp_hash_table)
2175     {
2176         for (my $i = 0; $i < $comp_hash_size; $i++)
2177         {
2178             $comp_hash_index[$i] = @comp_hash_data;
2179             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2180         }
2181         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2182         push @comp_hash_data, 0, 0, 0;
2183     }
2184
2185     my $level1 = ($MAX_CHAR + 1) / 128;
2186     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2187
2188     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2189                    0, $decomp_hash_size, $comp_hash_size, 0 );
2190     my @tables = (0) x 8;
2191
2192     $tables[0] = 16 + @header + @tables;
2193     $tables[1] = $tables[0] + @class_values / 2;
2194     $tables[2] = $tables[1] + $level1 / 2;
2195     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2196     $tables[4] = $tables[3] + @decomp_hash_index;
2197     $tables[5] = $tables[4] + @decomp_hash_data;
2198     $tables[6] = $tables[5] + length $decomp_char_data;
2199     $tables[7] = $tables[6] + @comp_hash_index;
2200
2201     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2202     print OUTPUT pack "S<*", @header;
2203     print OUTPUT pack "S<*", @tables;
2204     print OUTPUT pack "C*", @class_values;
2205
2206     print OUTPUT pack "C*", @rows[0..$level1-1];
2207     print OUTPUT pack "C*", @rows[$level1..$#rows];
2208     print OUTPUT pack "S<*", @decomp_hash_index;
2209     print OUTPUT pack "S<*", @decomp_hash_data;
2210     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2211     print OUTPUT pack "S<*", @comp_hash_index;
2212     print OUTPUT pack "S<*", @comp_hash_data;
2213
2214     close OUTPUT;
2215     save_file($filename);
2216
2217     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2218 }
2219
2220
2221 ################################################################
2222 # output a codepage definition file from the global tables
2223 sub output_codepage_file($)
2224 {
2225     my $codepage = shift;
2226
2227     my $output = sprintf "nls/c_%03d.nls", $codepage;
2228     open OUTPUT,">$output.new" or die "Cannot create $output";
2229
2230     printf "Building %s\n", $output;
2231     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2232     else { dump_binary_dbcs_table( $codepage ); }
2233
2234     close OUTPUT;
2235     save_file($output);
2236
2237     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2238 }
2239
2240 ################################################################
2241 # output a codepage table from a Microsoft-style mapping file
2242 sub dump_msdata_codepage($)
2243 {
2244     my $filename = shift;
2245
2246     my $state = "";
2247     my ($codepage, $width, $count);
2248     my ($lb_cur, $lb_end);
2249
2250     @cp2uni = ();
2251     @glyph2uni = ();
2252     @lead_bytes = ();
2253     @uni2cp = ();
2254     $default_char = $DEF_CHAR;
2255     $default_wchar = $DEF_CHAR;
2256
2257     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2258
2259     while (<$INPUT>)
2260     {
2261         next if /^;/;  # skip comments
2262         next if /^\s*$/;  # skip empty lines
2263         next if /\x1a/;  # skip ^Z
2264         last if /^ENDCODEPAGE/;
2265
2266         if (/^CODEPAGE\s+(\d+)/)
2267         {
2268             $codepage = $1;
2269             next;
2270         }
2271         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2272         {
2273             $width = $1;
2274             $default_char = hex $2;
2275             $default_wchar = hex $3;
2276             next;
2277         }
2278         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2279         {
2280             $state = $1;
2281             $count = $2;
2282             next;
2283         }
2284         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2285         {
2286             if ($state eq "MBTABLE")
2287             {
2288                 my $cp = hex $1;
2289                 my $uni = hex $2;
2290                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2291                 next;
2292             }
2293             if ($state eq "GLYPHTABLE")
2294             {
2295                 my $cp = hex $1;
2296                 my $uni = hex $2;
2297                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2298                 next;
2299             }
2300             if ($state eq "WCTABLE")
2301             {
2302                 my $uni = hex $1;
2303                 my $cp = hex $2;
2304                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2305                 next;
2306             }
2307             if ($state eq "DBCSRANGE")
2308             {
2309                 my $start = hex $1;
2310                 my $end = hex $2;
2311                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2312                 $lb_cur = $start;
2313                 $lb_end = $end;
2314                 next;
2315             }
2316             if ($state eq "DBCSTABLE")
2317             {
2318                 my $mb = hex $1;
2319                 my $uni = hex $2;
2320                 my $cp = ($lb_cur << 8) | $mb;
2321                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2322                 if (!--$count)
2323                 {
2324                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2325                 }
2326                 next;
2327             }
2328         }
2329         die "$filename: Unrecognized line $_\n";
2330     }
2331     close $INPUT;
2332
2333     output_codepage_file( $codepage );
2334 }
2335
2336 ################################################################
2337 # align a string length
2338 sub align_string($$)
2339 {
2340     my ($align, $str) = @_;
2341     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2342     return $str;
2343 }
2344
2345 ################################################################
2346 # pack a GUID string
2347 sub pack_guid($)
2348 {
2349     $_ = shift;
2350     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2351     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2352 }
2353
2354 ################################################################
2355 # comparison function for compression sort
2356 sub cmp_compression
2357 {
2358     return scalar @{$a} <=> scalar @{$b} ||
2359         $a->[4] <=> $b->[4] ||
2360         $a->[5] <=> $b->[5] ||
2361         $a->[6] <=> $b->[6] ||
2362         $a->[7] <=> $b->[7] ||
2363         $a->[8] <=> $b->[8] ||
2364         $a->[9] <=> $b->[9] ||
2365         $a->[10] <=> $b->[10] ||
2366         $a->[11] <=> $b->[11] ||
2367         $a->[12] <=> $b->[12];
2368 }
2369
2370 ################################################################
2371 # build a binary sort keys table
2372 sub dump_sortkey_table($$)
2373 {
2374     my ($filename, $download) = @_;
2375
2376     my @keys;
2377     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2378     my @multiple_weights;
2379     my @expansions;
2380     my @compressions;
2381     my %exceptions;
2382     my %guids;
2383     my %compr_flags;
2384     my %locales;
2385     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2386     my $jamostr = "";
2387
2388     my $re_hex = '0x[0-9A-Fa-f]+';
2389     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2390     $guids{$default_guid} = { };
2391
2392     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2393
2394     my $KEYS = open_data_file( $MSDATA, $download );
2395
2396     printf "Building $filename\n";
2397
2398     while (<$KEYS>)
2399     {
2400         s/\s*;.*$//;
2401         next if /^\s*$/;  # skip empty lines
2402         if (/^\s*(SORTKEY|SORTTABLES)/)
2403         {
2404             $part = $1;
2405             next;
2406         }
2407         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2408         {
2409             $part = $section = "";
2410             next;
2411         }
2412         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2413         {
2414             $section = $1;
2415             $guid = undef;
2416             next;
2417         }
2418         next unless $part;
2419         if ("$part.$section" eq "SORTKEY.DEFAULT")
2420         {
2421             if (/^\s*($re_hex)\s+$re_key/)
2422             {
2423                 $keys[hex $1] = [ split(/\s+/,$2) ];
2424                 next;
2425             }
2426         }
2427         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2428         {
2429             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2430             {
2431                 $version = hex $1;
2432                 next;
2433             }
2434             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2435             {
2436                 # ignore for now
2437                 next;
2438             }
2439         }
2440         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2441                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2442                "$part.$section" eq "SORTTABLES.INVERSECASING")
2443         {
2444             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2445             {
2446                 $guid = lc $1;
2447                 $guids{$guid} = { } unless defined $guids{$guid};
2448                 $guids{$guid}->{flags} |= $flags{$section};
2449                 next;
2450             }
2451             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2452             {
2453                 $locales{$1} = $guid;
2454                 next;
2455             }
2456         }
2457         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2458         {
2459             if (/^\s*(\d+)\s+(\d+)/)
2460             {
2461                 push @multiple_weights, $1, $2;
2462                 next;
2463             }
2464         }
2465         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2466         {
2467             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2468             {
2469                 my $pos = scalar @expansions / 2;
2470                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2471                 push @expansions, hex $2, hex $3;
2472                 next;
2473             }
2474         }
2475         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2476         {
2477             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2478             {
2479                 $keys[hex $1] = $keys[hex $2];
2480                 next;
2481             }
2482         }
2483         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2484         {
2485             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2486             {
2487                 if ($subsection || !$guid)  # start a new one
2488                 {
2489                     $guid = lc $1;
2490                     $subsection = "";
2491                     $guids{$guid} = { } unless defined $guids{$guid};
2492                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2493                     $guids{$guid}->{compr} = @compressions;
2494                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2495                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2496                     push @compressions, [ ];
2497                 }
2498                 else  # merge with current one
2499                 {
2500                     $guids{lc $1} = { } unless defined $guids{lc $1};
2501                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2502                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2503                     $compr_flags{lc $1} = $compr_flags{$guid};
2504                 }
2505                 next;
2506             }
2507             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2508             {
2509                 $locales{$1} = $guid;
2510                 next;
2511             }
2512             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2513             {
2514                 $subsection = $1;
2515                 next;
2516             }
2517             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2518             {
2519                 my @comp = map { hex $_; } split(/\s+/,$1);
2520                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2521                 # add compression flags
2522                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2523                 next;
2524             }
2525         }
2526         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2527         {
2528             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2529             {
2530                 $guid = lc $1;
2531                 $guids{$guid} = { } unless defined $guids{lc $1};
2532                 $ling_flag = ($2 ? "+" : "-");
2533                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2534                 next;
2535             }
2536             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2537             {
2538                 $locales{$1} = $guid;
2539                 next;
2540             }
2541             if (/^\s*($re_hex)\s+$re_key/)
2542             {
2543                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2544                 next;
2545             }
2546         }
2547         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2548         {
2549             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2550             {
2551                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2552                 next;
2553             }
2554         }
2555         die "$download: $part.$section: unrecognized line $_\n";
2556     }
2557     close $KEYS;
2558
2559     # Sortkey table
2560
2561     my $table;
2562     for (my $i = 0; $i < 0x10000; $i++)
2563     {
2564         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2565         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2566     }
2567
2568     foreach my $id (sort keys %exceptions)
2569     {
2570         my $pos = length($table) / 4;
2571         my @exc = @{$exceptions{$id}};
2572         my @filled;
2573         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2574         my $guid = substr( $id, 0, -1 );
2575         $guids{$guid}->{$key} = $pos;
2576         $pos += 0x100;
2577         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2578         for (my $j = 0; $j < 0x10000; $j++)
2579         {
2580             next unless defined $exc[$j] || defined $flags[$j];
2581             $filled[$j >> 8] = 1;
2582             $j |= 0xff;
2583         }
2584         for (my $j = 0; $j < 0x100; $j++)
2585         {
2586             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2587             $pos += 0x100 if $filled[$j];
2588         }
2589         for (my $j = 0; $j < 0x10000; $j++)
2590         {
2591             next unless $filled[$j >> 8];
2592             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2593             $k[3] |= $flags[$j] || 0;
2594             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2595         }
2596     }
2597
2598     # Case mapping tables
2599
2600     # standard table
2601     my @casemaps;
2602     my @upper = @toupper_table;
2603     my @lower = @tolower_table;
2604     remove_linguistic_mappings( \@upper, \@lower );
2605     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2606
2607     # linguistic table
2608     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2609
2610     # Turkish table
2611     @upper = @toupper_table;
2612     @lower = @tolower_table;
2613     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2614     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2615     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2616     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2617
2618     # Char type table
2619
2620     my @table;
2621     my $types = "";
2622     my %typestr;
2623     for (my $i = 0; $i < 0x10000; $i++)
2624     {
2625         my $str = pack "S<3",
2626             ($category_table[$i] || 0) & 0xffff,
2627             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2628             ($category_table[$i] || 0) >> 16;
2629
2630         if (!defined($typestr{$str}))
2631         {
2632             $typestr{$str} = length($types) / 6;
2633             $types .= $str;
2634         }
2635         $table[$i] = $typestr{$str};
2636     }
2637
2638     my @rows = compress_array( 4096, 0, @table[0..65535] );
2639     my @array = compress_array( 256, 0, @rows[0..4095] );
2640     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2641     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2642
2643     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2644     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2645     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2646
2647     # Sort tables
2648
2649     # guids
2650     my $sorttables = pack "L<2", $version, scalar %guids;
2651     foreach my $id (sort keys %guids)
2652     {
2653         my %guid = %{$guids{$id}};
2654         my $flags = $guid{flags} || 0;
2655         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2656         $sorttables .= pack_guid($id) . pack "L<5",
2657             $flags,
2658             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2659             $guid{except} || 0,
2660             $guid{ling_except} || 0,
2661             $map / 2;
2662     }
2663
2664     # expansions
2665     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2666
2667     # compressions
2668     $sorttables .= pack "L<", scalar @compressions;
2669     my $rowstr = "";
2670     foreach my $c (@compressions)
2671     {
2672         my $pos = length($rowstr) / 2;
2673         my $min = 0xffff;
2674         my $max = 0;
2675         my @lengths = (0) x 8;
2676         foreach my $r (sort cmp_compression @{$c})
2677         {
2678             my @row = @{$r};
2679             $lengths[scalar @row - 6]++;
2680             foreach my $val (@row[4..$#row])
2681             {
2682                 $min = $val if $min > $val;
2683                 $max = $val if $max < $val;
2684             }
2685             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2686             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2687         }
2688         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2689     }
2690     $sorttables .= $rowstr;
2691
2692     # multiple weights
2693     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2694
2695     # jamo sort
2696     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2697
2698     # Locales
2699
2700     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2701     foreach my $loc (sort keys %locales)
2702     {
2703         # skip specific locales that match more general ones
2704         my @parts = split /[-_]/, $loc;
2705         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2706         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2707         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2708     }
2709
2710     # File header
2711
2712     my @header;
2713     $header[0] = 16;
2714     $header[1] = $header[0] + length $table;
2715     $header[2] = $header[1] + length $casemaps;
2716     $header[3] = $header[2] + length $chartypes;
2717
2718     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2719     print OUTPUT pack "L<*", @header;
2720     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2721     close OUTPUT;
2722     save_file($filename);
2723 }
2724
2725
2726 ################################################################
2727 # build the script to create registry keys
2728 sub dump_registry_script($%)
2729 {
2730     my ($filename, %keys) = @_;
2731     my $indent = 1;
2732
2733     printf "Building %s\n", $filename;
2734     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2735     print OUTPUT "HKLM\n{\n";
2736     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2737     {
2738         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2739         $indent++;
2740     }
2741     foreach my $k (sort keys %keys)
2742     {
2743         my @subkeys = split /\\/, $k;
2744         my ($def, @vals) = @{$keys{$k}};
2745         for (my $i = 0; $i < @subkeys; $i++)
2746         {
2747             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2748                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2749             $indent++;
2750         }
2751         foreach my $v (@vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2752         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2753     }
2754     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2755     close OUTPUT;
2756     save_file($filename);
2757 }
2758
2759
2760 ################################################################
2761 # save a file if modified
2762 sub save_file($)
2763 {
2764     my $file = shift;
2765     if (-f $file && !system "cmp $file $file.new >/dev/null")
2766     {
2767         unlink "$file.new";
2768     }
2769     else
2770     {
2771         rename "$file.new", "$file";
2772     }
2773 }
2774
2775
2776 ################################################################
2777 # main routine
2778
2779 chdir ".." if -f "./make_unicode";
2780 load_data();
2781 dump_case_mappings( "libs/port/casemap.c" );
2782 dump_sortkeys( "dlls/kernelbase/collation.c" );
2783 dump_ctype_tables( "libs/port/wctype.c" );
2784 dump_bidi_dir_table( "dlls/gdi32/direction.c" );
2785 dump_bidi_dir_table( "dlls/usp10/direction.c" );
2786 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2787 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2788 dump_mirroring( "dlls/usp10/mirror.c" );
2789 dump_mirroring( "dlls/dwrite/mirror.c" );
2790 dump_bracket( "dlls/usp10/bracket.c" );
2791 dump_bracket( "dlls/dwrite/bracket.c" );
2792 dump_shaping( "dlls/usp10/shaping.c" );
2793 dump_linebreak( "dlls/usp10/linebreak.c" );
2794 dump_linebreak( "dlls/dwrite/linebreak.c" );
2795 dump_scripts( "dlls/dwrite/scripts" );
2796 dump_indic( "dlls/usp10/indicsyllable.c" );
2797 dump_vertical( "dlls/gdi32/vertical.c" );
2798 dump_vertical( "dlls/wineps.drv/vertical.c" );
2799 dump_intl_nls("nls/l_intl.nls");
2800 dump_norm_table( "nls/normnfc.nls" );
2801 dump_norm_table( "nls/normnfd.nls" );
2802 dump_norm_table( "nls/normnfkc.nls" );
2803 dump_norm_table( "nls/normnfkd.nls" );
2804 dump_norm_table( "nls/normidna.nls" );
2805 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2806 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2807 dump_eucjp_codepage();
2808 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2809
2810 exit 0;
2811
2812 # Local Variables:
2813 # compile-command: "./make_unicode"
2814 # End: