unicode/cpmap.pl

   1 #!/usr/bin/perl
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20 #
  21
  22 # base directory for ftp.unicode.org files
  23 $BASEDIR = "ftp.unicode.org/Public/";
  24 $MAPPREFIX = $BASEDIR . "MAPPINGS/";
  25
  26 # UnicodeData file
  27 $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
  28
  29 # Defaults mapping
  30 $DEFAULTS = "./defaults";
  31
  32 # Default char for undefined mappings
  33 $DEF_CHAR = ord '?';
  34
  35 @allfiles =
  36 (
  37     [ 37,    "VENDORS/MICSFT/EBCDIC/CP037.TXT",   "IBM EBCDIC US Canada" ],
  38     [ 42,    "VENDORS/ADOBE/symbol.txt",          "Symbol" ],
  39     [ 424,   "VENDORS/MISC/CP424.TXT",            "IBM EBCDIC Hebrew" ],
  40     [ 437,   "VENDORS/MICSFT/PC/CP437.TXT",       "OEM United States" ],
  41     [ 500,   "VENDORS/MICSFT/EBCDIC/CP500.TXT",   "IBM EBCDIC International" ],
  42     [ 737,   "VENDORS/MICSFT/PC/CP737.TXT",       "OEM Greek 437G" ],
  43     [ 775,   "VENDORS/MICSFT/PC/CP775.TXT",       "OEM Baltic" ],
  44     [ 850,   "VENDORS/MICSFT/PC/CP850.TXT",       "OEM Multilingual Latin 1" ],
  45     [ 852,   "VENDORS/MICSFT/PC/CP852.TXT",       "OEM Slovak Latin 2" ],
  46     [ 855,   "VENDORS/MICSFT/PC/CP855.TXT",       "OEM Cyrillic" ],
  47     [ 856,   "VENDORS/MISC/CP856.TXT",            "Hebrew PC" ],
  48     [ 857,   "VENDORS/MICSFT/PC/CP857.TXT",       "OEM Turkish" ],
  49     [ 860,   "VENDORS/MICSFT/PC/CP860.TXT",       "OEM Portuguese" ],
  50     [ 861,   "VENDORS/MICSFT/PC/CP861.TXT",       "OEM Icelandic" ],
  51     [ 862,   "VENDORS/MICSFT/PC/CP862.TXT",       "OEM Hebrew" ],
  52     [ 863,   "VENDORS/MICSFT/PC/CP863.TXT",       "OEM Canadian French" ],
  53     [ 864,   "VENDORS/MICSFT/PC/CP864.TXT",       "OEM Arabic" ],
  54     [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       "OEM Nordic" ],
  55     [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       "OEM Russian" ],
  56     [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       "OEM Greek" ],
  57     [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       "ANSI/OEM Thai" ],
  58     [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   "IBM EBCDIC Greek" ],
  59     [ 878,   "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
  60     [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  "ANSI/OEM Japanese Shift-JIS" ],
  61     [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  "ANSI/OEM Simplified Chinese GBK" ],
  62     [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  "ANSI/OEM Korean Unified Hangul" ],
  63     [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  "ANSI/OEM Traditional Chinese Big5" ],
  64     [ 1006,  "VENDORS/MISC/CP1006.TXT",           "IBM Arabic" ],
  65     [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  "IBM EBCDIC Latin 5 Turkish" ],
  66     [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", "ANSI Eastern Europe" ],
  67     [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", "ANSI Cyrillic" ],
  68     [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", "ANSI Latin 1" ],
  69     [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", "ANSI Greek" ],
  70     [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", "ANSI Turkish" ],
  71     [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", "ANSI Hebrew" ],
  72     [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", "ANSI Arabic" ],
  73     [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", "ANSI Baltic" ],
  74     [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", "ANSI/OEM Viet Nam" ],
  75     [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      "Mac Roman" ],
  76     [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      "Mac Greek" ],
  77     [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT",   "Mac Cyrillic" ],
  78     [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT",     "Mac Latin 2" ],
  79     [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT",    "Mac Icelandic" ],
  80     [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT",    "Mac Turkish" ],
  81     [ 20866, "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
  82     [ 28591, "ISO8859/8859-1.TXT",                "ISO 8859-1 Latin 1" ],
  83     [ 28592, "ISO8859/8859-2.TXT",                "ISO 8859-2 Latin 2 (East European)" ],
  84     [ 28593, "ISO8859/8859-3.TXT",                "ISO 8859-3 Latin 3 (South European)" ],
  85     [ 28594, "ISO8859/8859-4.TXT",                "ISO 8859-4 Latin 4 (Baltic old)" ],
  86     [ 28595, "ISO8859/8859-5.TXT",                "ISO 8859-5 Cyrillic" ],
  87     [ 28596, "ISO8859/8859-6.TXT",                "ISO 8859-6 Arabic" ],
  88     [ 28597, "ISO8859/8859-7.TXT",                "ISO 8859-7 Greek" ],
  89     [ 28598, "ISO8859/8859-8.TXT",                "ISO 8859-8 Hebrew" ],
  90     [ 28599, "ISO8859/8859-9.TXT",                "ISO 8859-9 Latin 5 (Turkish)" ],
  91     [ 28600, "ISO8859/8859-10.TXT",               "ISO 8859-10 Latin 6 (Nordic)" ],
  92     [ 28603, "ISO8859/8859-13.TXT",               "ISO 8859-13 Latin 7 (Baltic)" ],
  93     [ 28604, "ISO8859/8859-14.TXT",               "ISO 8859-14 Latin 8 (Celtic)" ],
  94     [ 28605, "ISO8859/8859-15.TXT",               "ISO 8859-15 Latin 9 (Euro)" ],
  95     [ 28606, "ISO8859/8859-16.TXT",               "ISO 8859-16 Latin 10 (Balkan)" ]
  96 );
  97
  98
  99 %ctype =
 100 (
 101     "upper"  => 0x0001,
 102     "lower"  => 0x0002,
 103     "digit"  => 0x0004,
 104     "space"  => 0x0008,
 105     "punct"  => 0x0010,
 106     "cntrl"  => 0x0020,
 107     "blank"  => 0x0040,
 108     "xdigit" => 0x0080,
 109     "alpha"  => 0x0100
 110 );
 111
 112 %categories =
 113 (
 114     "Lu" => $ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 115     "Ll" => $ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 116     "Lt" => $ctype{"alpha"},    # Letter, Titlecase
 117     "Mn" => $ctype{"punct"},    # Mark, Non-Spacing
 118     "Mc" => $ctype{"punct"},    # Mark, Spacing Combining
 119     "Me" => $ctype{"punct"},    # Mark, Enclosing
 120     "Nd" => $ctype{"digit"},    # Number, Decimal Digit
 121     "Nl" => $ctype{"punct"},    # Number, Letter
 122     "No" => $ctype{"punct"},    # Number, Other
 123     "Zs" => $ctype{"space"},    # Separator, Space
 124     "Zl" => 0,                  # Separator, Line
 125     "Zp" => 0,                  # Separator, Paragraph
 126     "Cc" => $ctype{"cntrl"},    # Other, Control
 127     "Cf" => 0,                  # Other, Format
 128     "Cs" => 0,                  # Other, Surrogate
 129     "Co" => 0,                  # Other, Private Use
 130     "Cn" => 0,                  # Other, Not Assigned
 131     "Lm" => $ctype{"punct"},    # Letter, Modifier
 132     "Lo" => $ctype{"alpha"},    # Letter, Other
 133     "Pc" => $ctype{"punct"},    # Punctuation, Connector
 134     "Pd" => $ctype{"punct"},    # Punctuation, Dash
 135     "Ps" => $ctype{"punct"},    # Punctuation, Open
 136     "Pe" => $ctype{"punct"},    # Punctuation, Close
 137     "Pi" => $ctype{"punct"},    # Punctuation, Initial quote
 138     "Pf" => $ctype{"punct"},    # Punctuation, Final quote
 139     "Po" => $ctype{"punct"},    # Punctuation, Other
 140     "Sm" => $ctype{"punct"},    # Symbol, Math
 141     "Sc" => $ctype{"punct"},    # Symbol, Currency
 142     "Sk" => $ctype{"punct"},    # Symbol, Modifier
 143     "So" => $ctype{"punct"}     # Symbol, Other
 144 );
 145
 146 # a few characters need additional categories that cannot be determined automatically
 147 %special_categories =
 148 (
 149     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 150                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 151     "space"  => [ 0x09..0x0d, 0xfeff ],
 152     "blank"  => [ 0x09, 0x20, 0xa0, 0xfeff ]
 153 );
 154
 155 %directions =
 156 (
 157     "L"   => 1,    # Left-to-Right
 158     "LRE" => 11,   # Left-to-Right Embedding
 159     "LRO" => 11,   # Left-to-Right Override
 160     "R"   => 2,    # Right-to-Left
 161     "AL"  => 2,    # Right-to-Left Arabic
 162     "RLE" => 11,   # Right-to-Left Embedding
 163     "RLO" => 11,   # Right-to-Left Override
 164     "PDF" => 11,   # Pop Directional Format
 165     "EN"  => 3,    # European Number
 166     "ES"  => 4,    # European Number Separator
 167     "ET"  => 5,    # European Number Terminator
 168     "AN"  => 6,    # Arabic Number
 169     "CS"  => 7,    # Common Number Separator
 170     "NSM" => 0,    # Non-Spacing Mark
 171     "BN"  => 0,    # Boundary Neutral
 172     "B"   => 8,    # Paragraph Separator
 173     "S"   => 9,    # Segment Separator
 174     "WS"  => 10,   # Whitespace
 175     "ON"  => 11    # Other Neutrals
 176 );
 177
 178
 179 ################################################################
 180 # main routine
 181
 182 READ_DEFAULTS();
 183 DUMP_CASE_MAPPINGS();
 184 DUMP_COMPOSE_TABLES();
 185 DUMP_CTYPE_TABLES();
 186
 187 foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
 188
 189 OUTPUT_CPTABLE();
 190
 191 exit(0);
 192
 193
 194 ################################################################
 195 # read in the defaults file
 196 sub READ_DEFAULTS
 197 {
 198     @unicode_defaults = ();
 199     @unicode_aliases = ();
 200     @tolower_table = ();
 201     @toupper_table = ();
 202     @category_table = ();
 203     @direction_table = ();
 204     @decomp_table = ();
 205     @compose_table = ();
 206
 207     # first setup a few default mappings
 208
 209     open DEFAULTS or die "Cannot open $DEFAULTS";
 210     print "Loading $DEFAULTS\n";
 211     while (<DEFAULTS>)
 212     {
 213         next if /^\#/;  # skip comments
 214         next if /^$/;  # skip empty lines
 215         if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
 216         {
 217             my @src = map hex, split /,/,$1;
 218             my $dst = $4;
 219             my $comment = $5;
 220             if ($#src > 0) { push @unicode_aliases, \@src; }
 221             next if ($dst eq "none");
 222             $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
 223             foreach $src (@src)
 224             {
 225                 die "Duplicate value" if defined($unicode_defaults[$src]);
 226                 $unicode_defaults[$src] = $dst;
 227             }
 228             next;
 229         }
 230         die "Unrecognized line $_\n";
 231     }
 232
 233     # now build mappings from the decomposition field of the Unicode database
 234
 235     open UNICODEDATA or die "Cannot open $UNICODEDATA";
 236     print "Loading $UNICODEDATA\n";
 237     while (<UNICODEDATA>)
 238     {
 239         # Decode the fields ...
 240         ($code, $name, $cat, $comb, $bidi,
 241          $decomp, $dec, $dig, $num, $mirror,
 242          $oldname, $comment, $upper, $lower, $title) = split /;/;
 243
 244         my $src = hex $code;
 245
 246         die "unknown category $cat" unless defined $categories{$cat};
 247         die "unknown directionality $bidi" unless defined $directions{$bidi};
 248
 249         $uniname[$src] = $name;
 250         $category_table[$src] = $categories{$cat};
 251         $direction_table[$src] = $directions{$bidi};
 252
 253         if ($lower ne "")
 254         {
 255             $tolower_table[$src] = hex $lower;
 256             $category_table[$src] |= $ctype{"upper"}|$ctype{"alpha"};
 257         }
 258         if ($upper ne "")
 259         {
 260             $toupper_table[$src] = hex $upper;
 261             $category_table[$src] |= $ctype{"lower"}|$ctype{"alpha"};
 262         }
 263         if ($dec ne "")
 264         {
 265             $category_table[$src] |= $ctype{"digit"};
 266         }
 267
 268         # copy the category and direction for everything between First/Last pairs
 269         if ($name =~ /, First>/) { $start = $src; }
 270         if ($name =~ /, Last>/)
 271         {
 272             while ($start < $src)
 273             {
 274                 $category_table[$start] = $category_table[$src];
 275                 $direction_table[$start] = $direction_table[$src];
 276                 $start++;
 277             }
 278         }
 279
 280         next if $decomp eq "";  # no decomposition, skip it
 281
 282         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 283         {
 284             # decomposition of the form "<foo> 1234" -> use char if type is known
 285             next unless ($1 eq "font" ||
 286                          $1 eq "noBreak" ||
 287                          $1 eq "circle" ||
 288                          $1 eq "super" ||
 289                          $1 eq "sub" ||
 290                          $1 eq "wide" ||
 291                          $1 eq "narrow" ||
 292                          $1 eq "compat" ||
 293                          $1 eq "small");
 294             $dst = hex $2;
 295         }
 296         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 297         {
 298             # decomposition "<compat> 0020 1234" -> combining accent
 299             $dst = hex $1;
 300         }
 301         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 302         {
 303             # decomposition contains only char values without prefix -> use first char
 304             $dst = hex $1;
 305             $category_table[$src] |= $category_table[$dst];
 306             # store decomposition if it contains two chars
 307             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 308             {
 309                 $decomp_table[$src] = [ hex $1, hex $2 ];
 310                 push @compose_table, [ hex $1, hex $2, $src ];
 311             }
 312         }
 313         else
 314         {
 315             next;
 316         }
 317
 318         next if defined($unicode_defaults[$src]);  # may have been set in the defaults file
 319
 320         # check for loops
 321         for ($i = $dst; ; $i = $unicode_defaults[$i])
 322         {
 323             die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
 324             last unless defined($unicode_defaults[$i]);
 325         }
 326         $unicode_defaults[$src] = $dst;
 327     }
 328
 329     # patch the category of some special characters
 330
 331     foreach $cat (keys %special_categories)
 332     {
 333         my $flag = $ctype{$cat};
 334         foreach $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 335     }
 336 }
 337
 338
 339 ################################################################
 340 # parse the input file
 341 sub READ_FILE
 342 {
 343     my $name = shift;
 344     open INPUT,$name or die "Cannot open $name";
 345     @cp2uni = ();
 346     @lead_bytes = ();
 347     @uni2cp = ();
 348
 349     while (<INPUT>)
 350     {
 351         next if /^\#/;  # skip comments
 352         next if /^$/;  # skip empty lines
 353         next if /\x1a/;  # skip ^Z
 354         next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/);  # undefined char
 355
 356         if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
 357         {
 358             $cp = hex $1;
 359             push @lead_bytes,$cp;
 360             $cp2uni[$cp] = 0;
 361             next;
 362         }
 363         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 364         {
 365             $cp = hex $1;
 366             $uni = hex $2;
 367             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 368             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 369             next;
 370         }
 371         die "$name: Unrecognized line $_\n";
 372     }
 373 }
 374
 375
 376 ################################################################
 377 # parse the symbol.txt file, since its syntax is different from the other ones
 378 sub READ_SYMBOL_FILE
 379 {
 380     my $name = shift;
 381     open INPUT,$name or die "Cannot open $name";
 382     @cp2uni = ();
 383     @lead_bytes = ();
 384     @uni2cp = ();
 385
 386     while (<INPUT>)
 387     {
 388         next if /^\#/;  # skip comments
 389         next if /^$/;  # skip empty lines
 390         next if /\x1a/;  # skip ^Z
 391         if (/^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\#.*)?/)
 392         {
 393             $uni = hex $1;
 394             $cp = hex $2;
 395             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 396             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 397             next;
 398         }
 399         die "$name: Unrecognized line $_\n";
 400     }
 401 }
 402
 403
 404 ################################################################
 405 # add default mappings once the file had been read
 406 sub ADD_DEFAULT_MAPPINGS
 407 {
 408     # Apply aliases
 409
 410     foreach $alias (@unicode_aliases)
 411     {
 412         my $target = undef;
 413         foreach $src (@$alias)
 414         {
 415             if (defined($uni2cp[$src]))
 416             {
 417                 $target = $uni2cp[$src];
 418                 last;
 419             }
 420         }
 421         next unless defined($target);
 422
 423         # At least one char of the alias set is defined, set the others to the same value
 424         foreach $src (@$alias)
 425         {
 426             $uni2cp[$src] = $target unless defined($uni2cp[$src]);
 427         }
 428     }
 429
 430     # For every src -> target mapping in the defaults table,
 431     # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
 432
 433     for ($src = 0; $src < 65536; $src++)
 434     {
 435         next if defined($uni2cp[$src]);  # source has a definition already
 436         next unless defined($unicode_defaults[$src]);  # no default for this char
 437         my $target = $unicode_defaults[$src];
 438
 439         # do a recursive mapping until we find a target char that is defined
 440         while (!defined($uni2cp[$target]) &&
 441                defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
 442
 443         if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
 444     }
 445
 446     # Add an identity mapping for all undefined chars
 447
 448     for ($i = 0; $i < 256; $i++)
 449     {
 450         next if defined($cp2uni[$i]);
 451         next if defined($uni2cp[$i]);
 452         $cp2uni[$i] = $uni2cp[$i] = $i;
 453     }
 454 }
 455
 456 ################################################################
 457 # dump an array of integers
 458 sub DUMP_ARRAY
 459 {
 460     my ($format,$default,@array) = @_;
 461     my $i, $ret = "    ";
 462     for ($i = 0; $i < $#array; $i++)
 463     {
 464         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
 465         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
 466     }
 467     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
 468     return $ret;
 469 }
 470
 471 ################################################################
 472 # dump an SBCS mapping table
 473 sub DUMP_SBCS_TABLE
 474 {
 475     my ($codepage, $name) = @_;
 476     my $i;
 477
 478     # output the ascii->unicode table
 479
 480     printf OUTPUT "static const WCHAR cp2uni[256] =\n";
 481     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
 482
 483     # count the number of unicode->ascii subtables that contain something
 484
 485     my @filled = ();
 486     my $subtables = 1;
 487     for ($i = 0; $i < 65536; $i++)
 488     {
 489         next unless defined $uni2cp[$i];
 490         $filled[$i >> 8] = 1;
 491         $subtables++;
 492         $i |= 255;
 493     }
 494
 495     # output all the subtables into a single array
 496
 497     printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
 498     for ($i = 0; $i < 256; $i++)
 499     {
 500         next unless $filled[$i];
 501         printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
 502         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
 503     }
 504     printf OUTPUT "    /* defaults */\n";
 505     printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
 506
 507     # output a table of the offsets of the subtables in the previous array
 508
 509     my $pos = 0;
 510     my @offsets = ();
 511     for ($i = 0; $i < 256; $i++)
 512     {
 513         if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
 514         else { push @offsets, ($subtables-1) * 256; }
 515     }
 516     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
 517     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
 518
 519     # output the code page descriptor
 520
 521     printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
 522     printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
 523                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 524     printf OUTPUT "    cp2uni,\n";
 525     printf OUTPUT "    uni2cp_low,\n";
 526     printf OUTPUT "    uni2cp_high\n};\n";
 527 }
 528
 529
 530 ################################################################
 531 # dump a DBCS mapping table
 532 sub DUMP_DBCS_TABLE
 533 {
 534     my ($codepage, $name) = @_;
 535     my $i, $x, $y;
 536
 537     # build a list of lead bytes that are actually used
 538
 539     my @lblist = ();
 540     LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++)
 541     {
 542         my $base = $lead_bytes[$y] << 8;
 543         for ($x = 0; $x < 256; $x++)
 544         {
 545             if (defined $cp2uni[$base+$x])
 546             {
 547                 push @lblist,$lead_bytes[$y];
 548                 next LBLOOP;
 549             }
 550         }
 551     }
 552     my $unused = ($#lead_bytes > $#lblist);
 553
 554     # output the ascii->unicode table for the single byte chars
 555
 556     printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
 557     printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
 558
 559     # output the default table for unused lead bytes
 560
 561     if ($unused)
 562     {
 563         printf OUTPUT "    /* unused lead bytes */\n";
 564         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
 565     }
 566
 567     # output the ascii->unicode table for each DBCS lead byte
 568
 569     for ($y = 0; $y <= $#lblist; $y++)
 570     {
 571         my $base = $lblist[$y] << 8;
 572         printf OUTPUT "    /* lead byte %02x */\n", $lblist[$y];
 573         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
 574         printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
 575     }
 576
 577     # output the lead byte subtables offsets
 578
 579     my @offsets = ();
 580     for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
 581     for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
 582     if ($unused)
 583     {
 584         # increment all lead bytes offset to take into account the unused table
 585         for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
 586     }
 587     printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
 588     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
 589
 590     # count the number of unicode->ascii subtables that contain something
 591
 592     my @filled = ();
 593     my $subtables = 1;
 594     for ($i = 0; $i < 65536; $i++)
 595     {
 596         next unless defined $uni2cp[$i];
 597         $filled[$i >> 8] = 1;
 598         $subtables++;
 599         $i |= 255;
 600     }
 601
 602     # output all the subtables into a single array
 603
 604     printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
 605     for ($y = 0; $y < 256; $y++)
 606     {
 607         next unless $filled[$y];
 608         printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
 609         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
 610     }
 611     printf OUTPUT "    /* defaults */\n";
 612     printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
 613
 614     # output a table of the offsets of the subtables in the previous array
 615
 616     my $pos = 0;
 617     my @offsets = ();
 618     for ($y = 0; $y < 256; $y++)
 619     {
 620         if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
 621         else { push @offsets, ($subtables-1) * 256; }
 622     }
 623     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
 624     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
 625
 626     # output the code page descriptor
 627
 628     printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
 629     printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
 630                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 631     printf OUTPUT "    cp2uni,\n";
 632     printf OUTPUT "    cp2uni_leadbytes,\n";
 633     printf OUTPUT "    uni2cp_low,\n";
 634     printf OUTPUT "    uni2cp_high,\n";
 635     DUMP_LB_RANGES();
 636     printf OUTPUT "};\n";
 637 }
 638
 639
 640 ################################################################
 641 # dump the list of defined lead byte ranges
 642 sub DUMP_LB_RANGES
 643 {
 644     my @list = ();
 645     my $i = 0;
 646     foreach $i (@lead_bytes) { $list[$i] = 1; }
 647     my $on = 0;
 648     printf OUTPUT "    { ";
 649     for ($i = 0; $i < 256; $i++)
 650     {
 651         if ($on)
 652         {
 653             if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
 654         }
 655         else
 656         {
 657             if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
 658         }
 659     }
 660     if ($on) { printf OUTPUT "0xff, "; }
 661     printf OUTPUT "0x00, 0x00 }\n";
 662 }
 663
 664
 665 ################################################################
 666 # dump the case mapping tables
 667 sub DUMP_CASE_MAPPINGS
 668 {
 669     open OUTPUT,">casemap.c" or die "Cannot create casemap.c";
 670     printf "Building casemap.c\n";
 671     printf OUTPUT "/* Unicode case mappings */\n";
 672     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 673     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 674
 675     DUMP_CASE_TABLE( "casemap_lower", @tolower_table );
 676     DUMP_CASE_TABLE( "casemap_upper", @toupper_table );
 677     close OUTPUT;
 678 }
 679
 680
 681 ################################################################
 682 # dump a case mapping table
 683 sub DUMP_CASE_TABLE
 684 {
 685     my ($name,@table) = @_;
 686
 687     # count the number of sub tables that contain something
 688
 689     my @filled = ();
 690     my $pos = 512;
 691     for ($i = 0; $i < 65536; $i++)
 692     {
 693         next unless defined $table[$i];
 694         $filled[$i >> 8] = $pos;
 695         $pos += 256;
 696         $i |= 255;
 697     }
 698     for ($i = 0; $i < 65536; $i++)
 699     {
 700         next unless defined $table[$i];
 701         $table[$i] = ($table[$i] - $i) & 0xffff;
 702     }
 703
 704     # dump the table
 705
 706     printf OUTPUT "const WCHAR %s[%d] =\n", $name, $pos;
 707     printf OUTPUT "{\n    /* index */\n";
 708     printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
 709     printf OUTPUT "    /* defaults */\n";
 710     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
 711     for ($i = 0; $i < 256; $i++)
 712     {
 713         next unless $filled[$i];
 714         printf OUTPUT ",\n    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
 715         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table[($i<<8) .. ($i<<8)+255] );
 716     }
 717     printf OUTPUT "\n};\n";
 718 }
 719
 720
 721 ################################################################
 722 # dump the ctype tables
 723 sub DUMP_CTYPE_TABLES
 724 {
 725     open OUTPUT,">wctype.c" or die "Cannot create casemap.c";
 726     printf "Building wctype.c\n";
 727     printf OUTPUT "/* Unicode ctype tables */\n";
 728     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 729     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 730
 731     my $i;
 732     my @array = (0) x 256;
 733
 734     # add the direction in the high 4 bits of the category
 735     for ($i = 0; $i < 65536; $i++)
 736     {
 737         $category_table[$i] |= $direction_table[$i] << 12;
 738     }
 739
 740     # try to merge table rows
 741     for ($row = 0; $row < 256; $row++)
 742     {
 743         my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
 744         if (defined($sequences{$rowtxt}))
 745         {
 746             # reuse an existing row
 747             $array[$row] = $sequences{$rowtxt};
 748         }
 749         else
 750         {
 751             # create a new row
 752             $sequences{$rowtxt} = $array[$row] = $#array + 1;
 753             push @array, @category_table[($row<<8)..($row<<8)+255];
 754         }
 755     }
 756
 757     printf OUTPUT "const unsigned short wctype_table[%d] =\n{\n", $#array+1;
 758     printf OUTPUT "    /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
 759     printf OUTPUT "    /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );
 760
 761     close OUTPUT;
 762 }
 763
 764
 765 ################################################################
 766 # dump the char composition tables
 767 sub DUMP_COMPOSE_TABLES
 768 {
 769     open OUTPUT,">compose.c" or die "Cannot create compose.c";
 770     printf "Building compose.c\n";
 771     printf OUTPUT "/* Unicode char composition */\n";
 772     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 773     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 774
 775     ######### composition table
 776
 777     my @filled = ();
 778     foreach $i (@compose_table)
 779     {
 780         my @comp = @$i;
 781         push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
 782     }
 783
 784     # count how many different second chars we have
 785
 786     for ($i = $count = 0; $i < 65536; $i++)
 787     {
 788         next unless defined $filled[$i];
 789         $count++;
 790     }
 791
 792     # build the table of second chars and offsets
 793
 794     my $pos = $count + 1;
 795     for ($i = 0; $i < 65536; $i++)
 796     {
 797         next unless defined $filled[$i];
 798         push @table, $i, $pos;
 799         $pos += @{$filled[$i]};
 800     }
 801     # terminator with last position
 802     push @table, 0, $pos;
 803     printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
 804     printf OUTPUT "    /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
 805
 806     # build the table of first chars and mappings
 807
 808     for ($i = 0; $i < 65536; $i++)
 809     {
 810         next unless defined $filled[$i];
 811         my @table = ();
 812         my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
 813         for ($j = 0; $j <= $#list; $j++)
 814         {
 815             push @table, $list[$j][0], $list[$j][1];
 816         }
 817         printf OUTPUT ",\n    /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
 818     }
 819     printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
 820
 821     ######### decomposition table
 822
 823     # first determine all the 16-char subsets that contain something
 824
 825     my @filled = (0) x 4096;
 826     my $pos = 16*2;  # for the null subset
 827     for ($i = 0; $i < 65536; $i++)
 828     {
 829         next unless defined $decomp_table[$i];
 830         $filled[$i >> 4] = $pos;
 831         $pos += 16*2;
 832         $i |= 15;
 833     }
 834     my $total = $pos;
 835
 836     # now count the 256-char subsets that contain something
 837
 838     my @filled_idx = (256) x 256;
 839     $pos = 256 + 16;
 840     for ($i = 0; $i < 4096; $i++)
 841     {
 842         next unless $filled[$i];
 843         $filled_idx[$i >> 4] = $pos;
 844         $pos += 16;
 845         $i |= 15;
 846     }
 847     my $null_offset = $pos;  # null mapping
 848     $total += $pos;
 849
 850     # add the index offsets to the subsets positions
 851
 852     for ($i = 0; $i < 4096; $i++)
 853     {
 854         next unless $filled[$i];
 855         $filled[$i] += $null_offset;
 856     }
 857
 858     # dump the main index
 859
 860     printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
 861     printf OUTPUT "{\n    /* index */\n";
 862     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
 863     printf OUTPUT ",\n    /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
 864
 865     # dump the second-level indexes
 866
 867     for ($i = 0; $i < 256; $i++)
 868     {
 869         next unless ($filled_idx[$i] > 256);
 870         my @table = @filled[($i<<4)..($i<<4)+15];
 871         for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
 872         printf OUTPUT ",\n    /* sub-index %02x */\n", $i;
 873         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
 874     }
 875
 876     # dump the 16-char subsets
 877
 878     printf OUTPUT ",\n    /* null mapping */\n";
 879     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
 880
 881     for ($i = 0; $i < 4096; $i++)
 882     {
 883         next unless $filled[$i];
 884         my @table = (0) x 32;
 885         for ($j = 0; $j < 16; $j++)
 886         {
 887             if (defined $decomp_table[($i<<4) + $j])
 888             {
 889                 $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
 890                 $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
 891             }
 892         }
 893         printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
 894         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
 895     }
 896
 897     printf OUTPUT "\n};\n";
 898     close OUTPUT;
 899 }
 900
 901
 902 ################################################################
 903 # read an input file and generate the corresponding .c file
 904 sub HANDLE_FILE
 905 {
 906     my ($codepage,$filename,$comment) = @_;
 907
 908     # symbol codepage file is special
 909     if ($codepage == 42) { READ_SYMBOL_FILE($MAPPREFIX . $filename); }
 910     else { READ_FILE($MAPPREFIX . $filename); }
 911
 912     ADD_DEFAULT_MAPPINGS();
 913
 914     my $output = sprintf "c_%03d.c", $codepage;
 915     open OUTPUT,">$output" or die "Cannot create $output";
 916
 917     printf "Building %s from %s (%s)\n", $output, $filename, $comment;
 918
 919     # dump all tables
 920
 921     printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
 922     printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
 923     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
 924     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 925
 926     if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $comment ); }
 927     else { DUMP_DBCS_TABLE( $codepage, $comment ); }
 928     close OUTPUT;
 929 }
 930
 931
 932 ################################################################
 933 # output the list of codepage tables into the cptable.c file
 934 sub OUTPUT_CPTABLE
 935 {
 936     @tables_decl = ();
 937
 938     foreach $file (@allfiles)
 939     {
 940         my ($codepage,$filename,$comment) = @$file;
 941         push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
 942     }
 943
 944     push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
 945     foreach $file (@allfiles)
 946     {
 947         my ($codepage,$filename,$comment) = @$file;
 948         push @tables_decl, sprintf("    &cptable_%03d,\n", $codepage);
 949     }
 950     push @tables_decl, "};";
 951     REPLACE_IN_FILE( "cptable.c", @tables_decl );
 952 }
 953
 954 ################################################################
 955 # replace the contents of a file between ### cpmap ### marks
 956
 957 sub REPLACE_IN_FILE
 958 {
 959     my $name = shift;
 960     my @data = @_;
 961     my @lines = ();
 962     open(FILE,$name) or die "Can't open $name";
 963     while (<FILE>)
 964     {
 965         push @lines, $_;
 966         last if /\#\#\# cpmap begin \#\#\#/;
 967     }
 968     push @lines, @data;
 969     while (<FILE>)
 970     {
 971         if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
 972     }
 973     push @lines, <FILE>;
 974     open(FILE,">$name") or die "Can't modify $name";
 975     print FILE @lines;
 976     close(FILE);
 977 }