unicode/cpmap.pl

   1 #!/usr/bin/perl
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7
   8 # base directory for ftp.unicode.org files
   9 $BASEDIR = "ftp.unicode.org/Public/";
  10 $MAPPREFIX = $BASEDIR . "MAPPINGS/";
  11
  12 # UnicodeData file
  13 $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
  14
  15 # Defaults mapping
  16 $DEFAULTS = "./defaults";
  17
  18 # Default char for undefined mappings
  19 $DEF_CHAR = ord '?';
  20
  21 @allfiles =
  22 (
  23     [ 37,    "VENDORS/MICSFT/EBCDIC/CP037.TXT",   "IBM EBCDIC US Canada" ],
  24     [ 42,    "VENDORS/ADOBE/symbol.txt",          "Symbol" ],
  25     [ 424,   "VENDORS/MISC/CP424.TXT",            "IBM EBCDIC Hebrew" ],
  26     [ 437,   "VENDORS/MICSFT/PC/CP437.TXT",       "OEM United States" ],
  27     [ 500,   "VENDORS/MICSFT/EBCDIC/CP500.TXT",   "IBM EBCDIC International" ],
  28     [ 737,   "VENDORS/MICSFT/PC/CP737.TXT",       "OEM Greek 437G" ],
  29     [ 775,   "VENDORS/MICSFT/PC/CP775.TXT",       "OEM Baltic" ],
  30     [ 850,   "VENDORS/MICSFT/PC/CP850.TXT",       "OEM Multilingual Latin 1" ],
  31     [ 852,   "VENDORS/MICSFT/PC/CP852.TXT",       "OEM Slovak Latin 2" ],
  32     [ 855,   "VENDORS/MICSFT/PC/CP855.TXT",       "OEM Cyrillic" ],
  33     [ 856,   "VENDORS/MISC/CP856.TXT",            "Hebrew PC" ],
  34     [ 857,   "VENDORS/MICSFT/PC/CP857.TXT",       "OEM Turkish" ],
  35     [ 860,   "VENDORS/MICSFT/PC/CP860.TXT",       "OEM Portuguese" ],
  36     [ 861,   "VENDORS/MICSFT/PC/CP861.TXT",       "OEM Icelandic" ],
  37     [ 862,   "VENDORS/MICSFT/PC/CP862.TXT",       "OEM Hebrew" ],
  38     [ 863,   "VENDORS/MICSFT/PC/CP863.TXT",       "OEM Canadian French" ],
  39     [ 864,   "VENDORS/MICSFT/PC/CP864.TXT",       "OEM Arabic" ],
  40     [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       "OEM Nordic" ],
  41     [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       "OEM Russian" ],
  42     [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       "OEM Greek" ],
  43     [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       "ANSI/OEM Thai" ],
  44     [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   "IBM EBCDIC Greek" ],
  45     [ 878,   "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
  46     [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  "ANSI/OEM Japanese Shift-JIS" ],
  47     [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  "ANSI/OEM Simplified Chinese GBK" ],
  48     [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  "ANSI/OEM Korean Unified Hangul" ],
  49     [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  "ANSI/OEM Traditional Chinese Big5" ],
  50     [ 1006,  "VENDORS/MISC/CP1006.TXT",           "IBM Arabic" ],
  51     [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  "IBM EBCDIC Latin 5 Turkish" ],
  52     [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", "ANSI Eastern Europe" ],
  53     [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", "ANSI Cyrillic" ],
  54     [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", "ANSI Latin 1" ],
  55     [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", "ANSI Greek" ],
  56     [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", "ANSI Turkish" ],
  57     [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", "ANSI Hebrew" ],
  58     [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", "ANSI Arabic" ],
  59     [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", "ANSI Baltic" ],
  60     [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", "ANSI/OEM Viet Nam" ],
  61     [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      "Mac Roman" ],
  62     [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      "Mac Greek" ],
  63     [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT",   "Mac Cyrillic" ],
  64     [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT",     "Mac Latin 2" ],
  65     [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT",    "Mac Icelandic" ],
  66     [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT",    "Mac Turkish" ],
  67     [ 20866, "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
  68     [ 28591, "ISO8859/8859-1.TXT",                "ISO 8859-1 Latin 1" ],
  69     [ 28592, "ISO8859/8859-2.TXT",                "ISO 8859-2 Latin 2 (East European)" ],
  70     [ 28593, "ISO8859/8859-3.TXT",                "ISO 8859-3 Latin 3 (South European)" ],
  71     [ 28594, "ISO8859/8859-4.TXT",                "ISO 8859-4 Latin 4 (Baltic old)" ],
  72     [ 28595, "ISO8859/8859-5.TXT",                "ISO 8859-5 Cyrillic" ],
  73     [ 28596, "ISO8859/8859-6.TXT",                "ISO 8859-6 Arabic" ],
  74     [ 28597, "ISO8859/8859-7.TXT",                "ISO 8859-7 Greek" ],
  75     [ 28598, "ISO8859/8859-8.TXT",                "ISO 8859-8 Hebrew" ],
  76     [ 28599, "ISO8859/8859-9.TXT",                "ISO 8859-9 Latin 5 (Turkish)" ],
  77     [ 28600, "ISO8859/8859-10.TXT",               "ISO 8859-10 Latin 6 (Nordic)" ],
  78     [ 28603, "ISO8859/8859-13.TXT",               "ISO 8859-13 Latin 7 (Baltic)" ],
  79     [ 28604, "ISO8859/8859-14.TXT",               "ISO 8859-14 Latin 8 (Celtic)" ],
  80     [ 28605, "ISO8859/8859-15.TXT",               "ISO 8859-15 Latin 9 (Euro)" ]
  81 );
  82
  83
  84 %ctype =
  85 (
  86     "upper"  => 0x0001,
  87     "lower"  => 0x0002,
  88     "digit"  => 0x0004,
  89     "space"  => 0x0008,
  90     "punct"  => 0x0010,
  91     "cntrl"  => 0x0020,
  92     "blank"  => 0x0040,
  93     "xdigit" => 0x0080,
  94     "alpha"  => 0x0100
  95 );
  96
  97 %categories =
  98 (
  99     "Lu" => $ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 100     "Ll" => $ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 101     "Lt" => $ctype{"alpha"},    # Letter, Titlecase
 102     "Mn" => $ctype{"punct"},    # Mark, Non-Spacing
 103     "Mc" => $ctype{"punct"},    # Mark, Spacing Combining
 104     "Me" => $ctype{"punct"},    # Mark, Enclosing
 105     "Nd" => $ctype{"digit"},    # Number, Decimal Digit
 106     "Nl" => $ctype{"punct"},    # Number, Letter
 107     "No" => $ctype{"punct"},    # Number, Other
 108     "Zs" => $ctype{"space"},    # Separator, Space
 109     "Zl" => 0,                  # Separator, Line
 110     "Zp" => 0,                  # Separator, Paragraph
 111     "Cc" => $ctype{"cntrl"},    # Other, Control
 112     "Cf" => 0,                  # Other, Format
 113     "Cs" => 0,                  # Other, Surrogate
 114     "Co" => 0,                  # Other, Private Use
 115     "Cn" => 0,                  # Other, Not Assigned
 116     "Lm" => $ctype{"punct"},    # Letter, Modifier
 117     "Lo" => $ctype{"alpha"},    # Letter, Other
 118     "Pc" => $ctype{"punct"},    # Punctuation, Connector
 119     "Pd" => $ctype{"punct"},    # Punctuation, Dash
 120     "Ps" => $ctype{"punct"},    # Punctuation, Open
 121     "Pe" => $ctype{"punct"},    # Punctuation, Close
 122     "Pi" => $ctype{"punct"},    # Punctuation, Initial quote
 123     "Pf" => $ctype{"punct"},    # Punctuation, Final quote
 124     "Po" => $ctype{"punct"},    # Punctuation, Other
 125     "Sm" => $ctype{"punct"},    # Symbol, Math
 126     "Sc" => $ctype{"punct"},    # Symbol, Currency
 127     "Sk" => $ctype{"punct"},    # Symbol, Modifier
 128     "So" => $ctype{"punct"}     # Symbol, Other
 129 );
 130
 131 # a few characters need additional categories that cannot be determined automatically
 132 %special_categories =
 133 (
 134     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 135                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 136     "space"  => [ 0x09..0x0d, 0xfeff ],
 137     "blank"  => [ 0x09, 0x20, 0xa0, 0xfeff ]
 138 );
 139
 140 %directions =
 141 (
 142     "L"   => 1,    # Left-to-Right
 143     "LRE" => 11,   # Left-to-Right Embedding
 144     "LRO" => 11,   # Left-to-Right Override
 145     "R"   => 2,    # Right-to-Left
 146     "AL"  => 2,    # Right-to-Left Arabic
 147     "RLE" => 11,   # Right-to-Left Embedding
 148     "RLO" => 11,   # Right-to-Left Override
 149     "PDF" => 11,   # Pop Directional Format
 150     "EN"  => 3,    # European Number
 151     "ES"  => 4,    # European Number Separator
 152     "ET"  => 5,    # European Number Terminator
 153     "AN"  => 6,    # Arabic Number
 154     "CS"  => 7,    # Common Number Separator
 155     "NSM" => 0,    # Non-Spacing Mark
 156     "BN"  => 0,    # Boundary Neutral
 157     "B"   => 8,    # Paragraph Separator
 158     "S"   => 9,    # Segment Separator
 159     "WS"  => 10,   # Whitespace
 160     "ON"  => 11    # Other Neutrals
 161 );
 162
 163
 164 ################################################################
 165 # main routine
 166
 167 READ_DEFAULTS();
 168 DUMP_CASE_MAPPINGS();
 169 DUMP_CTYPE_TABLES();
 170
 171 foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
 172
 173 OUTPUT_CPTABLE();
 174
 175 exit(0);
 176
 177
 178 ################################################################
 179 # read in the defaults file
 180 sub READ_DEFAULTS
 181 {
 182     @unicode_defaults = ();
 183     @unicode_aliases = ();
 184     @tolower_table = ();
 185     @toupper_table = ();
 186     @category_table = ();
 187     @direction_table = ();
 188
 189     # first setup a few default mappings
 190
 191     open DEFAULTS or die "Cannot open $DEFAULTS";
 192     print "Loading $DEFAULTS\n";
 193     while (<DEFAULTS>)
 194     {
 195         next if /^\#/;  # skip comments
 196         next if /^$/;  # skip empty lines
 197         if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
 198         {
 199             my @src = map hex, split /,/,$1;
 200             my $dst = $4;
 201             my $comment = $5;
 202             if ($#src > 0) { push @unicode_aliases, \@src; }
 203             next if ($dst eq "none");
 204             $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
 205             foreach $src (@src)
 206             {
 207                 die "Duplicate value" if defined($unicode_defaults[$src]);
 208                 $unicode_defaults[$src] = $dst;
 209             }
 210             next;
 211         }
 212         die "Unrecognized line $_\n";
 213     }
 214
 215     # now build mappings from the decomposition field of the Unicode database
 216
 217     open UNICODEDATA or die "Cannot open $UNICODEDATA";
 218     print "Loading $UNICODEDATA\n";
 219     while (<UNICODEDATA>)
 220     {
 221         # Decode the fields ...
 222         ($code, $name, $cat, $comb, $bidi,
 223          $decomp, $dec, $dig, $num, $mirror,
 224          $oldname, $comment, $upper, $lower, $title) = split /;/;
 225
 226         my $src = hex $code;
 227
 228         die "unknown category $cat" unless defined $categories{$cat};
 229         die "unknown directionality $bidi" unless defined $directions{$bidi};
 230
 231         $uniname[$src] = $name;
 232         $category_table[$src] = $categories{$cat};
 233         $direction_table[$src] = $directions{$bidi};
 234
 235         if ($lower ne "")
 236         {
 237             $tolower_table[$src] = hex $lower;
 238             $category_table[$src] |= $ctype{"upper"}|$ctype{"alpha"};
 239         }
 240         if ($upper ne "")
 241         {
 242             $toupper_table[$src] = hex $upper;
 243             $category_table[$src] |= $ctype{"lower"}|$ctype{"alpha"};
 244         }
 245         if ($dec ne "")
 246         {
 247             $category_table[$src] |= $ctype{"digit"};
 248         }
 249
 250         # copy the category and direction for everything between First/Last pairs
 251         if ($name =~ /, First>/) { $start = $src; }
 252         if ($name =~ /, Last>/)
 253         {
 254             while ($start < $src)
 255             {
 256                 $category_table[$start] = $category_table[$src];
 257                 $direction_table[$start] = $direction_table[$src];
 258                 $start++;
 259             }
 260         }
 261
 262         next if $decomp eq "";  # no decomposition, skip it
 263
 264         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 265         {
 266             # decomposition of the form "<foo> 1234" -> use char if type is known
 267             next unless ($1 eq "font" ||
 268                          $1 eq "noBreak" ||
 269                          $1 eq "circle" ||
 270                          $1 eq "super" ||
 271                          $1 eq "sub" ||
 272                          $1 eq "wide" ||
 273                          $1 eq "narrow" ||
 274                          $1 eq "compat" ||
 275                          $1 eq "small");
 276             $dst = hex $2;
 277         }
 278         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 279         {
 280             # decomposition "<compat> 0020 1234" -> combining accent
 281             $dst = hex $1;
 282         }
 283         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 284         {
 285             # decomposition contains only char values without prefix -> use first char
 286             $dst = hex $1;
 287             $category_table[$src] |= $category_table[$dst];
 288         }
 289         else
 290         {
 291             next;
 292         }
 293
 294         next if defined($unicode_defaults[$src]);  # may have been set in the defaults file
 295
 296         # check for loops
 297         for ($i = $dst; ; $i = $unicode_defaults[$i])
 298         {
 299             die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
 300             last unless defined($unicode_defaults[$i]);
 301         }
 302         $unicode_defaults[$src] = $dst;
 303     }
 304
 305     # patch the category of some special characters
 306
 307     foreach $cat (keys %special_categories)
 308     {
 309         my $flag = $ctype{$cat};
 310         foreach $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 311     }
 312 }
 313
 314
 315 ################################################################
 316 # parse the input file
 317 sub READ_FILE
 318 {
 319     my $name = shift;
 320     open INPUT,$name or die "Cannot open $name";
 321     @cp2uni = ();
 322     @lead_bytes = ();
 323     @uni2cp = ();
 324
 325     while (<INPUT>)
 326     {
 327         next if /^\#/;  # skip comments
 328         next if /^$/;  # skip empty lines
 329         next if /\x1a/;  # skip ^Z
 330         next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/);  # undefined char
 331
 332         if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
 333         {
 334             $cp = hex $1;
 335             push @lead_bytes,$cp;
 336             $cp2uni[$cp] = 0;
 337             next;
 338         }
 339         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 340         {
 341             $cp = hex $1;
 342             $uni = hex $2;
 343             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 344             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 345             next;
 346         }
 347         die "$name: Unrecognized line $_\n";
 348     }
 349 }
 350
 351
 352 ################################################################
 353 # parse the symbol.txt file, since its syntax is different from the other ones
 354 sub READ_SYMBOL_FILE
 355 {
 356     my $name = shift;
 357     open INPUT,$name or die "Cannot open $name";
 358     @cp2uni = ();
 359     @lead_bytes = ();
 360     @uni2cp = ();
 361
 362     while (<INPUT>)
 363     {
 364         next if /^\#/;  # skip comments
 365         next if /^$/;  # skip empty lines
 366         next if /\x1a/;  # skip ^Z
 367         if (/^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\#.*)?/)
 368         {
 369             $uni = hex $1;
 370             $cp = hex $2;
 371             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 372             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 373             next;
 374         }
 375         die "$name: Unrecognized line $_\n";
 376     }
 377 }
 378
 379
 380 ################################################################
 381 # add default mappings once the file had been read
 382 sub ADD_DEFAULT_MAPPINGS
 383 {
 384     # Apply aliases
 385
 386     foreach $alias (@unicode_aliases)
 387     {
 388         my $target = undef;
 389         foreach $src (@$alias)
 390         {
 391             if (defined($uni2cp[$src]))
 392             {
 393                 $target = $uni2cp[$src];
 394                 last;
 395             }
 396         }
 397         next unless defined($target);
 398
 399         # At least one char of the alias set is defined, set the others to the same value
 400         foreach $src (@$alias)
 401         {
 402             $uni2cp[$src] = $target unless defined($uni2cp[$src]);
 403         }
 404     }
 405
 406     # For every src -> target mapping in the defaults table,
 407     # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
 408
 409     for ($src = 0; $src < 65536; $src++)
 410     {
 411         next if defined($uni2cp[$src]);  # source has a definition already
 412         next unless defined($unicode_defaults[$src]);  # no default for this char
 413         my $target = $unicode_defaults[$src];
 414
 415         # do a recursive mapping until we find a target char that is defined
 416         while (!defined($uni2cp[$target]) &&
 417                defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
 418
 419         if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
 420     }
 421
 422     # Add an identity mapping for all undefined chars
 423
 424     for ($i = 0; $i < 256; $i++)
 425     {
 426         next if defined($cp2uni[$i]);
 427         next if defined($uni2cp[$i]);
 428         $cp2uni[$i] = $uni2cp[$i] = $i;
 429     }
 430 }
 431
 432 ################################################################
 433 # dump an array of integers
 434 sub DUMP_ARRAY
 435 {
 436     my ($format,$default,@array) = @_;
 437     my $i, $ret = "    ";
 438     for ($i = 0; $i < $#array; $i++)
 439     {
 440         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
 441         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
 442     }
 443     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
 444     return $ret;
 445 }
 446
 447 ################################################################
 448 # dump an SBCS mapping table
 449 sub DUMP_SBCS_TABLE
 450 {
 451     my ($codepage, $name) = @_;
 452     my $i;
 453
 454     # output the ascii->unicode table
 455
 456     printf OUTPUT "static const WCHAR cp2uni[256] =\n";
 457     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
 458
 459     # count the number of unicode->ascii subtables that contain something
 460
 461     my @filled = ();
 462     my $subtables = 1;
 463     for ($i = 0; $i < 65536; $i++)
 464     {
 465         next unless defined $uni2cp[$i];
 466         $filled[$i >> 8] = 1;
 467         $subtables++;
 468         $i = ($i & ~255) + 256;
 469     }
 470
 471     # output all the subtables into a single array
 472
 473     printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
 474     for ($i = 0; $i < 256; $i++)
 475     {
 476         next unless $filled[$i];
 477         printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
 478         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
 479     }
 480     printf OUTPUT "    /* defaults */\n";
 481     printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
 482
 483     # output a table of the offsets of the subtables in the previous array
 484
 485     my $pos = 0;
 486     my @offsets = ();
 487     for ($i = 0; $i < 256; $i++)
 488     {
 489         if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
 490         else { push @offsets, ($subtables-1) * 256; }
 491     }
 492     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
 493     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
 494
 495     # output the code page descriptor
 496
 497     printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
 498     printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
 499                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 500     printf OUTPUT "    cp2uni,\n";
 501     printf OUTPUT "    uni2cp_low,\n";
 502     printf OUTPUT "    uni2cp_high\n};\n";
 503 }
 504
 505
 506 ################################################################
 507 # dump a DBCS mapping table
 508 sub DUMP_DBCS_TABLE
 509 {
 510     my ($codepage, $name) = @_;
 511     my $i, $x, $y;
 512
 513     # build a list of lead bytes that are actually used
 514
 515     my @lblist = ();
 516     LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++)
 517     {
 518         my $base = $lead_bytes[$y] << 8;
 519         for ($x = 0; $x < 256; $x++)
 520         {
 521             if (defined $cp2uni[$base+$x])
 522             {
 523                 push @lblist,$lead_bytes[$y];
 524                 next LBLOOP;
 525             }
 526         }
 527     }
 528     my $unused = ($#lead_bytes > $#lblist);
 529
 530     # output the ascii->unicode table for the single byte chars
 531
 532     printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
 533     printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
 534
 535     # output the default table for unused lead bytes
 536
 537     if ($unused)
 538     {
 539         printf OUTPUT "    /* unused lead bytes */\n";
 540         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
 541     }
 542
 543     # output the ascii->unicode table for each DBCS lead byte
 544
 545     for ($y = 0; $y <= $#lblist; $y++)
 546     {
 547         my $base = $lblist[$y] << 8;
 548         printf OUTPUT "    /* lead byte %02x */\n", $lblist[$y];
 549         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
 550         printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
 551     }
 552
 553     # output the lead byte subtables offsets
 554
 555     my @offsets = ();
 556     for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
 557     for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
 558     if ($unused)
 559     {
 560         # increment all lead bytes offset to take into account the unused table
 561         for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
 562     }
 563     printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
 564     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
 565
 566     # count the number of unicode->ascii subtables that contain something
 567
 568     my @filled = ();
 569     my $subtables = 1;
 570     for ($i = 0; $i < 65536; $i++)
 571     {
 572         next unless defined $uni2cp[$i];
 573         $filled[$i >> 8] = 1;
 574         $subtables++;
 575         $i = ($i & ~255) + 256;
 576     }
 577
 578     # output all the subtables into a single array
 579
 580     printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
 581     for ($y = 0; $y < 256; $y++)
 582     {
 583         next unless $filled[$y];
 584         printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
 585         printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
 586     }
 587     printf OUTPUT "    /* defaults */\n";
 588     printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
 589
 590     # output a table of the offsets of the subtables in the previous array
 591
 592     my $pos = 0;
 593     my @offsets = ();
 594     for ($y = 0; $y < 256; $y++)
 595     {
 596         if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
 597         else { push @offsets, ($subtables-1) * 256; }
 598     }
 599     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
 600     printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
 601
 602     # output the code page descriptor
 603
 604     printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
 605     printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
 606                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 607     printf OUTPUT "    cp2uni,\n";
 608     printf OUTPUT "    cp2uni_leadbytes,\n";
 609     printf OUTPUT "    uni2cp_low,\n";
 610     printf OUTPUT "    uni2cp_high,\n";
 611     DUMP_LB_RANGES();
 612     printf OUTPUT "};\n";
 613 }
 614
 615
 616 ################################################################
 617 # dump the list of defined lead byte ranges
 618 sub DUMP_LB_RANGES
 619 {
 620     my @list = ();
 621     my $i = 0;
 622     foreach $i (@lead_bytes) { $list[$i] = 1; }
 623     my $on = 0;
 624     printf OUTPUT "    { ";
 625     for ($i = 0; $i < 256; $i++)
 626     {
 627         if ($on)
 628         {
 629             if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
 630         }
 631         else
 632         {
 633             if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
 634         }
 635     }
 636     if ($on) { printf OUTPUT "0xff, "; }
 637     printf OUTPUT "0x00, 0x00 }\n";
 638 }
 639
 640
 641 ################################################################
 642 # dump the case mapping tables
 643 sub DUMP_CASE_MAPPINGS
 644 {
 645     open OUTPUT,">casemap.c" or die "Cannot create casemap.c";
 646     printf "Building casemap.c\n";
 647     printf OUTPUT "/* Unicode case mappings */\n";
 648     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 649     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 650
 651     DUMP_CASE_TABLE( "casemap_lower", @tolower_table );
 652     DUMP_CASE_TABLE( "casemap_upper", @toupper_table );
 653     close OUTPUT;
 654 }
 655
 656
 657 ################################################################
 658 # dump a case mapping table
 659 sub DUMP_CASE_TABLE
 660 {
 661     my ($name,@table) = @_;
 662
 663     # count the number of sub tables that contain something
 664
 665     my @filled = ();
 666     my $pos = 512;
 667     for ($i = 0; $i < 65536; $i++)
 668     {
 669         next unless defined $table[$i];
 670         $filled[$i >> 8] = $pos;
 671         $pos += 256;
 672         $i = ($i & ~255) + 256;
 673     }
 674     for ($i = 0; $i < 65536; $i++)
 675     {
 676         next unless defined $table[$i];
 677         $table[$i] = ($table[$i] - $i) & 0xffff;
 678     }
 679
 680     # dump the table
 681
 682     printf OUTPUT "const WCHAR %s[%d] =\n", $name, $pos;
 683     printf OUTPUT "{\n    /* index */\n";
 684     printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
 685     printf OUTPUT "    /* defaults */\n";
 686     printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
 687     for ($i = 0; $i < 256; $i++)
 688     {
 689         next unless $filled[$i];
 690         printf OUTPUT ",\n    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
 691         printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table[($i<<8) .. ($i<<8)+255] );
 692     }
 693     printf OUTPUT "\n};\n";
 694 }
 695
 696
 697 ################################################################
 698 # dump the ctype tables
 699 sub DUMP_CTYPE_TABLES
 700 {
 701     open OUTPUT,">wctype.c" or die "Cannot create casemap.c";
 702     printf "Building wctype.c\n";
 703     printf OUTPUT "/* Unicode ctype tables */\n";
 704     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
 705     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 706
 707     my $i;
 708     my @array = (0) x 256;
 709
 710     # add the direction in the high 4 bits of the category
 711     for ($i = 0; $i < 65536; $i++)
 712     {
 713         $category_table[$i] |= $direction_table[$i] << 12;
 714     }
 715
 716     # try to merge table rows
 717     for ($row = 0; $row < 256; $row++)
 718     {
 719         my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
 720         if (defined($sequences{$rowtxt}))
 721         {
 722             # reuse an existing row
 723             $array[$row] = $sequences{$rowtxt};
 724         }
 725         else
 726         {
 727             # create a new row
 728             $sequences{$rowtxt} = $array[$row] = $#array + 1;
 729             push @array, @category_table[($row<<8)..($row<<8)+255];
 730         }
 731     }
 732
 733     printf OUTPUT "const unsigned short wctype_table[%d] =\n{\n", $#array+1;
 734     printf OUTPUT "    /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
 735     printf OUTPUT "    /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );
 736
 737     close OUTPUT;
 738 }
 739
 740 ################################################################
 741 # read an input file and generate the corresponding .c file
 742 sub HANDLE_FILE
 743 {
 744     my ($codepage,$filename,$comment) = @_;
 745
 746     # symbol codepage file is special
 747     if ($codepage == 42) { READ_SYMBOL_FILE($MAPPREFIX . $filename); }
 748     else { READ_FILE($MAPPREFIX . $filename); }
 749
 750     ADD_DEFAULT_MAPPINGS();
 751
 752     my $output = sprintf "c_%03d.c", $codepage;
 753     open OUTPUT,">$output" or die "Cannot create $output";
 754
 755     printf "Building %s from %s (%s)\n", $output, $filename, $comment;
 756
 757     # dump all tables
 758
 759     printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
 760     printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
 761     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
 762     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 763
 764     if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $comment ); }
 765     else { DUMP_DBCS_TABLE( $codepage, $comment ); }
 766     close OUTPUT;
 767 }
 768
 769
 770 ################################################################
 771 # output the list of codepage tables into the cptable.c file
 772 sub OUTPUT_CPTABLE
 773 {
 774     @tables_decl = ();
 775
 776     foreach $file (@allfiles)
 777     {
 778         my ($codepage,$filename,$comment) = @$file;
 779         push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
 780     }
 781
 782     push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
 783     foreach $file (@allfiles)
 784     {
 785         my ($codepage,$filename,$comment) = @$file;
 786         push @tables_decl, sprintf("    &cptable_%03d,\n", $codepage);
 787     }
 788     push @tables_decl, "};";
 789     REPLACE_IN_FILE( "cptable.c", @tables_decl );
 790 }
 791
 792 ################################################################
 793 # replace the contents of a file between ### cpmap ### marks
 794
 795 sub REPLACE_IN_FILE
 796 {
 797     my $name = shift;
 798     my @data = @_;
 799     my @lines = ();
 800     open(FILE,$name) or die "Can't open $name";
 801     while (<FILE>)
 802     {
 803         push @lines, $_;
 804         last if /\#\#\# cpmap begin \#\#\#/;
 805     }
 806     push @lines, @data;
 807     while (<FILE>)
 808     {
 809         if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
 810     }
 811     push @lines, <FILE>;
 812     open(FILE,">$name") or die "Can't modify $name";
 813     print FILE @lines;
 814     close(FILE);
 815 }