unicode/cpmap.pl

   1 #!/usr/bin/perl
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7
   8 # base directory for ftp.unicode.org files
   9 $BASEDIR = "ftp.unicode.org/Public/";
  10 $MAPPREFIX = $BASEDIR . "MAPPINGS/";
  11
  12 # UnicodeData file
  13 $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
  14
  15 # Defaults mapping
  16 $DEFAULTS = "./defaults";
  17
  18 # Default char for undefined mappings
  19 $DEF_CHAR = ord '?';
  20
  21 @allfiles =
  22 (
  23     [ 37,    "VENDORS/MICSFT/EBCDIC/CP037.TXT",   "IBM EBCDIC US Canada" ],
  24     [ 42,    "VENDORS/ADOBE/symbol.txt",          "Symbol" ],
  25     [ 424,   "VENDORS/MISC/CP424.TXT",            "IBM EBCDIC Hebrew" ],
  26     [ 437,   "VENDORS/MICSFT/PC/CP437.TXT",       "OEM United States" ],
  27     [ 500,   "VENDORS/MICSFT/EBCDIC/CP500.TXT",   "IBM EBCDIC International" ],
  28     [ 737,   "VENDORS/MICSFT/PC/CP737.TXT",       "OEM Greek 437G" ],
  29     [ 775,   "VENDORS/MICSFT/PC/CP775.TXT",       "OEM Baltic" ],
  30     [ 850,   "VENDORS/MICSFT/PC/CP850.TXT",       "OEM Multilingual Latin 1" ],
  31     [ 852,   "VENDORS/MICSFT/PC/CP852.TXT",       "OEM Slovak Latin 2" ],
  32     [ 855,   "VENDORS/MICSFT/PC/CP855.TXT",       "OEM Cyrillic" ],
  33     [ 856,   "VENDORS/MISC/CP856.TXT",            "Hebrew PC" ],
  34     [ 857,   "VENDORS/MICSFT/PC/CP857.TXT",       "OEM Turkish" ],
  35     [ 860,   "VENDORS/MICSFT/PC/CP860.TXT",       "OEM Portuguese" ],
  36     [ 861,   "VENDORS/MICSFT/PC/CP861.TXT",       "OEM Icelandic" ],
  37     [ 862,   "VENDORS/MICSFT/PC/CP862.TXT",       "OEM Hebrew" ],
  38     [ 863,   "VENDORS/MICSFT/PC/CP863.TXT",       "OEM Canadian French" ],
  39     [ 864,   "VENDORS/MICSFT/PC/CP864.TXT",       "OEM Arabic" ],
  40     [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       "OEM Nordic" ],
  41     [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       "OEM Russian" ],
  42     [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       "OEM Greek" ],
  43     [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       "ANSI/OEM Thai" ],
  44     [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   "IBM EBCDIC Greek" ],
  45     [ 878,   "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
  46     [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  "ANSI/OEM Japanese Shift-JIS" ],
  47     [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  "ANSI/OEM Simplified Chinese GBK" ],
  48     [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  "ANSI/OEM Korean Unified Hangul" ],
  49     [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  "ANSI/OEM Traditional Chinese Big5" ],
  50     [ 1006,  "VENDORS/MISC/CP1006.TXT",           "IBM Arabic" ],
  51     [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  "IBM EBCDIC Latin 5 Turkish" ],
  52     [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", "ANSI Eastern Europe" ],
  53     [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", "ANSI Cyrillic" ],
  54     [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", "ANSI Latin 1" ],
  55     [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", "ANSI Greek" ],
  56     [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", "ANSI Turkish" ],
  57     [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", "ANSI Hebrew" ],
  58     [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", "ANSI Arabic" ],
  59     [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", "ANSI Baltic" ],
  60     [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", "ANSI/OEM Viet Nam" ],
  61     [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      "Mac Roman" ],
  62     [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      "Mac Greek" ],
  63     [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT",   "Mac Cyrillic" ],
  64     [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT",     "Mac Latin 2" ],
  65     [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT",    "Mac Icelandic" ],
  66     [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT",    "Mac Turkish" ],
  67     [ 20866, "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
  68     [ 28591, "ISO8859/8859-1.TXT",                "ISO 8859-1 Latin 1" ],
  69     [ 28592, "ISO8859/8859-2.TXT",                "ISO 8859-2 Eastern Europe" ],
  70     [ 28593, "ISO8859/8859-3.TXT",                "ISO 8859-3 Turkish" ],
  71     [ 28594, "ISO8859/8859-4.TXT",                "ISO 8859-4 Baltic" ],
  72     [ 28595, "ISO8859/8859-5.TXT",                "ISO 8859-5 Cyrillic" ],
  73     [ 28596, "ISO8859/8859-6.TXT",                "ISO 8859-6 Arabic" ],
  74     [ 28597, "ISO8859/8859-7.TXT",                "ISO 8859-7 Greek" ],
  75     [ 28598, "ISO8859/8859-8.TXT",                "ISO 8859-8 Hebrew" ],
  76     [ 28599, "ISO8859/8859-9.TXT",                "ISO 8859-9 Latin 5" ]
  77 );
  78
  79 ################################################################
  80 # main routine
  81
  82 READ_DEFAULTS();
  83
  84 foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
  85
  86 OUTPUT_CPTABLE();
  87
  88 exit(0);
  89
  90
  91 ################################################################
  92 # read in the defaults file
  93 sub READ_DEFAULTS
  94 {
  95     @unicode_defaults = ();
  96     @unicode_aliases = ();
  97
  98     # first setup a few default mappings
  99
 100     open DEFAULTS or die "Cannot open $DEFAULTS";
 101     print "Loading $DEFAULTS\n";
 102     while (<DEFAULTS>)
 103     {
 104         next if /^\#/;  # skip comments
 105         next if /^$/;  # skip empty lines
 106         if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
 107         {
 108             my @src = map hex, split /,/,$1;
 109             my $dst = $4;
 110             my $comment = $5;
 111             if ($#src > 0) { push @unicode_aliases, \@src; }
 112             next if ($dst eq "none");
 113             $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
 114             foreach $src (@src)
 115             {
 116                 die "Duplicate value" if defined($unicode_defaults[$src]);
 117                 $unicode_defaults[$src] = $dst;
 118             }
 119             next;
 120         }
 121         die "Unrecognized line $_\n";
 122     }
 123
 124     # now build mappings from the decomposition field of the Unicode database
 125
 126     open UNICODEDATA or die "Cannot open $UNICODEDATA";
 127     while (<UNICODEDATA>)
 128     {
 129         # Decode the fields ...
 130         ($code, $name, $cat, $comb, $bidi,
 131          $decomp, $dec, $dig, $num, $mirror,
 132          $oldname, $comment, $upper, $lower, $title) = split /;/;
 133
 134         next if $decomp eq "";  # no decomposition, skip it
 135
 136         $src = hex $code;
 137
 138         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 139         {
 140             # decomposition of the form "<foo> 1234" -> use char if type is known
 141             next unless ($1 eq "font" ||
 142                          $1 eq "noBreak" ||
 143                          $1 eq "circle" ||
 144                          $1 eq "super" ||
 145                          $1 eq "sub" ||
 146                          $1 eq "wide" ||
 147                          $1 eq "narrow" ||
 148                          $1 eq "compat" ||
 149                          $1 eq "small");
 150             $dst = hex $2;
 151         }
 152         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 153         {
 154             # decomposition "<compat> 0020 1234" -> combining accent
 155             $dst = hex $1;
 156         }
 157         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 158         {
 159             # decomposition contains only char values without prefix -> use first char
 160             $dst = hex $1;
 161         }
 162         else
 163         {
 164             next;
 165         }
 166
 167         next if defined($unicode_defaults[$src]);  # may have been set in the defaults file
 168
 169         # check for loops
 170         for ($i = $dst; ; $i = $unicode_defaults[$i])
 171         {
 172             die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
 173             last unless defined($unicode_defaults[$i]);
 174         }
 175         $unicode_defaults[$src] = $dst;
 176     }
 177 }
 178
 179
 180 ################################################################
 181 # parse the input file
 182 sub READ_FILE
 183 {
 184     my $name = shift;
 185     open INPUT,$name or die "Cannot open $name";
 186     @cp2uni = ();
 187     @lead_bytes = ();
 188     @uni2cp = ();
 189
 190     while (<INPUT>)
 191     {
 192         next if /^\#/;  # skip comments
 193         next if /^$/;  # skip empty lines
 194         next if /\x1a/;  # skip ^Z
 195         next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/);  # undefined char
 196
 197         if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
 198         {
 199             $cp = hex $1;
 200             push @lead_bytes,$cp;
 201             $cp2uni[$cp] = 0;
 202             next;
 203         }
 204         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 205         {
 206             $cp = hex $1;
 207             $uni = hex $2;
 208             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 209             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 210             next;
 211         }
 212         die "$name: Unrecognized line $_\n";
 213     }
 214 }
 215
 216
 217 ################################################################
 218 # parse the symbol.txt file, since its syntax is different from the other ones
 219 sub READ_SYMBOL_FILE
 220 {
 221     my $name = shift;
 222     open INPUT,$name or die "Cannot open $name";
 223     @cp2uni = ();
 224     @lead_bytes = ();
 225     @uni2cp = ();
 226
 227     while (<INPUT>)
 228     {
 229         next if /^\#/;  # skip comments
 230         next if /^$/;  # skip empty lines
 231         next if /\x1a/;  # skip ^Z
 232         if (/^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\#.*)?/)
 233         {
 234             $uni = hex $1;
 235             $cp = hex $2;
 236             $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 237             $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 238             next;
 239         }
 240         die "$name: Unrecognized line $_\n";
 241     }
 242 }
 243
 244
 245 ################################################################
 246 # add default mappings once the file had been read
 247 sub ADD_DEFAULT_MAPPINGS
 248 {
 249     # Apply aliases
 250
 251     foreach $alias (@unicode_aliases)
 252     {
 253         my $target = undef;
 254         foreach $src (@$alias)
 255         {
 256             if (defined($uni2cp[$src]))
 257             {
 258                 $target = $uni2cp[$src];
 259                 last;
 260             }
 261         }
 262         next unless defined($target);
 263
 264         # At least one char of the alias set is defined, set the others to the same value
 265         foreach $src (@$alias)
 266         {
 267             $uni2cp[$src] = $target unless defined($uni2cp[$src]);
 268         }
 269     }
 270
 271     # For every src -> target mapping in the defaults table,
 272     # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
 273
 274     for ($src = 0; $src < 65536; $src++)
 275     {
 276         next if defined($uni2cp[$src]);  # source has a definition already
 277         next unless defined($unicode_defaults[$src]);  # no default for this char
 278         my $target = $unicode_defaults[$src];
 279
 280         # do a recursive mapping until we find a target char that is defined
 281         while (!defined($uni2cp[$target]) &&
 282                defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
 283
 284         if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
 285     }
 286
 287     # Add an identity mapping for all undefined chars
 288
 289     for ($i = 0; $i < 256; $i++)
 290     {
 291         next if defined($cp2uni[$i]);
 292         next if defined($uni2cp[$i]);
 293         $cp2uni[$i] = $uni2cp[$i] = $i;
 294     }
 295 }
 296
 297
 298 ################################################################
 299 # dump an SBCS mapping table
 300 sub DUMP_SBCS_TABLE
 301 {
 302     my ($codepage, $name) = @_;
 303     my $x, $y;
 304
 305     # output the ascii->unicode table
 306
 307     printf OUTPUT "static const unsigned short cp2uni[256] =\n{\n    ";
 308     my $i = 0;
 309     for ($i = 0; $i < 256; $i++)
 310     {
 311         printf OUTPUT "0x%04x", (defined $cp2uni[$i] ? $cp2uni[$i] : $DEF_CHAR);
 312         if (($i % 8) != 7) { printf OUTPUT ", "; }
 313         else { print OUTPUT (($i < 255) ? ",\n    " : "\n};\n\n"); }
 314     }
 315
 316     # count the number of unicode->ascii subtables that contain something
 317
 318     my @filled = ();
 319     my $subtables = 1;
 320     for ($i = 0; $i < 65536; $i++)
 321     {
 322         next unless defined $uni2cp[$i];
 323         $filled[$i >> 8] = 1;
 324         $subtables++;
 325         $i = ($i & ~255) + 256;
 326     }
 327
 328     # output all the subtables into a single array
 329
 330     printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n   ", $subtables*256;
 331     for ($y = 0; $y < 256; $y++)
 332     {
 333         next unless $filled[$y];
 334         printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n   ", $y, $y;
 335         for ($x = 0; $x < 256; $x++)
 336         {
 337             printf OUTPUT " 0x%02x,", (defined $uni2cp[($y<<8)+$x] ?
 338                                      $uni2cp[($y<<8)+$x] : $DEF_CHAR);
 339             if (($x % 8) == 7) { printf OUTPUT "\n   "; }
 340         }
 341     }
 342     printf OUTPUT " /* defaults */\n   ";
 343     for ($x = 0; $x < 256; $x++)
 344     {
 345         printf OUTPUT " 0x%02x", $DEF_CHAR;
 346         if (($x % 8) != 7) { printf OUTPUT ","; }
 347         else { print OUTPUT (($x < 255) ? ",\n   " : "\n};\n\n"); }
 348     }
 349
 350     # output a table of the offsets of the subtables in the previous array
 351
 352     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n{\n    ";
 353     my $pos = 0;
 354     for ($y = 0; $y < 256; $y++)
 355     {
 356         if ($filled[$y])
 357         {
 358             printf OUTPUT "0x%04x", $pos;
 359             $pos += 256;
 360         }
 361         else { printf OUTPUT "0x%04x", ($subtables-1) * 256; }
 362         if (($y % 8) != 7) { printf OUTPUT ", "; }
 363         else { print OUTPUT (($y < 255) ? ",\n    " : "\n};\n\n"); }
 364     }
 365
 366     # output the code page descriptor
 367
 368     printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
 369     printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
 370                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 371     printf OUTPUT "    cp2uni,\n";
 372     printf OUTPUT "    uni2cp_low,\n";
 373     printf OUTPUT "    uni2cp_high\n};\n";
 374 }
 375
 376
 377 ################################################################
 378 # dump a DBCS mapping table
 379 sub DUMP_DBCS_TABLE
 380 {
 381     my ($codepage, $name) = @_;
 382     my $i, $x, $y;
 383
 384     # build a list of lead bytes that are actually used
 385
 386     my @lblist = ();
 387     LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++)
 388     {
 389         my $base = $lead_bytes[$y] << 8;
 390         for ($x = 0; $x < 256; $x++)
 391         {
 392             if (defined $cp2uni[$base+$x])
 393             {
 394                 push @lblist,$lead_bytes[$y];
 395                 next LBLOOP;
 396             }
 397         }
 398     }
 399     my $unused = ($#lead_bytes > $#lblist);
 400
 401     # output the ascii->unicode table for the single byte chars
 402
 403     printf OUTPUT "static const unsigned short cp2uni[%d] =\n{\n    ",
 404                   256 * ($#lblist + 2 + $unused);
 405     for ($x = 0; $x < 256; $x++)
 406     {
 407         printf OUTPUT "0x%04x", (defined $cp2uni[$x] ? $cp2uni[$x] : $DEF_CHAR);
 408         if (($x % 8) != 7) { printf OUTPUT ", "; }
 409         else { print OUTPUT ",\n    "; }
 410     }
 411
 412     # output the default table for unused lead bytes
 413
 414     if ($unused)
 415     {
 416         printf OUTPUT "/* unused lead bytes */\n    ";
 417         for ($x = 0; $x < 256; $x++)
 418         {
 419             printf OUTPUT "0x%04x", $DEF_CHAR;
 420             if (($x % 8) != 7) { printf OUTPUT ", "; }
 421             else { print OUTPUT ",\n    "; }
 422         }
 423     }
 424
 425     # output the ascii->unicode table for each DBCS lead byte
 426
 427     for ($y = 0; $y <= $#lblist; $y++)
 428     {
 429         my $base = $lblist[$y] << 8;
 430         printf OUTPUT "/* lead byte %02x */\n    ", $lblist[$y];
 431         for ($x = 0; $x < 256; $x++)
 432         {
 433             printf OUTPUT "0x%04x", (defined $cp2uni[$base+$x] ? $cp2uni[$base+$x] : $DEF_CHAR);
 434             if (($x % 8) != 7) { printf OUTPUT ", "; }
 435             else { print OUTPUT (($x < 255 || $y < $#lblist) ? ",\n    " : "\n};\n\n"); }
 436         }
 437     }
 438
 439     # output the lead byte subtables offsets
 440
 441     my @offsets = ();
 442     for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
 443     for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
 444     if ($unused)
 445     {
 446         # increment all lead bytes offset to take into account the unused table
 447         for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
 448     }
 449
 450     printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n{\n    ";
 451     for ($x = 0; $x < 256; $x++)
 452     {
 453         printf OUTPUT "0x%02x", $offsets[$x];
 454         if (($x % 8) != 7) { printf OUTPUT ", "; }
 455         else { print OUTPUT (($x < 255) ? ",\n    " : "\n};\n\n"); }
 456     }
 457
 458     # count the number of unicode->ascii subtables that contain something
 459
 460     my @filled = ();
 461     my $subtables = 1;
 462     for ($i = 0; $i < 65536; $i++)
 463     {
 464         next unless defined $uni2cp[$i];
 465         $filled[$i >> 8] = 1;
 466         $subtables++;
 467         $i = ($i & ~255) + 256;
 468     }
 469
 470     # output all the subtables into a single array
 471
 472     printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n   ", $subtables*256;
 473     for ($y = 0; $y < 256; $y++)
 474     {
 475         next unless $filled[$y];
 476         printf OUTPUT " /* 0x%02x00 .. 0x%02xff */\n   ", $y, $y;
 477         for ($x = 0; $x < 256; $x++)
 478         {
 479             printf OUTPUT " 0x%04x,", (defined $uni2cp[($y<<8)+$x] ?
 480                                      $uni2cp[($y<<8)+$x] : $DEF_CHAR);
 481             if (($x % 8) == 7) { printf OUTPUT "\n   "; }
 482         }
 483     }
 484     printf OUTPUT " /* defaults */\n   ";
 485     for ($x = 0; $x < 256; $x++)
 486     {
 487         printf OUTPUT " 0x%04x", $DEF_CHAR;
 488         if (($x % 8) != 7) { printf OUTPUT ","; }
 489         else { print OUTPUT (($x < 255) ? ",\n   " : "\n};\n\n"); }
 490     }
 491
 492     # output a table of the offsets of the subtables in the previous array
 493
 494     printf OUTPUT "static const unsigned short uni2cp_high[256] =\n{\n    ";
 495     my $pos = 0;
 496     for ($y = 0; $y < 256; $y++)
 497     {
 498         if ($filled[$y])
 499         {
 500             printf OUTPUT "0x%04x", $pos;
 501             $pos += 256;
 502         }
 503         else { printf OUTPUT "0x%04x", ($subtables-1) * 256; }
 504
 505         if (($y % 8) != 7) { printf OUTPUT ", "; }
 506         else { print OUTPUT (($y < 255) ? ",\n    " : "\n};\n\n"); }
 507     }
 508
 509     # output the code page descriptor
 510
 511     printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
 512     printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
 513                   $codepage, $DEF_CHAR, $DEF_CHAR, $name;
 514     printf OUTPUT "    cp2uni,\n";
 515     printf OUTPUT "    cp2uni_leadbytes,\n";
 516     printf OUTPUT "    uni2cp_low,\n";
 517     printf OUTPUT "    uni2cp_high,\n";
 518     DUMP_LB_RANGES();
 519     printf OUTPUT "};\n";
 520 }
 521
 522
 523 ################################################################
 524 # dump the list of defined lead byte ranges
 525 sub DUMP_LB_RANGES
 526 {
 527     my @list = ();
 528     my $i = 0;
 529     foreach $i (@lead_bytes) { $list[$i] = 1; }
 530     my $on = 0;
 531     printf OUTPUT "    { ";
 532     for ($i = 0; $i < 256; $i++)
 533     {
 534         if ($on)
 535         {
 536             if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
 537         }
 538         else
 539         {
 540             if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
 541         }
 542     }
 543     if ($on) { printf OUTPUT "0xff, "; }
 544     printf OUTPUT "0x00, 0x00 }\n";
 545 }
 546
 547
 548 ################################################################
 549 # read an input file and generate the corresponding .c file
 550 sub HANDLE_FILE
 551 {
 552     my ($codepage,$filename,$comment) = @_;
 553
 554     # symbol codepage file is special
 555     if ($codepage == 42) { READ_SYMBOL_FILE($MAPPREFIX . $filename); }
 556     else { READ_FILE($MAPPREFIX . $filename); }
 557
 558     ADD_DEFAULT_MAPPINGS();
 559
 560     my $output = sprintf "c_%03d.c", $codepage;
 561     open OUTPUT,">$output" or die "Cannot create $output";
 562
 563     printf "Building %s from %s (%s)\n", $output, $filename, $comment;
 564
 565     # dump all tables
 566
 567     printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
 568     printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
 569     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
 570     printf OUTPUT "#include \"wine/unicode.h\"\n\n";
 571
 572     if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $comment ); }
 573     else { DUMP_DBCS_TABLE( $codepage, $comment ); }
 574 }
 575
 576
 577 ################################################################
 578 # output the list of codepage tables into the cptable.c file
 579 sub OUTPUT_CPTABLE
 580 {
 581     @tables_decl = ();
 582
 583     foreach $file (@allfiles)
 584     {
 585         my ($codepage,$filename,$comment) = @$file;
 586         push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
 587     }
 588
 589     push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
 590     foreach $file (@allfiles)
 591     {
 592         my ($codepage,$filename,$comment) = @$file;
 593         push @tables_decl, sprintf("    &cptable_%03d,\n", $codepage);
 594     }
 595     push @tables_decl, "};";
 596     REPLACE_IN_FILE( "cptable.c", @tables_decl );
 597 }
 598
 599 ################################################################
 600 # replace the contents of a file between ### cpmap ### marks
 601
 602 sub REPLACE_IN_FILE
 603 {
 604     my $name = shift;
 605     my @data = @_;
 606     my @lines = ();
 607     open(FILE,$name) or die "Can't open $name";
 608     while (<FILE>)
 609     {
 610         push @lines, $_;
 611         last if /\#\#\# cpmap begin \#\#\#/;
 612     }
 613     push @lines, @data;
 614     while (<FILE>)
 615     {
 616         if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
 617     }
 618     push @lines, <FILE>;
 619     open(FILE,">$name") or die "Can't modify $name";
 620     print FILE @lines;
 621     close(FILE);
 622 }