3 # Generate code page .c files from ftp.unicode.org descriptions
5 # Copyright 2000 Alexandre Julliard
8 # base directory for ftp.unicode.org files
9 $BASEDIR = "ftp.unicode.org/Public/";
10 $MAPPREFIX = $BASEDIR . "MAPPINGS/";
13 $UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";
16 $DEFAULTS = "./defaults";
18 # Default char for undefined mappings
23 [ 37, "VENDORS/MICSFT/EBCDIC/CP037.TXT", "IBM EBCDIC US Canada" ],
24 [ 42, "VENDORS/ADOBE/symbol.txt", "Symbol" ],
25 [ 424, "VENDORS/MISC/CP424.TXT", "IBM EBCDIC Hebrew" ],
26 [ 437, "VENDORS/MICSFT/PC/CP437.TXT", "OEM United States" ],
27 [ 500, "VENDORS/MICSFT/EBCDIC/CP500.TXT", "IBM EBCDIC International" ],
28 [ 737, "VENDORS/MICSFT/PC/CP737.TXT", "OEM Greek 437G" ],
29 [ 775, "VENDORS/MICSFT/PC/CP775.TXT", "OEM Baltic" ],
30 [ 850, "VENDORS/MICSFT/PC/CP850.TXT", "OEM Multilingual Latin 1" ],
31 [ 852, "VENDORS/MICSFT/PC/CP852.TXT", "OEM Slovak Latin 2" ],
32 [ 855, "VENDORS/MICSFT/PC/CP855.TXT", "OEM Cyrillic" ],
33 [ 856, "VENDORS/MISC/CP856.TXT", "Hebrew PC" ],
34 [ 857, "VENDORS/MICSFT/PC/CP857.TXT", "OEM Turkish" ],
35 [ 860, "VENDORS/MICSFT/PC/CP860.TXT", "OEM Portuguese" ],
36 [ 861, "VENDORS/MICSFT/PC/CP861.TXT", "OEM Icelandic" ],
37 [ 862, "VENDORS/MICSFT/PC/CP862.TXT", "OEM Hebrew" ],
38 [ 863, "VENDORS/MICSFT/PC/CP863.TXT", "OEM Canadian French" ],
39 [ 864, "VENDORS/MICSFT/PC/CP864.TXT", "OEM Arabic" ],
40 [ 865, "VENDORS/MICSFT/PC/CP865.TXT", "OEM Nordic" ],
41 [ 866, "VENDORS/MICSFT/PC/CP866.TXT", "OEM Russian" ],
42 [ 869, "VENDORS/MICSFT/PC/CP869.TXT", "OEM Greek" ],
43 [ 874, "VENDORS/MICSFT/PC/CP874.TXT", "ANSI/OEM Thai" ],
44 [ 875, "VENDORS/MICSFT/EBCDIC/CP875.TXT", "IBM EBCDIC Greek" ],
45 [ 878, "VENDORS/MISC/KOI8-R.TXT", "Russian KOI8" ],
46 [ 932, "VENDORS/MICSFT/WINDOWS/CP932.TXT", "ANSI/OEM Japanese Shift-JIS" ],
47 [ 936, "VENDORS/MICSFT/WINDOWS/CP936.TXT", "ANSI/OEM Simplified Chinese GBK" ],
48 [ 949, "VENDORS/MICSFT/WINDOWS/CP949.TXT", "ANSI/OEM Korean Unified Hangul" ],
49 [ 950, "VENDORS/MICSFT/WINDOWS/CP950.TXT", "ANSI/OEM Traditional Chinese Big5" ],
50 [ 1006, "VENDORS/MISC/CP1006.TXT", "IBM Arabic" ],
51 [ 1026, "VENDORS/MICSFT/EBCDIC/CP1026.TXT", "IBM EBCDIC Latin 5 Turkish" ],
52 [ 1250, "VENDORS/MICSFT/WINDOWS/CP1250.TXT", "ANSI Eastern Europe" ],
53 [ 1251, "VENDORS/MICSFT/WINDOWS/CP1251.TXT", "ANSI Cyrillic" ],
54 [ 1252, "VENDORS/MICSFT/WINDOWS/CP1252.TXT", "ANSI Latin 1" ],
55 [ 1253, "VENDORS/MICSFT/WINDOWS/CP1253.TXT", "ANSI Greek" ],
56 [ 1254, "VENDORS/MICSFT/WINDOWS/CP1254.TXT", "ANSI Turkish" ],
57 [ 1255, "VENDORS/MICSFT/WINDOWS/CP1255.TXT", "ANSI Hebrew" ],
58 [ 1256, "VENDORS/MICSFT/WINDOWS/CP1256.TXT", "ANSI Arabic" ],
59 [ 1257, "VENDORS/MICSFT/WINDOWS/CP1257.TXT", "ANSI Baltic" ],
60 [ 1258, "VENDORS/MICSFT/WINDOWS/CP1258.TXT", "ANSI/OEM Viet Nam" ],
61 [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT", "Mac Roman" ],
62 [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT", "Mac Greek" ],
63 [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT", "Mac Cyrillic" ],
64 [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT", "Mac Latin 2" ],
65 [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT", "Mac Icelandic" ],
66 [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT", "Mac Turkish" ],
67 [ 20866, "VENDORS/MISC/KOI8-R.TXT", "Russian KOI8" ],
68 [ 28591, "ISO8859/8859-1.TXT", "ISO 8859-1 Latin 1" ],
69 [ 28592, "ISO8859/8859-2.TXT", "ISO 8859-2 Eastern Europe" ],
70 [ 28593, "ISO8859/8859-3.TXT", "ISO 8859-3 Turkish" ],
71 [ 28594, "ISO8859/8859-4.TXT", "ISO 8859-4 Baltic" ],
72 [ 28595, "ISO8859/8859-5.TXT", "ISO 8859-5 Cyrillic" ],
73 [ 28596, "ISO8859/8859-6.TXT", "ISO 8859-6 Arabic" ],
74 [ 28597, "ISO8859/8859-7.TXT", "ISO 8859-7 Greek" ],
75 [ 28598, "ISO8859/8859-8.TXT", "ISO 8859-8 Hebrew" ],
76 [ 28599, "ISO8859/8859-9.TXT", "ISO 8859-9 Latin 5" ]
79 ################################################################
84 foreach $file (@allfiles) { HANDLE_FILE
( @
$file ); }
91 ################################################################
92 # read in the defaults file
95 @unicode_defaults = ();
96 @unicode_aliases = ();
98 # first setup a few default mappings
100 open DEFAULTS
or die "Cannot open $DEFAULTS";
101 print "Loading $DEFAULTS\n";
104 next if /^\#/; # skip comments
105 next if /^$/; # skip empty lines
106 if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
108 my @src = map hex, split /,/,$1;
111 if ($#src > 0) { push @unicode_aliases, \
@src; }
112 next if ($dst eq "none");
113 $dst = ($dst =~ /\'.\'/) ?
ord substr($dst,1,1) : hex $dst;
116 die "Duplicate value" if defined($unicode_defaults[$src]);
117 $unicode_defaults[$src] = $dst;
121 die "Unrecognized line $_\n";
124 # now build mappings from the decomposition field of the Unicode database
126 open UNICODEDATA
or die "Cannot open $UNICODEDATA";
127 while (<UNICODEDATA
>)
129 # Decode the fields ...
130 ($code, $name, $cat, $comb, $bidi,
131 $decomp, $dec, $dig, $num, $mirror,
132 $oldname, $comment, $upper, $lower, $title) = split /;/;
134 next if $decomp eq ""; # no decomposition, skip it
138 if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
140 # decomposition of the form "<foo> 1234" -> use char if type is known
141 next unless ($1 eq "font" ||
152 elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
154 # decomposition "<compat> 0020 1234" -> combining accent
157 elsif ($decomp =~ /^([0-9a-fA-F]+)/)
159 # decomposition contains only char values without prefix -> use first char
167 next if defined($unicode_defaults[$src]); # may have been set in the defaults file
170 for ($i = $dst; ; $i = $unicode_defaults[$i])
172 die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
173 last unless defined($unicode_defaults[$i]);
175 $unicode_defaults[$src] = $dst;
180 ################################################################
181 # parse the input file
185 open INPUT
,$name or die "Cannot open $name";
192 next if /^\#/; # skip comments
193 next if /^$/; # skip empty lines
194 next if /\x1a/; # skip ^Z
195 next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/); # undefined char
197 if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
200 push @lead_bytes,$cp;
204 if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
208 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
209 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
212 die "$name: Unrecognized line $_\n";
217 ################################################################
218 # parse the symbol.txt file, since its syntax is different from the other ones
222 open INPUT
,$name or die "Cannot open $name";
229 next if /^\#/; # skip comments
230 next if /^$/; # skip empty lines
231 next if /\x1a/; # skip ^Z
232 if (/^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+(\#.*)?/)
236 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
237 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
240 die "$name: Unrecognized line $_\n";
245 ################################################################
246 # add default mappings once the file had been read
247 sub ADD_DEFAULT_MAPPINGS
251 foreach $alias (@unicode_aliases)
254 foreach $src (@
$alias)
256 if (defined($uni2cp[$src]))
258 $target = $uni2cp[$src];
262 next unless defined($target);
264 # At least one char of the alias set is defined, set the others to the same value
265 foreach $src (@
$alias)
267 $uni2cp[$src] = $target unless defined($uni2cp[$src]);
271 # For every src -> target mapping in the defaults table,
272 # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined
274 for ($src = 0; $src < 65536; $src++)
276 next if defined($uni2cp[$src]); # source has a definition already
277 next unless defined($unicode_defaults[$src]); # no default for this char
278 my $target = $unicode_defaults[$src];
280 # do a recursive mapping until we find a target char that is defined
281 while (!defined($uni2cp[$target]) &&
282 defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }
284 if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
287 # Add an identity mapping for all undefined chars
289 for ($i = 0; $i < 256; $i++)
291 next if defined($cp2uni[$i]);
292 next if defined($uni2cp[$i]);
293 $cp2uni[$i] = $uni2cp[$i] = $i;
298 ################################################################
299 # dump an SBCS mapping table
302 my ($codepage, $name) = @_;
305 # output the ascii->unicode table
307 printf OUTPUT
"static const unsigned short cp2uni[256] =\n{\n ";
309 for ($i = 0; $i < 256; $i++)
311 printf OUTPUT
"0x%04x", (defined $cp2uni[$i] ?
$cp2uni[$i] : $DEF_CHAR);
312 if (($i % 8) != 7) { printf OUTPUT
", "; }
313 else { print OUTPUT
(($i < 255) ?
",\n " : "\n};\n\n"); }
316 # count the number of unicode->ascii subtables that contain something
320 for ($i = 0; $i < 65536; $i++)
322 next unless defined $uni2cp[$i];
323 $filled[$i >> 8] = 1;
325 $i = ($i & ~255) + 256;
328 # output all the subtables into a single array
330 printf OUTPUT
"static const unsigned char uni2cp_low[%d] =\n{\n ", $subtables*256;
331 for ($y = 0; $y < 256; $y++)
333 next unless $filled[$y];
334 printf OUTPUT
" /* 0x%02x00 .. 0x%02xff */\n ", $y, $y;
335 for ($x = 0; $x < 256; $x++)
337 printf OUTPUT
" 0x%02x,", (defined $uni2cp[($y<<8)+$x] ?
338 $uni2cp[($y<<8)+$x] : $DEF_CHAR);
339 if (($x % 8) == 7) { printf OUTPUT
"\n "; }
342 printf OUTPUT
" /* defaults */\n ";
343 for ($x = 0; $x < 256; $x++)
345 printf OUTPUT
" 0x%02x", $DEF_CHAR;
346 if (($x % 8) != 7) { printf OUTPUT
","; }
347 else { print OUTPUT
(($x < 255) ?
",\n " : "\n};\n\n"); }
350 # output a table of the offsets of the subtables in the previous array
352 printf OUTPUT
"static const unsigned short uni2cp_high[256] =\n{\n ";
354 for ($y = 0; $y < 256; $y++)
358 printf OUTPUT
"0x%04x", $pos;
361 else { printf OUTPUT
"0x%04x", ($subtables-1) * 256; }
362 if (($y % 8) != 7) { printf OUTPUT
", "; }
363 else { print OUTPUT
(($y < 255) ?
",\n " : "\n};\n\n"); }
366 # output the code page descriptor
368 printf OUTPUT
"const struct sbcs_table cptable_%03d =\n{\n", $codepage;
369 printf OUTPUT
" { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
370 $codepage, $DEF_CHAR, $DEF_CHAR, $name;
371 printf OUTPUT
" cp2uni,\n";
372 printf OUTPUT
" uni2cp_low,\n";
373 printf OUTPUT
" uni2cp_high\n};\n";
377 ################################################################
378 # dump a DBCS mapping table
381 my ($codepage, $name) = @_;
384 # build a list of lead bytes that are actually used
387 LBLOOP
: for ($y = 0; $y <= $#lead_bytes; $y++)
389 my $base = $lead_bytes[$y] << 8;
390 for ($x = 0; $x < 256; $x++)
392 if (defined $cp2uni[$base+$x])
394 push @lblist,$lead_bytes[$y];
399 my $unused = ($#lead_bytes > $#lblist);
401 # output the ascii->unicode table for the single byte chars
403 printf OUTPUT
"static const unsigned short cp2uni[%d] =\n{\n ",
404 256 * ($#lblist + 2 + $unused);
405 for ($x = 0; $x < 256; $x++)
407 printf OUTPUT
"0x%04x", (defined $cp2uni[$x] ?
$cp2uni[$x] : $DEF_CHAR);
408 if (($x % 8) != 7) { printf OUTPUT
", "; }
409 else { print OUTPUT
",\n "; }
412 # output the default table for unused lead bytes
416 printf OUTPUT
"/* unused lead bytes */\n ";
417 for ($x = 0; $x < 256; $x++)
419 printf OUTPUT
"0x%04x", $DEF_CHAR;
420 if (($x % 8) != 7) { printf OUTPUT
", "; }
421 else { print OUTPUT
",\n "; }
425 # output the ascii->unicode table for each DBCS lead byte
427 for ($y = 0; $y <= $#lblist; $y++)
429 my $base = $lblist[$y] << 8;
430 printf OUTPUT
"/* lead byte %02x */\n ", $lblist[$y];
431 for ($x = 0; $x < 256; $x++)
433 printf OUTPUT
"0x%04x", (defined $cp2uni[$base+$x] ?
$cp2uni[$base+$x] : $DEF_CHAR);
434 if (($x % 8) != 7) { printf OUTPUT
", "; }
435 else { print OUTPUT
(($x < 255 || $y < $#lblist) ?
",\n " : "\n};\n\n"); }
439 # output the lead byte subtables offsets
442 for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
443 for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
446 # increment all lead bytes offset to take into account the unused table
447 for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
450 printf OUTPUT
"static const unsigned char cp2uni_leadbytes[256] =\n{\n ";
451 for ($x = 0; $x < 256; $x++)
453 printf OUTPUT
"0x%02x", $offsets[$x];
454 if (($x % 8) != 7) { printf OUTPUT
", "; }
455 else { print OUTPUT
(($x < 255) ?
",\n " : "\n};\n\n"); }
458 # count the number of unicode->ascii subtables that contain something
462 for ($i = 0; $i < 65536; $i++)
464 next unless defined $uni2cp[$i];
465 $filled[$i >> 8] = 1;
467 $i = ($i & ~255) + 256;
470 # output all the subtables into a single array
472 printf OUTPUT
"static const unsigned short uni2cp_low[%d] =\n{\n ", $subtables*256;
473 for ($y = 0; $y < 256; $y++)
475 next unless $filled[$y];
476 printf OUTPUT
" /* 0x%02x00 .. 0x%02xff */\n ", $y, $y;
477 for ($x = 0; $x < 256; $x++)
479 printf OUTPUT
" 0x%04x,", (defined $uni2cp[($y<<8)+$x] ?
480 $uni2cp[($y<<8)+$x] : $DEF_CHAR);
481 if (($x % 8) == 7) { printf OUTPUT
"\n "; }
484 printf OUTPUT
" /* defaults */\n ";
485 for ($x = 0; $x < 256; $x++)
487 printf OUTPUT
" 0x%04x", $DEF_CHAR;
488 if (($x % 8) != 7) { printf OUTPUT
","; }
489 else { print OUTPUT
(($x < 255) ?
",\n " : "\n};\n\n"); }
492 # output a table of the offsets of the subtables in the previous array
494 printf OUTPUT
"static const unsigned short uni2cp_high[256] =\n{\n ";
496 for ($y = 0; $y < 256; $y++)
500 printf OUTPUT
"0x%04x", $pos;
503 else { printf OUTPUT
"0x%04x", ($subtables-1) * 256; }
505 if (($y % 8) != 7) { printf OUTPUT
", "; }
506 else { print OUTPUT
(($y < 255) ?
",\n " : "\n};\n\n"); }
509 # output the code page descriptor
511 printf OUTPUT
"const struct dbcs_table cptable_%03d =\n{\n", $codepage;
512 printf OUTPUT
" { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
513 $codepage, $DEF_CHAR, $DEF_CHAR, $name;
514 printf OUTPUT
" cp2uni,\n";
515 printf OUTPUT
" cp2uni_leadbytes,\n";
516 printf OUTPUT
" uni2cp_low,\n";
517 printf OUTPUT
" uni2cp_high,\n";
519 printf OUTPUT
"};\n";
523 ################################################################
524 # dump the list of defined lead byte ranges
529 foreach $i (@lead_bytes) { $list[$i] = 1; }
532 for ($i = 0; $i < 256; $i++)
536 if (!defined $list[$i]) { printf OUTPUT
"0x%02x, ", $i-1; $on = 0; }
540 if ($list[$i]) { printf OUTPUT
"0x%02x, ", $i; $on = 1; }
543 if ($on) { printf OUTPUT
"0xff, "; }
544 printf OUTPUT
"0x00, 0x00 }\n";
548 ################################################################
549 # read an input file and generate the corresponding .c file
552 my ($codepage,$filename,$comment) = @_;
554 # symbol codepage file is special
555 if ($codepage == 42) { READ_SYMBOL_FILE
($MAPPREFIX . $filename); }
556 else { READ_FILE
($MAPPREFIX . $filename); }
558 ADD_DEFAULT_MAPPINGS
();
560 my $output = sprintf "c_%03d.c", $codepage;
561 open OUTPUT
,">$output" or die "Cannot create $output";
563 printf "Building %s from %s (%s)\n", $output, $filename, $comment;
567 printf OUTPUT
"/* code page %03d (%s) */\n", $codepage, $comment;
568 printf OUTPUT
"/* generated from %s */\n", $MAPPREFIX . $filename;
569 printf OUTPUT
"/* DO NOT EDIT!! */\n\n";
570 printf OUTPUT
"#include \"wine/unicode.h\"\n\n";
572 if ($#lead_bytes == -1) { DUMP_SBCS_TABLE
( $codepage, $comment ); }
573 else { DUMP_DBCS_TABLE
( $codepage, $comment ); }
577 ################################################################
578 # output the list of codepage tables into the cptable.c file
583 foreach $file (@allfiles)
585 my ($codepage,$filename,$comment) = @
$file;
586 push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
589 push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
590 foreach $file (@allfiles)
592 my ($codepage,$filename,$comment) = @
$file;
593 push @tables_decl, sprintf(" &cptable_%03d,\n", $codepage);
595 push @tables_decl, "};";
596 REPLACE_IN_FILE
( "cptable.c", @tables_decl );
599 ################################################################
600 # replace the contents of a file between ### cpmap ### marks
607 open(FILE
,$name) or die "Can't open $name";
611 last if /\#\#\# cpmap begin \#\#\#/;
616 if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
619 open(FILE
,">$name") or die "Can't modify $name";