3 # Produce a codepage matching table. For each 8-bit character, list
4 # a primary and an alternate match (the latter used for case-insensitive
8 # cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
10 # Note: for the format of the UnicodeData file, see:
11 # http://www.unicode.org/Public/UNIDATA/UCD.html
14 ($ucd, $cpco, $cpfs, $cpout) = @ARGV;
16 if (!defined($cpout)) {
17 die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
26 or die "$0: could not open unicode data: $ucd: $!\n";
27 while (defined($line = <UCD
>)) {
29 @f = split(/;/, $line);
31 $ucase{$n} = ($f[12] ne '') ?
hex $f[12] : $n;
32 $lcase{$n} = ($f[13] ne '') ?
hex $f[13] : $n;
33 $tcase{$n} = ($f[14] ne '') ?
hex $f[14] : $n;
34 if ($f[5] =~ /^[0-9A-F\s]+$/) {
35 # This character has a canonical decomposition.
36 # The regular expression rejects angle brackets, so other
37 # decompositions aren't permitted.
39 foreach my $dch (split(' ', $f[5])) {
40 push(@
{$decomp{$n}}, hex $dch);
47 # Filesystem and console codepages. The filesystem codepage is used
48 # for FAT shortnames, whereas the console codepage is whatever is used
49 # on the screen and keyboard.
51 @xtab = (undef) x
256;
53 open(CPFS
, '<', $cpfs)
54 or die "$0: could not open fs codepage: $cpfs: $!\n";
55 while (defined($line = <CPFS
>)) {
56 $line =~ s/\s*(\#.*|)$//;
57 @f = split(/\s+/, $line);
58 next if (scalar @f != 2);
59 next if (hex $f[0] > 255);
60 $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
61 $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
65 @ytab = (undef) x
256;
67 open(CPCO
, '<', $cpco)
68 or die "$0: could not open console codepage: $cpco: $!\n";
69 while (defined($line = <CPCO
>)) {
70 $line =~ s/\s*(\#.*|)$//;
71 @f = split(/\s+/, $line);
72 next if (scalar @f != 2);
73 next if (hex $f[0] > 255);
74 $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
75 $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
79 open(CPOUT
, '>', $cpout)
80 or die "$0: could not open output file: $cpout: $!\n";
82 # Magic number, in anticipation of being able to load these
83 # files dynamically...
85 print CPOUT
pack("VV", 0x58a8b3d4, 0x51d21eb1);
87 # Header fields available for future use...
88 print CPOUT
pack("VVVVVV", 0, 0, 0, 0, 0, 0);
91 # Self (shortname) uppercase table.
92 # This depends both on the console codepage and the filesystem codepage;
93 # the logical transcoding operation is:
95 # $tabx{$ucase{$ytab[$i]}}
97 # ... where @ytab is console codepage -> Unicode and
98 # %tabx is Unicode -> filesystem codepage.
100 @uctab = (undef) x
256;
101 for ($i = 0; $i < 256; $i++) {
102 $uuc = $ucase{$ytab[$i]}; # Unicode upper case
103 if (defined($tabx{$uuc})) {
104 # Straight-forward conversion
106 } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
107 # Upper case equivalent stripped of accents
108 $u = $tabx{${$decomp{$uuc}}[0]};
110 # No equivalent at all found. Assume it is a lower-case-only
111 # character, like greek alpha in CP437.
115 print CPOUT
pack("C", $u);
119 # Self (shortname) lowercase table.
120 # This depends both on the console codepage and the filesystem codepage;
121 # the logical transcoding operation is:
123 # $taby{$lcase{$xtab[$i]}}
125 # ... where @ytab is console codepage -> Unicode and
126 # %tabx is Unicode -> filesystem codepage.
128 @lctab = (undef) x
256;
129 for ($i = 0; $i < 256; $i++) {
130 $llc = $lcase{$xtab[$i]}; # Unicode lower case
131 if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
132 # Straight-forward conversion
133 } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
134 # Lower case equivalent stripped of accents
136 # No equivalent at all found. Find *anything* that matches the
137 # bijection criterion...
138 for ($l = 0; $l < 256; $l++) {
139 last if ($uctab[$l] == $i);
141 $l = $i if ($l == 256); # If nothing, we're screwed anyway...
144 print CPOUT
pack("C", $l);
148 # Unicode (longname) matching table.
149 # This only depends on the console codepage.
151 $pp0 = ''; $pp1 = '';
152 for ($i = 0; $i < 256; $i++) {
153 if (!defined($ytab[$i])) {
157 if ($ucase{$p0} != $p0) {
159 } elsif ($lcase{$p0} != $p0) {
161 } elsif ($tcase{$p0} != $p0) {
167 # Only the BMP is supported...
168 $p0 = 0xffff if ($p0 > 0xffff);
169 $p1 = 0xffff if ($p1 > 0xffff);
170 $pp0 .= pack("v", $p0);
171 $pp1 .= pack("v", $p1);
173 print CPOUT
$pp0, $pp1;