codepage/cptable.pl

   1 #!/usr/bin/perl
   2 #
   3 # Produce a codepage matching table.  For each 8-bit character, list
   4 # a primary and an alternate match (the latter used for case-insensitive
   5 # matching.)
   6 #
   7 # Usage:
   8 #       cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
   9 #
  10 # Note: for the format of the UnicodeData file, see:
  11 # http://www.unicode.org/Public/UNIDATA/UCD.html
  12 #
  13
  14 ($ucd, $cpco, $cpfs, $cpout) = @ARGV;
  15
  16 if (!defined($cpout)) {
  17     die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
  18 }
  19
  20 %ucase   = ();
  21 %lcase   = ();
  22 %tcase   = ();
  23 %decomp  = ();
  24
  25 open(UCD, '<', $ucd)
  26     or die "$0: could not open unicode data: $ucd: $!\n";
  27 while (defined($line = <UCD>)) {
  28     chomp $line;
  29     @f = split(/;/, $line);
  30     $n = hex $f[0];
  31     $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
  32     $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
  33     $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
  34     if ($f[5] =~ /^[0-9A-F\s]+$/) {
  35         # This character has a canonical decomposition.
  36         # The regular expression rejects angle brackets, so other
  37         # decompositions aren't permitted.
  38         $decomp{$n} = [];
  39         foreach my $dch (split(' ', $f[5])) {
  40             push(@{$decomp{$n}}, hex $dch);
  41         }
  42     }
  43 }
  44 close(UCD);
  45
  46 #
  47 # Filesystem and console codepages.  The filesystem codepage is used
  48 # for FAT shortnames, whereas the console codepage is whatever is used
  49 # on the screen and keyboard.
  50 #
  51 @xtab = (undef) x 256;
  52 %tabx = ();
  53 open(CPFS, '<', $cpfs)
  54     or die "$0: could not open fs codepage: $cpfs: $!\n";
  55 while (defined($line = <CPFS>)) {
  56     $line =~ s/\s*(\#.*|)$//;
  57     @f = split(/\s+/, $line);
  58     next if (scalar @f != 2);
  59     next if (hex $f[0] > 255);
  60     $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
  61     $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
  62 }
  63 close(CPFS);
  64
  65 @ytab = (undef) x 256;
  66 %taby = ();
  67 open(CPCO, '<', $cpco)
  68     or die "$0: could not open console codepage: $cpco: $!\n";
  69 while (defined($line = <CPCO>)) {
  70     $line =~ s/\s*(\#.*|)$//;
  71     @f = split(/\s+/, $line);
  72     next if (scalar @f != 2);
  73     next if (hex $f[0] > 255);
  74     $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
  75     $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
  76 }
  77 close(CPCO);
  78
  79 open(CPOUT, '>', $cpout)
  80     or die "$0: could not open output file: $cpout: $!\n";
  81 #
  82 # Magic number, in anticipation of being able to load these
  83 # files dynamically...
  84 #
  85 print CPOUT pack("VV", 0x8fad232b, 0x9c295319);
  86
  87 # Header fields available for future use...
  88 print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
  89
  90 #
  91 # Self (shortname) uppercase table.
  92 # This depends both on the console codepage and the filesystem codepage;
  93 # the logical transcoding operation is:
  94 #
  95 # $tabx{$ucase{$ytab[$i]}}
  96 #
  97 # ... where @ytab is console codepage -> Unicode and
  98 # %tabx is Unicode -> filesystem codepage.
  99 #
 100 for ($i = 0; $i < 256; $i++) {
 101     $uuc = $ucase{$ytab[$i]};   # Unicode upper case
 102     if (defined($tabx{$uuc})) {
 103         # Straight-forward conversion
 104         $u = $tabx{$uuc};
 105     } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
 106         # Upper case equivalent stripped of accents
 107         $u = $tabx{${$decomp{$uuc}}[0]};
 108     } else {
 109         # No equivalent at all found.  Set this to zero, which should
 110         # prevent shortname matching altogether (still making longname
 111         # matching possible, of course.)
 112         $u = 0;
 113     }
 114     print CPOUT pack("C", $u);
 115 }
 116
 117 #
 118 # Unicode (longname) matching table.
 119 # This only depends on the console codepage.
 120 #
 121 for ($i = 0; $i < 256; $i++) {
 122     if (!defined($ytab[$i])) {
 123         $p0 = $p1 = 0xffff;
 124     } else {
 125         $p0 = $ytab[$i];
 126         if ($ucase{$p0} != $p0) {
 127             $p1 = $ucase{$p0};
 128         } elsif ($lcase{$p0} != $p0) {
 129             $p1 = $lcase{$p0};
 130         } elsif ($tcase{$p0} != $p0) {
 131             $p1 = $tcase{$p0};
 132         } else {
 133             $p1 = $p0;
 134         }
 135     }
 136     # Only the BMP is supported...
 137     $p0 = 0xffff if ($p0 > 0xffff);
 138     $p1 = 0xffff if ($p1 > 0xffff);
 139     print CPOUT pack("vv", $p0, $p1);
 140 }
 141 close (CPOUT);
 142
 143