bin/enc2xs

   1 #!/usr/bin/perl
   2     eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
   3         if $running_under_some_shell;
   4 #!./perl
   5 BEGIN {
   6     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   7     # with $ENV{PERL_CORE} set
   8     # In case we need it in future...
   9     require Config; import Config;
  10 }
  11 use strict;
  12 use warnings;
  13 use Getopt::Std;
  14 my @orig_ARGV = @ARGV;
  15 our $VERSION  = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  16
  17 # These may get re-ordered.
  18 # RAW is a do_now as inserted by &enter
  19 # AGG is an aggreagated do_now, as built up by &process
  20
  21 use constant {
  22   RAW_NEXT => 0,
  23   RAW_IN_LEN => 1,
  24   RAW_OUT_BYTES => 2,
  25   RAW_FALLBACK => 3,
  26
  27   AGG_MIN_IN => 0,
  28   AGG_MAX_IN => 1,
  29   AGG_OUT_BYTES => 2,
  30   AGG_NEXT => 3,
  31   AGG_IN_LEN => 4,
  32   AGG_OUT_LEN => 5,
  33   AGG_FALLBACK => 6,
  34 };
  35
  36 # (See the algorithm in encengine.c - we're building structures for it)
  37
  38 # There are two sorts of structures.
  39 # "do_now" (an array, two variants of what needs storing) is whatever we need
  40 # to do now we've read an input byte.
  41 # It's housed in a "do_next" (which is how we got to it), and in turn points
  42 # to a "do_next" which contains all the "do_now"s for the next input byte.
  43
  44 # There will be a "do_next" which is the start state.
  45 # For a single byte encoding it's the only "do_next" - each "do_now" points
  46 # back to it, and each "do_now" will cause bytes. There is no state.
  47
  48 # For a multi-byte encoding where all characters in the input are the same
  49 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  50 # branching out from the start state, one step for each input byte.
  51 # The leaf "do_now"s will all be at the same distance from the start state,
  52 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  53 # the start state.
  54
  55 # For an encoding where there are varaible length input byte sequences, you
  56 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  57 # as before the leaves will point back to the start state.
  58
  59 # The system will cope with escape encodings (imagine them as a mostly
  60 # self-contained tree for each escape state, and cross links between trees
  61 # at the state-switching characters) but so far no input format defines these.
  62
  63 # The system will also cope with having output "leaves" in the middle of
  64 # the bifurcating branches, not just at the extremities, but again no
  65 # input format does this yet.
  66
  67 # There are two variants of the "do_now" structure. The first, smaller variant
  68 # is generated by &enter as the input file is read. There is one structure
  69 # for each input byte. Say we are mapping a single byte encoding to a
  70 # single byte encoding, with  "ABCD" going "abcd". There will be
  71 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  72
  73 # &process then walks the tree, building aggregate "do_now" structres for
  74 # adjacent bytes where possible. The aggregate is for a contiguous range of
  75 # bytes which each produce the same length of output, each move to the
  76 # same next state, and each have the same fallback flag.
  77 # So our 4 RAW "do_now"s above become replaced by a single structure
  78 # containing:
  79 # ["A", "D", "abcd", 1, ...]
  80 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  81 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  82 # which maps very nicely into pointer arithmetic in C for encengine.c
  83
  84 sub encode_U
  85 {
  86  # UTF-8 encode long hand - only covers part of perl's range
  87  ## my $uv = shift;
  88  # chr() works in native space so convert value from table
  89  # into that space before using chr().
  90  my $ch = chr(utf8::unicode_to_native($_[0]));
  91  # Now get core perl to encode that the way it likes.
  92  utf8::encode($ch);
  93  return $ch;
  94 }
  95
  96 sub encode_S
  97 {
  98  # encode single byte
  99  ## my ($ch,$page) = @_; return chr($ch);
 100  return chr $_[0];
 101 }
 102
 103 sub encode_D
 104 {
 105  # encode double byte MS byte first
 106  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 107  return chr ($_[1]) . chr $_[0];
 108 }
 109
 110 sub encode_M
 111 {
 112  # encode Multi-byte - single for 0..255 otherwise double
 113  ## my ($ch,$page) = @_;
 114  ## return &encode_D if $page;
 115  ## return &encode_S;
 116  return chr ($_[1]) . chr $_[0] if $_[1];
 117  return chr $_[0];
 118 }
 119
 120 my %encode_types = (U => \&encode_U,
 121                     S => \&encode_S,
 122                     D => \&encode_D,
 123                     M => \&encode_M,
 124                    );
 125
 126 # Win32 does not expand globs on command line
 127 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 128
 129 my %opt;
 130 # I think these are:
 131 # -Q to disable the duplicate codepoint test
 132 # -S make mapping errors fatal
 133 # -q to remove comments written to output files
 134 # -O to enable the (brute force) substring optimiser
 135 # -o <output> to specify the output file name (else it's the first arg)
 136 # -f <inlist> to give a file with a list of input files (else use the args)
 137 # -n <name> to name the encoding (else use the basename of the input file.
 138 getopts('CM:SQqOo:f:n:',\%opt);
 139
 140 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 141 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 142
 143 # This really should go first, else the die here causes empty (non-erroneous)
 144 # output files to be written.
 145 my @encfiles;
 146 if (exists $opt{'f'}) {
 147     # -F is followed by name of file containing list of filenames
 148     my $flist = $opt{'f'};
 149     open(FLIST,$flist) || die "Cannot open $flist:$!";
 150     chomp(@encfiles = <FLIST>);
 151     close(FLIST);
 152 } else {
 153     @encfiles = @ARGV;
 154 }
 155
 156 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 157 chmod(0666,$cname) if -f $cname && !-w $cname;
 158 open(C,">$cname") || die "Cannot open $cname:$!";
 159
 160 my $dname = $cname;
 161 my $hname = $cname;
 162
 163 my ($doC,$doEnc,$doUcm,$doPet);
 164
 165 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
 166  {
 167   $doC = 1;
 168   $dname =~ s/(\.[^\.]*)?$/.exh/;
 169   chmod(0666,$dname) if -f $cname && !-w $dname;
 170   open(D,">$dname") || die "Cannot open $dname:$!";
 171   $hname =~ s/(\.[^\.]*)?$/.h/;
 172   chmod(0666,$hname) if -f $cname && !-w $hname;
 173   open(H,">$hname") || die "Cannot open $hname:$!";
 174
 175   foreach my $fh (\*C,\*D,\*H)
 176   {
 177    print $fh <<"END" unless $opt{'q'};
 178 /*
 179  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 180  This file was autogenerated by:
 181  $^X $0 @orig_ARGV
 182 */
 183 END
 184   }
 185
 186   if ($cname =~ /(\w+)\.xs$/)
 187    {
 188     print C "#include <EXTERN.h>\n";
 189     print C "#include <perl.h>\n";
 190     print C "#include <XSUB.h>\n";
 191     print C "#define U8 U8\n";
 192    }
 193   print C "#include \"encode.h\"\n\n";
 194
 195  }
 196 elsif ($cname =~ /\.enc$/)
 197  {
 198   $doEnc = 1;
 199  }
 200 elsif ($cname =~ /\.ucm$/)
 201  {
 202   $doUcm = 1;
 203  }
 204 elsif ($cname =~ /\.pet$/)
 205  {
 206   $doPet = 1;
 207  }
 208
 209 my %encoding;
 210 my %strings;
 211 my $string_acc;
 212 my %strings_in_acc;
 213
 214 my $saved = 0;
 215 my $subsave = 0;
 216 my $strings = 0;
 217
 218 sub cmp_name
 219 {
 220  if ($a =~ /^.*-(\d+)/)
 221   {
 222    my $an = $1;
 223    if ($b =~ /^.*-(\d+)/)
 224     {
 225      my $r = $an <=> $1;
 226      return $r if $r;
 227     }
 228   }
 229  return $a cmp $b;
 230 }
 231
 232
 233 foreach my $enc (sort cmp_name @encfiles)
 234  {
 235   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 236   $name = $opt{'n'} if exists $opt{'n'};
 237   if (open(E,$enc))
 238    {
 239     if ($sfx eq 'enc')
 240      {
 241       compile_enc(\*E,lc($name));
 242      }
 243     else
 244      {
 245       compile_ucm(\*E,lc($name));
 246      }
 247    }
 248   else
 249    {
 250     warn "Cannot open $enc for $name:$!";
 251    }
 252  }
 253
 254 if ($doC)
 255  {
 256   print STDERR "Writing compiled form\n";
 257   foreach my $name (sort cmp_name keys %encoding)
 258    {
 259     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 260     process($name.'_utf8',$e2u);
 261     addstrings(\*C,$e2u);
 262
 263     process('utf8_'.$name,$u2e);
 264     addstrings(\*C,$u2e);
 265    }
 266   outbigstring(\*C,"enctable");
 267   foreach my $name (sort cmp_name keys %encoding)
 268    {
 269     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 270     outtable(\*C,$e2u, "enctable");
 271     outtable(\*C,$u2e, "enctable");
 272
 273     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 274    }
 275   foreach my $enc (sort cmp_name keys %encoding)
 276    {
 277     # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 278     my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
 279     #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 280     my $replen = 0;
 281     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
 282     my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
 283     my $sym = "${enc}_encoding";
 284     $sym =~ s/\W+/_/g;
 285     print C "encode_t $sym = \n";
 286     # This is to make null encoding work -- dankogai
 287     for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
 288         $info[$i] ||= 1;
 289     }
 290     # end of null tweak -- dankogai
 291     print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
 292    }
 293
 294   foreach my $enc (sort cmp_name keys %encoding)
 295    {
 296     my $sym = "${enc}_encoding";
 297     $sym =~ s/\W+/_/g;
 298     print H "extern encode_t $sym;\n";
 299     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 300    }
 301
 302   if ($cname =~ /(\w+)\.xs$/)
 303    {
 304     my $mod = $1;
 305     print C <<'END';
 306
 307 static void
 308 Encode_XSEncoding(pTHX_ encode_t *enc)
 309 {
 310  dSP;
 311  HV *stash = gv_stashpv("Encode::XS", TRUE);
 312  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 313  int i = 0;
 314  PUSHMARK(sp);
 315  XPUSHs(sv);
 316  while (enc->name[i])
 317   {
 318    const char *name = enc->name[i++];
 319    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 320   }
 321  PUTBACK;
 322  call_pv("Encode::define_encoding",G_DISCARD);
 323  SvREFCNT_dec(sv);
 324 }
 325
 326 END
 327
 328     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 329     print C "BOOT:\n{\n";
 330     print C "#include \"$dname\"\n";
 331     print C "}\n";
 332    }
 333   # Close in void context is bad, m'kay
 334   close(D) or warn "Error closing '$dname': $!";
 335   close(H) or warn "Error closing '$hname': $!";
 336
 337   my $perc_saved    = $saved/($strings + $saved) * 100;
 338   my $perc_subsaved = $subsave/($strings + $subsave) * 100;
 339   printf STDERR "%d bytes in string tables\n",$strings;
 340   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 341     $saved, $perc_saved              if $saved;
 342   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 343     $subsave, $perc_subsaved         if $subsave;
 344  }
 345 elsif ($doEnc)
 346  {
 347   foreach my $name (sort cmp_name keys %encoding)
 348    {
 349     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 350     output_enc(\*C,$name,$e2u);
 351    }
 352  }
 353 elsif ($doUcm)
 354  {
 355   foreach my $name (sort cmp_name keys %encoding)
 356    {
 357     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 358     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 359    }
 360  }
 361
 362 # writing half meg files and then not checking to see if you just filled the
 363 # disk is bad, m'kay
 364 close(C) or die "Error closing '$cname': $!";
 365
 366 # End of the main program.
 367
 368 sub compile_ucm
 369 {
 370  my ($fh,$name) = @_;
 371  my $e2u = {};
 372  my $u2e = {};
 373  my $cs;
 374  my %attr;
 375  while (<$fh>)
 376   {
 377    s/#.*$//;
 378    last if /^\s*CHARMAP\s*$/i;
 379    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 380     {
 381      $attr{$1} = $2;
 382     }
 383   }
 384  if (!defined($cs =  $attr{'code_set_name'}))
 385   {
 386    warn "No <code_set_name> in $name\n";
 387   }
 388  else
 389   {
 390    $name = $cs unless exists $opt{'n'};
 391   }
 392  my $erep;
 393  my $urep;
 394  my $max_el;
 395  my $min_el;
 396  if (exists $attr{'subchar'})
 397   {
 398    #my @byte;
 399    #$attr{'subchar'} =~ /^\s*/cg;
 400    #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 401    #$erep = join('',map(chr(hex($_)),@byte));
 402    $erep = $attr{'subchar'};
 403    $erep =~ s/^\s+//; $erep =~ s/\s+$//;
 404   }
 405  print "Reading $name ($cs)\n";
 406  my $nfb = 0;
 407  my $hfb = 0;
 408  while (<$fh>)
 409   {
 410    s/#.*$//;
 411    last if /^\s*END\s+CHARMAP\s*$/i;
 412    next if /^\s*$/;
 413    my (@uni, @byte) = ();
 414    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 415        or die "Bad line: $_";
 416    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 417        push @uni, map { substr($_, 1) } split(/\+/, $1);
 418    }
 419    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 420        push @byte, $1;
 421    }
 422    if (@uni)
 423     {
 424      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 425      my $ech = join('',map(chr(hex($_)),@byte));
 426      my $el  = length($ech);
 427      $max_el = $el if (!defined($max_el) || $el > $max_el);
 428      $min_el = $el if (!defined($min_el) || $el < $min_el);
 429      if (length($fb))
 430       {
 431        $fb = substr($fb,1);
 432        $hfb++;
 433       }
 434      else
 435       {
 436        $nfb++;
 437        $fb = '0';
 438       }
 439      # $fb is fallback flag
 440      # 0 - round trip safe
 441      # 1 - fallback for unicode -> enc
 442      # 2 - skip sub-char mapping
 443      # 3 - fallback enc -> unicode
 444      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 445      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 446     }
 447    else
 448     {
 449      warn $_;
 450     }
 451   }
 452  if ($nfb && $hfb)
 453   {
 454    die "$nfb entries without fallback, $hfb entries with\n";
 455   }
 456  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 457 }
 458
 459
 460
 461 sub compile_enc
 462 {
 463  my ($fh,$name) = @_;
 464  my $e2u = {};
 465  my $u2e = {};
 466
 467  my $type;
 468  while ($type = <$fh>)
 469   {
 470    last if $type !~ /^\s*#/;
 471   }
 472  chomp($type);
 473  return if $type eq 'E';
 474  # Do the hash lookup once, rather than once per function call. 4% speedup.
 475  my $type_func = $encode_types{$type};
 476  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 477  warn "$type encoded $name\n";
 478  my $rep = '';
 479  # Save a defined test by setting these to defined values.
 480  my $min_el = ~0; # A very big integer
 481  my $max_el = 0;  # Anything must be longer than 0
 482  {
 483   my $v = hex($def);
 484   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 485  }
 486  my $errors;
 487  my $seen;
 488  # use -Q to silence the seen test. Makefile.PL uses this by default.
 489  $seen = {} unless $opt{Q};
 490  do
 491   {
 492    my $line = <$fh>;
 493    chomp($line);
 494    my $page = hex($line);
 495    my $ch = 0;
 496    my $i = 16;
 497    do
 498     {
 499      # So why is it 1% faster to leave the my here?
 500      my $line = <$fh>;
 501      $line =~ s/\r\n$/\n/;
 502      die "$.:${line}Line should be exactly 65 characters long including
 503      newline (".length($line).")" unless length ($line) == 65;
 504      # Split line into groups of 4 hex digits, convert groups to ints
 505      # This takes 65.35
 506      # map {hex $_} $line =~ /(....)/g
 507      # This takes 63.75 (2.5% less time)
 508      # unpack "n*", pack "H*", $line
 509      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 510      # Doing it as while ($line =~ /(....)/g) took 74.63
 511      foreach my $val (unpack "n*", pack "H*", $line)
 512       {
 513        next if $val == 0xFFFD;
 514        my $ech = &$type_func($ch,$page);
 515        if ($val || (!$ch && !$page))
 516         {
 517          my $el  = length($ech);
 518          $max_el = $el if $el > $max_el;
 519          $min_el = $el if $el < $min_el;
 520          my $uch = encode_U($val);
 521          if ($seen) {
 522            # We're doing the test.
 523            # We don't need to read this quickly, so storing it as a scalar,
 524            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 525            # RAM and may make us faster on low RAM systems. [see __END__]
 526            if (exists $seen->{$uch})
 527              {
 528                warn sprintf("U%04X is %02X%02X and %04X\n",
 529                             $val,$page,$ch,$seen->{$uch});
 530                $errors++;
 531              }
 532            else
 533              {
 534                $seen->{$uch} = $page << 8 | $ch;
 535              }
 536          }
 537          # Passing 2 extra args each time is 3.6% slower!
 538          # Even with having to add $fallback ||= 0 later
 539          enter_fb0($e2u,$ech,$uch);
 540          enter_fb0($u2e,$uch,$ech);
 541         }
 542        else
 543         {
 544          # No character at this position
 545          # enter($e2u,$ech,undef,$e2u);
 546         }
 547        $ch++;
 548       }
 549     } while --$i;
 550   } while --$pages;
 551  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 552    if $min_el > $max_el;
 553  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 554  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 555 }
 556
 557 # my ($a,$s,$d,$t,$fb) = @_;
 558 sub enter {
 559   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 560   # state we shift to after this (multibyte) input character defaults to same
 561   # as current state.
 562   $next ||= $current;
 563   # Making sure it is defined seems to be faster than {no warnings;} in
 564   # &process, or passing it in as 0 explicity.
 565   # XXX $fallback ||= 0;
 566
 567   # Start at the beginning and work forwards through the string to zero.
 568   # effectively we are removing 1 character from the front each time
 569   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 570   # Hence -$pos is the length of the remaining string.
 571   my $pos = -length $inbytes;
 572   while (1) {
 573     my $byte = substr $inbytes, $pos, 1;
 574     #  RAW_NEXT => 0,
 575     #  RAW_IN_LEN => 1,
 576     #  RAW_OUT_BYTES => 2,
 577     #  RAW_FALLBACK => 3,
 578     # to unicode an array would seem to be better, because the pages are dense.
 579     # from unicode can be very sparse, favouring a hash.
 580     # hash using the bytes (all length 1) as keys rather than ord value,
 581     # as it's easier to sort these in &process.
 582
 583     # It's faster to always add $fallback even if it's undef, rather than
 584     # choosing between 3 and 4 element array. (hence why we set it defined
 585     # above)
 586     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 587     # When $pos was -1 we were at the last input character.
 588     unless (++$pos) {
 589       $do_now->[RAW_OUT_BYTES] = $outbytes;
 590       $do_now->[RAW_NEXT] = $next;
 591       return;
 592     }
 593     # Tail recursion. The intermdiate state may not have a name yet.
 594     $current = $do_now->[RAW_NEXT];
 595   }
 596 }
 597
 598 # This is purely for optimistation. It's just &enter hard coded for $fallback
 599 # of 0, using only a 3 entry array ref to save memory for every entry.
 600 sub enter_fb0 {
 601   my ($current,$inbytes,$outbytes,$next) = @_;
 602   $next ||= $current;
 603
 604   my $pos = -length $inbytes;
 605   while (1) {
 606     my $byte = substr $inbytes, $pos, 1;
 607     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 608     unless (++$pos) {
 609       $do_now->[RAW_OUT_BYTES] = $outbytes;
 610       $do_now->[RAW_NEXT] = $next;
 611       return;
 612     }
 613     $current = $do_now->[RAW_NEXT];
 614   }
 615 }
 616
 617 sub process
 618 {
 619   my ($name,$a) = @_;
 620   $name =~ s/\W+/_/g;
 621   $a->{Cname} = $name;
 622   my $raw = $a->{Raw};
 623   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 624   my @ent;
 625   $agg_max_in = 0;
 626   foreach my $key (sort keys %$raw) {
 627     #  RAW_NEXT => 0,
 628     #  RAW_IN_LEN => 1,
 629     #  RAW_OUT_BYTES => 2,
 630     #  RAW_FALLBACK => 3,
 631     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 632     # Now we are converting from raw to aggregate, switch from 1 byte strings
 633     # to numbers
 634     my $b = ord $key;
 635     $fallback ||= 0;
 636     if ($l &&
 637         # If this == fails, we're going to reset $agg_max_in below anyway.
 638         $b == ++$agg_max_in &&
 639         # References in numeric context give the pointer as an int.
 640         $agg_next == $next &&
 641         $agg_in_len == $in_len &&
 642         $agg_out_len == length $out_bytes &&
 643         $agg_fallback == $fallback
 644         # && length($l->[AGG_OUT_BYTES]) < 16
 645        ) {
 646       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 647       # we can aggregate this byte onto the end.
 648       $l->[AGG_MAX_IN] = $b;
 649       $l->[AGG_OUT_BYTES] .= $out_bytes;
 650     } else {
 651       # AGG_MIN_IN => 0,
 652       # AGG_MAX_IN => 1,
 653       # AGG_OUT_BYTES => 2,
 654       # AGG_NEXT => 3,
 655       # AGG_IN_LEN => 4,
 656       # AGG_OUT_LEN => 5,
 657       # AGG_FALLBACK => 6,
 658       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 659       # (only gains .6% on euc-jp  -- is it worth it?)
 660       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 661                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 662                        $agg_fallback = $fallback];
 663     }
 664     if (exists $next->{Cname}) {
 665       $next->{'Forward'} = 1 if $next != $a;
 666     } else {
 667       process(sprintf("%s_%02x",$name,$b),$next);
 668     }
 669   }
 670   # encengine.c rules say that last entry must be for 255
 671   if ($agg_max_in < 255) {
 672     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 673   }
 674   $a->{'Entries'} = \@ent;
 675 }
 676
 677
 678 sub addstrings
 679 {
 680  my ($fh,$a) = @_;
 681  my $name = $a->{'Cname'};
 682  # String tables
 683  foreach my $b (@{$a->{'Entries'}})
 684   {
 685    next unless $b->[AGG_OUT_LEN];
 686    $strings{$b->[AGG_OUT_BYTES]} = undef;
 687   }
 688  if ($a->{'Forward'})
 689   {
 690    my $var = $^O eq 'MacOS' ? 'extern' : 'static';
 691    print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 692   }
 693  $a->{'DoneStrings'} = 1;
 694  foreach my $b (@{$a->{'Entries'}})
 695   {
 696    my ($s,$e,$out,$t,$end,$l) = @$b;
 697    addstrings($fh,$t) unless $t->{'DoneStrings'};
 698   }
 699 }
 700
 701 sub outbigstring
 702 {
 703   my ($fh,$name) = @_;
 704
 705   $string_acc = '';
 706
 707   # Make the big string in the string accumulator. Longest first, on the hope
 708   # that this makes it more likely that we find the short strings later on.
 709   # Not sure if it helps sorting strings of the same length lexcically.
 710   foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
 711     my $index = index $string_acc, $s;
 712     if ($index >= 0) {
 713       $saved += length($s);
 714       $strings_in_acc{$s} = $index;
 715     } else {
 716     OPTIMISER: {
 717         if ($opt{'O'}) {
 718           my $sublength = length $s;
 719           while (--$sublength > 0) {
 720             # progressively lop characters off the end, to see if the start of
 721             # the new string overlaps the end of the accumulator.
 722             if (substr ($string_acc, -$sublength)
 723                 eq substr ($s, 0, $sublength)) {
 724               $subsave += $sublength;
 725               $strings_in_acc{$s} = length ($string_acc) - $sublength;
 726               # append the last bit on the end.
 727               $string_acc .= substr ($s, $sublength);
 728               last OPTIMISER;
 729             }
 730             # or if the end of the new string overlaps the start of the
 731             # accumulator
 732             next unless substr ($string_acc, 0, $sublength)
 733               eq substr ($s, -$sublength);
 734             # well, the last $sublength characters of the accumulator match.
 735             # so as we're prepending to the accumulator, need to shift all our
 736             # existing offsets forwards
 737             $_ += $sublength foreach values %strings_in_acc;
 738             $subsave += $sublength;
 739             $strings_in_acc{$s} = 0;
 740             # append the first bit on the start.
 741             $string_acc = substr ($s, 0, -$sublength) . $string_acc;
 742             last OPTIMISER;
 743           }
 744         }
 745         # Optimiser (if it ran) found nothing, so just going have to tack the
 746         # whole thing on the end.
 747         $strings_in_acc{$s} = length $string_acc;
 748         $string_acc .= $s;
 749       };
 750     }
 751   }
 752
 753   $strings = length $string_acc;
 754   my $definition = "\nstatic const U8 $name\[$strings] = { " .
 755     join(',',unpack "C*",$string_acc);
 756   # We have a single long line. Split it at convenient commas.
 757   print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
 758   print $fh substr ($definition, pos $definition), " };\n";
 759 }
 760
 761 sub findstring {
 762   my ($name,$s) = @_;
 763   my $offset = $strings_in_acc{$s};
 764   die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
 765     unless defined $offset;
 766   "$name + $offset";
 767 }
 768
 769 sub outtable
 770 {
 771  my ($fh,$a,$bigname) = @_;
 772  my $name = $a->{'Cname'};
 773  $a->{'Done'} = 1;
 774  foreach my $b (@{$a->{'Entries'}})
 775   {
 776    my ($s,$e,$out,$t,$end,$l) = @$b;
 777    outtable($fh,$t,$bigname) unless $t->{'Done'};
 778   }
 779  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
 780  foreach my $b (@{$a->{'Entries'}})
 781   {
 782    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 783    # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
 784    print  $fh "{";
 785    if ($l)
 786     {
 787      printf $fh findstring($bigname,$out);
 788     }
 789    else
 790     {
 791      print  $fh "0";
 792     }
 793    print  $fh ",",$t->{Cname};
 794    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 795   }
 796  print $fh "};\n";
 797 }
 798
 799 sub output_enc
 800 {
 801  my ($fh,$name,$a) = @_;
 802  die "Changed - fix me for new structure";
 803  foreach my $b (sort keys %$a)
 804   {
 805    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 806   }
 807 }
 808
 809 sub decode_U
 810 {
 811  my $s = shift;
 812 }
 813
 814 my @uname;
 815 sub char_names
 816 {
 817  my $s = do "unicore/Name.pl";
 818  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 819  pos($s) = 0;
 820  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 821   {
 822    my $name = $3;
 823    my $s = hex($1);
 824    last if $s >= 0x10000;
 825    my $e = length($2) ? hex($2) : $s;
 826    for (my $i = $s; $i <= $e; $i++)
 827     {
 828      $uname[$i] = $name;
 829 #    print sprintf("U%04X $name\n",$i);
 830     }
 831   }
 832 }
 833
 834 sub output_ucm_page
 835 {
 836   my ($cmap,$a,$t,$pre) = @_;
 837   # warn sprintf("Page %x\n",$pre);
 838   my $raw = $t->{Raw};
 839   foreach my $key (sort keys %$raw) {
 840     #  RAW_NEXT => 0,
 841     #  RAW_IN_LEN => 1,
 842     #  RAW_OUT_BYTES => 2,
 843     #  RAW_FALLBACK => 3,
 844     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 845     my $u = ord $key;
 846     $fallback ||= 0;
 847
 848     if ($next != $a && $next != $t) {
 849       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 850     } elsif (length $out_bytes) {
 851       if ($pre) {
 852         $u = $pre|($u &0x3f);
 853       }
 854       my $s = sprintf "<U%04X> ",$u;
 855       #foreach my $c (split(//,$out_bytes)) {
 856       #  $s .= sprintf "\\x%02X",ord($c);
 857       #}
 858       # 9.5% faster changing that loop to this:
 859       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 860       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 861       push(@$cmap,$s);
 862     } else {
 863       warn join(',',$u, @{$raw->{$key}},$a,$t);
 864     }
 865   }
 866 }
 867
 868 sub output_ucm
 869 {
 870  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 871  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 872  print $fh "<code_set_name> \"$name\"\n";
 873  char_names();
 874  if (defined $min_el)
 875   {
 876    print $fh "<mb_cur_min> $min_el\n";
 877   }
 878  if (defined $max_el)
 879   {
 880    print $fh "<mb_cur_max> $max_el\n";
 881   }
 882  if (defined $rep)
 883   {
 884    print $fh "<subchar> ";
 885    foreach my $c (split(//,$rep))
 886     {
 887      printf $fh "\\x%02X",ord($c);
 888     }
 889    print $fh "\n";
 890   }
 891  my @cmap;
 892  output_ucm_page(\@cmap,$h,$h,0);
 893  print $fh "#\nCHARMAP\n";
 894  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 895   {
 896    print $fh $line;
 897   }
 898  print $fh "END CHARMAP\n";
 899 }
 900
 901 use vars qw(
 902     $_Enc2xs
 903     $_Version
 904     $_Inc
 905     $_E2X
 906     $_Name
 907     $_TableFiles
 908     $_Now
 909 );
 910
 911 sub find_e2x{
 912     eval { require File::Find; };
 913     my (@inc, %e2x_dir);
 914     for my $inc (@INC){
 915         push @inc, $inc unless $inc eq '.'; #skip current dir
 916     }
 917     File::Find::find(
 918              sub {
 919                  my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 920                      $atime,$mtime,$ctime,$blksize,$blocks)
 921                      = lstat($_) or return;
 922                  -f _ or return;
 923                  if (/^.*\.e2x$/o){
 924                      no warnings 'once';
 925                      $e2x_dir{$File::Find::dir} ||= $mtime;
 926                  }
 927                  return;
 928              }, @inc);
 929     warn join("\n", keys %e2x_dir), "\n";
 930     for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
 931         $_E2X = $d;
 932         # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
 933         return $_E2X;
 934     }
 935 }
 936
 937 sub make_makefile_pl
 938 {
 939     eval { require Encode; };
 940     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 941     # our used for variable expanstion
 942     $_Enc2xs = $0;
 943     $_Version = $VERSION;
 944     $_E2X = find_e2x();
 945     $_Name = shift;
 946     $_TableFiles = join(",", map {qq('$_')} @_);
 947     $_Now = scalar localtime();
 948
 949     eval { require File::Spec; };
 950     _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
 951     _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
 952     _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
 953     _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
 954     _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
 955     exit;
 956 }
 957
 958 use vars qw(
 959             $_ModLines
 960             $_LocalVer
 961             );
 962
 963 sub make_configlocal_pm
 964 {
 965     eval { require Encode; };
 966     $@ and die "Unable to require Encode: $@\n";
 967     eval { require File::Spec; };
 968     # our used for variable expanstion
 969     my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
 970     my %LocalMod = ();
 971     for my $d (@INC){
 972         my $inc = File::Spec->catfile($d, "Encode");
 973         -d $inc or next;
 974         opendir my $dh, $inc or die "$inc:$!";
 975         warn "Checking $inc...\n";
 976         for my $f (grep /\.pm$/o, readdir($dh)){
 977             -f File::Spec->catfile($inc, "$f") or next;
 978             $INC{"Encode/$f"} and next;
 979             warn "require Encode/$f;\n";
 980             eval { require "Encode/$f"; };
 981             $@ and die "Can't require Encode/$f: $@\n";
 982             for my $enc (Encode->encodings()){
 983                 no warnings 'once';
 984                 $in_core{$enc} and next;
 985                 $Encode::Config::ExtModule{$enc} and next;
 986                 my $mod = "Encode/$f";
 987                 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
 988                 $LocalMod{$enc} ||= $mod;
 989             }
 990         }
 991     }
 992     $_ModLines = "";
 993     for my $enc (sort keys %LocalMod){
 994         $_ModLines .=
 995             qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
 996     }
 997     warn $_ModLines;
 998     $_LocalVer = _mkversion();
 999     $_E2X = find_e2x();
1000     $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
1001     _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
1002                   File::Spec->catfile($_Inc,"ConfigLocal.pm"),
1003                   1);
1004     exit;
1005 }
1006
1007 sub _mkversion{
1008     my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1009     $yyyy += 1900, $mo +=1;
1010     return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1011 }
1012
1013 sub _print_expand{
1014     eval { require File::Basename; };
1015     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1016     File::Basename->import();
1017     my ($src, $dst, $clobber) = @_;
1018     if (!$clobber and -e $dst){
1019         warn "$dst exists. skipping\n";
1020         return;
1021     }
1022     warn "Generating $dst...\n";
1023     open my $in, $src or die "$src : $!";
1024     if ((my $d = dirname($dst)) ne '.'){
1025         -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1026     }
1027     open my $out, ">$dst" or die "$!";
1028     my $asis = 0;
1029     while (<$in>){
1030         if (/^#### END_OF_HEADER/){
1031             $asis = 1; next;
1032         }
1033         s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1034         print $out $_;
1035     }
1036 }
1037 __END__
1038
1039 =head1 NAME
1040
1041 enc2xs -- Perl Encode Module Generator
1042
1043 =head1 SYNOPSIS
1044
1045   enc2xs -[options]
1046   enc2xs -M ModName mapfiles...
1047   enc2xs -C
1048
1049 =head1 DESCRIPTION
1050
1051 F<enc2xs> builds a Perl extension for use by Encode from either
1052 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1053 Besides being used internally during the build process of the Encode
1054 module, you can use F<enc2xs> to add your own encoding to perl.
1055 No knowledge of XS is necessary.
1056
1057 =head1 Quick Guide
1058
1059 If you want to know as little about Perl as possible but need to
1060 add a new encoding, just read this chapter and forget the rest.
1061
1062 =over 4
1063
1064 =item 0.
1065
1066 Have a .ucm file ready.  You can get it from somewhere or you can write
1067 your own from scratch or you can grab one from the Encode distribution
1068 and customize it.  For the UCM format, see the next Chapter.  In the
1069 example below, I'll call my theoretical encoding myascii, defined
1070 in I<my.ucm>.  C<$> is a shell prompt.
1071
1072   $ ls -F
1073   my.ucm
1074
1075 =item 1.
1076
1077 Issue a command as follows;
1078
1079   $ enc2xs -M My my.ucm
1080   generating Makefile.PL
1081   generating My.pm
1082   generating README
1083   generating Changes
1084
1085 Now take a look at your current directory.  It should look like this.
1086
1087   $ ls -F
1088   Makefile.PL   My.pm         my.ucm        t/
1089
1090 The following files were created.
1091
1092   Makefile.PL - MakeMaker script
1093   My.pm       - Encode submodule
1094   t/My.t      - test file
1095
1096 =over 4
1097
1098 =item 1.1.
1099
1100 If you want *.ucm installed together with the modules, do as follows;
1101
1102   $ mkdir Encode
1103   $ mv *.ucm Encode
1104   $ enc2xs -M My Encode/*ucm
1105
1106 =back
1107
1108 =item 2.
1109
1110 Edit the files generated.  You don't have to if you have no time AND no
1111 intention to give it to someone else.  But it is a good idea to edit
1112 the pod and to add more tests.
1113
1114 =item 3.
1115
1116 Now issue a command all Perl Mongers love:
1117
1118   $ perl Makefile.PL
1119   Writing Makefile for Encode::My
1120
1121 =item 4.
1122
1123 Now all you have to do is make.
1124
1125   $ make
1126   cp My.pm blib/lib/Encode/My.pm
1127   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1128     -o encode_t.c -f encode_t.fnm
1129   Reading myascii (myascii)
1130   Writing compiled form
1131   128 bytes in string tables
1132   384 bytes (75%) saved spotting duplicates
1133   1 bytes (0.775%) saved using substrings
1134   ....
1135   chmod 644 blib/arch/auto/Encode/My/My.bs
1136   $
1137
1138 The time it takes varies depending on how fast your machine is and
1139 how large your encoding is.  Unless you are working on something big
1140 like euc-tw, it won't take too long.
1141
1142 =item 5.
1143
1144 You can "make install" already but you should test first.
1145
1146   $ make test
1147   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1148     -e 'use Test::Harness  qw(&runtests $verbose); \
1149     $verbose=0; runtests @ARGV;' t/*.t
1150   t/My....ok
1151   All tests successful.
1152   Files=1, Tests=2,  0 wallclock secs
1153    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1154
1155 =item 6.
1156
1157 If you are content with the test result, just "make install"
1158
1159 =item 7.
1160
1161 If you want to add your encoding to Encode's demand-loading list
1162 (so you don't have to "use Encode::YourEncoding"), run
1163
1164   enc2xs -C
1165
1166 to update Encode::ConfigLocal, a module that controls local settings.
1167 After that, "use Encode;" is enough to load your encodings on demand.
1168
1169 =back
1170
1171 =head1 The Unicode Character Map
1172
1173 Encode uses the Unicode Character Map (UCM) format for source character
1174 mappings.  This format is used by IBM's ICU package and was adopted
1175 by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1176 more flexible than Tcl's Encoding Map and far more user-friendly,
1177 this is the recommended formet for Encode now.
1178
1179 A UCM file looks like this.
1180
1181   #
1182   # Comments
1183   #
1184   <code_set_name> "US-ascii" # Required
1185   <code_set_alias> "ascii"   # Optional
1186   <mb_cur_min> 1             # Required; usually 1
1187   <mb_cur_max> 1             # Max. # of bytes/char
1188   <subchar> \x3F             # Substitution char
1189   #
1190   CHARMAP
1191   <U0000> \x00 |0 # <control>
1192   <U0001> \x01 |0 # <control>
1193   <U0002> \x02 |0 # <control>
1194   ....
1195   <U007C> \x7C |0 # VERTICAL LINE
1196   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1197   <U007E> \x7E |0 # TILDE
1198   <U007F> \x7F |0 # <control>
1199   END CHARMAP
1200
1201 =over 4
1202
1203 =item *
1204
1205 Anything that follows C<#> is treated as a comment.
1206
1207 =item *
1208
1209 The header section continues until a line containing the word
1210 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1211 pair per line.  Strings used as values must be quoted. Barewords are
1212 treated as numbers.  I<\xXX> represents a byte.
1213
1214 Most of the keywords are self-explanatory. I<subchar> means
1215 substitution character, not subcharacter.  When you decode a Unicode
1216 sequence to this encoding but no matching character is found, the byte
1217 sequence defined here will be used.  For most cases, the value here is
1218 \x3F; in ASCII, this is a question mark.
1219
1220 =item *
1221
1222 CHARMAP starts the character map section.  Each line has a form as
1223 follows:
1224
1225   <UXXXX> \xXX.. |0 # comment
1226     ^     ^      ^
1227     |     |      +- Fallback flag
1228     |     +-------- Encoded byte sequence
1229     +-------------- Unicode Character ID in hex
1230
1231 The format is roughly the same as a header section except for the
1232 fallback flag: | followed by 0..3.   The meaning of the possible
1233 values is as follows:
1234
1235 =over 4
1236
1237 =item |0
1238
1239 Round trip safe.  A character decoded to Unicode encodes back to the
1240 same byte sequence.  Most characters have this flag.
1241
1242 =item |1
1243
1244 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1245 character for the encode map only.
1246
1247 =item |2
1248
1249 Skip sub-char mapping should there be no code point.
1250
1251 =item |3
1252
1253 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1254 character for the decode map only.
1255
1256 =back
1257
1258 =item *
1259
1260 And finally, END OF CHARMAP ends the section.
1261
1262 =back
1263
1264 When you are manually creating a UCM file, you should copy ascii.ucm
1265 or an existing encoding which is close to yours, rather than write
1266 your own from scratch.
1267
1268 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1269 is, unless your environment is EBCDIC.
1270
1271 B<CAVEAT>: not all features in UCM are implemented.  For example,
1272 icu:state is not used.  Because of that, you need to write a perl
1273 module if you want to support algorithmical encodings, notably
1274 the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1275 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1276
1277 =head2 Coping with duplicate mappings
1278
1279 When you create a map, you SHOULD make your mappings round-trip safe.
1280 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1281 $data> stands for all characters that are marked as C<|0>.  Here is
1282 how to make sure:
1283
1284 =over 4
1285
1286 =item *
1287
1288 Sort your map in Unicode order.
1289
1290 =item *
1291
1292 When you have a duplicate entry, mark either one with '|1' or '|3'.
1293
1294 =item *
1295
1296 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1297
1298 =back
1299
1300 Here is an example from big5-eten.
1301
1302   <U2550> \xF9\xF9 |0
1303   <U2550> \xA2\xA4 |3
1304
1305 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1306 this;
1307
1308   E to U               U to E
1309   --------------------------------------
1310   \xF9\xF9 => U2550    U2550 => \xF9\xF9
1311   \xA2\xA4 => U2550
1312
1313 So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1314 down, here is what happens.
1315
1316   E to U               U to E
1317   --------------------------------------
1318   \xA2\xA4 => U2550    U2550 => \xF9\xF9
1319   (\xF9\xF9 => U2550 is now overwritten!)
1320
1321 The Encode package comes with F<ucmlint>, a crude but sufficient
1322 utility to check the integrity of a UCM file.  Check under the
1323 Encode/bin directory for this.
1324
1325 When in doubt, you can use F<ucmsort>, yet another utility under
1326 Encode/bin directory.
1327
1328 =head1 Bookmarks
1329
1330 =over 4
1331
1332 =item *
1333
1334 ICU Home Page
1335 L<http://oss.software.ibm.com/icu/>
1336
1337 =item *
1338
1339 ICU Character Mapping Tables
1340 L<http://oss.software.ibm.com/icu/charset/>
1341
1342 =item *
1343
1344 ICU:Conversion Data
1345 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1346
1347 =back
1348
1349 =head1 SEE ALSO
1350
1351 L<Encode>,
1352 L<perlmod>,
1353 L<perlpod>
1354
1355 =cut
1356
1357 # -Q to disable the duplicate codepoint test
1358 # -S make mapping errors fatal
1359 # -q to remove comments written to output files
1360 # -O to enable the (brute force) substring optimiser
1361 # -o <output> to specify the output file name (else it's the first arg)
1362 # -f <inlist> to give a file with a list of input files (else use the args)
1363 # -n <name> to name the encoding (else use the basename of the input file.
1364
1365 With %seen holding array refs:
1366
1367       865.66 real        28.80 user         8.79 sys
1368       7904  maximum resident set size
1369       1356  average shared memory size
1370      18566  average unshared data size
1371        229  average unshared stack size
1372      46080  page reclaims
1373      33373  page faults
1374
1375 With %seen holding simple scalars:
1376
1377       342.16 real        27.11 user         3.54 sys
1378       8388  maximum resident set size
1379       1394  average shared memory size
1380      14969  average unshared data size
1381        236  average unshared stack size
1382      28159  page reclaims
1383       9839  page faults
1384
1385 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1386 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1387 RAM machine, but it's going to help even on modern machines.
1388 Swapping is bad, m'kay :-)