libjava/scripts/unicode-muncher.pl

   1 #!/usr/bin/perl -w
   2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
   3 # Copyright (C) 1998, 2002 Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20 # 02111-1307 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39 # Code for reading UnicodeData.txt and generating the code for
  40 # gnu.java.lang.CharData.  For now, the relevant Unicode definition files
  41 # are found in libjava/gnu/gcj/convert/.
  42 #
  43 # Inspired by code from Jochen Hoenicke.
  44 # author Eric Blake <ebb9@email.byu.edu>
  45 #
  46 # Usage: ./unicode-muncher <UnicodeData.txt> <CharData.java>
  47 #   where <UnicodeData.txt> is obtained from www.unicode.org (named
  48 #   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
  49 #   is the final location for the Java interface gnu.java.lang.CharData.
  50 #   As of JDK 1.4, use Unicode version 3.0.0 for best results.
  51
  52 ##
  53 ## Convert a 16-bit integer to a Java source code String literal character
  54 ##
  55 sub javaChar($) {
  56     my ($char) = @_;
  57     die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
  58     $char += 0x10000 if $char < 0;
  59     # Special case characters that must be escaped, or are shorter as ASCII
  60     return sprintf("\\%03o", $char) if $char < 0x20;
  61     return "\\\"" if $char == 0x22;
  62     return "\\\\" if $char == 0x5c;
  63     return pack("C", $char) if $char < 0x7f;
  64     return sprintf("\\u%04x", $char);
  65 }
  66
  67 ##
  68 ## Convert the text UnicodeData file from www.unicode.org into a Java
  69 ## interface with string constants holding the compressed information.
  70 ##
  71 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
  72                    SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
  73 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
  74
  75 my $NOBREAK_FLAG  = 32;
  76 my $MIRRORED_FLAG = 64;
  77
  78 my @info = ();
  79 my $titlecase = "";
  80 my $count = 0;
  81 my $range = 0;
  82
  83 die "Usage: $0 <UnicodeData.txt> <CharData.java>" unless @ARGV == 2;
  84 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
  85
  86 # Stage 1: Parse the attribute file
  87 $| = 1;
  88 print "GNU Classpath Unicode Attribute Database Generator 2.0\n";
  89 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
  90 print "Parsing attributes file";
  91 while(<UNICODE>) {
  92     print "." unless $count++ % 1000;
  93     chomp;
  94     s/\r//g;
  95     my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
  96         $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
  97     $ch = hex($ch);
  98     next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
  99
 100     my ($type, $numValue, $upperchar, $lowerchar, $direction);
 101
 102     $type = 0;
 103     while ($category !~ /^$TYPECODES[$type]$/) {
 104         if (++$type == @TYPECODES) {
 105             die "$ch: Unknown type: $category";
 106         }
 107     }
 108     $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
 109     $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
 110
 111     if ($numeric =~ /^[0-9]+$/) {
 112         $numValue = $numeric;
 113         die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
 114     } elsif ($numeric eq "") {
 115         # Special case sequences of 'a'-'z'
 116         if ($ch >= 0x0041 && $ch <= 0x005a) {
 117             $numValue = $ch - 0x0037;
 118         } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
 119             $numValue = $ch - 0x0057;
 120         } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
 121             $numValue = $ch - 0xff17;
 122         } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
 123             $numValue = $ch - 0xff37;
 124         } else {
 125             $numValue = -1;
 126         }
 127     } else {
 128         $numValue = -2;
 129     }
 130
 131     $upperchar = $upcase ? hex($upcase) - $ch : 0;
 132     $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
 133     if ($title ne $upcase) {
 134         my $titlechar = $title ? hex($title) : $ch;
 135         $titlecase .= pack("n2", $ch, $titlechar);
 136     }
 137
 138     $direction = 0;
 139     while ($bidir !~ /^$DIRCODES[$direction]$/) {
 140         if (++$direction == @DIRCODES) {
 141             $direction = -1;
 142             last;
 143         }
 144     }
 145
 146     if ($range) {
 147         die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
 148         for ($range + 1 .. $ch - 1) {
 149             $info[$_] = pack("n5", $type, $numValue, $upperchar,
 150                              $lowerchar, $direction);
 151         }
 152         $range = 0;
 153     } elsif ($name =~ /First>$/) {
 154         $range = $ch;
 155     }
 156     $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
 157                       $direction);
 158 }
 159 close UNICODE;
 160
 161 # Stage 2: Compress the data structures
 162 printf "\nCompressing data structures";
 163 $count = 0;
 164 my $info = ();
 165 my %charhash = ();
 166 my @charinfo = ();
 167
 168 for my $ch (0 .. 0xffff) {
 169     print "." unless $count++ % 0x1000;
 170     if (! defined $info[$ch]) {
 171         $info[$ch] = pack("n5", 0, -1, 0, 0, -1);
 172     }
 173
 174     my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
 175     if (! exists $charhash{$info[$ch]}) {
 176         push @charinfo, [ $numVal, $upper, $lower, $direction ];
 177         $charhash{$info[$ch]} = $#charinfo;
 178     }
 179     $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
 180 }
 181
 182 my $charlen = @charinfo;
 183 my $bestshift;
 184 my $bestest = 1000000;
 185 my $bestblkstr;
 186 die "Too many unique character entries: $charlen\n" if $charlen > 512;
 187 print "\nUnique character entries: $charlen\n";
 188
 189 for my $i (3 .. 8) {
 190     my $blksize = 1 << $i;
 191     my %blocks = ();
 192     my @blkarray = ();
 193     my ($j, $k);
 194     print "shift: $i";
 195
 196     for ($j = 0; $j < 0x10000; $j += $blksize) {
 197         my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 198         if (! exists $blocks{$blkkey}) {
 199             push @blkarray, $blkkey;
 200             $blocks{$blkkey} = $#blkarray;
 201         }
 202     }
 203     my $blknum = @blkarray;
 204     my $blocklen = $blknum * $blksize;
 205     printf " before %5d", $blocklen;
 206
 207     # Now we try to pack the blkarray as tight as possible by finding matching
 208     # heads and tails.
 209     for ($j = $blksize - 1; $j > 0; $j--) {
 210         my %tails = ();
 211         for $k (0 .. $#blkarray) {
 212             next if ! defined $blkarray[$k];
 213             my $len = length $blkarray[$k];
 214             my $tail = substr $blkarray[$k], $len - $j * 2;
 215             if (exists $tails{$tail}) {
 216                 push @{$tails{$tail}}, $k;
 217             } else {
 218                 $tails{$tail} = [ $k ];
 219             }
 220         }
 221
 222         # tails are calculated, now calculate the heads and merge.
 223       BLOCK:
 224         for $k (0 .. $#blkarray) {
 225             next if ! defined $blkarray[$k];
 226             my $tomerge = $k;
 227             while (1) {
 228                 my $head = substr($blkarray[$tomerge], 0, $j * 2);
 229                 my $entry = $tails{$head};
 230                 next BLOCK if ! defined $entry;
 231
 232                 my $other = shift @{$entry};
 233                 if ($other == $tomerge) {
 234                     if (@{$entry}) {
 235                         push @{$entry}, $other;
 236                         $other = shift @{$entry};
 237                     } else {
 238                         push @{$entry}, $other;
 239                         next BLOCK;
 240                     }
 241                 }
 242                 if (@{$entry} == 0) {
 243                     delete $tails{$head};
 244                 }
 245
 246                 # a match was found
 247                 my $merge = $blkarray[$other]
 248                     . substr($blkarray[$tomerge], $j * 2);
 249                 $blocklen -= $j;
 250                 $blknum--;
 251
 252                 if ($other < $tomerge) {
 253                     $blkarray[$tomerge] = undef;
 254                     $blkarray[$other] = $merge;
 255                     my $len = length $merge;
 256                     my $tail = substr $merge, $len - $j * 2;
 257                     $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
 258                                       @{$tails{$tail}} ];
 259                     next BLOCK;
 260                 }
 261                 $blkarray[$tomerge] = $merge;
 262                 $blkarray[$other] = undef;
 263             }
 264         }
 265     }
 266     my $blockstr;
 267     for $k (0 .. $#blkarray) {
 268         $blockstr .= $blkarray[$k] if defined $blkarray[$k];
 269     }
 270
 271     die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
 272     my $estimate = 2 * $blocklen + (0x20000 >> $i);
 273
 274     printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
 275     if ($estimate < $bestest) {
 276         $bestest = $estimate;
 277         $bestshift = $i;
 278         $bestblkstr = $blockstr;
 279     }
 280 }
 281
 282 my @blocks;
 283 my $blksize = 1 << $bestshift;
 284 for (my $j = 0; $j < 0x10000; $j += $blksize) {
 285     my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 286     my $index = index $bestblkstr, $blkkey;
 287     while ($index & 1) {
 288         die "not found: $j" if $index == -1;
 289         $index = index $bestblkstr, $blkkey, $index + 1;
 290     }
 291     push @blocks, ($index / 2 - $j) & 0xffff;
 292 }
 293
 294 # Phase 3: Generate the file
 295 die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
 296     if @blocks > 0xffff / 3;
 297 die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
 298     if length($bestblkstr) > 0xffff / 3;
 299 {
 300     print "Generating $ARGV[1] with shift of $bestshift";
 301     my ($i, $j);
 302
 303     open OUTPUT, "> $ARGV[1]" or die "Failed creating output file: $!\n";
 304     print OUTPUT <<EOF;
 305 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
 306    Copyright (C) 2002 Free Software Foundation, Inc.
 307    *** This file is generated by scripts/unicode-muncher.pl ***
 308
 309 This file is part of GNU Classpath.
 310
 311 GNU Classpath is free software; you can redistribute it and/or modify
 312 it under the terms of the GNU General Public License as published by
 313 the Free Software Foundation; either version 2, or (at your option)
 314 any later version.
 315
 316 GNU Classpath is distributed in the hope that it will be useful, but
 317 WITHOUT ANY WARRANTY; without even the implied warranty of
 318 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 319 General Public License for more details.
 320
 321 You should have received a copy of the GNU General Public License
 322 along with GNU Classpath; see the file COPYING.  If not, write to the
 323 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 324 02111-1307 USA.
 325
 326 Linking this library statically or dynamically with other modules is
 327 making a combined work based on this library.  Thus, the terms and
 328 conditions of the GNU General Public License cover the whole
 329 combination.
 330
 331 As a special exception, the copyright holders of this library give you
 332 permission to link this library with independent modules to produce an
 333 executable, regardless of the license terms of these independent
 334 modules, and to copy and distribute the resulting executable under
 335 terms of your choice, provided that you also meet, for each linked
 336 independent module, the terms and conditions of the license of that
 337 module.  An independent module is a module which is not derived from
 338 or based on this library.  If you modify this library, you may extend
 339 this exception to your version of the library, but you are not
 340 obligated to do so.  If you do not wish to do so, delete this
 341 exception statement from your version. */
 342
 343 package gnu.java.lang;
 344
 345 /**
 346  * This contains the info about the unicode characters, that
 347  * java.lang.Character needs.  It is generated automatically from
 348  * <code>$ARGV[0]</code>, by some
 349  * perl scripts. This Unicode definition file can be found on the
 350  * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 351  * JDK 1.4 uses Unicode version 3.0.0.
 352  *
 353  * The data is stored as string constants, but Character will convert these
 354  * Strings to their respective <code>char[]</code> components.  The field
 355  * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
 356  * characters within <code>DATA</code>.  The DATA field, in turn, stores
 357  * information about each character in the low order bits, and an offset
 358  * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
 359  * <code>NUM_VALUE</code>, and <code>DIRECTION</code>.  Notice that the
 360  * attribute tables are much smaller than 0xffff entries; as many characters
 361  * in Unicode share common attributes.  Finally, there is a listing for
 362  * <code>TITLE</code> exceptions (most characters just have the same
 363  * title case as upper case).
 364  *
 365  * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
 366  *         Eric Blake)
 367  * \@see Character
 368  */
 369 public interface CharData
 370 {
 371   /**
 372    * The Unicode definition file that was parsed to build this database.
 373    */
 374   String SOURCE = \"$ARGV[0]\";
 375
 376   /**
 377    * The character shift amount to look up the block offset. In other words,
 378    * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
 379    * <code>ch</code> is described in <code>DATA</code>.
 380    */
 381   int SHIFT = $bestshift;
 382
 383   /**
 384    * The mapping of character blocks to their location in <code>DATA</code>.
 385    * Each entry has been adjusted so that the 16-bit sum with the desired
 386    * character gives the actual index into <code>DATA</code>.
 387    */
 388   String BLOCKS
 389 EOF
 390
 391     for ($i = 0; $i < @blocks / 11; $i++) {
 392         print OUTPUT $i ? "\n    + \"" : "    = \"";
 393         for $j (0 .. 10) {
 394             last if @blocks <= $i * 11 + $j;
 395             my $val = $blocks[$i * 11 + $j];
 396             print OUTPUT javaChar($val);
 397         }
 398         print OUTPUT "\"";
 399     }
 400
 401     print OUTPUT <<EOF;
 402 ;
 403
 404   /**
 405    * Information about each character.  The low order 5 bits form the
 406    * character type, the next bit is a flag for non-breaking spaces, and the
 407    * next bit is a flag for mirrored directionality.  The high order 9 bits
 408    * form the offset into the attribute tables.  Note that this limits the
 409    * number of unique character attributes to 512, which is not a problem
 410    * as of Unicode version 3.2.0, but may soon become one.
 411    */
 412   String DATA
 413 EOF
 414
 415     my $len = length($bestblkstr) / 2;
 416     for ($i = 0; $i < $len / 11; $i++) {
 417         print OUTPUT $i ? "\n    + \"" : "    = \"";
 418         for $j (0 .. 10) {
 419             last if $len <= $i * 11 + $j;
 420             my $val = unpack "n", substr($bestblkstr, 2 * ($i*11 + $j), 2);
 421             print OUTPUT javaChar($val);
 422         }
 423         print OUTPUT "\"";
 424     }
 425
 426     print OUTPUT <<EOF;
 427 ;
 428
 429   /**
 430    * This is the attribute table for computing the numeric value of a
 431    * character.  The value is -1 if Unicode does not define a value, -2
 432    * if the value is not a positive integer, otherwise it is the value.
 433    * Note that this is a signed value, but stored as an unsigned char
 434    * since this is a String literal.
 435    */
 436   String NUM_VALUE
 437 EOF
 438
 439     $len = @charinfo;
 440     for ($i = 0; $i < $len / 11; $i++) {
 441         print OUTPUT $i ? "\n    + \"" : "    = \"";
 442         for $j (0 .. 10) {
 443             last if $len <= $i * 11 + $j;
 444             my $val = $charinfo[$i * 11 + $j][0];
 445             print OUTPUT javaChar($val);
 446         }
 447         print OUTPUT "\"";
 448     }
 449
 450     print OUTPUT <<EOF;
 451 ;
 452
 453   /**
 454    * This is the attribute table for computing the uppercase representation
 455    * of a character.  The value is the signed difference between the
 456    * character and its uppercase version.  Note that this is stored as an
 457    * unsigned char since this is a String literal.
 458    */
 459   String UPPER
 460 EOF
 461
 462     $len = @charinfo;
 463     for ($i = 0; $i < $len / 11; $i++) {
 464         print OUTPUT $i ? "\n    + \"" : "    = \"";
 465         for $j (0 .. 10) {
 466             last if $len <= $i * 11 + $j;
 467             my $val = $charinfo[$i * 11 + $j][1];
 468             print OUTPUT javaChar($val);
 469         }
 470         print OUTPUT "\"";
 471     }
 472
 473     print OUTPUT <<EOF;
 474 ;
 475
 476   /**
 477    * This is the attribute table for computing the lowercase representation
 478    * of a character.  The value is the signed difference between the
 479    * character and its lowercase version.  Note that this is stored as an
 480    * unsigned char since this is a String literal.
 481    */
 482   String LOWER
 483 EOF
 484
 485     $len = @charinfo;
 486     for ($i = 0; $i < $len / 11; $i++) {
 487         print OUTPUT $i ? "\n    + \"" : "    = \"";
 488         for $j (0 .. 10) {
 489             last if $len <= $i * 11 + $j;
 490             my $val = $charinfo[$i * 11 + $j][2];
 491             print OUTPUT javaChar($val);
 492         }
 493         print OUTPUT "\"";
 494     }
 495
 496     print OUTPUT <<EOF;
 497 ;
 498
 499   /**
 500    * This is the attribute table for computing the directionality class
 501    * of a character.  At present, the value is in the range 0 - 18 if the
 502    * character has a direction, otherwise it is -1.  Note that this is
 503    * stored as an unsigned char since this is a String literal.
 504    */
 505   String DIRECTION
 506 EOF
 507
 508     $len = @charinfo;
 509     for ($i = 0; $i < $len / 11; $i++) {
 510         print OUTPUT $i ? "\n    + \"" : "    = \"";
 511         for $j (0 .. 10) {
 512             last if $len <= $i * 11 + $j;
 513             my $val = $charinfo[$i * 11 + $j][3];
 514             print OUTPUT javaChar($val);
 515         }
 516         print OUTPUT "\"";
 517     }
 518
 519     print OUTPUT <<EOF;
 520 ;
 521
 522   /**
 523    * This is the listing of titlecase special cases (all other character
 524    * can use <code>UPPER</code> to determine their titlecase).  The listing
 525    * is a sequence of character pairs; converting the first character of the
 526    * pair to titlecase produces the second character.
 527    */
 528   String TITLE
 529 EOF
 530
 531     $len = length($titlecase) / 2;
 532     for ($i = 0; $i < $len / 11; $i++) {
 533         print OUTPUT $i ? "\n    + \"" : "    = \"";
 534         for $j (0 .. 10) {
 535             last if $len <= $i * 11 + $j;
 536             my $val = unpack "n", substr($titlecase, 2 * ($i*11 + $j), 2);
 537             print OUTPUT javaChar($val);
 538         }
 539         print OUTPUT "\"";
 540     }
 541
 542     print OUTPUT ";\n}\n";
 543     close OUTPUT;
 544 }
 545 print "\nDone.\n";