libjava/classpath/scripts/unicode-muncher.pl

   1 #!/usr/bin/perl -w
   2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
   3 # Copyright (C) 1998, 2002, 2004  Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20 # 02110-1301 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39 # Code for reading UnicodeData.txt and generating the code for
  40 # gnu.java.lang.CharData.  For now, the relevant Unicode definition files
  41 # are found in doc/unicode/.
  42 #
  43 # Inspired by code from Jochen Hoenicke.
  44 # author Eric Blake <ebb9@email.byu.edu>
  45 # updated to Unicode 4.0.0 by Anthony Balkissoon <abalkiss@redhat.com>
  46 #
  47 # Usage: ./unicode-muncher <UnicodeData> <SpecialCasing> <CharData.java>
  48 #   where <UnicodeData> and <SpecialCasing> are .txt files obtained from
  49 #   www.unicode.org (named UnicodeData-4.0.0.txt and SpecialCasing-4.0.0.txt for
  50 #   Unicode version 4.0.0), and <CharData.java> is the final location for the
  51 #   Java interface gnu.java.lang.CharData.
  52 #   As of JDK 1.5, use Unicode version 4.0.0 for best results.
  53
  54 ##
  55 ## Convert a 16-bit integer to a Java source code String literal character
  56 ##
  57 sub javaChar($) {
  58     my ($char) = @_;
  59     die "Out of range: $char\n" if $char < -0x8000 or $char > 0x10ffff;
  60     $char += 0x10000 if $char < 0;
  61     # Special case characters that must be escaped, or are shorter as ASCII
  62     return sprintf("\\%03o", $char) if $char < 0x20;
  63     return "\\\"" if $char == 0x22;
  64     return "\\\\" if $char == 0x5c;
  65     return pack("C", $char) if $char < 0x7f;
  66     return sprintf("\\u%04x", $char);
  67 }
  68
  69 ##
  70 ## Convert the text UnicodeData file from www.unicode.org into a Java
  71 ## interface with string constants holding the compressed information.
  72 ##
  73 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
  74                    SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
  75 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
  76
  77 my $NOBREAK_FLAG  = 32;
  78 my $MIRRORED_FLAG = 64;
  79
  80 my %special = ();
  81
  82 # infoArray is an array where each element is a list of character information
  83 # for characters in a plane.  The index of each list is equal to the plane
  84 # that it corresponds to even though most of these lists will currently be
  85 # empty.  This is done so that that this script can be easily modified to
  86 # accomodate future versions of Unicode.
  87 my @infoArray = \((), (), (), (), (), (), (), (),
  88     (), (), (), (), (), (), (), (), ());
  89
  90 # info is a reference to one of the lists in infoArray, depending on which
  91 # plane we're currently parsing.
  92 my $info;
  93
  94 # titlecase is a string of ordered pairs of characters to store the titlecase
  95 # conversions of characters that have them
  96 my $titlecase = "";
  97
  98 # count is simply used to print "." to the screen every so often
  99 my $count = 0;
 100
 101 # range is used when the UnicodeData file blocks out ranges of code points
 102 my $range = 0;
 103
 104 # largeNums is an array of numerical values that are too large to fit
 105 # into the 16 bit char where most numerical values are stored.
 106 # What is stored in the char then is a number N such that (-N - 3) is
 107 # the index into largeNums where the numerical value can be found.
 108 my @largeNums = ();
 109
 110 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
 111     unless @ARGV == 3;
 112 $| = 1;
 113 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
 114 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
 115
 116 ################################################################################
 117 ################################################################################
 118 ## Stage 0: Parse the special casing file
 119 print "Parsing special casing file\n";
 120 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
 121 while (<SPECIAL>) {
 122     next if /^\#/;
 123     my ($ch, undef, undef, $upper) = split / *; */;
 124
 125     # This grabs only the special casing for multi-char uppercase. Note that
 126     # there are no multi-char lowercase, and that Sun ignores multi-char
 127     # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
 128     # which must be hardcoded in java.lang.String:
 129     #  \u03a3 (Sun ignores this special case)
 130     #  \u0049 - lowercases to \u0131, but only in Turkish locale
 131     #  \u0069 - uppercases to \u0130, but only in Turkish locale
 132     next unless defined $upper and $upper =~ / /;
 133     $special{hex $ch} = [map {hex} split ' ', $upper];
 134 }
 135 close SPECIAL;
 136
 137 ################################################################################
 138 ################################################################################
 139 ## Stage 1: Parse the attribute file
 140 print "Parsing attributes file";
 141 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
 142 while (<UNICODE>) {
 143     print "." unless $count++ % 1000;
 144     chomp;
 145     s/\r//g;
 146     my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
 147         $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
 148     $ch = hex($ch);
 149
 150     # plane tells us which Unicode code plane we're currently in and is an
 151     # index into infoArray.
 152     my $plane = int($ch / 0x10000);
 153     my $planeBase = $plane * 0x10000;
 154     $info = \@{$infoArray[$plane]};
 155
 156     my ($type, $numValue, $upperchar, $lowerchar, $direction);
 157
 158     # Set the value of the $type variable, checking to make sure that it's valid
 159     # and setting the mirrored and nobreak bits if necessary.
 160     $type = 0;
 161     while ($category !~ /^$TYPECODES[$type]$/) {
 162         if (++$type == @TYPECODES) {
 163             die "$ch: Unknown type: $category";
 164         }
 165     }
 166     $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
 167     $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
 168
 169     # Set the value of the $numeric variable checking the special cases of
 170     # large numbers or 'a' - 'z' values.
 171     if ($numeric =~ /^[0-9]+$/) {
 172         $numValue = $numeric;
 173         # If numeric takes more than 16 bits to store we want to store that
 174         # number in a separate array and store a number N in numValue such
 175         # that (-N - 3) is the offset into the separate array containing the
 176         # large numerical value.
 177         if ($numValue >= 0x7fff) {
 178             $numValue = -3 - @largeNums;
 179             push @largeNums, $numeric;
 180         }
 181     } elsif ($numeric eq "") {
 182         # Special case sequences of 'a'-'z'
 183         if ($ch >= 0x0041 && $ch <= 0x005a) {
 184             $numValue = $ch - 0x0037;
 185         } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
 186             $numValue = $ch - 0x0057;
 187         } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
 188             $numValue = $ch - 0xff17;
 189         } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
 190             $numValue = $ch - 0xff37;
 191         } else {
 192             $numValue = -1;
 193         }
 194     } else {
 195         $numValue = -2;
 196     }
 197
 198     # Set the uppercase and lowercase expansions for the character.
 199     $upperchar = $upcase ? hex($upcase) - $ch : 0;
 200     $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
 201
 202     # If this character has a special titlecase expansion then append it to
 203     # the titlecase String.
 204     if ($title ne $upcase) {
 205         my $titlechar = $title ? hex($title) : $ch;
 206         $titlecase .= pack("n2", $ch, $titlechar);
 207     }
 208
 209     # Set the direction variable, use the lower 2 bits as a count of how many
 210     # characters will be added to the String if this character undergoes an
 211     # uppercase expansion.
 212     $direction = 0;
 213     while ($bidir !~ /^$DIRCODES[$direction]$/) {
 214         if (++$direction == @DIRCODES) {
 215             $direction = -1;
 216             last;
 217         }
 218     }
 219     $direction <<= 2;
 220     $direction += $#{$special{$ch}} if defined $special{$ch};
 221
 222     # If the UnicodeData file blocks off ranges of code points give them all
 223     # the same character information.
 224     if ($range) {
 225         die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
 226         for ($range + 1 .. $ch - 1) {
 227             $info->[$_ - $planeBase] = pack("n5", $type, $numValue, $upperchar,
 228                              $lowerchar, $direction);
 229         }
 230         $range = 0;
 231     } elsif ($name =~ /First>$/) {
 232         $range = $ch;
 233     }
 234
 235     # Store all this parsed information into the element in infoArray that info
 236     # points to.
 237     $info->[$ch - $planeBase] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
 238                       $direction);
 239 }
 240 close UNICODE;
 241
 242 ################################################################################
 243 ################################################################################
 244 ## Stage 2: Compress the data structures
 245 printf "\nCompressing data structures";
 246 $count = 0;
 247
 248 # data is a String that will be used to create the DATA String containing
 249 # character information and offsets into the attribute tables.
 250 my @data = ();
 251
 252 # charhashArray is an array of hashtables used so that we can reuse character
 253 # attributes when characters share the same attributes ... this makes our
 254 # attribute tables smaller.  charhash is a pointer into this array.
 255 my @charhashArray = ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
 256 my $charhash = ();
 257
 258 # charinfoArray is an array of arrays, one per plane, for storing character
 259 # information.  charinfo is a pointer into this array.
 260 my @charinfoArray = \((), (), (), (), (), (), (), (),
 261     (), (), (), (), (), (), (), (), ());
 262 my $charinfo;
 263
 264 # charlen is an array, one element per plane, that tells us how many unique
 265 # character attributes there are for that plane.
 266 my @charlen = ();
 267
 268 for my $plane (0 .. 0x10) {
 269     $info = \@{$infoArray[$plane]};
 270     my $planeBase = $plane * 0x10000;
 271     $charhash = \%{$charhashArray[$plane]};
 272     $charinfo = \@{$charinfoArray[$plane]};
 273
 274     for my $ch ($planeBase .. $planeBase + 0xffff) {
 275         my $index = $ch - $planeBase;
 276         print "." unless $count++ % 0x1000;
 277         $info->[$index] = pack("n5", 0, -1, 0, 0, -4) unless defined $info->[$index];
 278
 279         my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info->[$index]);
 280         if (! exists $charhash->{$info->[$index]}) {
 281             # If we entered this loop that means the character we're looking at
 282             # now has attributes that are unique from those that we've looked
 283             # at so far for this plane.  So we push its attributes into charinfo
 284             # and store in charhash the offset into charinfo where these
 285             # attributes can later be found.
 286             push @{$charinfo}, [ $numVal, $upper, $lower, $direction ];
 287             $charhash->{$info->[$index]} = @{$charinfo} - 1;
 288             # When the file is generaged, the number we just stored in charhas
 289             # will be the upper 9 bits in the DATA String that are an offset
 290             # into the attribute tables.
 291         }
 292         $data[$plane] .= pack("n", ($charhash->{$info->[$index]} << 7) | $type);
 293     }
 294     $charlen[$plane] = scalar(@{$charinfoArray[$plane]});
 295 }
 296
 297 # the shift that results in the best compression of the table.  This is an array
 298 # because different shifts are better for the different tables for each plane.
 299 my @bestshift;
 300
 301 # an initial guess.
 302 my $bestest = 1000000;
 303 my @bestblkstr;
 304 my @blksize = ();
 305
 306 for my $plane (0 .. 0x10) {
 307     print "\n\nplane: $plane\n";
 308     print "Unique character entries: $charlen[$plane]\n";
 309     $bestest = 1000000;
 310     for my $i (3 .. 8) {
 311         my $blksize = 1 << $i;
 312         my %blocks = ();
 313         my @blkarray = ();
 314         my ($j, $k);
 315         print "shift: $i";
 316
 317         for ($j = 0; $j < 0x10000; $j += $blksize) {
 318             my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize;
 319             if (! exists $blocks{$blkkey}) {
 320                 push @blkarray, $blkkey;
 321                 $blocks{$blkkey} = $#blkarray;
 322             }
 323         }
 324
 325         my $blknum = @blkarray;
 326         my $blocklen = $blknum * $blksize;
 327         printf " before %5d", $blocklen;
 328
 329         # Now we try to pack the blkarray as tight as possible by finding matching
 330         # heads and tails.
 331         for ($j = $blksize - 1; $j > 0; $j--) {
 332             my %tails = ();
 333             for $k (0 .. $#blkarray) {
 334                 next unless defined $blkarray[$k];
 335                 my $len = length $blkarray[$k];
 336                 my $tail = substr $blkarray[$k], $len - $j * 2;
 337                 if (exists $tails{$tail}) {
 338                     push @{$tails{$tail}}, $k;
 339                 } else {
 340                     $tails{$tail} = [ $k ];
 341                 }
 342             }
 343
 344             # tails are calculated, now calculate the heads and merge.
 345           BLOCK:
 346             for $k (0 .. $#blkarray) {
 347                 next unless defined $blkarray[$k];
 348                 my $tomerge = $k;
 349                 while (1) {
 350                     my $head = substr($blkarray[$tomerge], 0, $j * 2);
 351                     my $entry = $tails{$head};
 352                     next BLOCK unless defined $entry;
 353
 354                     my $other = shift @{$entry};
 355                     if ($other == $tomerge) {
 356                         if (@{$entry}) {
 357                             push @{$entry}, $other;
 358                             $other = shift @{$entry};
 359                         } else {
 360                             push @{$entry}, $other;
 361                             next BLOCK;
 362                         }
 363                     }
 364                     if (@{$entry} == 0) {
 365                         delete $tails{$head};
 366                     }
 367
 368                     # a match was found
 369                     my $merge = $blkarray[$other]
 370                         . substr($blkarray[$tomerge], $j * 2);
 371                     $blocklen -= $j;
 372                     $blknum--;
 373
 374                     if ($other < $tomerge) {
 375                         $blkarray[$tomerge] = undef;
 376                         $blkarray[$other] = $merge;
 377                         my $len = length $merge;
 378                         my $tail = substr $merge, $len - $j * 2;
 379                         $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
 380                                           @{$tails{$tail}} ];
 381                         next BLOCK;
 382                     }
 383                     $blkarray[$tomerge] = $merge;
 384                     $blkarray[$other] = undef;
 385                 }
 386             }
 387         }
 388         my $blockstr;
 389         for $k (0 .. $#blkarray) {
 390             $blockstr .= $blkarray[$k] if defined $blkarray[$k];
 391         }
 392
 393         die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
 394         my $estimate = 2 * $blocklen + (0x20000 >> $i);
 395
 396         printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
 397         if ($estimate < $bestest) {
 398             $bestest = $estimate;
 399             $bestshift[$plane] = $i;
 400             $bestblkstr[$plane] = $blockstr;
 401         }
 402     }
 403     $blksize[$plane] = 1 << $bestshift[$plane];
 404     print "best shift: ", $bestshift[$plane];
 405     print "     blksize: ", $blksize[$plane];
 406 }
 407 my @blocksArray = \((), (), (), (), (), (), (), (),
 408     (), (), (), (), (), (), (), (), ());
 409
 410 for my $plane (0 .. 0x10) {
 411     for (my $j = 0; $j < 0x10000; $j += $blksize[$plane]) {
 412         my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize[$plane];
 413         my $index = index $bestblkstr[$plane], $blkkey;
 414         while ($index & 1) {
 415             die "not found: $j" if $index == -1;
 416             $index = index $bestblkstr[$plane], $blkkey, $index + 1;
 417         }
 418         push @{$blocksArray[$plane]}, ($index / 2 - $j) & 0xffff;
 419     }
 420 }
 421
 422 ################################################################################
 423 ################################################################################
 424 ## Stage 3: Generate the file
 425 for my $plane (0 .. 0x10) {
 426     die "UTF-8 limit of blocks may be exceeded for plane $plane: " . scalar(@{$blocksArray[$plane]}) . "\n"
 427         if @{$blocksArray[$plane]} > 0xffff / 3;
 428     die "UTF-8 limit of data may be exceeded for plane $plane: " . length($bestblkstr[$plane]) . "\n"
 429         if length($bestblkstr[$plane]) > 0xffff / 3;
 430 }
 431
 432 {
 433     print "\nGenerating $ARGV[2].";
 434     my ($i, $j);
 435
 436     open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
 437     print OUTPUT <<EOF;
 438 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
 439    Copyright (C) 2002 Free Software Foundation, Inc.
 440    *** This file is generated by scripts/unicode-muncher.pl ***
 441
 442 This file is part of GNU Classpath.
 443
 444 GNU Classpath is free software; you can redistribute it and/or modify
 445 it under the terms of the GNU General Public License as published by
 446 the Free Software Foundation; either version 2, or (at your option)
 447 any later version.
 448
 449 GNU Classpath is distributed in the hope that it will be useful, but
 450 WITHOUT ANY WARRANTY; without even the implied warranty of
 451 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 452 General Public License for more details.
 453
 454 You should have received a copy of the GNU General Public License
 455 along with GNU Classpath; see the file COPYING.  If not, write to the
 456 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 457 02110-1301 USA.
 458
 459 Linking this library statically or dynamically with other modules is
 460 making a combined work based on this library.  Thus, the terms and
 461 conditions of the GNU General Public License cover the whole
 462 combination.
 463
 464 As a special exception, the copyright holders of this library give you
 465 permission to link this library with independent modules to produce an
 466 executable, regardless of the license terms of these independent
 467 modules, and to copy and distribute the resulting executable under
 468 terms of your choice, provided that you also meet, for each linked
 469 independent module, the terms and conditions of the license of that
 470 module.  An independent module is a module which is not derived from
 471 or based on this library.  If you modify this library, you may extend
 472 this exception to your version of the library, but you are not
 473 obligated to do so.  If you do not wish to do so, delete this
 474 exception statement from your version. */
 475
 476 package gnu.java.lang;
 477
 478 /**
 479  * This contains the info about the unicode characters, that
 480  * java.lang.Character needs.  It is generated automatically from
 481  * <code>$ARGV[0]</code> and
 482  * <code>$ARGV[1]</code>, by some
 483  * perl scripts. These Unicode definition files can be found on the
 484  * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 485  * JDK 1.5 uses Unicode version 4.0.0.
 486  *
 487  * The data is stored as string constants, but Character will convert these
 488  * Strings to their respective <code>char[]</code> components.  The fields
 489  * are stored in arrays of 17 elements each, one element per Unicode plane.
 490  * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
 491  * characters within <code>DATA</code>.  The DATA field, in turn, stores
 492  * information about each character in the low order bits, and an offset
 493  * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
 494  * <code>NUM_VALUE</code>, and <code>DIRECTION</code>.  Notice that the
 495  * attribute tables are much smaller than 0xffff entries; as many characters
 496  * in Unicode share common attributes.  Numbers that are too large to fit
 497  * into NUM_VALUE as 16 bit chars are stored in LARGENUMS and a number N is
 498  * stored in NUM_VALUE such that (-N - 3) is the offset into LARGENUMS for
 499  * the particular character. The DIRECTION table also contains a field for
 500  * detecting characters with multi-character uppercase expansions.
 501  * Next, there is a listing for <code>TITLE</code> exceptions (most characters
 502  * just have the same title case as upper case).  Finally, there are two
 503  * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
 504  * which lists the characters which are special cased, and
 505  * <code>UPPER_EXPAND</code>, which lists their expansion.
 506  *
 507  * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
 508  *         Eric Blake)
 509  * \@see Character
 510  * \@see String
 511  */
 512 public interface CharData
 513 {
 514   /**
 515    * The Unicode definition file that was parsed to build this database.
 516    */
 517   String SOURCE = \"$ARGV[0]\";
 518
 519   /**
 520    * The character shift amount to look up the block offset. In other words,
 521    * <code>(char) (BLOCKS.value[ch >> SHIFT[p]] + ch)</code> is the index
 522    * where <code>ch</code> is described in <code>DATA</code> if <code>ch</code>
 523    * is in Unicode plane <code>p</code>.  Note that <code>p</code> is simply
 524    * the integer division of ch and 0x10000.
 525    */
 526   int[] SHIFT
 527 EOF
 528   for ($i = 0; $i < @bestshift - 1; $i++) {
 529       if ($i == 0){
 530           print OUTPUT "    = new int[] {";
 531       }
 532       print OUTPUT $bestshift[$i], ", ";
 533   }
 534   if (scalar(@bestshift) > 0){
 535     print OUTPUT $bestshift[-1], "}";
 536   }
 537   else {
 538     print OUTPUT "    = null";
 539   }
 540   print OUTPUT <<EOF;
 541 ;
 542
 543   /**
 544    * The mapping of character blocks to their location in <code>DATA</code>.
 545    * Each entry has been adjusted so that the 16-bit sum with the desired
 546    * character gives the actual index into <code>DATA</code>.
 547    */
 548    String[] BLOCKS = new String[]{
 549 EOF
 550     for ($plane = 0; $plane <= 0x10; $plane++) {
 551         # The following if statement handles the cases of unassigned planes
 552         # specially so we don't waste space with unused Strings.  As of
 553         # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 554         # you are updating this script to work with a later version of
 555         # Unicode you may have to alter this if statement.
 556         if ($plane > 2 && $plane != 14) {
 557             print OUTPUT ($plane == 0x10) ? "    \"\"}" : "    \"\",\n\n";
 558         }
 559         else {
 560             for ($i = 0; $i < @{$blocksArray[$plane]} / 11; $i++) {
 561                 print OUTPUT $i ? "\n    + " : "    ";
 562                 print OUTPUT "\"";
 563                 for $j (0 .. 10) {
 564                     last if @{$blocksArray[$plane]} <= $i * 11 + $j;
 565                     my $val = $blocksArray[$plane]->[$i * 11 + $j];
 566                     print OUTPUT javaChar($val);
 567                 }
 568                 print OUTPUT "\"";
 569             }
 570             print OUTPUT ",\n\n";
 571         }
 572     }
 573     print OUTPUT <<EOF;
 574 ;
 575
 576   /**
 577    * The array containing the numeric values that are too large to be stored as
 578    * chars in NUM_VALUE.  NUM_VALUE in this case will contain a negative integer
 579    * N such that LARGENUMS[-N - 3] contains the correct numeric value.
 580    */
 581   int[] LARGENUMS
 582 EOF
 583   for ($i = 0; $i < @largeNums - 1; $i++) {
 584       if ($i == 0){
 585           print OUTPUT "    = new int[] {";
 586       }
 587       print OUTPUT $largeNums[$i], ", ";
 588   }
 589   if (scalar(@largeNums) > 0){
 590     print OUTPUT $largeNums[-1], "}";
 591   }
 592   else {
 593     print OUTPUT "    = null";
 594   }
 595   print OUTPUT <<EOF;
 596 ;
 597
 598   /**
 599    * Information about each character.  The low order 5 bits form the
 600    * character type, the next bit is a flag for non-breaking spaces, and the
 601    * next bit is a flag for mirrored directionality.  The high order 9 bits
 602    * form the offset into the attribute tables.  Note that this limits the
 603    * number of unique character attributes to 512, which is not a problem
 604    * as of Unicode version 4.0.0, but may soon become one.
 605    */
 606    String[] DATA = new String[]{
 607 EOF
 608     for ($plane = 0; $plane <= 0x10; $plane++) {
 609         # The following if statement handles the cases of unassigned planes
 610         # specially so we don't waste space with unused Strings.  As of
 611         # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 612         # you are updating this script to work with a later version of
 613         # Unicode you may have to alter this if statement.
 614         if ($plane > 2 && $plane != 14) {
 615             print OUTPUT ($plane == 0x10) ? "    \"\"}" : "    \"\",\n\n";
 616         }
 617         else {
 618             my $len = length($bestblkstr[$plane]) / 2;
 619             for ($i = 0; $i < $len / 11; $i++) {
 620                 print OUTPUT $i ? "\n    + " : "    ";
 621                 print OUTPUT "\"";
 622                 for $j (0 .. 10) {
 623                     last if $len <= $i * 11 + $j;
 624                     my $val = unpack "n", substr($bestblkstr[$plane], 2 * ($i * 11 + $j), 2);
 625                     print OUTPUT javaChar($val);
 626                 }
 627                 print OUTPUT "\"";
 628             }
 629             print OUTPUT ",\n\n";
 630         }
 631     }
 632     print OUTPUT <<EOF;
 633 ;
 634
 635   /**
 636    * This is the attribute table for computing the numeric value of a
 637    * character.  The value is -1 if Unicode does not define a value, -2
 638    * if the value is not a positive integer, otherwise it is the value.
 639    * Note that this is a signed value, but stored as an unsigned char
 640    * since this is a String literal.
 641    */
 642    String[] NUM_VALUE = new String[]{
 643 EOF
 644
 645     for ($plane = 0; $plane <= 0x10; $plane++) {
 646         # The following if statement handles the cases of unassigned planes
 647         # specially so we don't waste space with unused Strings.  As of
 648         # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 649         # you are updating this script to work with a later version of
 650         # Unicode you may have to alter this if statement.
 651         if ($plane > 2 && $plane != 14) {
 652             print OUTPUT ($plane == 0x10) ? "    \"\"}" : "    \"\",\n\n";
 653         }
 654         else {
 655             $len = @{$charinfoArray[$plane]};
 656             for ($i = 0; $i < $len / 11; $i++) {
 657                 print OUTPUT $i ? "\n    + " : "   ";
 658                 print OUTPUT "\"";
 659                 for $j (0 .. 10) {
 660                     last if $len <= $i * 11 + $j;
 661                     my $val = $charinfoArray[$plane]->[$i * 11 + $j][0];
 662                     print OUTPUT javaChar($val);
 663                 }
 664                 print OUTPUT "\"";
 665             }
 666             print OUTPUT ",\n\n";
 667         }
 668     }
 669     print OUTPUT <<EOF;
 670 ;
 671
 672   /**
 673    * This is the attribute table for computing the single-character uppercase
 674    * representation of a character.  The value is the signed difference
 675    * between the character and its uppercase version.  Note that this is
 676    * stored as an unsigned char since this is a String literal.  When
 677    * capitalizing a String, you must first check if a multi-character uppercase
 678    * sequence exists before using this character.
 679    */
 680   String[] UPPER = new String[]{
 681 EOF
 682
 683     for ($plane = 0; $plane <= 0x10; $plane++) {
 684         # The following if statement handles the cases of unassigned planes
 685         # specially so we don't waste space with unused Strings.  As of
 686         # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 687         # you are updating this script to work with a later version of
 688         # Unicode you may have to alter this if statement.
 689         if ($plane > 2 && $plane != 14) {
 690             print OUTPUT ($plane == 0x10) ? "    \"\"}" : "    \"\",\n\n";
 691         }
 692         else {
 693             $len = @{$charinfoArray[$plane]};
 694             for ($i = 0; $i < $len / 11; $i++) {
 695                 print OUTPUT $i ? "\n    + " : "   ";
 696                 print OUTPUT "\"";
 697                 for $j (0 .. 10) {
 698                     last if $len <= $i * 11 + $j;
 699                     my $val = $charinfoArray[$plane]->[$i * 11 + $j][1];
 700                     print OUTPUT javaChar($val);
 701                 }
 702                 print OUTPUT "\"";
 703             }
 704             print OUTPUT ",\n\n";
 705         }
 706     }
 707     print OUTPUT <<EOF;
 708 ;
 709
 710   /**
 711    * This is the attribute table for computing the lowercase representation
 712    * of a character.  The value is the signed difference between the
 713    * character and its lowercase version.  Note that this is stored as an
 714    * unsigned char since this is a String literal.
 715    */
 716    String[] LOWER = new String[]{
 717 EOF
 718
 719     for ($plane = 0; $plane <= 0x10; $plane++) {
 720         # The following if statement handles the cases of unassigned planes
 721         # specially so we don't waste space with unused Strings.  As of
 722         # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 723         # you are updating this script to work with a later version of
 724         # Unicode you may have to alter this if statement.
 725         if ($plane > 2 && $plane != 14) {
 726             print OUTPUT ($plane == 0x10) ? "    \"\"}" : "    \"\",\n\n";
 727         }
 728         else {
 729             $len = @{$charinfoArray[$plane]};
 730             for ($i = 0; $i < $len / 11; $i++) {
 731                 print OUTPUT $i ? "\n    + " : "   ";
 732                 print OUTPUT "\"";
 733                 for $j (0 .. 10) {
 734                     last if $len <= $i * 11 + $j;
 735                     my $val = $charinfoArray[$plane]->[$i * 11 + $j][2];
 736                     print OUTPUT javaChar($val);
 737                 }
 738                 print OUTPUT "\"";
 739             }
 740             print OUTPUT ",\n\n";
 741         }
 742     }
 743     print OUTPUT <<EOF;
 744 ;
 745
 746   /**
 747    * This is the attribute table for computing the directionality class
 748    * of a character, as well as a marker of characters with a multi-character
 749    * capitalization.  The direction is taken by performing a signed shift
 750    * right by 2 (where a result of -1 means an unknown direction, such as
 751    * for undefined characters). The lower 2 bits form a count of the
 752    * additional characters that will be added to a String when performing
 753    * multi-character uppercase expansion. This count is also used, along with
 754    * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
 755    * when performing the case conversion. Note that this information is stored
 756    * as an unsigned char since this is a String literal.
 757    */
 758   String[] DIRECTION = new String[]{
 759 EOF
 760
 761     for ($plane = 0; $plane <= 0x10; $plane++) {
 762         # The following if statement handles the cases of unassigned planes
 763         # specially so we don't waste space with unused Strings.  As of
 764         # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 765         # you are updating this script to work with a later version of
 766         # Unicode you may have to alter this if statement.
 767         if ($plane > 2 && $plane != 14) {
 768             print OUTPUT ($plane == 0x10) ? "    \"\"}" : "    \"\",\n\n";
 769         }
 770         else {
 771             $len = @{$charinfoArray[$plane]};
 772             for ($i = 0; $i < $len / 11; $i++) {
 773                 print OUTPUT $i ? "\n    + " : "   ";
 774                 print OUTPUT "\"";
 775                 for $j (0 .. 10) {
 776                     last if $len <= $i * 11 + $j;
 777                     my $val = $charinfoArray[$plane]->[$i * 11 + $j][3];
 778                     print OUTPUT javaChar($val);
 779                 }
 780                 print OUTPUT "\"";
 781             }
 782             print OUTPUT ",\n\n";
 783         }
 784     }
 785     print OUTPUT <<EOF;
 786 ;
 787
 788   /**
 789    * This is the listing of titlecase special cases (all other characters
 790    * can use <code>UPPER</code> to determine their titlecase).  The listing
 791    * is a sorted sequence of character pairs; converting the first character
 792    * of the pair to titlecase produces the second character.
 793    */
 794   String TITLE
 795 EOF
 796
 797     $len = length($titlecase) / 2;
 798     for ($i = 0; $i < $len / 11; $i++) {
 799         print OUTPUT $i ? "\n    + \"" : "    = \"";
 800         for $j (0 .. 10) {
 801             last if $len <= $i * 11 + $j;
 802             my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
 803             print OUTPUT javaChar($val);
 804         }
 805         print OUTPUT "\"";
 806     }
 807
 808     print OUTPUT <<EOF;
 809 ;
 810
 811   /**
 812    * This is a listing of characters with multi-character uppercase sequences.
 813    * A character appears in this list exactly when it has a non-zero entry
 814    * in the low-order 2-bit field of DIRECTION.  The listing is a sorted
 815    * sequence of pairs (hence a binary search on the even elements is an
 816    * efficient way to lookup a character). The first element of a pair is the
 817    * character with the expansion, and the second is the index into
 818    * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
 819    * DIRECTION to determine where the expansion ends.
 820    */
 821   String UPPER_SPECIAL
 822 EOF
 823
 824     my @list = sort {$a <=> $b} keys %special;
 825     my $expansion = "";
 826     my $offset = 0;
 827     $len = @list;
 828     for ($i = 0; $i < $len / 5; $i++) {
 829         print OUTPUT $i ? "\n    + \"" : "    = \"";
 830         for $j (0 .. 4) {
 831             last if $len <= $i * 5 + $j;
 832             my $ch = $list[$i * 5 + $j];
 833             print OUTPUT javaChar($ch);
 834             print OUTPUT javaChar($offset);
 835             $offset += @{$special{$ch}};
 836             $expansion .= pack "n*", @{$special{$ch}};
 837         }
 838         print OUTPUT "\"";
 839     }
 840
 841     print OUTPUT <<EOF;
 842 ;
 843
 844   /**
 845    * This is the listing of special case multi-character uppercase sequences.
 846    * Characters listed in UPPER_SPECIAL index into this table to find their
 847    * uppercase expansion. Remember that you must also perform special-casing
 848    * on two single-character sequences in the Turkish locale, which are not
 849    * covered here in CharData.
 850    */
 851   String UPPER_EXPAND
 852 EOF
 853
 854     $len = length($expansion) / 2;
 855     for ($i = 0; $i < $len / 11; $i++) {
 856         print OUTPUT $i ? "\n    + \"" : "    = \"";
 857         for $j (0 .. 10) {
 858             last if $len <= $i * 11 + $j;
 859             my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
 860             print OUTPUT javaChar($val);
 861         }
 862         print OUTPUT "\"";
 863     }
 864
 865     print OUTPUT ";\n}\n";
 866     close OUTPUT;
 867 }
 868 print "\nDone.\n";