gomp-20050608-branch/libjava/classpath/scripts/unicode-muncher.pl

   1 #!/usr/bin/perl -w
   2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
   3 # Copyright (C) 1998, 2002, 2004  Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20 # 02110-1301 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39 # Code for reading UnicodeData.txt and generating the code for
  40 # gnu.java.lang.CharData.  For now, the relevant Unicode definition files
  41 # are found in doc/unicode/.
  42 #
  43 # Inspired by code from Jochen Hoenicke.
  44 # author Eric Blake <ebb9@email.byu.edu>
  45 #
  46 # Usage: ./unicode-muncher <UnicodeData> <SpecialCasing> <CharData.java>
  47 #   where <UnicodeData> and <SpecialCasing> are .txt files obtained from
  48 #   www.unicode.org (named UnicodeData-3.0.0.txt and SpecialCasing-2.txt for
  49 #   Unicode version 3.0.0), and <CharData.java> is the final location for the
  50 #   Java interface gnu.java.lang.CharData.
  51 #   As of JDK 1.4, use Unicode version 3.0.0 for best results.
  52
  53 ##
  54 ## Convert a 16-bit integer to a Java source code String literal character
  55 ##
  56 sub javaChar($) {
  57     my ($char) = @_;
  58     die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
  59     $char += 0x10000 if $char < 0;
  60     # Special case characters that must be escaped, or are shorter as ASCII
  61     return sprintf("\\%03o", $char) if $char < 0x20;
  62     return "\\\"" if $char == 0x22;
  63     return "\\\\" if $char == 0x5c;
  64     return pack("C", $char) if $char < 0x7f;
  65     return sprintf("\\u%04x", $char);
  66 }
  67
  68 ##
  69 ## Convert the text UnicodeData file from www.unicode.org into a Java
  70 ## interface with string constants holding the compressed information.
  71 ##
  72 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
  73                    SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
  74 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
  75
  76 my $NOBREAK_FLAG  = 32;
  77 my $MIRRORED_FLAG = 64;
  78
  79 my %special = ();
  80 my @info = ();
  81 my $titlecase = "";
  82 my $count = 0;
  83 my $range = 0;
  84
  85 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
  86     unless @ARGV == 3;
  87 $| = 1;
  88 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
  89 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
  90
  91 # Stage 0: Parse the special casing file
  92 print "Parsing special casing file\n";
  93 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
  94 while (<SPECIAL>) {
  95     next if /^\#/;
  96     my ($ch, undef, undef, $upper) = split / *; */;
  97
  98     # This grabs only the special casing for multi-char uppercase. Note that
  99     # there are no multi-char lowercase, and that Sun ignores multi-char
 100     # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
 101     # which must be hardcoded in java.lang.String:
 102     #  \u03a3 (Sun ignores this special case)
 103     #  \u0049 - lowercases to \u0131, but only in Turkish locale
 104     #  \u0069 - uppercases to \u0130, but only in Turkish locale
 105     next unless defined $upper and $upper =~ / /;
 106     $special{hex $ch} = [map {hex} split ' ', $upper];
 107 }
 108
 109 close SPECIAL;
 110
 111 # Stage 1: Parse the attribute file
 112 print "Parsing attributes file";
 113 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
 114 while (<UNICODE>) {
 115     print "." unless $count++ % 1000;
 116     chomp;
 117     s/\r//g;
 118     my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
 119         $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
 120     $ch = hex($ch);
 121     next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
 122
 123     my ($type, $numValue, $upperchar, $lowerchar, $direction);
 124
 125     $type = 0;
 126     while ($category !~ /^$TYPECODES[$type]$/) {
 127         if (++$type == @TYPECODES) {
 128             die "$ch: Unknown type: $category";
 129         }
 130     }
 131     $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
 132     $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
 133
 134     if ($numeric =~ /^[0-9]+$/) {
 135         $numValue = $numeric;
 136         die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
 137     } elsif ($numeric eq "") {
 138         # Special case sequences of 'a'-'z'
 139         if ($ch >= 0x0041 && $ch <= 0x005a) {
 140             $numValue = $ch - 0x0037;
 141         } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
 142             $numValue = $ch - 0x0057;
 143         } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
 144             $numValue = $ch - 0xff17;
 145         } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
 146             $numValue = $ch - 0xff37;
 147         } else {
 148             $numValue = -1;
 149         }
 150     } else {
 151         $numValue = -2;
 152     }
 153
 154     $upperchar = $upcase ? hex($upcase) - $ch : 0;
 155     $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
 156     if ($title ne $upcase) {
 157         my $titlechar = $title ? hex($title) : $ch;
 158         $titlecase .= pack("n2", $ch, $titlechar);
 159     }
 160
 161     $direction = 0;
 162     while ($bidir !~ /^$DIRCODES[$direction]$/) {
 163         if (++$direction == @DIRCODES) {
 164             $direction = -1;
 165             last;
 166         }
 167     }
 168     $direction <<= 2;
 169     $direction += $#{$special{$ch}} if defined $special{$ch};
 170
 171     if ($range) {
 172         die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
 173         for ($range + 1 .. $ch - 1) {
 174             $info[$_] = pack("n5", $type, $numValue, $upperchar,
 175                              $lowerchar, $direction);
 176         }
 177         $range = 0;
 178     } elsif ($name =~ /First>$/) {
 179         $range = $ch;
 180     }
 181     $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
 182                       $direction);
 183 }
 184 close UNICODE;
 185
 186 # Stage 2: Compress the data structures
 187 printf "\nCompressing data structures";
 188 $count = 0;
 189 my $info = ();
 190 my %charhash = ();
 191 my @charinfo = ();
 192
 193 for my $ch (0 .. 0xffff) {
 194     print "." unless $count++ % 0x1000;
 195     $info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
 196
 197     my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
 198     if (! exists $charhash{$info[$ch]}) {
 199         push @charinfo, [ $numVal, $upper, $lower, $direction ];
 200         $charhash{$info[$ch]} = $#charinfo;
 201     }
 202     $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
 203 }
 204
 205 my $charlen = @charinfo;
 206 my $bestshift;
 207 my $bestest = 1000000;
 208 my $bestblkstr;
 209 die "Too many unique character entries: $charlen\n" if $charlen > 512;
 210 print "\nUnique character entries: $charlen\n";
 211
 212 for my $i (3 .. 8) {
 213     my $blksize = 1 << $i;
 214     my %blocks = ();
 215     my @blkarray = ();
 216     my ($j, $k);
 217     print "shift: $i";
 218
 219     for ($j = 0; $j < 0x10000; $j += $blksize) {
 220         my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 221         if (! exists $blocks{$blkkey}) {
 222             push @blkarray, $blkkey;
 223             $blocks{$blkkey} = $#blkarray;
 224         }
 225     }
 226     my $blknum = @blkarray;
 227     my $blocklen = $blknum * $blksize;
 228     printf " before %5d", $blocklen;
 229
 230     # Now we try to pack the blkarray as tight as possible by finding matching
 231     # heads and tails.
 232     for ($j = $blksize - 1; $j > 0; $j--) {
 233         my %tails = ();
 234         for $k (0 .. $#blkarray) {
 235             next unless defined $blkarray[$k];
 236             my $len = length $blkarray[$k];
 237             my $tail = substr $blkarray[$k], $len - $j * 2;
 238             if (exists $tails{$tail}) {
 239                 push @{$tails{$tail}}, $k;
 240             } else {
 241                 $tails{$tail} = [ $k ];
 242             }
 243         }
 244
 245         # tails are calculated, now calculate the heads and merge.
 246       BLOCK:
 247         for $k (0 .. $#blkarray) {
 248             next unless defined $blkarray[$k];
 249             my $tomerge = $k;
 250             while (1) {
 251                 my $head = substr($blkarray[$tomerge], 0, $j * 2);
 252                 my $entry = $tails{$head};
 253                 next BLOCK unless defined $entry;
 254
 255                 my $other = shift @{$entry};
 256                 if ($other == $tomerge) {
 257                     if (@{$entry}) {
 258                         push @{$entry}, $other;
 259                         $other = shift @{$entry};
 260                     } else {
 261                         push @{$entry}, $other;
 262                         next BLOCK;
 263                     }
 264                 }
 265                 if (@{$entry} == 0) {
 266                     delete $tails{$head};
 267                 }
 268
 269                 # a match was found
 270                 my $merge = $blkarray[$other]
 271                     . substr($blkarray[$tomerge], $j * 2);
 272                 $blocklen -= $j;
 273                 $blknum--;
 274
 275                 if ($other < $tomerge) {
 276                     $blkarray[$tomerge] = undef;
 277                     $blkarray[$other] = $merge;
 278                     my $len = length $merge;
 279                     my $tail = substr $merge, $len - $j * 2;
 280                     $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
 281                                       @{$tails{$tail}} ];
 282                     next BLOCK;
 283                 }
 284                 $blkarray[$tomerge] = $merge;
 285                 $blkarray[$other] = undef;
 286             }
 287         }
 288     }
 289     my $blockstr;
 290     for $k (0 .. $#blkarray) {
 291         $blockstr .= $blkarray[$k] if defined $blkarray[$k];
 292     }
 293
 294     die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
 295     my $estimate = 2 * $blocklen + (0x20000 >> $i);
 296
 297     printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
 298     if ($estimate < $bestest) {
 299         $bestest = $estimate;
 300         $bestshift = $i;
 301         $bestblkstr = $blockstr;
 302     }
 303 }
 304
 305 my @blocks;
 306 my $blksize = 1 << $bestshift;
 307 for (my $j = 0; $j < 0x10000; $j += $blksize) {
 308     my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 309     my $index = index $bestblkstr, $blkkey;
 310     while ($index & 1) {
 311         die "not found: $j" if $index == -1;
 312         $index = index $bestblkstr, $blkkey, $index + 1;
 313     }
 314     push @blocks, ($index / 2 - $j) & 0xffff;
 315 }
 316
 317 # Phase 3: Generate the file
 318 die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
 319     if @blocks > 0xffff / 3;
 320 die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
 321     if length($bestblkstr) > 0xffff / 3;
 322 {
 323     print "Generating $ARGV[2] with shift of $bestshift";
 324     my ($i, $j);
 325
 326     open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
 327     print OUTPUT <<EOF;
 328 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
 329    Copyright (C) 2002 Free Software Foundation, Inc.
 330    *** This file is generated by scripts/unicode-muncher.pl ***
 331
 332 This file is part of GNU Classpath.
 333
 334 GNU Classpath is free software; you can redistribute it and/or modify
 335 it under the terms of the GNU General Public License as published by
 336 the Free Software Foundation; either version 2, or (at your option)
 337 any later version.
 338
 339 GNU Classpath is distributed in the hope that it will be useful, but
 340 WITHOUT ANY WARRANTY; without even the implied warranty of
 341 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 342 General Public License for more details.
 343
 344 You should have received a copy of the GNU General Public License
 345 along with GNU Classpath; see the file COPYING.  If not, write to the
 346 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 347 02110-1301 USA.
 348
 349 Linking this library statically or dynamically with other modules is
 350 making a combined work based on this library.  Thus, the terms and
 351 conditions of the GNU General Public License cover the whole
 352 combination.
 353
 354 As a special exception, the copyright holders of this library give you
 355 permission to link this library with independent modules to produce an
 356 executable, regardless of the license terms of these independent
 357 modules, and to copy and distribute the resulting executable under
 358 terms of your choice, provided that you also meet, for each linked
 359 independent module, the terms and conditions of the license of that
 360 module.  An independent module is a module which is not derived from
 361 or based on this library.  If you modify this library, you may extend
 362 this exception to your version of the library, but you are not
 363 obligated to do so.  If you do not wish to do so, delete this
 364 exception statement from your version. */
 365
 366 package gnu.java.lang;
 367
 368 /**
 369  * This contains the info about the unicode characters, that
 370  * java.lang.Character needs.  It is generated automatically from
 371  * <code>$ARGV[0]</code> and
 372  * <code>$ARGV[1]</code>, by some
 373  * perl scripts. These Unicode definition files can be found on the
 374  * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 375  * JDK 1.4 uses Unicode version 3.0.0.
 376  *
 377  * The data is stored as string constants, but Character will convert these
 378  * Strings to their respective <code>char[]</code> components.  The field
 379  * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
 380  * characters within <code>DATA</code>.  The DATA field, in turn, stores
 381  * information about each character in the low order bits, and an offset
 382  * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
 383  * <code>NUM_VALUE</code>, and <code>DIRECTION</code>.  Notice that the
 384  * attribute tables are much smaller than 0xffff entries; as many characters
 385  * in Unicode share common attributes.  The DIRECTION table also contains
 386  * a field for detecting characters with multi-character uppercase expansions.
 387  * Next, there is a listing for <code>TITLE</code> exceptions (most characters
 388  * just have the same title case as upper case).  Finally, there are two
 389  * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
 390  * which lists the characters which are special cased, and
 391  * <code>UPPER_EXPAND</code>, which lists their expansion.
 392  *
 393  * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
 394  *         Eric Blake)
 395  * \@see Character
 396  * \@see String
 397  */
 398 public interface CharData
 399 {
 400   /**
 401    * The Unicode definition file that was parsed to build this database.
 402    */
 403   String SOURCE = \"$ARGV[0]\";
 404
 405   /**
 406    * The character shift amount to look up the block offset. In other words,
 407    * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
 408    * <code>ch</code> is described in <code>DATA</code>.
 409    */
 410   int SHIFT = $bestshift;
 411
 412   /**
 413    * The mapping of character blocks to their location in <code>DATA</code>.
 414    * Each entry has been adjusted so that the 16-bit sum with the desired
 415    * character gives the actual index into <code>DATA</code>.
 416    */
 417   String BLOCKS
 418 EOF
 419
 420     for ($i = 0; $i < @blocks / 11; $i++) {
 421         print OUTPUT $i ? "\n    + \"" : "    = \"";
 422         for $j (0 .. 10) {
 423             last if @blocks <= $i * 11 + $j;
 424             my $val = $blocks[$i * 11 + $j];
 425             print OUTPUT javaChar($val);
 426         }
 427         print OUTPUT "\"";
 428     }
 429
 430     print OUTPUT <<EOF;
 431 ;
 432
 433   /**
 434    * Information about each character.  The low order 5 bits form the
 435    * character type, the next bit is a flag for non-breaking spaces, and the
 436    * next bit is a flag for mirrored directionality.  The high order 9 bits
 437    * form the offset into the attribute tables.  Note that this limits the
 438    * number of unique character attributes to 512, which is not a problem
 439    * as of Unicode version 3.2.0, but may soon become one.
 440    */
 441   String DATA
 442 EOF
 443
 444     my $len = length($bestblkstr) / 2;
 445     for ($i = 0; $i < $len / 11; $i++) {
 446         print OUTPUT $i ? "\n    + \"" : "    = \"";
 447         for $j (0 .. 10) {
 448             last if $len <= $i * 11 + $j;
 449             my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
 450             print OUTPUT javaChar($val);
 451         }
 452         print OUTPUT "\"";
 453     }
 454
 455     print OUTPUT <<EOF;
 456 ;
 457
 458   /**
 459    * This is the attribute table for computing the numeric value of a
 460    * character.  The value is -1 if Unicode does not define a value, -2
 461    * if the value is not a positive integer, otherwise it is the value.
 462    * Note that this is a signed value, but stored as an unsigned char
 463    * since this is a String literal.
 464    */
 465   String NUM_VALUE
 466 EOF
 467
 468     $len = @charinfo;
 469     for ($i = 0; $i < $len / 11; $i++) {
 470         print OUTPUT $i ? "\n    + \"" : "    = \"";
 471         for $j (0 .. 10) {
 472             last if $len <= $i * 11 + $j;
 473             my $val = $charinfo[$i * 11 + $j][0];
 474             print OUTPUT javaChar($val);
 475         }
 476         print OUTPUT "\"";
 477     }
 478
 479     print OUTPUT <<EOF;
 480 ;
 481
 482   /**
 483    * This is the attribute table for computing the single-character uppercase
 484    * representation of a character.  The value is the signed difference
 485    * between the character and its uppercase version.  Note that this is
 486    * stored as an unsigned char since this is a String literal.  When
 487    * capitalizing a String, you must first check if a multi-character uppercase
 488    * sequence exists before using this character.
 489    */
 490   String UPPER
 491 EOF
 492
 493     $len = @charinfo;
 494     for ($i = 0; $i < $len / 11; $i++) {
 495         print OUTPUT $i ? "\n    + \"" : "    = \"";
 496         for $j (0 .. 10) {
 497             last if $len <= $i * 11 + $j;
 498             my $val = $charinfo[$i * 11 + $j][1];
 499             print OUTPUT javaChar($val);
 500         }
 501         print OUTPUT "\"";
 502     }
 503
 504     print OUTPUT <<EOF;
 505 ;
 506
 507   /**
 508    * This is the attribute table for computing the lowercase representation
 509    * of a character.  The value is the signed difference between the
 510    * character and its lowercase version.  Note that this is stored as an
 511    * unsigned char since this is a String literal.
 512    */
 513   String LOWER
 514 EOF
 515
 516     $len = @charinfo;
 517     for ($i = 0; $i < $len / 13; $i++) {
 518         print OUTPUT $i ? "\n    + \"" : "    = \"";
 519         for $j (0 .. 12) {
 520             last if $len <= $i * 13 + $j;
 521             my $val = $charinfo[$i * 13 + $j][2];
 522             print OUTPUT javaChar($val);
 523         }
 524         print OUTPUT "\"";
 525     }
 526
 527     print OUTPUT <<EOF;
 528 ;
 529
 530   /**
 531    * This is the attribute table for computing the directionality class
 532    * of a character, as well as a marker of characters with a multi-character
 533    * capitalization.  The direction is taken by performing a signed shift
 534    * right by 2 (where a result of -1 means an unknown direction, such as
 535    * for undefined characters). The lower 2 bits form a count of the
 536    * additional characters that will be added to a String when performing
 537    * multi-character uppercase expansion. This count is also used, along with
 538    * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
 539    * when performing the case conversion. Note that this information is stored
 540    * as an unsigned char since this is a String literal.
 541    */
 542   String DIRECTION
 543 EOF
 544
 545     $len = @charinfo;
 546     for ($i = 0; $i < $len / 17; $i++) {
 547         print OUTPUT $i ? "\n    + \"" : "    = \"";
 548         for $j (0 .. 16) {
 549             last if $len <= $i * 17 + $j;
 550             my $val = $charinfo[$i * 17 + $j][3];
 551             print OUTPUT javaChar($val);
 552         }
 553         print OUTPUT "\"";
 554     }
 555
 556     print OUTPUT <<EOF;
 557 ;
 558
 559   /**
 560    * This is the listing of titlecase special cases (all other characters
 561    * can use <code>UPPER</code> to determine their titlecase).  The listing
 562    * is a sorted sequence of character pairs; converting the first character
 563    * of the pair to titlecase produces the second character.
 564    */
 565   String TITLE
 566 EOF
 567
 568     $len = length($titlecase) / 2;
 569     for ($i = 0; $i < $len / 11; $i++) {
 570         print OUTPUT $i ? "\n    + \"" : "    = \"";
 571         for $j (0 .. 10) {
 572             last if $len <= $i * 11 + $j;
 573             my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
 574             print OUTPUT javaChar($val);
 575         }
 576         print OUTPUT "\"";
 577     }
 578
 579     print OUTPUT <<EOF;
 580 ;
 581
 582   /**
 583    * This is a listing of characters with multi-character uppercase sequences.
 584    * A character appears in this list exactly when it has a non-zero entry
 585    * in the low-order 2-bit field of DIRECTION.  The listing is a sorted
 586    * sequence of pairs (hence a binary search on the even elements is an
 587    * efficient way to lookup a character). The first element of a pair is the
 588    * character with the expansion, and the second is the index into
 589    * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
 590    * DIRECTION to determine where the expansion ends.
 591    */
 592   String UPPER_SPECIAL
 593 EOF
 594
 595     my @list = sort {$a <=> $b} keys %special;
 596     my $expansion = "";
 597     my $offset = 0;
 598     $len = @list;
 599     for ($i = 0; $i < $len / 5; $i++) {
 600         print OUTPUT $i ? "\n    + \"" : "    = \"";
 601         for $j (0 .. 4) {
 602             last if $len <= $i * 5 + $j;
 603             my $ch = $list[$i * 5 + $j];
 604             print OUTPUT javaChar($ch);
 605             print OUTPUT javaChar($offset);
 606             $offset += @{$special{$ch}};
 607             $expansion .= pack "n*", @{$special{$ch}};
 608         }
 609         print OUTPUT "\"";
 610     }
 611
 612     print OUTPUT <<EOF;
 613 ;
 614
 615   /**
 616    * This is the listing of special case multi-character uppercase sequences.
 617    * Characters listed in UPPER_SPECIAL index into this table to find their
 618    * uppercase expansion. Remember that you must also perform special-casing
 619    * on two single-character sequences in the Turkish locale, which are not
 620    * covered here in CharData.
 621    */
 622   String UPPER_EXPAND
 623 EOF
 624
 625     $len = length($expansion) / 2;
 626     for ($i = 0; $i < $len / 11; $i++) {
 627         print OUTPUT $i ? "\n    + \"" : "    = \"";
 628         for $j (0 .. 10) {
 629             last if $len <= $i * 11 + $j;
 630             my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
 631             print OUTPUT javaChar($val);
 632         }
 633         print OUTPUT "\"";
 634     }
 635
 636     print OUTPUT ";\n}\n";
 637     close OUTPUT;
 638 }
 639 print "\nDone.\n";