Merge from mainline.
[official-gcc.git] / libjava / classpath / scripts / unicode-muncher.pl
blobdb2b89a728c01dc9d52b18bae2ecd69344d0a508
1 #!/usr/bin/perl -w
2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
3 # Copyright (C) 1998, 2002, 2004 Free Software Foundation, Inc.
5 # This file is part of GNU Classpath.
7 # GNU Classpath is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
12 # GNU Classpath is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GNU Classpath; see the file COPYING. If not, write to the
19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 # 02110-1301 USA.
22 # Linking this library statically or dynamically with other modules is
23 # making a combined work based on this library. Thus, the terms and
24 # conditions of the GNU General Public License cover the whole
25 # combination.
27 # As a special exception, the copyright holders of this library give you
28 # permission to link this library with independent modules to produce an
29 # executable, regardless of the license terms of these independent
30 # modules, and to copy and distribute the resulting executable under
31 # terms of your choice, provided that you also meet, for each linked
32 # independent module, the terms and conditions of the license of that
33 # module. An independent module is a module which is not derived from
34 # or based on this library. If you modify this library, you may extend
35 # this exception to your version of the library, but you are not
36 # obligated to do so. If you do not wish to do so, delete this
37 # exception statement from your version.
39 # Code for reading UnicodeData.txt and generating the code for
40 # gnu.java.lang.CharData. For now, the relevant Unicode definition files
41 # are found in doc/unicode/.
43 # Inspired by code from Jochen Hoenicke.
44 # author Eric Blake <ebb9@email.byu.edu>
45 # updated to Unicode 4.0.0 by Anthony Balkissoon <abalkiss@redhat.com>
47 # Usage: ./unicode-muncher <UnicodeData> <SpecialCasing> <CharData.java>
48 # where <UnicodeData> and <SpecialCasing> are .txt files obtained from
49 # www.unicode.org (named UnicodeData-4.0.0.txt and SpecialCasing-4.0.0.txt for
50 # Unicode version 4.0.0), and <CharData.java> is the final location for the
51 # Java interface gnu.java.lang.CharData.
52 # As of JDK 1.5, use Unicode version 4.0.0 for best results.
55 ## Convert a 16-bit integer to a Java source code String literal character
57 sub javaChar($) {
58 my ($char) = @_;
59 die "Out of range: $char\n" if $char < -0x8000 or $char > 0x10ffff;
60 $char += 0x10000 if $char < 0;
61 # Special case characters that must be escaped, or are shorter as ASCII
62 return sprintf("\\%03o", $char) if $char < 0x20;
63 return "\\\"" if $char == 0x22;
64 return "\\\\" if $char == 0x5c;
65 return pack("C", $char) if $char < 0x7f;
66 return sprintf("\\u%04x", $char);
70 ## Convert the text UnicodeData file from www.unicode.org into a Java
71 ## interface with string constants holding the compressed information.
73 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
74 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
75 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
77 my $NOBREAK_FLAG = 32;
78 my $MIRRORED_FLAG = 64;
80 my %special = ();
82 # infoArray is an array where each element is a list of character information
83 # for characters in a plane. The index of each list is equal to the plane
84 # that it corresponds to even though most of these lists will currently be
85 # empty. This is done so that that this script can be easily modified to
86 # accomodate future versions of Unicode.
87 my @infoArray = \((), (), (), (), (), (), (), (),
88 (), (), (), (), (), (), (), (), ());
90 # info is a reference to one of the lists in infoArray, depending on which
91 # plane we're currently parsing.
92 my $info;
94 # titlecase is a string of ordered pairs of characters to store the titlecase
95 # conversions of characters that have them
96 my $titlecase = "";
98 # count is simply used to print "." to the screen every so often
99 my $count = 0;
101 # range is used when the UnicodeData file blocks out ranges of code points
102 my $range = 0;
104 # largeNums is an array of numerical values that are too large to fit
105 # into the 16 bit char where most numerical values are stored.
106 # What is stored in the char then is a number N such that (-N - 3) is
107 # the index into largeNums where the numerical value can be found.
108 my @largeNums = ();
110 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
111 unless @ARGV == 3;
112 $| = 1;
113 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
114 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
116 ################################################################################
117 ################################################################################
118 ## Stage 0: Parse the special casing file
119 print "Parsing special casing file\n";
120 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
121 while (<SPECIAL>) {
122 next if /^\#/;
123 my ($ch, undef, undef, $upper) = split / *; */;
125 # This grabs only the special casing for multi-char uppercase. Note that
126 # there are no multi-char lowercase, and that Sun ignores multi-char
127 # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
128 # which must be hardcoded in java.lang.String:
129 # \u03a3 (Sun ignores this special case)
130 # \u0049 - lowercases to \u0131, but only in Turkish locale
131 # \u0069 - uppercases to \u0130, but only in Turkish locale
132 next unless defined $upper and $upper =~ / /;
133 $special{hex $ch} = [map {hex} split ' ', $upper];
135 close SPECIAL;
137 ################################################################################
138 ################################################################################
139 ## Stage 1: Parse the attribute file
140 print "Parsing attributes file";
141 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
142 while (<UNICODE>) {
143 print "." unless $count++ % 1000;
144 chomp;
145 s/\r//g;
146 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
147 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
148 $ch = hex($ch);
150 # plane tells us which Unicode code plane we're currently in and is an
151 # index into infoArray.
152 my $plane = int($ch / 0x10000);
153 my $planeBase = $plane * 0x10000;
154 $info = \@{$infoArray[$plane]};
156 my ($type, $numValue, $upperchar, $lowerchar, $direction);
158 # Set the value of the $type variable, checking to make sure that it's valid
159 # and setting the mirrored and nobreak bits if necessary.
160 $type = 0;
161 while ($category !~ /^$TYPECODES[$type]$/) {
162 if (++$type == @TYPECODES) {
163 die "$ch: Unknown type: $category";
166 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
167 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
169 # Set the value of the $numeric variable checking the special cases of
170 # large numbers or 'a' - 'z' values.
171 if ($numeric =~ /^[0-9]+$/) {
172 $numValue = $numeric;
173 # If numeric takes more than 16 bits to store we want to store that
174 # number in a separate array and store a number N in numValue such
175 # that (-N - 3) is the offset into the separate array containing the
176 # large numerical value.
177 if ($numValue >= 0x7fff) {
178 $numValue = -3 - @largeNums;
179 push @largeNums, $numeric;
181 } elsif ($numeric eq "") {
182 # Special case sequences of 'a'-'z'
183 if ($ch >= 0x0041 && $ch <= 0x005a) {
184 $numValue = $ch - 0x0037;
185 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
186 $numValue = $ch - 0x0057;
187 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
188 $numValue = $ch - 0xff17;
189 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
190 $numValue = $ch - 0xff37;
191 } else {
192 $numValue = -1;
194 } else {
195 $numValue = -2;
198 # Set the uppercase and lowercase expansions for the character.
199 $upperchar = $upcase ? hex($upcase) - $ch : 0;
200 $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
202 # If this character has a special titlecase expansion then append it to
203 # the titlecase String.
204 if ($title ne $upcase) {
205 my $titlechar = $title ? hex($title) : $ch;
206 $titlecase .= pack("n2", $ch, $titlechar);
209 # Set the direction variable, use the lower 2 bits as a count of how many
210 # characters will be added to the String if this character undergoes an
211 # uppercase expansion.
212 $direction = 0;
213 while ($bidir !~ /^$DIRCODES[$direction]$/) {
214 if (++$direction == @DIRCODES) {
215 $direction = -1;
216 last;
219 $direction <<= 2;
220 $direction += $#{$special{$ch}} if defined $special{$ch};
222 # If the UnicodeData file blocks off ranges of code points give them all
223 # the same character information.
224 if ($range) {
225 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
226 for ($range + 1 .. $ch - 1) {
227 $info->[$_ - $planeBase] = pack("n5", $type, $numValue, $upperchar,
228 $lowerchar, $direction);
230 $range = 0;
231 } elsif ($name =~ /First>$/) {
232 $range = $ch;
235 # Store all this parsed information into the element in infoArray that info
236 # points to.
237 $info->[$ch - $planeBase] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
238 $direction);
240 close UNICODE;
242 ################################################################################
243 ################################################################################
244 ## Stage 2: Compress the data structures
245 printf "\nCompressing data structures";
246 $count = 0;
248 # data is a String that will be used to create the DATA String containing
249 # character information and offsets into the attribute tables.
250 my @data = ();
252 # charhashArray is an array of hashtables used so that we can reuse character
253 # attributes when characters share the same attributes ... this makes our
254 # attribute tables smaller. charhash is a pointer into this array.
255 my @charhashArray = ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
256 my $charhash = ();
258 # charinfoArray is an array of arrays, one per plane, for storing character
259 # information. charinfo is a pointer into this array.
260 my @charinfoArray = \((), (), (), (), (), (), (), (),
261 (), (), (), (), (), (), (), (), ());
262 my $charinfo;
264 # charlen is an array, one element per plane, that tells us how many unique
265 # character attributes there are for that plane.
266 my @charlen = ();
268 for my $plane (0 .. 0x10) {
269 $info = \@{$infoArray[$plane]};
270 my $planeBase = $plane * 0x10000;
271 $charhash = \%{$charhashArray[$plane]};
272 $charinfo = \@{$charinfoArray[$plane]};
274 for my $ch ($planeBase .. $planeBase + 0xffff) {
275 my $index = $ch - $planeBase;
276 print "." unless $count++ % 0x1000;
277 $info->[$index] = pack("n5", 0, -1, 0, 0, -4) unless defined $info->[$index];
279 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info->[$index]);
280 if (! exists $charhash->{$info->[$index]}) {
281 # If we entered this loop that means the character we're looking at
282 # now has attributes that are unique from those that we've looked
283 # at so far for this plane. So we push its attributes into charinfo
284 # and store in charhash the offset into charinfo where these
285 # attributes can later be found.
286 push @{$charinfo}, [ $numVal, $upper, $lower, $direction ];
287 $charhash->{$info->[$index]} = @{$charinfo} - 1;
288 # When the file is generaged, the number we just stored in charhas
289 # will be the upper 9 bits in the DATA String that are an offset
290 # into the attribute tables.
292 $data[$plane] .= pack("n", ($charhash->{$info->[$index]} << 7) | $type);
294 $charlen[$plane] = scalar(@{$charinfoArray[$plane]});
297 # the shift that results in the best compression of the table. This is an array
298 # because different shifts are better for the different tables for each plane.
299 my @bestshift;
301 # an initial guess.
302 my $bestest = 1000000;
303 my @bestblkstr;
304 my @blksize = ();
306 for my $plane (0 .. 0x10) {
307 print "\n\nplane: $plane\n";
308 print "Unique character entries: $charlen[$plane]\n";
309 $bestest = 1000000;
310 for my $i (3 .. 8) {
311 my $blksize = 1 << $i;
312 my %blocks = ();
313 my @blkarray = ();
314 my ($j, $k);
315 print "shift: $i";
317 for ($j = 0; $j < 0x10000; $j += $blksize) {
318 my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize;
319 if (! exists $blocks{$blkkey}) {
320 push @blkarray, $blkkey;
321 $blocks{$blkkey} = $#blkarray;
325 my $blknum = @blkarray;
326 my $blocklen = $blknum * $blksize;
327 printf " before %5d", $blocklen;
329 # Now we try to pack the blkarray as tight as possible by finding matching
330 # heads and tails.
331 for ($j = $blksize - 1; $j > 0; $j--) {
332 my %tails = ();
333 for $k (0 .. $#blkarray) {
334 next unless defined $blkarray[$k];
335 my $len = length $blkarray[$k];
336 my $tail = substr $blkarray[$k], $len - $j * 2;
337 if (exists $tails{$tail}) {
338 push @{$tails{$tail}}, $k;
339 } else {
340 $tails{$tail} = [ $k ];
344 # tails are calculated, now calculate the heads and merge.
345 BLOCK:
346 for $k (0 .. $#blkarray) {
347 next unless defined $blkarray[$k];
348 my $tomerge = $k;
349 while (1) {
350 my $head = substr($blkarray[$tomerge], 0, $j * 2);
351 my $entry = $tails{$head};
352 next BLOCK unless defined $entry;
354 my $other = shift @{$entry};
355 if ($other == $tomerge) {
356 if (@{$entry}) {
357 push @{$entry}, $other;
358 $other = shift @{$entry};
359 } else {
360 push @{$entry}, $other;
361 next BLOCK;
364 if (@{$entry} == 0) {
365 delete $tails{$head};
368 # a match was found
369 my $merge = $blkarray[$other]
370 . substr($blkarray[$tomerge], $j * 2);
371 $blocklen -= $j;
372 $blknum--;
374 if ($other < $tomerge) {
375 $blkarray[$tomerge] = undef;
376 $blkarray[$other] = $merge;
377 my $len = length $merge;
378 my $tail = substr $merge, $len - $j * 2;
379 $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
380 @{$tails{$tail}} ];
381 next BLOCK;
383 $blkarray[$tomerge] = $merge;
384 $blkarray[$other] = undef;
388 my $blockstr;
389 for $k (0 .. $#blkarray) {
390 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
393 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
394 my $estimate = 2 * $blocklen + (0x20000 >> $i);
396 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
397 if ($estimate < $bestest) {
398 $bestest = $estimate;
399 $bestshift[$plane] = $i;
400 $bestblkstr[$plane] = $blockstr;
403 $blksize[$plane] = 1 << $bestshift[$plane];
404 print "best shift: ", $bestshift[$plane];
405 print " blksize: ", $blksize[$plane];
407 my @blocksArray = \((), (), (), (), (), (), (), (),
408 (), (), (), (), (), (), (), (), ());
410 for my $plane (0 .. 0x10) {
411 for (my $j = 0; $j < 0x10000; $j += $blksize[$plane]) {
412 my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize[$plane];
413 my $index = index $bestblkstr[$plane], $blkkey;
414 while ($index & 1) {
415 die "not found: $j" if $index == -1;
416 $index = index $bestblkstr[$plane], $blkkey, $index + 1;
418 push @{$blocksArray[$plane]}, ($index / 2 - $j) & 0xffff;
422 ################################################################################
423 ################################################################################
424 ## Stage 3: Generate the file
425 for my $plane (0 .. 0x10) {
426 die "UTF-8 limit of blocks may be exceeded for plane $plane: " . scalar(@{$blocksArray[$plane]}) . "\n"
427 if @{$blocksArray[$plane]} > 0xffff / 3;
428 die "UTF-8 limit of data may be exceeded for plane $plane: " . length($bestblkstr[$plane]) . "\n"
429 if length($bestblkstr[$plane]) > 0xffff / 3;
433 print "\nGenerating $ARGV[2].";
434 my ($i, $j);
436 open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
437 print OUTPUT <<EOF;
438 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
439 Copyright (C) 2002 Free Software Foundation, Inc.
440 *** This file is generated by scripts/unicode-muncher.pl ***
442 This file is part of GNU Classpath.
444 GNU Classpath is free software; you can redistribute it and/or modify
445 it under the terms of the GNU General Public License as published by
446 the Free Software Foundation; either version 2, or (at your option)
447 any later version.
449 GNU Classpath is distributed in the hope that it will be useful, but
450 WITHOUT ANY WARRANTY; without even the implied warranty of
451 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
452 General Public License for more details.
454 You should have received a copy of the GNU General Public License
455 along with GNU Classpath; see the file COPYING. If not, write to the
456 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
457 02110-1301 USA.
459 Linking this library statically or dynamically with other modules is
460 making a combined work based on this library. Thus, the terms and
461 conditions of the GNU General Public License cover the whole
462 combination.
464 As a special exception, the copyright holders of this library give you
465 permission to link this library with independent modules to produce an
466 executable, regardless of the license terms of these independent
467 modules, and to copy and distribute the resulting executable under
468 terms of your choice, provided that you also meet, for each linked
469 independent module, the terms and conditions of the license of that
470 module. An independent module is a module which is not derived from
471 or based on this library. If you modify this library, you may extend
472 this exception to your version of the library, but you are not
473 obligated to do so. If you do not wish to do so, delete this
474 exception statement from your version. */
476 package gnu.java.lang;
479 * This contains the info about the unicode characters, that
480 * java.lang.Character needs. It is generated automatically from
481 * <code>$ARGV[0]</code> and
482 * <code>$ARGV[1]</code>, by some
483 * perl scripts. These Unicode definition files can be found on the
484 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
485 * JDK 1.5 uses Unicode version 4.0.0.
487 * The data is stored as string constants, but Character will convert these
488 * Strings to their respective <code>char[]</code> components. The fields
489 * are stored in arrays of 17 elements each, one element per Unicode plane.
490 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
491 * characters within <code>DATA</code>. The DATA field, in turn, stores
492 * information about each character in the low order bits, and an offset
493 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
494 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
495 * attribute tables are much smaller than 0xffff entries; as many characters
496 * in Unicode share common attributes. Numbers that are too large to fit
497 * into NUM_VALUE as 16 bit chars are stored in LARGENUMS and a number N is
498 * stored in NUM_VALUE such that (-N - 3) is the offset into LARGENUMS for
499 * the particular character. The DIRECTION table also contains a field for
500 * detecting characters with multi-character uppercase expansions.
501 * Next, there is a listing for <code>TITLE</code> exceptions (most characters
502 * just have the same title case as upper case). Finally, there are two
503 * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
504 * which lists the characters which are special cased, and
505 * <code>UPPER_EXPAND</code>, which lists their expansion.
507 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
508 * Eric Blake)
509 * \@see Character
510 * \@see String
512 public interface CharData
515 * The Unicode definition file that was parsed to build this database.
517 String SOURCE = \"$ARGV[0]\";
520 * The character shift amount to look up the block offset. In other words,
521 * <code>(char) (BLOCKS.value[ch >> SHIFT[p]] + ch)</code> is the index
522 * where <code>ch</code> is described in <code>DATA</code> if <code>ch</code>
523 * is in Unicode plane <code>p</code>. Note that <code>p</code> is simply
524 * the integer division of ch and 0x10000.
526 int[] SHIFT
528 for ($i = 0; $i < @bestshift - 1; $i++) {
529 if ($i == 0){
530 print OUTPUT " = new int[] {";
532 print OUTPUT $bestshift[$i], ", ";
534 if (scalar(@bestshift) > 0){
535 print OUTPUT $bestshift[-1], "}";
537 else {
538 print OUTPUT " = null";
540 print OUTPUT <<EOF;
544 * The mapping of character blocks to their location in <code>DATA</code>.
545 * Each entry has been adjusted so that the 16-bit sum with the desired
546 * character gives the actual index into <code>DATA</code>.
548 String[] BLOCKS = new String[]{
550 for ($plane = 0; $plane <= 0x10; $plane++) {
551 # The following if statement handles the cases of unassigned planes
552 # specially so we don't waste space with unused Strings. As of
553 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
554 # you are updating this script to work with a later version of
555 # Unicode you may have to alter this if statement.
556 if ($plane > 2 && $plane != 14) {
557 print OUTPUT ($plane == 0x10) ? " \"\"}" : " \"\",\n\n";
559 else {
560 for ($i = 0; $i < @{$blocksArray[$plane]} / 11; $i++) {
561 print OUTPUT $i ? "\n + " : " ";
562 print OUTPUT "\"";
563 for $j (0 .. 10) {
564 last if @{$blocksArray[$plane]} <= $i * 11 + $j;
565 my $val = $blocksArray[$plane]->[$i * 11 + $j];
566 print OUTPUT javaChar($val);
568 print OUTPUT "\"";
570 print OUTPUT ",\n\n";
573 print OUTPUT <<EOF;
577 * The array containing the numeric values that are too large to be stored as
578 * chars in NUM_VALUE. NUM_VALUE in this case will contain a negative integer
579 * N such that LARGENUMS[-N - 3] contains the correct numeric value.
581 int[] LARGENUMS
583 for ($i = 0; $i < @largeNums - 1; $i++) {
584 if ($i == 0){
585 print OUTPUT " = new int[] {";
587 print OUTPUT $largeNums[$i], ", ";
589 if (scalar(@largeNums) > 0){
590 print OUTPUT $largeNums[-1], "}";
592 else {
593 print OUTPUT " = null";
595 print OUTPUT <<EOF;
599 * Information about each character. The low order 5 bits form the
600 * character type, the next bit is a flag for non-breaking spaces, and the
601 * next bit is a flag for mirrored directionality. The high order 9 bits
602 * form the offset into the attribute tables. Note that this limits the
603 * number of unique character attributes to 512, which is not a problem
604 * as of Unicode version 4.0.0, but may soon become one.
606 String[] DATA = new String[]{
608 for ($plane = 0; $plane <= 0x10; $plane++) {
609 # The following if statement handles the cases of unassigned planes
610 # specially so we don't waste space with unused Strings. As of
611 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
612 # you are updating this script to work with a later version of
613 # Unicode you may have to alter this if statement.
614 if ($plane > 2 && $plane != 14) {
615 print OUTPUT ($plane == 0x10) ? " \"\"}" : " \"\",\n\n";
617 else {
618 my $len = length($bestblkstr[$plane]) / 2;
619 for ($i = 0; $i < $len / 11; $i++) {
620 print OUTPUT $i ? "\n + " : " ";
621 print OUTPUT "\"";
622 for $j (0 .. 10) {
623 last if $len <= $i * 11 + $j;
624 my $val = unpack "n", substr($bestblkstr[$plane], 2 * ($i * 11 + $j), 2);
625 print OUTPUT javaChar($val);
627 print OUTPUT "\"";
629 print OUTPUT ",\n\n";
632 print OUTPUT <<EOF;
636 * This is the attribute table for computing the numeric value of a
637 * character. The value is -1 if Unicode does not define a value, -2
638 * if the value is not a positive integer, otherwise it is the value.
639 * Note that this is a signed value, but stored as an unsigned char
640 * since this is a String literal.
642 String[] NUM_VALUE = new String[]{
645 for ($plane = 0; $plane <= 0x10; $plane++) {
646 # The following if statement handles the cases of unassigned planes
647 # specially so we don't waste space with unused Strings. As of
648 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
649 # you are updating this script to work with a later version of
650 # Unicode you may have to alter this if statement.
651 if ($plane > 2 && $plane != 14) {
652 print OUTPUT ($plane == 0x10) ? " \"\"}" : " \"\",\n\n";
654 else {
655 $len = @{$charinfoArray[$plane]};
656 for ($i = 0; $i < $len / 11; $i++) {
657 print OUTPUT $i ? "\n + " : " ";
658 print OUTPUT "\"";
659 for $j (0 .. 10) {
660 last if $len <= $i * 11 + $j;
661 my $val = $charinfoArray[$plane]->[$i * 11 + $j][0];
662 print OUTPUT javaChar($val);
664 print OUTPUT "\"";
666 print OUTPUT ",\n\n";
669 print OUTPUT <<EOF;
673 * This is the attribute table for computing the single-character uppercase
674 * representation of a character. The value is the signed difference
675 * between the character and its uppercase version. Note that this is
676 * stored as an unsigned char since this is a String literal. When
677 * capitalizing a String, you must first check if a multi-character uppercase
678 * sequence exists before using this character.
680 String[] UPPER = new String[]{
683 for ($plane = 0; $plane <= 0x10; $plane++) {
684 # The following if statement handles the cases of unassigned planes
685 # specially so we don't waste space with unused Strings. As of
686 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
687 # you are updating this script to work with a later version of
688 # Unicode you may have to alter this if statement.
689 if ($plane > 2 && $plane != 14) {
690 print OUTPUT ($plane == 0x10) ? " \"\"}" : " \"\",\n\n";
692 else {
693 $len = @{$charinfoArray[$plane]};
694 for ($i = 0; $i < $len / 11; $i++) {
695 print OUTPUT $i ? "\n + " : " ";
696 print OUTPUT "\"";
697 for $j (0 .. 10) {
698 last if $len <= $i * 11 + $j;
699 my $val = $charinfoArray[$plane]->[$i * 11 + $j][1];
700 print OUTPUT javaChar($val);
702 print OUTPUT "\"";
704 print OUTPUT ",\n\n";
707 print OUTPUT <<EOF;
711 * This is the attribute table for computing the lowercase representation
712 * of a character. The value is the signed difference between the
713 * character and its lowercase version. Note that this is stored as an
714 * unsigned char since this is a String literal.
716 String[] LOWER = new String[]{
719 for ($plane = 0; $plane <= 0x10; $plane++) {
720 # The following if statement handles the cases of unassigned planes
721 # specially so we don't waste space with unused Strings. As of
722 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
723 # you are updating this script to work with a later version of
724 # Unicode you may have to alter this if statement.
725 if ($plane > 2 && $plane != 14) {
726 print OUTPUT ($plane == 0x10) ? " \"\"}" : " \"\",\n\n";
728 else {
729 $len = @{$charinfoArray[$plane]};
730 for ($i = 0; $i < $len / 11; $i++) {
731 print OUTPUT $i ? "\n + " : " ";
732 print OUTPUT "\"";
733 for $j (0 .. 10) {
734 last if $len <= $i * 11 + $j;
735 my $val = $charinfoArray[$plane]->[$i * 11 + $j][2];
736 print OUTPUT javaChar($val);
738 print OUTPUT "\"";
740 print OUTPUT ",\n\n";
743 print OUTPUT <<EOF;
747 * This is the attribute table for computing the directionality class
748 * of a character, as well as a marker of characters with a multi-character
749 * capitalization. The direction is taken by performing a signed shift
750 * right by 2 (where a result of -1 means an unknown direction, such as
751 * for undefined characters). The lower 2 bits form a count of the
752 * additional characters that will be added to a String when performing
753 * multi-character uppercase expansion. This count is also used, along with
754 * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
755 * when performing the case conversion. Note that this information is stored
756 * as an unsigned char since this is a String literal.
758 String[] DIRECTION = new String[]{
761 for ($plane = 0; $plane <= 0x10; $plane++) {
762 # The following if statement handles the cases of unassigned planes
763 # specially so we don't waste space with unused Strings. As of
764 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
765 # you are updating this script to work with a later version of
766 # Unicode you may have to alter this if statement.
767 if ($plane > 2 && $plane != 14) {
768 print OUTPUT ($plane == 0x10) ? " \"\"}" : " \"\",\n\n";
770 else {
771 $len = @{$charinfoArray[$plane]};
772 for ($i = 0; $i < $len / 11; $i++) {
773 print OUTPUT $i ? "\n + " : " ";
774 print OUTPUT "\"";
775 for $j (0 .. 10) {
776 last if $len <= $i * 11 + $j;
777 my $val = $charinfoArray[$plane]->[$i * 11 + $j][3];
778 print OUTPUT javaChar($val);
780 print OUTPUT "\"";
782 print OUTPUT ",\n\n";
785 print OUTPUT <<EOF;
789 * This is the listing of titlecase special cases (all other characters
790 * can use <code>UPPER</code> to determine their titlecase). The listing
791 * is a sorted sequence of character pairs; converting the first character
792 * of the pair to titlecase produces the second character.
794 String TITLE
797 $len = length($titlecase) / 2;
798 for ($i = 0; $i < $len / 11; $i++) {
799 print OUTPUT $i ? "\n + \"" : " = \"";
800 for $j (0 .. 10) {
801 last if $len <= $i * 11 + $j;
802 my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
803 print OUTPUT javaChar($val);
805 print OUTPUT "\"";
808 print OUTPUT <<EOF;
812 * This is a listing of characters with multi-character uppercase sequences.
813 * A character appears in this list exactly when it has a non-zero entry
814 * in the low-order 2-bit field of DIRECTION. The listing is a sorted
815 * sequence of pairs (hence a binary search on the even elements is an
816 * efficient way to lookup a character). The first element of a pair is the
817 * character with the expansion, and the second is the index into
818 * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
819 * DIRECTION to determine where the expansion ends.
821 String UPPER_SPECIAL
824 my @list = sort {$a <=> $b} keys %special;
825 my $expansion = "";
826 my $offset = 0;
827 $len = @list;
828 for ($i = 0; $i < $len / 5; $i++) {
829 print OUTPUT $i ? "\n + \"" : " = \"";
830 for $j (0 .. 4) {
831 last if $len <= $i * 5 + $j;
832 my $ch = $list[$i * 5 + $j];
833 print OUTPUT javaChar($ch);
834 print OUTPUT javaChar($offset);
835 $offset += @{$special{$ch}};
836 $expansion .= pack "n*", @{$special{$ch}};
838 print OUTPUT "\"";
841 print OUTPUT <<EOF;
845 * This is the listing of special case multi-character uppercase sequences.
846 * Characters listed in UPPER_SPECIAL index into this table to find their
847 * uppercase expansion. Remember that you must also perform special-casing
848 * on two single-character sequences in the Turkish locale, which are not
849 * covered here in CharData.
851 String UPPER_EXPAND
854 $len = length($expansion) / 2;
855 for ($i = 0; $i < $len / 11; $i++) {
856 print OUTPUT $i ? "\n + \"" : " = \"";
857 for $j (0 .. 10) {
858 last if $len <= $i * 11 + $j;
859 my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
860 print OUTPUT javaChar($val);
862 print OUTPUT "\"";
865 print OUTPUT ";\n}\n";
866 close OUTPUT;
868 print "\nDone.\n";