2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
3 # Copyright (C) 1998, 2002, 2004 Free Software Foundation, Inc.
5 # This file is part of GNU Classpath.
7 # GNU Classpath is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
12 # GNU Classpath is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GNU Classpath; see the file COPYING. If not, write to the
19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 # Linking this library statically or dynamically with other modules is
23 # making a combined work based on this library. Thus, the terms and
24 # conditions of the GNU General Public License cover the whole
27 # As a special exception, the copyright holders of this library give you
28 # permission to link this library with independent modules to produce an
29 # executable, regardless of the license terms of these independent
30 # modules, and to copy and distribute the resulting executable under
31 # terms of your choice, provided that you also meet, for each linked
32 # independent module, the terms and conditions of the license of that
33 # module. An independent module is a module which is not derived from
34 # or based on this library. If you modify this library, you may extend
35 # this exception to your version of the library, but you are not
36 # obligated to do so. If you do not wish to do so, delete this
37 # exception statement from your version.
39 # Code for reading UnicodeData.txt and generating the code for
40 # gnu.java.lang.CharData. For now, the relevant Unicode definition files
41 # are found in doc/unicode/.
43 # Inspired by code from Jochen Hoenicke.
44 # author Eric Blake <ebb9@email.byu.edu>
45 # updated to Unicode 4.0.0 by Anthony Balkissoon <abalkiss@redhat.com>
47 # Usage: ./unicode-muncher <UnicodeData> <SpecialCasing> <CharData.java>
48 # where <UnicodeData> and <SpecialCasing> are .txt files obtained from
49 # www.unicode.org (named UnicodeData-4.0.0.txt and SpecialCasing-4.0.0.txt for
50 # Unicode version 4.0.0), and <CharData.java> is the final location for the
51 # Java interface gnu.java.lang.CharData.
52 # As of JDK 1.5, use Unicode version 4.0.0 for best results.
55 ## Convert a 16-bit integer to a Java source code String literal character
59 die "Out of range: $char\n" if $char < -0x8000 or $char > 0x10ffff;
60 $char += 0x10000 if $char < 0;
61 # Special case characters that must be escaped, or are shorter as ASCII
62 return sprintf("\\%03o", $char) if $char < 0x20;
63 return "\\\"" if $char == 0x22;
64 return "\\\\" if $char == 0x5c;
65 return pack("C", $char) if $char < 0x7f;
66 return sprintf("\\u%04x", $char);
70 ## Convert the text UnicodeData file from www.unicode.org into a Java
71 ## interface with string constants holding the compressed information.
73 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
74 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
75 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
77 my $NOBREAK_FLAG = 32;
78 my $MIRRORED_FLAG = 64;
82 # infoArray is an array where each element is a list of character information
83 # for characters in a plane. The index of each list is equal to the plane
84 # that it corresponds to even though most of these lists will currently be
85 # empty. This is done so that that this script can be easily modified to
86 # accomodate future versions of Unicode.
87 my @infoArray = \
((), (), (), (), (), (), (), (),
88 (), (), (), (), (), (), (), (), ());
90 # info is a reference to one of the lists in infoArray, depending on which
91 # plane we're currently parsing.
94 # titlecase is a string of ordered pairs of characters to store the titlecase
95 # conversions of characters that have them
98 # count is simply used to print "." to the screen every so often
101 # range is used when the UnicodeData file blocks out ranges of code points
104 # largeNums is an array of numerical values that are too large to fit
105 # into the 16 bit char where most numerical values are stored.
106 # What is stored in the char then is a number N such that (-N - 3) is
107 # the index into largeNums where the numerical value can be found.
110 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
113 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
114 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
116 ################################################################################
117 ################################################################################
118 ## Stage 0: Parse the special casing file
119 print "Parsing special casing file\n";
120 open (SPECIAL
, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
123 my ($ch, undef, undef, $upper) = split / *; */;
125 # This grabs only the special casing for multi-char uppercase. Note that
126 # there are no multi-char lowercase, and that Sun ignores multi-char
127 # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
128 # which must be hardcoded in java.lang.String:
129 # \u03a3 (Sun ignores this special case)
130 # \u0049 - lowercases to \u0131, but only in Turkish locale
131 # \u0069 - uppercases to \u0130, but only in Turkish locale
132 next unless defined $upper and $upper =~ / /;
133 $special{hex $ch} = [map {hex} split ' ', $upper];
137 ################################################################################
138 ################################################################################
139 ## Stage 1: Parse the attribute file
140 print "Parsing attributes file";
141 open (UNICODE
, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
143 print "." unless $count++ % 1000;
146 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
147 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
150 # plane tells us which Unicode code plane we're currently in and is an
151 # index into infoArray.
152 my $plane = int($ch / 0x10000);
153 my $planeBase = $plane * 0x10000;
154 $info = \@
{$infoArray[$plane]};
156 my ($type, $numValue, $upperchar, $lowerchar, $direction);
158 # Set the value of the $type variable, checking to make sure that it's valid
159 # and setting the mirrored and nobreak bits if necessary.
161 while ($category !~ /^$TYPECODES[$type]$/) {
162 if (++$type == @TYPECODES) {
163 die "$ch: Unknown type: $category";
166 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
167 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
169 # Set the value of the $numeric variable checking the special cases of
170 # large numbers or 'a' - 'z' values.
171 if ($numeric =~ /^[0-9]+$/) {
172 $numValue = $numeric;
173 # If numeric takes more than 16 bits to store we want to store that
174 # number in a separate array and store a number N in numValue such
175 # that (-N - 3) is the offset into the separate array containing the
176 # large numerical value.
177 if ($numValue >= 0x7fff) {
178 $numValue = -3 - @largeNums;
179 push @largeNums, $numeric;
181 } elsif ($numeric eq "") {
182 # Special case sequences of 'a'-'z'
183 if ($ch >= 0x0041 && $ch <= 0x005a) {
184 $numValue = $ch - 0x0037;
185 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
186 $numValue = $ch - 0x0057;
187 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
188 $numValue = $ch - 0xff17;
189 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
190 $numValue = $ch - 0xff37;
198 # Set the uppercase and lowercase expansions for the character.
199 $upperchar = $upcase ?
hex($upcase) - $ch : 0;
200 $lowerchar = $lowcase ?
hex($lowcase) - $ch : 0;
202 # If this character has a special titlecase expansion then append it to
203 # the titlecase String.
204 if ($title ne $upcase) {
205 my $titlechar = $title ?
hex($title) : $ch;
206 $titlecase .= pack("n2", $ch, $titlechar);
209 # Set the direction variable, use the lower 2 bits as a count of how many
210 # characters will be added to the String if this character undergoes an
211 # uppercase expansion.
213 while ($bidir !~ /^$DIRCODES[$direction]$/) {
214 if (++$direction == @DIRCODES) {
220 $direction += $#{$special{$ch}} if defined $special{$ch};
222 # If the UnicodeData file blocks off ranges of code points give them all
223 # the same character information.
225 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
226 for ($range + 1 .. $ch - 1) {
227 $info->[$_ - $planeBase] = pack("n5", $type, $numValue, $upperchar,
228 $lowerchar, $direction);
231 } elsif ($name =~ /First>$/) {
235 # Store all this parsed information into the element in infoArray that info
237 $info->[$ch - $planeBase] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
242 ################################################################################
243 ################################################################################
244 ## Stage 2: Compress the data structures
245 printf "\nCompressing data structures";
248 # data is a String that will be used to create the DATA String containing
249 # character information and offsets into the attribute tables.
252 # charhashArray is an array of hashtables used so that we can reuse character
253 # attributes when characters share the same attributes ... this makes our
254 # attribute tables smaller. charhash is a pointer into this array.
255 my @charhashArray = ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
258 # charinfoArray is an array of arrays, one per plane, for storing character
259 # information. charinfo is a pointer into this array.
260 my @charinfoArray = \
((), (), (), (), (), (), (), (),
261 (), (), (), (), (), (), (), (), ());
264 # charlen is an array, one element per plane, that tells us how many unique
265 # character attributes there are for that plane.
268 for my $plane (0 .. 0x10) {
269 $info = \@
{$infoArray[$plane]};
270 my $planeBase = $plane * 0x10000;
271 $charhash = \
%{$charhashArray[$plane]};
272 $charinfo = \@
{$charinfoArray[$plane]};
274 for my $ch ($planeBase .. $planeBase + 0xffff) {
275 my $index = $ch - $planeBase;
276 print "." unless $count++ % 0x1000;
277 $info->[$index] = pack("n5", 0, -1, 0, 0, -4) unless defined $info->[$index];
279 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info->[$index]);
280 if (! exists $charhash->{$info->[$index]}) {
281 # If we entered this loop that means the character we're looking at
282 # now has attributes that are unique from those that we've looked
283 # at so far for this plane. So we push its attributes into charinfo
284 # and store in charhash the offset into charinfo where these
285 # attributes can later be found.
286 push @
{$charinfo}, [ $numVal, $upper, $lower, $direction ];
287 $charhash->{$info->[$index]} = @
{$charinfo} - 1;
288 # When the file is generaged, the number we just stored in charhas
289 # will be the upper 9 bits in the DATA String that are an offset
290 # into the attribute tables.
292 $data[$plane] .= pack("n", ($charhash->{$info->[$index]} << 7) | $type);
294 $charlen[$plane] = scalar(@
{$charinfoArray[$plane]});
297 # the shift that results in the best compression of the table. This is an array
298 # because different shifts are better for the different tables for each plane.
302 my $bestest = 1000000;
306 for my $plane (0 .. 0x10) {
307 print "\n\nplane: $plane\n";
308 print "Unique character entries: $charlen[$plane]\n";
311 my $blksize = 1 << $i;
317 for ($j = 0; $j < 0x10000; $j += $blksize) {
318 my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize;
319 if (! exists $blocks{$blkkey}) {
320 push @blkarray, $blkkey;
321 $blocks{$blkkey} = $#blkarray;
325 my $blknum = @blkarray;
326 my $blocklen = $blknum * $blksize;
327 printf " before %5d", $blocklen;
329 # Now we try to pack the blkarray as tight as possible by finding matching
331 for ($j = $blksize - 1; $j > 0; $j--) {
333 for $k (0 .. $#blkarray) {
334 next unless defined $blkarray[$k];
335 my $len = length $blkarray[$k];
336 my $tail = substr $blkarray[$k], $len - $j * 2;
337 if (exists $tails{$tail}) {
338 push @
{$tails{$tail}}, $k;
340 $tails{$tail} = [ $k ];
344 # tails are calculated, now calculate the heads and merge.
346 for $k (0 .. $#blkarray) {
347 next unless defined $blkarray[$k];
350 my $head = substr($blkarray[$tomerge], 0, $j * 2);
351 my $entry = $tails{$head};
352 next BLOCK
unless defined $entry;
354 my $other = shift @
{$entry};
355 if ($other == $tomerge) {
357 push @
{$entry}, $other;
358 $other = shift @
{$entry};
360 push @
{$entry}, $other;
364 if (@
{$entry} == 0) {
365 delete $tails{$head};
369 my $merge = $blkarray[$other]
370 . substr($blkarray[$tomerge], $j * 2);
374 if ($other < $tomerge) {
375 $blkarray[$tomerge] = undef;
376 $blkarray[$other] = $merge;
377 my $len = length $merge;
378 my $tail = substr $merge, $len - $j * 2;
379 $tails{$tail} = [ map { $_ == $tomerge ?
$other : $_ }
383 $blkarray[$tomerge] = $merge;
384 $blkarray[$other] = undef;
389 for $k (0 .. $#blkarray) {
390 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
393 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
394 my $estimate = 2 * $blocklen + (0x20000 >> $i);
396 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
397 if ($estimate < $bestest) {
398 $bestest = $estimate;
399 $bestshift[$plane] = $i;
400 $bestblkstr[$plane] = $blockstr;
403 $blksize[$plane] = 1 << $bestshift[$plane];
404 print "best shift: ", $bestshift[$plane];
405 print " blksize: ", $blksize[$plane];
407 my @blocksArray = \
((), (), (), (), (), (), (), (),
408 (), (), (), (), (), (), (), (), ());
410 for my $plane (0 .. 0x10) {
411 for (my $j = 0; $j < 0x10000; $j += $blksize[$plane]) {
412 my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize[$plane];
413 my $index = index $bestblkstr[$plane], $blkkey;
415 die "not found: $j" if $index == -1;
416 $index = index $bestblkstr[$plane], $blkkey, $index + 1;
418 push @
{$blocksArray[$plane]}, ($index / 2 - $j) & 0xffff;
422 ################################################################################
423 ################################################################################
424 ## Stage 3: Generate the file
425 for my $plane (0 .. 0x10) {
426 die "UTF-8 limit of blocks may be exceeded for plane $plane: " . scalar(@
{$blocksArray[$plane]}) . "\n"
427 if @
{$blocksArray[$plane]} > 0xffff / 3;
428 die "UTF-8 limit of data may be exceeded for plane $plane: " . length($bestblkstr[$plane]) . "\n"
429 if length($bestblkstr[$plane]) > 0xffff / 3;
433 print "\nGenerating $ARGV[2].";
436 open OUTPUT
, "> $ARGV[2]" or die "Failed creating output file: $!\n";
438 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
439 Copyright (C) 2002 Free Software Foundation, Inc.
440 *** This file is generated by scripts/unicode-muncher.pl ***
442 This file is part of GNU Classpath.
444 GNU Classpath is free software; you can redistribute it and/or modify
445 it under the terms of the GNU General Public License as published by
446 the Free Software Foundation; either version 2, or (at your option)
449 GNU Classpath is distributed in the hope that it will be useful, but
450 WITHOUT ANY WARRANTY; without even the implied warranty of
451 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
452 General Public License for more details.
454 You should have received a copy of the GNU General Public License
455 along with GNU Classpath; see the file COPYING. If not, write to the
456 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
459 Linking this library statically or dynamically with other modules is
460 making a combined work based on this library. Thus, the terms and
461 conditions of the GNU General Public License cover the whole
464 As a special exception, the copyright holders of this library give you
465 permission to link this library with independent modules to produce an
466 executable, regardless of the license terms of these independent
467 modules, and to copy and distribute the resulting executable under
468 terms of your choice, provided that you also meet, for each linked
469 independent module, the terms and conditions of the license of that
470 module. An independent module is a module which is not derived from
471 or based on this library. If you modify this library, you may extend
472 this exception to your version of the library, but you are not
473 obligated to do so. If you do not wish to do so, delete this
474 exception statement from your version. */
476 package gnu.java.lang;
479 * This contains the info about the unicode characters, that
480 * java.lang.Character needs. It is generated automatically from
481 * <code>$ARGV[0]</code> and
482 * <code>$ARGV[1]</code>, by some
483 * perl scripts. These Unicode definition files can be found on the
484 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
485 * JDK 1.5 uses Unicode version 4.0.0.
487 * The data is stored as string constants, but Character will convert these
488 * Strings to their respective <code>char[]</code> components. The fields
489 * are stored in arrays of 17 elements each, one element per Unicode plane.
490 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
491 * characters within <code>DATA</code>. The DATA field, in turn, stores
492 * information about each character in the low order bits, and an offset
493 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
494 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
495 * attribute tables are much smaller than 0xffff entries; as many characters
496 * in Unicode share common attributes. Numbers that are too large to fit
497 * into NUM_VALUE as 16 bit chars are stored in LARGENUMS and a number N is
498 * stored in NUM_VALUE such that (-N - 3) is the offset into LARGENUMS for
499 * the particular character. The DIRECTION table also contains a field for
500 * detecting characters with multi-character uppercase expansions.
501 * Next, there is a listing for <code>TITLE</code> exceptions (most characters
502 * just have the same title case as upper case). Finally, there are two
503 * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
504 * which lists the characters which are special cased, and
505 * <code>UPPER_EXPAND</code>, which lists their expansion.
507 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
512 public interface CharData
515 * The Unicode definition file that was parsed to build this database.
517 String SOURCE = \"$ARGV[0]\";
520 * The character shift amount to look up the block offset. In other words,
521 * <code>(char) (BLOCKS.value[ch >> SHIFT[p]] + ch)</code> is the index
522 * where <code>ch</code> is described in <code>DATA</code> if <code>ch</code>
523 * is in Unicode plane <code>p</code>. Note that <code>p</code> is simply
524 * the integer division of ch and 0x10000.
528 for ($i = 0; $i < @bestshift - 1; $i++) {
530 print OUTPUT
" = new int[] {";
532 print OUTPUT
$bestshift[$i], ", ";
534 if (scalar(@bestshift) > 0){
535 print OUTPUT
$bestshift[-1], "}";
538 print OUTPUT
" = null";
544 * The mapping of character blocks to their location in <code>DATA</code>.
545 * Each entry has been adjusted so that the 16-bit sum with the desired
546 * character gives the actual index into <code>DATA</code>.
548 String[] BLOCKS = new String[]{
550 for ($plane = 0; $plane <= 0x10; $plane++) {
551 # The following if statement handles the cases of unassigned planes
552 # specially so we don't waste space with unused Strings. As of
553 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
554 # you are updating this script to work with a later version of
555 # Unicode you may have to alter this if statement.
556 if ($plane > 2 && $plane != 14) {
557 print OUTPUT
($plane == 0x10) ?
" \"\"}" : " \"\",\n\n";
560 for ($i = 0; $i < @
{$blocksArray[$plane]} / 11; $i++) {
561 print OUTPUT
$i ?
"\n + " : " ";
564 last if @
{$blocksArray[$plane]} <= $i * 11 + $j;
565 my $val = $blocksArray[$plane]->[$i * 11 + $j];
566 print OUTPUT javaChar
($val);
570 print OUTPUT
",\n\n";
577 * The array containing the numeric values that are too large to be stored as
578 * chars in NUM_VALUE. NUM_VALUE in this case will contain a negative integer
579 * N such that LARGENUMS[-N - 3] contains the correct numeric value.
583 for ($i = 0; $i < @largeNums - 1; $i++) {
585 print OUTPUT
" = new int[] {";
587 print OUTPUT
$largeNums[$i], ", ";
589 if (scalar(@largeNums) > 0){
590 print OUTPUT
$largeNums[-1], "}";
593 print OUTPUT
" = null";
599 * Information about each character. The low order 5 bits form the
600 * character type, the next bit is a flag for non-breaking spaces, and the
601 * next bit is a flag for mirrored directionality. The high order 9 bits
602 * form the offset into the attribute tables. Note that this limits the
603 * number of unique character attributes to 512, which is not a problem
604 * as of Unicode version 4.0.0, but may soon become one.
606 String[] DATA = new String[]{
608 for ($plane = 0; $plane <= 0x10; $plane++) {
609 # The following if statement handles the cases of unassigned planes
610 # specially so we don't waste space with unused Strings. As of
611 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
612 # you are updating this script to work with a later version of
613 # Unicode you may have to alter this if statement.
614 if ($plane > 2 && $plane != 14) {
615 print OUTPUT
($plane == 0x10) ?
" \"\"}" : " \"\",\n\n";
618 my $len = length($bestblkstr[$plane]) / 2;
619 for ($i = 0; $i < $len / 11; $i++) {
620 print OUTPUT
$i ?
"\n + " : " ";
623 last if $len <= $i * 11 + $j;
624 my $val = unpack "n", substr($bestblkstr[$plane], 2 * ($i * 11 + $j), 2);
625 print OUTPUT javaChar
($val);
629 print OUTPUT
",\n\n";
636 * This is the attribute table for computing the numeric value of a
637 * character. The value is -1 if Unicode does not define a value, -2
638 * if the value is not a positive integer, otherwise it is the value.
639 * Note that this is a signed value, but stored as an unsigned char
640 * since this is a String literal.
642 String[] NUM_VALUE = new String[]{
645 for ($plane = 0; $plane <= 0x10; $plane++) {
646 # The following if statement handles the cases of unassigned planes
647 # specially so we don't waste space with unused Strings. As of
648 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
649 # you are updating this script to work with a later version of
650 # Unicode you may have to alter this if statement.
651 if ($plane > 2 && $plane != 14) {
652 print OUTPUT
($plane == 0x10) ?
" \"\"}" : " \"\",\n\n";
655 $len = @
{$charinfoArray[$plane]};
656 for ($i = 0; $i < $len / 11; $i++) {
657 print OUTPUT
$i ?
"\n + " : " ";
660 last if $len <= $i * 11 + $j;
661 my $val = $charinfoArray[$plane]->[$i * 11 + $j][0];
662 print OUTPUT javaChar
($val);
666 print OUTPUT
",\n\n";
673 * This is the attribute table for computing the single-character uppercase
674 * representation of a character. The value is the signed difference
675 * between the character and its uppercase version. Note that this is
676 * stored as an unsigned char since this is a String literal. When
677 * capitalizing a String, you must first check if a multi-character uppercase
678 * sequence exists before using this character.
680 String[] UPPER = new String[]{
683 for ($plane = 0; $plane <= 0x10; $plane++) {
684 # The following if statement handles the cases of unassigned planes
685 # specially so we don't waste space with unused Strings. As of
686 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
687 # you are updating this script to work with a later version of
688 # Unicode you may have to alter this if statement.
689 if ($plane > 2 && $plane != 14) {
690 print OUTPUT
($plane == 0x10) ?
" \"\"}" : " \"\",\n\n";
693 $len = @
{$charinfoArray[$plane]};
694 for ($i = 0; $i < $len / 11; $i++) {
695 print OUTPUT
$i ?
"\n + " : " ";
698 last if $len <= $i * 11 + $j;
699 my $val = $charinfoArray[$plane]->[$i * 11 + $j][1];
700 print OUTPUT javaChar
($val);
704 print OUTPUT
",\n\n";
711 * This is the attribute table for computing the lowercase representation
712 * of a character. The value is the signed difference between the
713 * character and its lowercase version. Note that this is stored as an
714 * unsigned char since this is a String literal.
716 String[] LOWER = new String[]{
719 for ($plane = 0; $plane <= 0x10; $plane++) {
720 # The following if statement handles the cases of unassigned planes
721 # specially so we don't waste space with unused Strings. As of
722 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
723 # you are updating this script to work with a later version of
724 # Unicode you may have to alter this if statement.
725 if ($plane > 2 && $plane != 14) {
726 print OUTPUT
($plane == 0x10) ?
" \"\"}" : " \"\",\n\n";
729 $len = @
{$charinfoArray[$plane]};
730 for ($i = 0; $i < $len / 11; $i++) {
731 print OUTPUT
$i ?
"\n + " : " ";
734 last if $len <= $i * 11 + $j;
735 my $val = $charinfoArray[$plane]->[$i * 11 + $j][2];
736 print OUTPUT javaChar
($val);
740 print OUTPUT
",\n\n";
747 * This is the attribute table for computing the directionality class
748 * of a character, as well as a marker of characters with a multi-character
749 * capitalization. The direction is taken by performing a signed shift
750 * right by 2 (where a result of -1 means an unknown direction, such as
751 * for undefined characters). The lower 2 bits form a count of the
752 * additional characters that will be added to a String when performing
753 * multi-character uppercase expansion. This count is also used, along with
754 * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
755 * when performing the case conversion. Note that this information is stored
756 * as an unsigned char since this is a String literal.
758 String[] DIRECTION = new String[]{
761 for ($plane = 0; $plane <= 0x10; $plane++) {
762 # The following if statement handles the cases of unassigned planes
763 # specially so we don't waste space with unused Strings. As of
764 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
765 # you are updating this script to work with a later version of
766 # Unicode you may have to alter this if statement.
767 if ($plane > 2 && $plane != 14) {
768 print OUTPUT
($plane == 0x10) ?
" \"\"}" : " \"\",\n\n";
771 $len = @
{$charinfoArray[$plane]};
772 for ($i = 0; $i < $len / 11; $i++) {
773 print OUTPUT
$i ?
"\n + " : " ";
776 last if $len <= $i * 11 + $j;
777 my $val = $charinfoArray[$plane]->[$i * 11 + $j][3];
778 print OUTPUT javaChar
($val);
782 print OUTPUT
",\n\n";
789 * This is the listing of titlecase special cases (all other characters
790 * can use <code>UPPER</code> to determine their titlecase). The listing
791 * is a sorted sequence of character pairs; converting the first character
792 * of the pair to titlecase produces the second character.
797 $len = length($titlecase) / 2;
798 for ($i = 0; $i < $len / 11; $i++) {
799 print OUTPUT
$i ?
"\n + \"" : " = \"";
801 last if $len <= $i * 11 + $j;
802 my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
803 print OUTPUT javaChar
($val);
812 * This is a listing of characters with multi-character uppercase sequences.
813 * A character appears in this list exactly when it has a non-zero entry
814 * in the low-order 2-bit field of DIRECTION. The listing is a sorted
815 * sequence of pairs (hence a binary search on the even elements is an
816 * efficient way to lookup a character). The first element of a pair is the
817 * character with the expansion, and the second is the index into
818 * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
819 * DIRECTION to determine where the expansion ends.
824 my @list = sort {$a <=> $b} keys %special;
828 for ($i = 0; $i < $len / 5; $i++) {
829 print OUTPUT
$i ?
"\n + \"" : " = \"";
831 last if $len <= $i * 5 + $j;
832 my $ch = $list[$i * 5 + $j];
833 print OUTPUT javaChar
($ch);
834 print OUTPUT javaChar
($offset);
835 $offset += @
{$special{$ch}};
836 $expansion .= pack "n*", @
{$special{$ch}};
845 * This is the listing of special case multi-character uppercase sequences.
846 * Characters listed in UPPER_SPECIAL index into this table to find their
847 * uppercase expansion. Remember that you must also perform special-casing
848 * on two single-character sequences in the Turkish locale, which are not
849 * covered here in CharData.
854 $len = length($expansion) / 2;
855 for ($i = 0; $i < $len / 11; $i++) {
856 print OUTPUT
$i ?
"\n + \"" : " = \"";
858 last if $len <= $i * 11 + $j;
859 my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
860 print OUTPUT javaChar
($val);
865 print OUTPUT
";\n}\n";