2014-07-29 Ed Smith-Rowland <3dw4rd@verizon.net>
[official-gcc.git] / libjava / scripts / unicode-to-chartables.pl
blob49095352b3ce2426a7d5c62c56e6be8ab3ba8b35
1 #!/usr/bin/perl -w
2 # unicode-to-chartables.pl -- generate Unicode database for java.lang.Character
3 # Copyright (C) 1998, 2002, 2004, 2006 Free Software Foundation, Inc.
5 # This file is part of GNU Classpath.
7 # GNU Classpath is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
12 # GNU Classpath is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GNU Classpath; see the file COPYING. If not, write to the
19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 # 02110-1301 USA.
22 # Linking this library statically or dynamically with other modules is
23 # making a combined work based on this library. Thus, the terms and
24 # conditions of the GNU General Public License cover the whole
25 # combination.
27 # As a special exception, the copyright holders of this library give you
28 # permission to link this library with independent modules to produce an
29 # executable, regardless of the license terms of these independent
30 # modules, and to copy and distribute the resulting executable under
31 # terms of your choice, provided that you also meet, for each linked
32 # independent module, the terms and conditions of the license of that
33 # module. An independent module is a module which is not derived from
34 # or based on this library. If you modify this library, you may extend
35 # this exception to your version of the library, but you are not
36 # obligated to do so. If you do not wish to do so, delete this
37 # exception statement from your version.
39 # Code for reading UnicodeData-4.0.0.txt and SpecialCasing-4.0.0.txt to generate
40 # the code for java-chartables.h. The relevant files can be found here:
42 # http://www.unicode.org/Public/4.0-Update/UnicodeData-4.0.0.txt
43 # http://www.unicode.org/Public/4.0-Update/SpecialCasing-4.0.0.txt
45 # Inspired by code from Jochen Hoenicke.
46 # author Eric Blake <ebb9@email.byu.edu>
47 # Unicode 4.0.0 support by Anthony Balkissoon <abalkiss@redhat.com>
49 # Usage: ./unicode-to-chartables.pl <UnicodeData> <SpecialCasing> <tables>
50 # where <UnicodeData.txt> is obtained from www.unicode.org (named
51 # UnicodeData-4.0.0.txt for Unicode version 4.0.0), <SpecialCasing>
52 # is obtained from www.unicode too (named SpecialCasing-4.0.0.txt for Unicode
53 # version 4.0.0), and <tables> is the final location for the header file
54 # java-chartables.h. As of JDK 1.5, use Unicode version 4.0.0
55 # for best results.
59 ## Return the given variable interpreted as a 16 bit signed number.
61 sub cShort($) {
62 my ($char) = @_;
63 return unpack "s", pack "I", $char;
67 ## Convert the text UnicodeData file from www.unicode.org into a header file
68 ## interface with arrays holding the compressed information.
70 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
71 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
72 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
74 my $NOBREAK_FLAG = 32;
75 my $MIRRORED_FLAG = 64;
77 my %special = ();
79 # infoArray is an array where each element is a list of character information
80 # for characters in a plane. The index of each list is equal to the plane
81 # that it corresponds to even though most of these lists will currently be
82 # empty. This is done so that that this script can be easily modified to
83 # accomodate future versions of Unicode.
84 my @infoArray = \((), (), (), (), (), (), (), (),
85 (), (), (), (), (), (), (), (), ());
87 # info is a reference to one of the lists in infoArray, depending on which
88 # plane we're currently parsing.
89 my $info;
91 # largeNums is an array of numerical values that are too large to fit
92 # into the 16 bit char where most numerical values are stored.
93 # What is stored in the char then is a number N such that (-N - 3) is
94 # the index into largeNums where the numerical value can be found.
95 my @largeNums = ();
97 my $titlecase = "";
98 my $count = 0;
99 my $range = 0;
101 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <java-chartables.h>"
102 unless @ARGV == 3;
103 $| = 1;
104 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
105 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
108 ################################################################################
109 ################################################################################
110 # Stage 0: Parse the special casing file
111 print "Parsing special casing file\n";
112 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
113 while (<SPECIAL>) {
114 next if /^\#/;
115 my ($ch, undef, undef, $upper) = split / *; */;
117 # This grabs only the special casing for multi-char uppercase. Note that
118 # there are no multi-char lowercase, and that Sun ignores multi-char
119 # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
120 # which must be hardcoded in java.lang.String:
121 # \u03a3 (Sun ignores this special case)
122 # \u0049 - lowercases to \u0131, but only in Turkish locale
123 # \u0069 - uppercases to \u0130, but only in Turkish locale
124 next unless defined $upper and $upper =~ / /;
125 $special{hex $ch} = [map {hex} split ' ', $upper];
128 close SPECIAL;
131 ################################################################################
132 ################################################################################
133 ## Stage 1: Parse the attribute file
134 print "Parsing attributes file";
135 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
136 while (<UNICODE>) {
137 print "." unless $count++ % 1000;
138 chomp;
139 s/\r//g;
140 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
141 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
142 $ch = hex($ch);
144 # plane tells us which Unicode code plane we're currently in and is an
145 # index into infoArray.
146 my $plane = int($ch / 0x10000);
147 my $planeBase = $plane * 0x10000;
148 $info = \@{$infoArray[$plane]};
150 my ($type, $numValue, $upperchar, $lowerchar, $direction);
152 $type = 0;
153 while ($category !~ /^$TYPECODES[$type]$/) {
154 if (++$type == @TYPECODES) {
155 die "$ch: Unknown type: $category";
158 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
159 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
161 if ($numeric =~ /^[0-9]+$/) {
162 $numValue = $numeric;
163 # If numeric takes more than 16 bits to store we want to store that
164 # number in a separate array and store a number N in numValue such
165 # that (-N - 3) is the offset into the separate array containing the
166 # large numerical value.
167 if ($numValue >= 0x7fff) {
168 $numValue = -3 - @largeNums;
169 push @largeNums, $numeric;
171 } elsif ($numeric eq "") {
172 # Special case sequences of 'a'-'z'
173 if ($ch >= 0x0041 && $ch <= 0x005a) {
174 $numValue = $ch - 0x0037;
175 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
176 $numValue = $ch - 0x0057;
177 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
178 $numValue = $ch - 0xff17;
179 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
180 $numValue = $ch - 0xff37;
181 } else {
182 $numValue = -1;
184 } else {
185 $numValue = -2;
188 $upperchar = $upcase ? hex($upcase) - $ch : 0;
189 $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
190 if ($title ne $upcase) {
191 my $titlechar = $title ? hex($title) : $ch;
192 $titlecase .= pack("n2", $ch, $titlechar);
195 $direction = 0;
196 while ($bidir !~ /^$DIRCODES[$direction]$/) {
197 if (++$direction == @DIRCODES) {
198 $direction = -1;
199 last;
202 $direction <<= 2;
203 $direction += $#{$special{$ch}} if defined $special{$ch};
205 if ($range) {
206 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
207 for ($range + 1 .. $ch - 1) {
208 $info->[$_ - $planeBase] = pack("n5", $type, $numValue, $upperchar,
209 $lowerchar, $direction);
211 $range = 0;
212 } elsif ($name =~ /First>$/) {
213 $range = $ch;
215 # Store all this parsed information into the element in infoArray that info
216 # points to.
217 $info->[$ch - $planeBase] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
218 $direction);
220 close UNICODE;
223 ################################################################################
224 ################################################################################
225 ## Stage 2: Compress the data structures
226 printf "\nCompressing data structures";
227 $count = 0;
229 # data is a String that will be used to create the DATA String containing
230 # character information and offsets into the attribute tables.
231 my @data = ();
233 # charhashArray is an array of hashtables used so that we can reuse character
234 # attributes when characters share the same attributes ... this makes our
235 # attribute tables smaller. charhash is a pointer into this array.
236 my @charhashArray = ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
237 my $charhash = ();
239 # charinfoArray is an array of arrays, one per plane, for storing character
240 # information. charinfo is a pointer into this array.
241 my @charinfoArray = \((), (), (), (), (), (), (), (),
242 (), (), (), (), (), (), (), (), ());
243 my $charinfo;
245 # charlen is an array, one element per plane, that tells us how many unique
246 # character attributes there are for that plane.
247 my @charlen = ();
249 for my $plane (0 .. 0x10) {
250 $info = \@{$infoArray[$plane]};
251 my $planeBase = $plane * 0x10000;
252 $charhash = \%{$charhashArray[$plane]};
253 $charinfo = \@{$charinfoArray[$plane]};
255 for my $ch ($planeBase .. $planeBase + 0xffff) {
256 my $index = $ch - $planeBase;
257 print "." unless $count++ % 0x1000;
258 $info->[$index] = pack("n5", 0, -1, 0, 0, -4) unless defined $info->[$index];
260 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info->[$index]);
261 if (! exists $charhash->{$info->[$index]}) {
262 # If we entered this loop that means the character we're looking at
263 # now has attributes that are unique from those that we've looked
264 # at so far for this plane. So we push its attributes into charinfo
265 # and store in charhash the offset into charinfo where these
266 # attributes can later be found.
267 push @{$charinfo}, [ $numVal, $upper, $lower, $direction ];
268 $charhash->{$info->[$index]} = @{$charinfo} - 1;
269 # When the file is generaged, the number we just stored in charhas
270 # will be the upper 9 bits in the DATA String that are an offset
271 # into the attribute tables.
273 $data[$plane] .= pack("n", ($charhash->{$info->[$index]} << 7) | $type);
275 $charlen[$plane] = scalar(@{$charinfoArray[$plane]});
278 # the shift that results in the best compression of the table. This is an array
279 # because different shifts are better for the different tables for each plane.
280 my @bestshift;
282 # an initial guess.
283 my $bestest = 1000000;
284 my @bestblkstr;
285 my @blksize = ();
287 for my $plane (0 .. 0x10) {
288 print "\n\nplane: $plane\n";
289 print "Unique character entries: $charlen[$plane]\n";
290 $bestest = 1000000;
291 for my $i (3 .. 8) {
292 my $blksize = 1 << $i;
293 my %blocks = ();
294 my @blkarray = ();
295 my ($j, $k);
296 print "shift: $i";
298 for ($j = 0; $j < 0x10000; $j += $blksize) {
299 my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize;
300 if (! exists $blocks{$blkkey}) {
301 push @blkarray, $blkkey;
302 $blocks{$blkkey} = $#blkarray;
306 my $blknum = @blkarray;
307 my $blocklen = $blknum * $blksize;
308 printf " before %5d", $blocklen;
310 # Now we try to pack the blkarray as tight as possible by finding matching
311 # heads and tails.
312 for ($j = $blksize - 1; $j > 0; $j--) {
313 my %tails = ();
314 for $k (0 .. $#blkarray) {
315 next unless defined $blkarray[$k];
316 my $len = length $blkarray[$k];
317 my $tail = substr $blkarray[$k], $len - $j * 2;
318 if (exists $tails{$tail}) {
319 push @{$tails{$tail}}, $k;
320 } else {
321 $tails{$tail} = [ $k ];
325 # tails are calculated, now calculate the heads and merge.
326 BLOCK:
327 for $k (0 .. $#blkarray) {
328 next unless defined $blkarray[$k];
329 my $tomerge = $k;
330 while (1) {
331 my $head = substr($blkarray[$tomerge], 0, $j * 2);
332 my $entry = $tails{$head};
333 next BLOCK unless defined $entry;
335 my $other = shift @{$entry};
336 if ($other == $tomerge) {
337 if (@{$entry}) {
338 push @{$entry}, $other;
339 $other = shift @{$entry};
340 } else {
341 push @{$entry}, $other;
342 next BLOCK;
345 if (@{$entry} == 0) {
346 delete $tails{$head};
349 # a match was found
350 my $merge = $blkarray[$other]
351 . substr($blkarray[$tomerge], $j * 2);
352 $blocklen -= $j;
353 $blknum--;
355 if ($other < $tomerge) {
356 $blkarray[$tomerge] = undef;
357 $blkarray[$other] = $merge;
358 my $len = length $merge;
359 my $tail = substr $merge, $len - $j * 2;
360 $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
361 @{$tails{$tail}} ];
362 next BLOCK;
364 $blkarray[$tomerge] = $merge;
365 $blkarray[$other] = undef;
369 my $blockstr;
370 for $k (0 .. $#blkarray) {
371 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
374 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
375 my $estimate = 2 * $blocklen + (0x20000 >> $i);
377 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
378 if ($estimate < $bestest) {
379 $bestest = $estimate;
380 $bestshift[$plane] = $i;
381 $bestblkstr[$plane] = $blockstr;
384 $blksize[$plane] = 1 << $bestshift[$plane];
385 print "best shift: ", $bestshift[$plane];
386 print " blksize: ", $blksize[$plane];
388 my @blocksArray = \((), (), (), (), (), (), (), (),
389 (), (), (), (), (), (), (), (), ());
391 for my $plane (0 .. 0x10) {
392 for (my $j = 0; $j < 0x10000; $j += $blksize[$plane]) {
393 my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize[$plane];
394 my $index = index $bestblkstr[$plane], $blkkey;
395 while ($index & 1) {
396 die "not found: $j" if $index == -1;
397 $index = index $bestblkstr[$plane], $blkkey, $index + 1;
399 push @{$blocksArray[$plane]}, ($index / 2 - $j) & 0xffff;
404 ################################################################################
405 ################################################################################
406 ## Stage 3: Generate the file
407 for my $plane (0 .. 0x10) {
408 die "UTF-8 limit of blocks may be exceeded for plane $plane: " . scalar(@{$blocksArray[$plane]}) . "\n"
409 if @{$blocksArray[$plane]} > 0xffff / 3;
410 die "UTF-8 limit of data may be exceeded for plane $plane: " . length($bestblkstr[$plane]) . "\n"
411 if length($bestblkstr[$plane]) > 0xffff / 3;
415 print "\nGenerating $ARGV[2].";
416 my ($i, $j);
418 open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
419 print OUTPUT <<EOF;
420 /* java-chartables.h -- Character tables for java.lang.Character -*- c++ -*-
421 Copyright (C) 2002, 2006 Free Software Foundation, Inc.
422 *** This file is generated by scripts/unicode-to-chartables.pl ***
424 This file is part of GNU Classpath.
426 GNU Classpath is free software; you can redistribute it and/or modify
427 it under the terms of the GNU General Public License as published by
428 the Free Software Foundation; either version 2, or (at your option)
429 any later version.
431 GNU Classpath is distributed in the hope that it will be useful, but
432 WITHOUT ANY WARRANTY; without even the implied warranty of
433 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
434 General Public License for more details.
436 You should have received a copy of the GNU General Public License
437 along with GNU Classpath; see the file COPYING. If not, write to the
438 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
439 02110-1301 USA.
441 Linking this library statically or dynamically with other modules is
442 making a combined work based on this library. Thus, the terms and
443 conditions of the GNU General Public License cover the whole
444 combination.
446 As a special exception, the copyright holders of this library give you
447 permission to link this library with independent modules to produce an
448 executable, regardless of the license terms of these independent
449 modules, and to copy and distribute the resulting executable under
450 terms of your choice, provided that you also meet, for each linked
451 independent module, the terms and conditions of the license of that
452 module. An independent module is a module which is not derived from
453 or based on this library. If you modify this library, you may extend
454 this exception to your version of the library, but you are not
455 obligated to do so. If you do not wish to do so, delete this
456 exception statement from your version. */
458 #ifndef __JAVA_CHARTABLES_H__
459 #define __JAVA_CHARTABLES_H__
461 // These tables are automatically generated by scripts/unicode_to_chartables.pl.
462 // The Unicode data comes from www.unicode.org; this header is based on
463 // UnicodeData-4.0.0.txt. JDK 1.5 uses Unicode version 4.0.0.
464 // DO NOT EDIT the tables. Instead, fix the upstream scripts and run
465 // them again.
467 // The data is stored in C style arrays of the appropriate CNI types, to
468 // guarantee that the data is constant and non-relocatable. The field
469 // <code>blocks</code> stores the offset of a block of 2<sup>SHIFT</sup>
470 // characters within <code>data</code>. The data field, in turn, stores
471 // information about each character in the low order bits, and an offset
472 // into the attribute tables <code>upper</code>, <code>lower</code>,
473 // <code>numValue</code>, and <code>direction</code>. Notice that the
474 // attribute tables are much smaller than 0xffff entries; as many characters
475 // in Unicode share common attributes. Finally, there is a listing for
476 // <code>title</code> exceptions (most characters just have the same title
477 // case as upper case).
479 // This file should only be included by natCharacter.cc
482 * The array containing the numeric values that are too large to be stored as
483 * chars in NUM_VALUE. NUM_VALUE in this case will contain a negative integer
484 * N such that LARGENUMS[-N - 3] contains the correct numeric value.
487 print OUTPUT "static const jint largenums[] = {\n ";
488 for ($i = 0; $i < @largeNums; $i++) {
489 print OUTPUT $largeNums[$i], ", ";
491 print OUTPUT "}";
492 print OUTPUT <<EOF;
496 * The character shift amount to look up the block offset. In other words,
497 * <code>(char) (blocks[p][off >> SHIFT[p]] + off)</code> is the index where
498 * <code>ch</code> is described in <code>data</code>, where <code>off</code>
499 * is ch & 0xffff and <code>p</code> is the plane the character belongs to.
502 print OUTPUT "static const int shift[] = {\n ";
503 for ($i = 0; $i < @bestshift; $i++) {
504 print OUTPUT $bestshift[$i], ", ";
506 print OUTPUT "}";
507 print OUTPUT <<EOF;
511 * The mapping of character blocks to their location in <code>data</code>.
512 * Each entry has been adjusted so that a modulo 16 sum with the desired
513 * character gives the actual index into <code>data</code>.
516 for ($plane = 0; $plane <= 0x10; $plane++) {
517 # The following if statement handles the cases of unassigned planes
518 # specially so we don't waste space with unused Strings. As of
519 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
520 # you are updating this script to work with a later version of
521 # Unicode you may have to alter this if statement.
522 next if ($plane > 2 && $plane != 14) ;
524 print OUTPUT "static const jchar blocks", $plane, "[] = {\n";
525 for ($i = 0; $i < @{$blocksArray[$plane]} / 10; $i++) {
526 print OUTPUT " ";
527 for $j (0 .. 9) {
528 last if @{$blocksArray[$plane]} <= $i * 10 + $j;
529 my $val = $blocksArray[$plane]->[$i * 10 + $j];
530 print OUTPUT $val, ", ";
532 print OUTPUT "\n";
534 print OUTPUT "};\n\n";
536 print OUTPUT "static const int blocks_length[] = {\n ";
537 for ($plane = 0; $plane <= 0x10; $plane++) {
538 if ($plane > 2 && $plane != 14){
539 print OUTPUT "-1, ";
541 else {
542 print OUTPUT scalar(@{$blocksArray[$plane]}), ", ";
545 print OUTPUT "};\n";
546 print OUTPUT <<EOF;
547 static const jchar* blocks[] = {
548 blocks0, blocks1, blocks2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
549 NULL, NULL, NULL, NULL, blocks14, NULL, NULL};
552 * Information about each character. The low order 5 bits form the
553 * character type, the next bit is a flag for non-breaking spaces, and the
554 * next bit is a flag for mirrored directionality. The high order 9 bits
555 * form the offset into the attribute tables. Note that this limits the
556 * number of unique character attributes per plane to 512, which is not a
557 * problem as of Unicode version 4.0.0, but may soon become one.
560 for ($plane = 0; $plane <= 0x10; $plane++) {
561 # The following if statement handles the cases of unassigned planes
562 # specially so we don't waste space with unused Strings. As of
563 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
564 # you are updating this script to work with a later version of
565 # Unicode you may have to alter this if statement.
566 next if ($plane > 2 && $plane != 14);
568 print OUTPUT "static const jchar data", $plane, "[] = {\n";
569 my $len = length($bestblkstr[$plane]) / 2;
570 for ($i = 0; $i < $len / 10; $i++) {
571 print OUTPUT " ";
572 for $j (0 .. 9) {
573 last if $len <= $i * 10 + $j;
574 my $val = unpack "n", substr($bestblkstr[$plane], 2 * ($i * 10 + $j), 2);
575 print OUTPUT $val, ", ";
577 print OUTPUT "\n";
579 print OUTPUT "};\n\n";
581 print OUTPUT "static const int data_length[] = {\n ";
582 for ($plane = 0; $plane <= 0x10; $plane++) {
583 if ($plane > 2 && $plane != 14){
584 print OUTPUT "-1, ";
586 else {
587 print OUTPUT length($bestblkstr[$plane]) / 2, ", ";
590 print OUTPUT "};\n";
591 print OUTPUT <<EOF;
592 static const jchar* data[] = {
593 data0, data1, data2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
594 NULL, NULL, NULL, NULL, data14, NULL, NULL};
598 * This is the attribute table for computing the numeric value of a
599 * character. The value is -1 if Unicode does not define a value, -2
600 * if the value is not a positive integer, otherwise it is the value.
603 for ($plane = 0; $plane <= 0x10; $plane++) {
604 # The following if statement handles the cases of unassigned planes
605 # specially so we don't waste space with unused Strings. As of
606 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
607 # you are updating this script to work with a later version of
608 # Unicode you may have to alter this if statement.
609 next if ($plane > 2 && $plane != 14);
611 print OUTPUT "static const jshort numValue", $plane, "[] = {\n";
612 $len = @{$charinfoArray[$plane]};
613 for ($i = 0; $i < $len / 13; $i++) {
614 print OUTPUT " ";
615 for $j (0 .. 12) {
616 last if $len <= $i * 13 + $j;
617 my $val = $charinfoArray[$plane]->[$i * 13 + $j][0];
618 print OUTPUT cShort($val), ", ";
620 print OUTPUT "\n";
622 print OUTPUT "};\n\n";
624 print OUTPUT "static const int numValue_length[] = {\n ";
625 for ($plane = 0; $plane <= 0x10; $plane++) {
626 if ($plane > 2 && $plane != 14){
627 print OUTPUT "-1, ";
629 else {
630 print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
633 print OUTPUT "};\n";
634 print OUTPUT <<EOF;
635 static const jshort* numValue[] = {
636 numValue0, numValue1, numValue2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
637 NULL, NULL, NULL, NULL, numValue14, NULL, NULL};
642 * This is the attribute table for computing the uppercase representation
643 * of a character. The value is the difference between the character and
644 * its uppercase version.
647 for ($plane = 0; $plane <= 0x10; $plane++) {
648 # The following if statement handles the cases of unassigned planes
649 # specially so we don't waste space with unused Strings. As of
650 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
651 # you are updating this script to work with a later version of
652 # Unicode you may have to alter this if statement.
653 next if ($plane > 2 && $plane != 14);
655 print OUTPUT "static const jshort upper", $plane, "[] = {\n";
656 $len = @{$charinfoArray[$plane]};
657 for ($i = 0; $i < $len / 13; $i++) {
658 print OUTPUT " ";
659 for $j (0 .. 12) {
660 last if $len <= $i * 13 + $j;
661 my $val = $charinfoArray[$plane]->[$i * 13 + $j][1];
662 print OUTPUT cShort($val), ", ";
664 print OUTPUT "\n";
666 print OUTPUT "};\n\n";
668 print OUTPUT "static const int upper_length[] = {\n ";
669 for ($plane = 0; $plane <= 0x10; $plane++) {
670 if ($plane > 2 && $plane != 14){
671 print OUTPUT "-1, ";
673 else {
674 print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
677 print OUTPUT "};\n";
678 print OUTPUT <<EOF;
679 static const jshort* upper[] = {
680 upper0, upper1, upper2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
681 NULL, NULL, NULL, NULL, upper14, NULL, NULL};
685 * This is the attribute table for computing the lowercase representation
686 * of a character. The value is the difference between the character and
687 * its lowercase version.
690 for ($plane = 0; $plane <= 0x10; $plane++) {
691 # The following if statement handles the cases of unassigned planes
692 # specially so we don't waste space with unused Strings. As of
693 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
694 # you are updating this script to work with a later version of
695 # Unicode you may have to alter this if statement.
696 next if ($plane > 2 && $plane != 14);
698 print OUTPUT "static const jshort lower", $plane, "[] = {\n";
699 $len = @{$charinfoArray[$plane]};
700 for ($i = 0; $i < $len / 13; $i++) {
701 print OUTPUT " ";
702 for $j (0 .. 12) {
703 last if $len <= $i * 13 + $j;
704 my $val = $charinfoArray[$plane]->[$i * 13 + $j][2];
705 print OUTPUT cShort($val), ", ";
707 print OUTPUT "\n";
709 print OUTPUT "};\n\n";
711 print OUTPUT "static const int lower_length[] = {\n ";
712 for ($plane = 0; $plane <= 0x10; $plane++) {
713 if ($plane > 2 && $plane != 14){
714 print OUTPUT "-1, ";
716 else {
717 print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
720 print OUTPUT "};\n";
721 print OUTPUT <<EOF;
722 static const jshort* lower[] = {
723 lower0, lower1, lower2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
724 NULL, NULL, NULL, NULL, lower14, NULL, NULL};
728 * This is the attribute table for computing the directionality class
729 * of a character. At present, the value is in the range 0 - 18 if the
730 * character has a direction, otherwise it is -1.
733 for ($plane = 0; $plane <= 0x10; $plane++) {
734 # The following if statement handles the cases of unassigned planes
735 # specially so we don't waste space with unused Strings. As of
736 # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used. If
737 # you are updating this script to work with a later version of
738 # Unicode you may have to alter this if statement.
739 next if ($plane > 2 && $plane != 14);
741 print OUTPUT "static const jbyte direction", $plane, "[] = {\n";
742 $len = @{$charinfoArray[$plane]};
743 for ($i = 0; $i < $len / 19; $i++) {
744 print OUTPUT " ";
745 for $j (0 .. 18) {
746 last if $len <= $i * 19 + $j;
747 my $val = $charinfoArray[$plane]->[$i * 19 + $j][3];
748 $val >>= 2;
749 if ($val < 0 || $val > 18){
750 $val = -1;
752 print OUTPUT cShort($val), ", ";
754 print OUTPUT "\n";
756 print OUTPUT "};\n\n";
758 print OUTPUT "static const int direction_length[] = {\n ";
759 for ($plane = 0; $plane <= 0x10; $plane++) {
760 if ($plane > 2 && $plane != 14){
761 print OUTPUT "-1, ";
763 else {
764 print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
767 print OUTPUT "};\n";
768 print OUTPUT <<EOF;
769 static const jbyte* direction[] = {
770 direction0, direction1, direction2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
771 NULL, NULL, NULL, NULL, direction14, NULL, NULL};
775 * This is the listing of titlecase special cases (all other character
776 * can use <code>upper</code> to determine their titlecase). The listing
777 * is a sequence of character pairs; converting the first character of the
778 * pair to titlecase produces the second character.
780 static const jchar title[] = {
783 $len = length($titlecase) / 2;
784 for ($i = 0; $i < $len / 10; $i++) {
785 print OUTPUT $i ? "\n " : " ";
786 for $j (0 .. 9) {
787 last if $len <= $i * 10 + $j;
788 my $val = unpack "n", substr($titlecase, 2 * ($i * 10 + $j), 2);
789 print OUTPUT $val, ", ";
793 print OUTPUT "\n };";
794 print OUTPUT "\n/** Length of title. */\nstatic const int title_length = ", $len;
795 print OUTPUT <<EOF;
798 #endif /* __JAVA_CHARTABLES_H__ */
800 close OUTPUT;
802 print "\nDone.\n";