Dead
[official-gcc.git] / gomp-20050608-branch / libjava / classpath / scripts / unicode-muncher.pl
blobb275f36f07b3a08588994e581b00f583a7b18e62
1 #!/usr/bin/perl -w
2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
3 # Copyright (C) 1998, 2002, 2004 Free Software Foundation, Inc.
5 # This file is part of GNU Classpath.
7 # GNU Classpath is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
12 # GNU Classpath is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GNU Classpath; see the file COPYING. If not, write to the
19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 # 02110-1301 USA.
22 # Linking this library statically or dynamically with other modules is
23 # making a combined work based on this library. Thus, the terms and
24 # conditions of the GNU General Public License cover the whole
25 # combination.
27 # As a special exception, the copyright holders of this library give you
28 # permission to link this library with independent modules to produce an
29 # executable, regardless of the license terms of these independent
30 # modules, and to copy and distribute the resulting executable under
31 # terms of your choice, provided that you also meet, for each linked
32 # independent module, the terms and conditions of the license of that
33 # module. An independent module is a module which is not derived from
34 # or based on this library. If you modify this library, you may extend
35 # this exception to your version of the library, but you are not
36 # obligated to do so. If you do not wish to do so, delete this
37 # exception statement from your version.
39 # Code for reading UnicodeData.txt and generating the code for
40 # gnu.java.lang.CharData. For now, the relevant Unicode definition files
41 # are found in doc/unicode/.
43 # Inspired by code from Jochen Hoenicke.
44 # author Eric Blake <ebb9@email.byu.edu>
46 # Usage: ./unicode-muncher <UnicodeData> <SpecialCasing> <CharData.java>
47 # where <UnicodeData> and <SpecialCasing> are .txt files obtained from
48 # www.unicode.org (named UnicodeData-3.0.0.txt and SpecialCasing-2.txt for
49 # Unicode version 3.0.0), and <CharData.java> is the final location for the
50 # Java interface gnu.java.lang.CharData.
51 # As of JDK 1.4, use Unicode version 3.0.0 for best results.
54 ## Convert a 16-bit integer to a Java source code String literal character
56 sub javaChar($) {
57 my ($char) = @_;
58 die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
59 $char += 0x10000 if $char < 0;
60 # Special case characters that must be escaped, or are shorter as ASCII
61 return sprintf("\\%03o", $char) if $char < 0x20;
62 return "\\\"" if $char == 0x22;
63 return "\\\\" if $char == 0x5c;
64 return pack("C", $char) if $char < 0x7f;
65 return sprintf("\\u%04x", $char);
69 ## Convert the text UnicodeData file from www.unicode.org into a Java
70 ## interface with string constants holding the compressed information.
72 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
73 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
74 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
76 my $NOBREAK_FLAG = 32;
77 my $MIRRORED_FLAG = 64;
79 my %special = ();
80 my @info = ();
81 my $titlecase = "";
82 my $count = 0;
83 my $range = 0;
85 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
86 unless @ARGV == 3;
87 $| = 1;
88 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
89 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
91 # Stage 0: Parse the special casing file
92 print "Parsing special casing file\n";
93 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
94 while (<SPECIAL>) {
95 next if /^\#/;
96 my ($ch, undef, undef, $upper) = split / *; */;
98 # This grabs only the special casing for multi-char uppercase. Note that
99 # there are no multi-char lowercase, and that Sun ignores multi-char
100 # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
101 # which must be hardcoded in java.lang.String:
102 # \u03a3 (Sun ignores this special case)
103 # \u0049 - lowercases to \u0131, but only in Turkish locale
104 # \u0069 - uppercases to \u0130, but only in Turkish locale
105 next unless defined $upper and $upper =~ / /;
106 $special{hex $ch} = [map {hex} split ' ', $upper];
109 close SPECIAL;
111 # Stage 1: Parse the attribute file
112 print "Parsing attributes file";
113 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
114 while (<UNICODE>) {
115 print "." unless $count++ % 1000;
116 chomp;
117 s/\r//g;
118 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
119 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
120 $ch = hex($ch);
121 next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
123 my ($type, $numValue, $upperchar, $lowerchar, $direction);
125 $type = 0;
126 while ($category !~ /^$TYPECODES[$type]$/) {
127 if (++$type == @TYPECODES) {
128 die "$ch: Unknown type: $category";
131 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
132 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
134 if ($numeric =~ /^[0-9]+$/) {
135 $numValue = $numeric;
136 die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
137 } elsif ($numeric eq "") {
138 # Special case sequences of 'a'-'z'
139 if ($ch >= 0x0041 && $ch <= 0x005a) {
140 $numValue = $ch - 0x0037;
141 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
142 $numValue = $ch - 0x0057;
143 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
144 $numValue = $ch - 0xff17;
145 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
146 $numValue = $ch - 0xff37;
147 } else {
148 $numValue = -1;
150 } else {
151 $numValue = -2;
154 $upperchar = $upcase ? hex($upcase) - $ch : 0;
155 $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
156 if ($title ne $upcase) {
157 my $titlechar = $title ? hex($title) : $ch;
158 $titlecase .= pack("n2", $ch, $titlechar);
161 $direction = 0;
162 while ($bidir !~ /^$DIRCODES[$direction]$/) {
163 if (++$direction == @DIRCODES) {
164 $direction = -1;
165 last;
168 $direction <<= 2;
169 $direction += $#{$special{$ch}} if defined $special{$ch};
171 if ($range) {
172 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
173 for ($range + 1 .. $ch - 1) {
174 $info[$_] = pack("n5", $type, $numValue, $upperchar,
175 $lowerchar, $direction);
177 $range = 0;
178 } elsif ($name =~ /First>$/) {
179 $range = $ch;
181 $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
182 $direction);
184 close UNICODE;
186 # Stage 2: Compress the data structures
187 printf "\nCompressing data structures";
188 $count = 0;
189 my $info = ();
190 my %charhash = ();
191 my @charinfo = ();
193 for my $ch (0 .. 0xffff) {
194 print "." unless $count++ % 0x1000;
195 $info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
197 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
198 if (! exists $charhash{$info[$ch]}) {
199 push @charinfo, [ $numVal, $upper, $lower, $direction ];
200 $charhash{$info[$ch]} = $#charinfo;
202 $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
205 my $charlen = @charinfo;
206 my $bestshift;
207 my $bestest = 1000000;
208 my $bestblkstr;
209 die "Too many unique character entries: $charlen\n" if $charlen > 512;
210 print "\nUnique character entries: $charlen\n";
212 for my $i (3 .. 8) {
213 my $blksize = 1 << $i;
214 my %blocks = ();
215 my @blkarray = ();
216 my ($j, $k);
217 print "shift: $i";
219 for ($j = 0; $j < 0x10000; $j += $blksize) {
220 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
221 if (! exists $blocks{$blkkey}) {
222 push @blkarray, $blkkey;
223 $blocks{$blkkey} = $#blkarray;
226 my $blknum = @blkarray;
227 my $blocklen = $blknum * $blksize;
228 printf " before %5d", $blocklen;
230 # Now we try to pack the blkarray as tight as possible by finding matching
231 # heads and tails.
232 for ($j = $blksize - 1; $j > 0; $j--) {
233 my %tails = ();
234 for $k (0 .. $#blkarray) {
235 next unless defined $blkarray[$k];
236 my $len = length $blkarray[$k];
237 my $tail = substr $blkarray[$k], $len - $j * 2;
238 if (exists $tails{$tail}) {
239 push @{$tails{$tail}}, $k;
240 } else {
241 $tails{$tail} = [ $k ];
245 # tails are calculated, now calculate the heads and merge.
246 BLOCK:
247 for $k (0 .. $#blkarray) {
248 next unless defined $blkarray[$k];
249 my $tomerge = $k;
250 while (1) {
251 my $head = substr($blkarray[$tomerge], 0, $j * 2);
252 my $entry = $tails{$head};
253 next BLOCK unless defined $entry;
255 my $other = shift @{$entry};
256 if ($other == $tomerge) {
257 if (@{$entry}) {
258 push @{$entry}, $other;
259 $other = shift @{$entry};
260 } else {
261 push @{$entry}, $other;
262 next BLOCK;
265 if (@{$entry} == 0) {
266 delete $tails{$head};
269 # a match was found
270 my $merge = $blkarray[$other]
271 . substr($blkarray[$tomerge], $j * 2);
272 $blocklen -= $j;
273 $blknum--;
275 if ($other < $tomerge) {
276 $blkarray[$tomerge] = undef;
277 $blkarray[$other] = $merge;
278 my $len = length $merge;
279 my $tail = substr $merge, $len - $j * 2;
280 $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
281 @{$tails{$tail}} ];
282 next BLOCK;
284 $blkarray[$tomerge] = $merge;
285 $blkarray[$other] = undef;
289 my $blockstr;
290 for $k (0 .. $#blkarray) {
291 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
294 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
295 my $estimate = 2 * $blocklen + (0x20000 >> $i);
297 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
298 if ($estimate < $bestest) {
299 $bestest = $estimate;
300 $bestshift = $i;
301 $bestblkstr = $blockstr;
305 my @blocks;
306 my $blksize = 1 << $bestshift;
307 for (my $j = 0; $j < 0x10000; $j += $blksize) {
308 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
309 my $index = index $bestblkstr, $blkkey;
310 while ($index & 1) {
311 die "not found: $j" if $index == -1;
312 $index = index $bestblkstr, $blkkey, $index + 1;
314 push @blocks, ($index / 2 - $j) & 0xffff;
317 # Phase 3: Generate the file
318 die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
319 if @blocks > 0xffff / 3;
320 die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
321 if length($bestblkstr) > 0xffff / 3;
323 print "Generating $ARGV[2] with shift of $bestshift";
324 my ($i, $j);
326 open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
327 print OUTPUT <<EOF;
328 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
329 Copyright (C) 2002 Free Software Foundation, Inc.
330 *** This file is generated by scripts/unicode-muncher.pl ***
332 This file is part of GNU Classpath.
334 GNU Classpath is free software; you can redistribute it and/or modify
335 it under the terms of the GNU General Public License as published by
336 the Free Software Foundation; either version 2, or (at your option)
337 any later version.
339 GNU Classpath is distributed in the hope that it will be useful, but
340 WITHOUT ANY WARRANTY; without even the implied warranty of
341 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
342 General Public License for more details.
344 You should have received a copy of the GNU General Public License
345 along with GNU Classpath; see the file COPYING. If not, write to the
346 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
347 02110-1301 USA.
349 Linking this library statically or dynamically with other modules is
350 making a combined work based on this library. Thus, the terms and
351 conditions of the GNU General Public License cover the whole
352 combination.
354 As a special exception, the copyright holders of this library give you
355 permission to link this library with independent modules to produce an
356 executable, regardless of the license terms of these independent
357 modules, and to copy and distribute the resulting executable under
358 terms of your choice, provided that you also meet, for each linked
359 independent module, the terms and conditions of the license of that
360 module. An independent module is a module which is not derived from
361 or based on this library. If you modify this library, you may extend
362 this exception to your version of the library, but you are not
363 obligated to do so. If you do not wish to do so, delete this
364 exception statement from your version. */
366 package gnu.java.lang;
369 * This contains the info about the unicode characters, that
370 * java.lang.Character needs. It is generated automatically from
371 * <code>$ARGV[0]</code> and
372 * <code>$ARGV[1]</code>, by some
373 * perl scripts. These Unicode definition files can be found on the
374 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
375 * JDK 1.4 uses Unicode version 3.0.0.
377 * The data is stored as string constants, but Character will convert these
378 * Strings to their respective <code>char[]</code> components. The field
379 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
380 * characters within <code>DATA</code>. The DATA field, in turn, stores
381 * information about each character in the low order bits, and an offset
382 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
383 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
384 * attribute tables are much smaller than 0xffff entries; as many characters
385 * in Unicode share common attributes. The DIRECTION table also contains
386 * a field for detecting characters with multi-character uppercase expansions.
387 * Next, there is a listing for <code>TITLE</code> exceptions (most characters
388 * just have the same title case as upper case). Finally, there are two
389 * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
390 * which lists the characters which are special cased, and
391 * <code>UPPER_EXPAND</code>, which lists their expansion.
393 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
394 * Eric Blake)
395 * \@see Character
396 * \@see String
398 public interface CharData
401 * The Unicode definition file that was parsed to build this database.
403 String SOURCE = \"$ARGV[0]\";
406 * The character shift amount to look up the block offset. In other words,
407 * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
408 * <code>ch</code> is described in <code>DATA</code>.
410 int SHIFT = $bestshift;
413 * The mapping of character blocks to their location in <code>DATA</code>.
414 * Each entry has been adjusted so that the 16-bit sum with the desired
415 * character gives the actual index into <code>DATA</code>.
417 String BLOCKS
420 for ($i = 0; $i < @blocks / 11; $i++) {
421 print OUTPUT $i ? "\n + \"" : " = \"";
422 for $j (0 .. 10) {
423 last if @blocks <= $i * 11 + $j;
424 my $val = $blocks[$i * 11 + $j];
425 print OUTPUT javaChar($val);
427 print OUTPUT "\"";
430 print OUTPUT <<EOF;
434 * Information about each character. The low order 5 bits form the
435 * character type, the next bit is a flag for non-breaking spaces, and the
436 * next bit is a flag for mirrored directionality. The high order 9 bits
437 * form the offset into the attribute tables. Note that this limits the
438 * number of unique character attributes to 512, which is not a problem
439 * as of Unicode version 3.2.0, but may soon become one.
441 String DATA
444 my $len = length($bestblkstr) / 2;
445 for ($i = 0; $i < $len / 11; $i++) {
446 print OUTPUT $i ? "\n + \"" : " = \"";
447 for $j (0 .. 10) {
448 last if $len <= $i * 11 + $j;
449 my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
450 print OUTPUT javaChar($val);
452 print OUTPUT "\"";
455 print OUTPUT <<EOF;
459 * This is the attribute table for computing the numeric value of a
460 * character. The value is -1 if Unicode does not define a value, -2
461 * if the value is not a positive integer, otherwise it is the value.
462 * Note that this is a signed value, but stored as an unsigned char
463 * since this is a String literal.
465 String NUM_VALUE
468 $len = @charinfo;
469 for ($i = 0; $i < $len / 11; $i++) {
470 print OUTPUT $i ? "\n + \"" : " = \"";
471 for $j (0 .. 10) {
472 last if $len <= $i * 11 + $j;
473 my $val = $charinfo[$i * 11 + $j][0];
474 print OUTPUT javaChar($val);
476 print OUTPUT "\"";
479 print OUTPUT <<EOF;
483 * This is the attribute table for computing the single-character uppercase
484 * representation of a character. The value is the signed difference
485 * between the character and its uppercase version. Note that this is
486 * stored as an unsigned char since this is a String literal. When
487 * capitalizing a String, you must first check if a multi-character uppercase
488 * sequence exists before using this character.
490 String UPPER
493 $len = @charinfo;
494 for ($i = 0; $i < $len / 11; $i++) {
495 print OUTPUT $i ? "\n + \"" : " = \"";
496 for $j (0 .. 10) {
497 last if $len <= $i * 11 + $j;
498 my $val = $charinfo[$i * 11 + $j][1];
499 print OUTPUT javaChar($val);
501 print OUTPUT "\"";
504 print OUTPUT <<EOF;
508 * This is the attribute table for computing the lowercase representation
509 * of a character. The value is the signed difference between the
510 * character and its lowercase version. Note that this is stored as an
511 * unsigned char since this is a String literal.
513 String LOWER
516 $len = @charinfo;
517 for ($i = 0; $i < $len / 13; $i++) {
518 print OUTPUT $i ? "\n + \"" : " = \"";
519 for $j (0 .. 12) {
520 last if $len <= $i * 13 + $j;
521 my $val = $charinfo[$i * 13 + $j][2];
522 print OUTPUT javaChar($val);
524 print OUTPUT "\"";
527 print OUTPUT <<EOF;
531 * This is the attribute table for computing the directionality class
532 * of a character, as well as a marker of characters with a multi-character
533 * capitalization. The direction is taken by performing a signed shift
534 * right by 2 (where a result of -1 means an unknown direction, such as
535 * for undefined characters). The lower 2 bits form a count of the
536 * additional characters that will be added to a String when performing
537 * multi-character uppercase expansion. This count is also used, along with
538 * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
539 * when performing the case conversion. Note that this information is stored
540 * as an unsigned char since this is a String literal.
542 String DIRECTION
545 $len = @charinfo;
546 for ($i = 0; $i < $len / 17; $i++) {
547 print OUTPUT $i ? "\n + \"" : " = \"";
548 for $j (0 .. 16) {
549 last if $len <= $i * 17 + $j;
550 my $val = $charinfo[$i * 17 + $j][3];
551 print OUTPUT javaChar($val);
553 print OUTPUT "\"";
556 print OUTPUT <<EOF;
560 * This is the listing of titlecase special cases (all other characters
561 * can use <code>UPPER</code> to determine their titlecase). The listing
562 * is a sorted sequence of character pairs; converting the first character
563 * of the pair to titlecase produces the second character.
565 String TITLE
568 $len = length($titlecase) / 2;
569 for ($i = 0; $i < $len / 11; $i++) {
570 print OUTPUT $i ? "\n + \"" : " = \"";
571 for $j (0 .. 10) {
572 last if $len <= $i * 11 + $j;
573 my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
574 print OUTPUT javaChar($val);
576 print OUTPUT "\"";
579 print OUTPUT <<EOF;
583 * This is a listing of characters with multi-character uppercase sequences.
584 * A character appears in this list exactly when it has a non-zero entry
585 * in the low-order 2-bit field of DIRECTION. The listing is a sorted
586 * sequence of pairs (hence a binary search on the even elements is an
587 * efficient way to lookup a character). The first element of a pair is the
588 * character with the expansion, and the second is the index into
589 * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
590 * DIRECTION to determine where the expansion ends.
592 String UPPER_SPECIAL
595 my @list = sort {$a <=> $b} keys %special;
596 my $expansion = "";
597 my $offset = 0;
598 $len = @list;
599 for ($i = 0; $i < $len / 5; $i++) {
600 print OUTPUT $i ? "\n + \"" : " = \"";
601 for $j (0 .. 4) {
602 last if $len <= $i * 5 + $j;
603 my $ch = $list[$i * 5 + $j];
604 print OUTPUT javaChar($ch);
605 print OUTPUT javaChar($offset);
606 $offset += @{$special{$ch}};
607 $expansion .= pack "n*", @{$special{$ch}};
609 print OUTPUT "\"";
612 print OUTPUT <<EOF;
616 * This is the listing of special case multi-character uppercase sequences.
617 * Characters listed in UPPER_SPECIAL index into this table to find their
618 * uppercase expansion. Remember that you must also perform special-casing
619 * on two single-character sequences in the Turkish locale, which are not
620 * covered here in CharData.
622 String UPPER_EXPAND
625 $len = length($expansion) / 2;
626 for ($i = 0; $i < $len / 11; $i++) {
627 print OUTPUT $i ? "\n + \"" : " = \"";
628 for $j (0 .. 10) {
629 last if $len <= $i * 11 + $j;
630 my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
631 print OUTPUT javaChar($val);
633 print OUTPUT "\"";
636 print OUTPUT ";\n}\n";
637 close OUTPUT;
639 print "\nDone.\n";