Forgot ChangeLog in previous commit.
[official-gcc.git] / libjava / scripts / unicode-muncher.pl
blob4c4edc9c146ee7390ac8187a7e0e6d043be8a69d
1 #!/usr/bin/perl -w
2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
3 # Copyright (C) 1998, 2002, 2004 Free Software Foundation, Inc.
5 # This file is part of GNU Classpath.
7 # GNU Classpath is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
12 # GNU Classpath is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GNU Classpath; see the file COPYING. If not, write to the
19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 # 02110-1301 USA.
22 # Linking this library statically or dynamically with other modules is
23 # making a combined work based on this library. Thus, the terms and
24 # conditions of the GNU General Public License cover the whole
25 # combination.
27 # As a special exception, the copyright holders of this library give you
28 # permission to link this library with independent modules to produce an
29 # executable, regardless of the license terms of these independent
30 # modules, and to copy and distribute the resulting executable under
31 # terms of your choice, provided that you also meet, for each linked
32 # independent module, the terms and conditions of the license of that
33 # module. An independent module is a module which is not derived from
34 # or based on this library. If you modify this library, you may extend
35 # this exception to your version of the library, but you are not
36 # obligated to do so. If you do not wish to do so, delete this
37 # exception statement from your version.
39 # Code for reading UnicodeData-3.0.0.txt and SpecialCasing-2.txt to generate
40 # the code for gnu.java.lang.CharData. The relevant files can be found here:
42 # http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
43 # http://www.unicode.org/Public/3.0-Update/SpecialCasing-2.txt
45 # Inspired by code from Jochen Hoenicke.
46 # author Eric Blake <ebb9@email.byu.edu>
48 # Usage: ./unicode-muncher <UnicodeData.txt> <SpecialCasing> <CharData.java>
49 # where <UnicodeData.txt> is obtained from www.unicode.org (named
50 # UnicodeData-3.0.0.txt for Unicode version 3.0.0), <SpecialCasing>
51 # is obtained from www.unicode too (named SpecialCasing-2.txt for Unicode
52 # version 3.0.0), and <CharData.java> is the final location for the Java
53 # interface gnu.java.lang.CharData. As of JDK 1.4, use Unicode version 3.0.0
54 # for best results.
57 ## Convert a 16-bit integer to a Java source code String literal character
59 sub javaChar($) {
60 my ($char) = @_;
61 die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
62 $char += 0x10000 if $char < 0;
63 # Special case characters that must be escaped, or are shorter as ASCII
64 return sprintf("\\%03o", $char) if $char < 0x20;
65 return "\\\"" if $char == 0x22;
66 return "\\\\" if $char == 0x5c;
67 return pack("C", $char) if $char < 0x7f;
68 return sprintf("\\u%04x", $char);
72 ## Convert the text UnicodeData file from www.unicode.org into a Java
73 ## interface with string constants holding the compressed information.
75 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
76 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
77 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
79 my $NOBREAK_FLAG = 32;
80 my $MIRRORED_FLAG = 64;
82 my %special = ();
83 my @info = ();
84 my $titlecase = "";
85 my $count = 0;
86 my $range = 0;
88 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
89 unless @ARGV == 3;
90 $| = 1;
91 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
92 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
94 # Stage 0: Parse the special casing file
95 print "Parsing special casing file\n";
96 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
97 while (<SPECIAL>) {
98 next if /^\#/;
99 my ($ch, undef, undef, $upper) = split / *; */;
101 # This grabs only the special casing for multi-char uppercase. Note that
102 # there are no multi-char lowercase, and that Sun ignores multi-char
103 # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
104 # which must be hardcoded in java.lang.String:
105 # \u03a3 (Sun ignores this special case)
106 # \u0049 - lowercases to \u0131, but only in Turkish locale
107 # \u0069 - uppercases to \u0130, but only in Turkish locale
108 next unless defined $upper and $upper =~ / /;
109 $special{hex $ch} = [map {hex} split ' ', $upper];
112 close SPECIAL;
114 # Stage 1: Parse the attribute file
115 print "Parsing attributes file";
116 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
117 while (<UNICODE>) {
118 print "." unless $count++ % 1000;
119 chomp;
120 s/\r//g;
121 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
122 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
123 $ch = hex($ch);
124 next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
126 my ($type, $numValue, $upperchar, $lowerchar, $direction);
128 $type = 0;
129 while ($category !~ /^$TYPECODES[$type]$/) {
130 if (++$type == @TYPECODES) {
131 die "$ch: Unknown type: $category";
134 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
135 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
137 if ($numeric =~ /^[0-9]+$/) {
138 $numValue = $numeric;
139 die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
140 } elsif ($numeric eq "") {
141 # Special case sequences of 'a'-'z'
142 if ($ch >= 0x0041 && $ch <= 0x005a) {
143 $numValue = $ch - 0x0037;
144 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
145 $numValue = $ch - 0x0057;
146 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
147 $numValue = $ch - 0xff17;
148 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
149 $numValue = $ch - 0xff37;
150 } else {
151 $numValue = -1;
153 } else {
154 $numValue = -2;
157 $upperchar = $upcase ? hex($upcase) - $ch : 0;
158 $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
159 if ($title ne $upcase) {
160 my $titlechar = $title ? hex($title) : $ch;
161 $titlecase .= pack("n2", $ch, $titlechar);
164 $direction = 0;
165 while ($bidir !~ /^$DIRCODES[$direction]$/) {
166 if (++$direction == @DIRCODES) {
167 $direction = -1;
168 last;
171 $direction <<= 2;
172 $direction += $#{$special{$ch}} if defined $special{$ch};
174 if ($range) {
175 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
176 for ($range + 1 .. $ch - 1) {
177 $info[$_] = pack("n5", $type, $numValue, $upperchar,
178 $lowerchar, $direction);
180 $range = 0;
181 } elsif ($name =~ /First>$/) {
182 $range = $ch;
184 $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
185 $direction);
187 close UNICODE;
189 # Stage 2: Compress the data structures
190 printf "\nCompressing data structures";
191 $count = 0;
192 my $info = ();
193 my %charhash = ();
194 my @charinfo = ();
196 for my $ch (0 .. 0xffff) {
197 print "." unless $count++ % 0x1000;
198 $info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
200 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
201 if (! exists $charhash{$info[$ch]}) {
202 push @charinfo, [ $numVal, $upper, $lower, $direction ];
203 $charhash{$info[$ch]} = $#charinfo;
205 $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
208 my $charlen = @charinfo;
209 my $bestshift;
210 my $bestest = 1000000;
211 my $bestblkstr;
212 die "Too many unique character entries: $charlen\n" if $charlen > 512;
213 print "\nUnique character entries: $charlen\n";
215 for my $i (3 .. 8) {
216 my $blksize = 1 << $i;
217 my %blocks = ();
218 my @blkarray = ();
219 my ($j, $k);
220 print "shift: $i";
222 for ($j = 0; $j < 0x10000; $j += $blksize) {
223 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
224 if (! exists $blocks{$blkkey}) {
225 push @blkarray, $blkkey;
226 $blocks{$blkkey} = $#blkarray;
229 my $blknum = @blkarray;
230 my $blocklen = $blknum * $blksize;
231 printf " before %5d", $blocklen;
233 # Now we try to pack the blkarray as tight as possible by finding matching
234 # heads and tails.
235 for ($j = $blksize - 1; $j > 0; $j--) {
236 my %tails = ();
237 for $k (0 .. $#blkarray) {
238 next unless defined $blkarray[$k];
239 my $len = length $blkarray[$k];
240 my $tail = substr $blkarray[$k], $len - $j * 2;
241 if (exists $tails{$tail}) {
242 push @{$tails{$tail}}, $k;
243 } else {
244 $tails{$tail} = [ $k ];
248 # tails are calculated, now calculate the heads and merge.
249 BLOCK:
250 for $k (0 .. $#blkarray) {
251 next unless defined $blkarray[$k];
252 my $tomerge = $k;
253 while (1) {
254 my $head = substr($blkarray[$tomerge], 0, $j * 2);
255 my $entry = $tails{$head};
256 next BLOCK unless defined $entry;
258 my $other = shift @{$entry};
259 if ($other == $tomerge) {
260 if (@{$entry}) {
261 push @{$entry}, $other;
262 $other = shift @{$entry};
263 } else {
264 push @{$entry}, $other;
265 next BLOCK;
268 if (@{$entry} == 0) {
269 delete $tails{$head};
272 # a match was found
273 my $merge = $blkarray[$other]
274 . substr($blkarray[$tomerge], $j * 2);
275 $blocklen -= $j;
276 $blknum--;
278 if ($other < $tomerge) {
279 $blkarray[$tomerge] = undef;
280 $blkarray[$other] = $merge;
281 my $len = length $merge;
282 my $tail = substr $merge, $len - $j * 2;
283 $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
284 @{$tails{$tail}} ];
285 next BLOCK;
287 $blkarray[$tomerge] = $merge;
288 $blkarray[$other] = undef;
292 my $blockstr;
293 for $k (0 .. $#blkarray) {
294 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
297 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
298 my $estimate = 2 * $blocklen + (0x20000 >> $i);
300 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
301 if ($estimate < $bestest) {
302 $bestest = $estimate;
303 $bestshift = $i;
304 $bestblkstr = $blockstr;
308 my @blocks;
309 my $blksize = 1 << $bestshift;
310 for (my $j = 0; $j < 0x10000; $j += $blksize) {
311 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
312 my $index = index $bestblkstr, $blkkey;
313 while ($index & 1) {
314 die "not found: $j" if $index == -1;
315 $index = index $bestblkstr, $blkkey, $index + 1;
317 push @blocks, ($index / 2 - $j) & 0xffff;
320 # Phase 3: Generate the file
321 die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
322 if @blocks > 0xffff / 3;
323 die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
324 if length($bestblkstr) > 0xffff / 3;
326 print "Generating $ARGV[2] with shift of $bestshift";
327 my ($i, $j);
329 open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
330 print OUTPUT <<EOF;
331 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
332 Copyright (C) 2002 Free Software Foundation, Inc.
333 *** This file is generated by scripts/unicode-muncher.pl ***
335 This file is part of GNU Classpath.
337 GNU Classpath is free software; you can redistribute it and/or modify
338 it under the terms of the GNU General Public License as published by
339 the Free Software Foundation; either version 2, or (at your option)
340 any later version.
342 GNU Classpath is distributed in the hope that it will be useful, but
343 WITHOUT ANY WARRANTY; without even the implied warranty of
344 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
345 General Public License for more details.
347 You should have received a copy of the GNU General Public License
348 along with GNU Classpath; see the file COPYING. If not, write to the
349 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
350 02110-1301 USA.
352 Linking this library statically or dynamically with other modules is
353 making a combined work based on this library. Thus, the terms and
354 conditions of the GNU General Public License cover the whole
355 combination.
357 As a special exception, the copyright holders of this library give you
358 permission to link this library with independent modules to produce an
359 executable, regardless of the license terms of these independent
360 modules, and to copy and distribute the resulting executable under
361 terms of your choice, provided that you also meet, for each linked
362 independent module, the terms and conditions of the license of that
363 module. An independent module is a module which is not derived from
364 or based on this library. If you modify this library, you may extend
365 this exception to your version of the library, but you are not
366 obligated to do so. If you do not wish to do so, delete this
367 exception statement from your version. */
369 package gnu.java.lang;
372 * This contains the info about the unicode characters, that
373 * java.lang.Character needs. It is generated automatically from
374 * <code>$ARGV[0]</code> and
375 * <code>$ARGV[1]</code>, by some
376 * perl scripts. These Unicode definition files can be found on the
377 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
378 * JDK 1.4 uses Unicode version 3.0.0.
380 * The data is stored as string constants, but Character will convert these
381 * Strings to their respective <code>char[]</code> components. The field
382 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
383 * characters within <code>DATA</code>. The DATA field, in turn, stores
384 * information about each character in the low order bits, and an offset
385 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
386 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
387 * attribute tables are much smaller than 0xffff entries; as many characters
388 * in Unicode share common attributes. The DIRECTION table also contains
389 * a field for detecting characters with multi-character uppercase expansions.
390 * Next, there is a listing for <code>TITLE</code> exceptions (most characters
391 * just have the same title case as upper case). Finally, there are two
392 * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
393 * which lists the characters which are special cased, and
394 * <code>UPPER_EXPAND</code>, which lists their expansion.
396 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
397 * Eric Blake)
398 * \@see Character
399 * \@see String
401 public interface CharData
404 * The Unicode definition file that was parsed to build this database.
406 String SOURCE = \"$ARGV[0]\";
409 * The character shift amount to look up the block offset. In other words,
410 * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
411 * <code>ch</code> is described in <code>DATA</code>.
413 int SHIFT = $bestshift;
416 * The mapping of character blocks to their location in <code>DATA</code>.
417 * Each entry has been adjusted so that the 16-bit sum with the desired
418 * character gives the actual index into <code>DATA</code>.
420 String BLOCKS
423 for ($i = 0; $i < @blocks / 11; $i++) {
424 print OUTPUT $i ? "\n + \"" : " = \"";
425 for $j (0 .. 10) {
426 last if @blocks <= $i * 11 + $j;
427 my $val = $blocks[$i * 11 + $j];
428 print OUTPUT javaChar($val);
430 print OUTPUT "\"";
433 print OUTPUT <<EOF;
437 * Information about each character. The low order 5 bits form the
438 * character type, the next bit is a flag for non-breaking spaces, and the
439 * next bit is a flag for mirrored directionality. The high order 9 bits
440 * form the offset into the attribute tables. Note that this limits the
441 * number of unique character attributes to 512, which is not a problem
442 * as of Unicode version 3.2.0, but may soon become one.
444 String DATA
447 my $len = length($bestblkstr) / 2;
448 for ($i = 0; $i < $len / 11; $i++) {
449 print OUTPUT $i ? "\n + \"" : " = \"";
450 for $j (0 .. 10) {
451 last if $len <= $i * 11 + $j;
452 my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
453 print OUTPUT javaChar($val);
455 print OUTPUT "\"";
458 print OUTPUT <<EOF;
462 * This is the attribute table for computing the numeric value of a
463 * character. The value is -1 if Unicode does not define a value, -2
464 * if the value is not a positive integer, otherwise it is the value.
465 * Note that this is a signed value, but stored as an unsigned char
466 * since this is a String literal.
468 String NUM_VALUE
471 $len = @charinfo;
472 for ($i = 0; $i < $len / 11; $i++) {
473 print OUTPUT $i ? "\n + \"" : " = \"";
474 for $j (0 .. 10) {
475 last if $len <= $i * 11 + $j;
476 my $val = $charinfo[$i * 11 + $j][0];
477 print OUTPUT javaChar($val);
479 print OUTPUT "\"";
482 print OUTPUT <<EOF;
486 * This is the attribute table for computing the single-character uppercase
487 * representation of a character. The value is the signed difference
488 * between the character and its uppercase version. Note that this is
489 * stored as an unsigned char since this is a String literal. When
490 * capitalizing a String, you must first check if a multi-character uppercase
491 * sequence exists before using this character.
493 String UPPER
496 $len = @charinfo;
497 for ($i = 0; $i < $len / 11; $i++) {
498 print OUTPUT $i ? "\n + \"" : " = \"";
499 for $j (0 .. 10) {
500 last if $len <= $i * 11 + $j;
501 my $val = $charinfo[$i * 11 + $j][1];
502 print OUTPUT javaChar($val);
504 print OUTPUT "\"";
507 print OUTPUT <<EOF;
511 * This is the attribute table for computing the lowercase representation
512 * of a character. The value is the signed difference between the
513 * character and its lowercase version. Note that this is stored as an
514 * unsigned char since this is a String literal.
516 String LOWER
519 $len = @charinfo;
520 for ($i = 0; $i < $len / 13; $i++) {
521 print OUTPUT $i ? "\n + \"" : " = \"";
522 for $j (0 .. 12) {
523 last if $len <= $i * 13 + $j;
524 my $val = $charinfo[$i * 13 + $j][2];
525 print OUTPUT javaChar($val);
527 print OUTPUT "\"";
530 print OUTPUT <<EOF;
534 * This is the attribute table for computing the directionality class
535 * of a character, as well as a marker of characters with a multi-character
536 * capitalization. The direction is taken by performing a signed shift
537 * right by 2 (where a result of -1 means an unknown direction, such as
538 * for undefined characters). The lower 2 bits form a count of the
539 * additional characters that will be added to a String when performing
540 * multi-character uppercase expansion. This count is also used, along with
541 * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
542 * when performing the case conversion. Note that this information is stored
543 * as an unsigned char since this is a String literal.
545 String DIRECTION
548 $len = @charinfo;
549 for ($i = 0; $i < $len / 17; $i++) {
550 print OUTPUT $i ? "\n + \"" : " = \"";
551 for $j (0 .. 16) {
552 last if $len <= $i * 17 + $j;
553 my $val = $charinfo[$i * 17 + $j][3];
554 print OUTPUT javaChar($val);
556 print OUTPUT "\"";
559 print OUTPUT <<EOF;
563 * This is the listing of titlecase special cases (all other characters
564 * can use <code>UPPER</code> to determine their titlecase). The listing
565 * is a sorted sequence of character pairs; converting the first character
566 * of the pair to titlecase produces the second character.
568 String TITLE
571 $len = length($titlecase) / 2;
572 for ($i = 0; $i < $len / 11; $i++) {
573 print OUTPUT $i ? "\n + \"" : " = \"";
574 for $j (0 .. 10) {
575 last if $len <= $i * 11 + $j;
576 my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
577 print OUTPUT javaChar($val);
579 print OUTPUT "\"";
582 print OUTPUT <<EOF;
586 * This is a listing of characters with multi-character uppercase sequences.
587 * A character appears in this list exactly when it has a non-zero entry
588 * in the low-order 2-bit field of DIRECTION. The listing is a sorted
589 * sequence of pairs (hence a binary search on the even elements is an
590 * efficient way to lookup a character). The first element of a pair is the
591 * character with the expansion, and the second is the index into
592 * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
593 * DIRECTION to determine where the expansion ends.
595 String UPPER_SPECIAL
598 my @list = sort {$a <=> $b} keys %special;
599 my $expansion = "";
600 my $offset = 0;
601 $len = @list;
602 for ($i = 0; $i < $len / 5; $i++) {
603 print OUTPUT $i ? "\n + \"" : " = \"";
604 for $j (0 .. 4) {
605 last if $len <= $i * 5 + $j;
606 my $ch = $list[$i * 5 + $j];
607 print OUTPUT javaChar($ch);
608 print OUTPUT javaChar($offset);
609 $offset += @{$special{$ch}};
610 $expansion .= pack "n*", @{$special{$ch}};
612 print OUTPUT "\"";
615 print OUTPUT <<EOF;
619 * This is the listing of special case multi-character uppercase sequences.
620 * Characters listed in UPPER_SPECIAL index into this table to find their
621 * uppercase expansion. Remember that you must also perform special-casing
622 * on two single-character sequences in the Turkish locale, which are not
623 * covered here in CharData.
625 String UPPER_EXPAND
628 $len = length($expansion) / 2;
629 for ($i = 0; $i < $len / 11; $i++) {
630 print OUTPUT $i ? "\n + \"" : " = \"";
631 for $j (0 .. 10) {
632 last if $len <= $i * 11 + $j;
633 my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
634 print OUTPUT javaChar($val);
636 print OUTPUT "\"";
639 print OUTPUT ";\n}\n";
640 close OUTPUT;
642 print "\nDone.\n";