2003-12-26 Guilhem Lavaux <guilhem@kaffe.org>
[official-gcc.git] / libjava / scripts / unicode-muncher.pl
blob073bc69941466819feed892f1733c7746da4c65a
1 #!/usr/bin/perl -w
2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
3 # Copyright (C) 1998, 2002 Free Software Foundation, Inc.
5 # This file is part of GNU Classpath.
7 # GNU Classpath is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
12 # GNU Classpath is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GNU Classpath; see the file COPYING. If not, write to the
19 # Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 # 02111-1307 USA.
22 # Linking this library statically or dynamically with other modules is
23 # making a combined work based on this library. Thus, the terms and
24 # conditions of the GNU General Public License cover the whole
25 # combination.
27 # As a special exception, the copyright holders of this library give you
28 # permission to link this library with independent modules to produce an
29 # executable, regardless of the license terms of these independent
30 # modules, and to copy and distribute the resulting executable under
31 # terms of your choice, provided that you also meet, for each linked
32 # independent module, the terms and conditions of the license of that
33 # module. An independent module is a module which is not derived from
34 # or based on this library. If you modify this library, you may extend
35 # this exception to your version of the library, but you are not
36 # obligated to do so. If you do not wish to do so, delete this
37 # exception statement from your version.
39 # Code for reading UnicodeData.txt and generating the code for
40 # gnu.java.lang.CharData. For now, the relevant Unicode definition files
41 # are found in libjava/gnu/gcj/convert/.
43 # Inspired by code from Jochen Hoenicke.
44 # author Eric Blake <ebb9@email.byu.edu>
46 # Usage: ./unicode-muncher <UnicodeData.txt> <CharData.java>
47 # where <UnicodeData.txt> is obtained from www.unicode.org (named
48 # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
49 # is the final location for the Java interface gnu.java.lang.CharData.
50 # As of JDK 1.4, use Unicode version 3.0.0 for best results.
53 ## Convert a 16-bit integer to a Java source code String literal character
55 sub javaChar($) {
56 my ($char) = @_;
57 die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
58 $char += 0x10000 if $char < 0;
59 # Special case characters that must be escaped, or are shorter as ASCII
60 return sprintf("\\%03o", $char) if $char < 0x20;
61 return "\\\"" if $char == 0x22;
62 return "\\\\" if $char == 0x5c;
63 return pack("C", $char) if $char < 0x7f;
64 return sprintf("\\u%04x", $char);
68 ## Convert the text UnicodeData file from www.unicode.org into a Java
69 ## interface with string constants holding the compressed information.
71 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
72 SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
73 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
75 my $NOBREAK_FLAG = 32;
76 my $MIRRORED_FLAG = 64;
78 my @info = ();
79 my $titlecase = "";
80 my $count = 0;
81 my $range = 0;
83 die "Usage: $0 <UnicodeData.txt> <CharData.java>" unless @ARGV == 2;
84 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
86 # Stage 1: Parse the attribute file
87 $| = 1;
88 print "GNU Classpath Unicode Attribute Database Generator 2.0\n";
89 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
90 print "Parsing attributes file";
91 while(<UNICODE>) {
92 print "." unless $count++ % 1000;
93 chomp;
94 s/\r//g;
95 my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
96 $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
97 $ch = hex($ch);
98 next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
100 my ($type, $numValue, $upperchar, $lowerchar, $direction);
102 $type = 0;
103 while ($category !~ /^$TYPECODES[$type]$/) {
104 if (++$type == @TYPECODES) {
105 die "$ch: Unknown type: $category";
108 $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
109 $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
111 if ($numeric =~ /^[0-9]+$/) {
112 $numValue = $numeric;
113 die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
114 } elsif ($numeric eq "") {
115 # Special case sequences of 'a'-'z'
116 if ($ch >= 0x0041 && $ch <= 0x005a) {
117 $numValue = $ch - 0x0037;
118 } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
119 $numValue = $ch - 0x0057;
120 } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
121 $numValue = $ch - 0xff17;
122 } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
123 $numValue = $ch - 0xff37;
124 } else {
125 $numValue = -1;
127 } else {
128 $numValue = -2;
131 $upperchar = $upcase ? hex($upcase) - $ch : 0;
132 $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
133 if ($title ne $upcase) {
134 my $titlechar = $title ? hex($title) : $ch;
135 $titlecase .= pack("n2", $ch, $titlechar);
138 $direction = 0;
139 while ($bidir !~ /^$DIRCODES[$direction]$/) {
140 if (++$direction == @DIRCODES) {
141 $direction = -1;
142 last;
146 if ($range) {
147 die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
148 for ($range + 1 .. $ch - 1) {
149 $info[$_] = pack("n5", $type, $numValue, $upperchar,
150 $lowerchar, $direction);
152 $range = 0;
153 } elsif ($name =~ /First>$/) {
154 $range = $ch;
156 $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
157 $direction);
159 close UNICODE;
161 # Stage 2: Compress the data structures
162 printf "\nCompressing data structures";
163 $count = 0;
164 my $info = ();
165 my %charhash = ();
166 my @charinfo = ();
168 for my $ch (0 .. 0xffff) {
169 print "." unless $count++ % 0x1000;
170 if (! defined $info[$ch]) {
171 $info[$ch] = pack("n5", 0, -1, 0, 0, -1);
174 my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
175 if (! exists $charhash{$info[$ch]}) {
176 push @charinfo, [ $numVal, $upper, $lower, $direction ];
177 $charhash{$info[$ch]} = $#charinfo;
179 $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
182 my $charlen = @charinfo;
183 my $bestshift;
184 my $bestest = 1000000;
185 my $bestblkstr;
186 die "Too many unique character entries: $charlen\n" if $charlen > 512;
187 print "\nUnique character entries: $charlen\n";
189 for my $i (3 .. 8) {
190 my $blksize = 1 << $i;
191 my %blocks = ();
192 my @blkarray = ();
193 my ($j, $k);
194 print "shift: $i";
196 for ($j = 0; $j < 0x10000; $j += $blksize) {
197 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
198 if (! exists $blocks{$blkkey}) {
199 push @blkarray, $blkkey;
200 $blocks{$blkkey} = $#blkarray;
203 my $blknum = @blkarray;
204 my $blocklen = $blknum * $blksize;
205 printf " before %5d", $blocklen;
207 # Now we try to pack the blkarray as tight as possible by finding matching
208 # heads and tails.
209 for ($j = $blksize - 1; $j > 0; $j--) {
210 my %tails = ();
211 for $k (0 .. $#blkarray) {
212 next if ! defined $blkarray[$k];
213 my $len = length $blkarray[$k];
214 my $tail = substr $blkarray[$k], $len - $j * 2;
215 if (exists $tails{$tail}) {
216 push @{$tails{$tail}}, $k;
217 } else {
218 $tails{$tail} = [ $k ];
222 # tails are calculated, now calculate the heads and merge.
223 BLOCK:
224 for $k (0 .. $#blkarray) {
225 next if ! defined $blkarray[$k];
226 my $tomerge = $k;
227 while (1) {
228 my $head = substr($blkarray[$tomerge], 0, $j * 2);
229 my $entry = $tails{$head};
230 next BLOCK if ! defined $entry;
232 my $other = shift @{$entry};
233 if ($other == $tomerge) {
234 if (@{$entry}) {
235 push @{$entry}, $other;
236 $other = shift @{$entry};
237 } else {
238 push @{$entry}, $other;
239 next BLOCK;
242 if (@{$entry} == 0) {
243 delete $tails{$head};
246 # a match was found
247 my $merge = $blkarray[$other]
248 . substr($blkarray[$tomerge], $j * 2);
249 $blocklen -= $j;
250 $blknum--;
252 if ($other < $tomerge) {
253 $blkarray[$tomerge] = undef;
254 $blkarray[$other] = $merge;
255 my $len = length $merge;
256 my $tail = substr $merge, $len - $j * 2;
257 $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
258 @{$tails{$tail}} ];
259 next BLOCK;
261 $blkarray[$tomerge] = $merge;
262 $blkarray[$other] = undef;
266 my $blockstr;
267 for $k (0 .. $#blkarray) {
268 $blockstr .= $blkarray[$k] if defined $blkarray[$k];
271 die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
272 my $estimate = 2 * $blocklen + (0x20000 >> $i);
274 printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
275 if ($estimate < $bestest) {
276 $bestest = $estimate;
277 $bestshift = $i;
278 $bestblkstr = $blockstr;
282 my @blocks;
283 my $blksize = 1 << $bestshift;
284 for (my $j = 0; $j < 0x10000; $j += $blksize) {
285 my $blkkey = substr $info, 2 * $j, 2 * $blksize;
286 my $index = index $bestblkstr, $blkkey;
287 while ($index & 1) {
288 die "not found: $j" if $index == -1;
289 $index = index $bestblkstr, $blkkey, $index + 1;
291 push @blocks, ($index / 2 - $j) & 0xffff;
294 # Phase 3: Generate the file
295 die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
296 if @blocks > 0xffff / 3;
297 die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
298 if length($bestblkstr) > 0xffff / 3;
300 print "Generating $ARGV[1] with shift of $bestshift";
301 my ($i, $j);
303 open OUTPUT, "> $ARGV[1]" or die "Failed creating output file: $!\n";
304 print OUTPUT <<EOF;
305 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
306 Copyright (C) 2002 Free Software Foundation, Inc.
307 *** This file is generated by scripts/unicode-muncher.pl ***
309 This file is part of GNU Classpath.
311 GNU Classpath is free software; you can redistribute it and/or modify
312 it under the terms of the GNU General Public License as published by
313 the Free Software Foundation; either version 2, or (at your option)
314 any later version.
316 GNU Classpath is distributed in the hope that it will be useful, but
317 WITHOUT ANY WARRANTY; without even the implied warranty of
318 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
319 General Public License for more details.
321 You should have received a copy of the GNU General Public License
322 along with GNU Classpath; see the file COPYING. If not, write to the
323 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
324 02111-1307 USA.
326 Linking this library statically or dynamically with other modules is
327 making a combined work based on this library. Thus, the terms and
328 conditions of the GNU General Public License cover the whole
329 combination.
331 As a special exception, the copyright holders of this library give you
332 permission to link this library with independent modules to produce an
333 executable, regardless of the license terms of these independent
334 modules, and to copy and distribute the resulting executable under
335 terms of your choice, provided that you also meet, for each linked
336 independent module, the terms and conditions of the license of that
337 module. An independent module is a module which is not derived from
338 or based on this library. If you modify this library, you may extend
339 this exception to your version of the library, but you are not
340 obligated to do so. If you do not wish to do so, delete this
341 exception statement from your version. */
343 package gnu.java.lang;
346 * This contains the info about the unicode characters, that
347 * java.lang.Character needs. It is generated automatically from
348 * <code>$ARGV[0]</code>, by some
349 * perl scripts. This Unicode definition file can be found on the
350 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
351 * JDK 1.4 uses Unicode version 3.0.0.
353 * The data is stored as string constants, but Character will convert these
354 * Strings to their respective <code>char[]</code> components. The field
355 * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
356 * characters within <code>DATA</code>. The DATA field, in turn, stores
357 * information about each character in the low order bits, and an offset
358 * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
359 * <code>NUM_VALUE</code>, and <code>DIRECTION</code>. Notice that the
360 * attribute tables are much smaller than 0xffff entries; as many characters
361 * in Unicode share common attributes. Finally, there is a listing for
362 * <code>TITLE</code> exceptions (most characters just have the same
363 * title case as upper case).
365 * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
366 * Eric Blake)
367 * \@see Character
369 public interface CharData
372 * The Unicode definition file that was parsed to build this database.
374 String SOURCE = \"$ARGV[0]\";
377 * The character shift amount to look up the block offset. In other words,
378 * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
379 * <code>ch</code> is described in <code>DATA</code>.
381 int SHIFT = $bestshift;
384 * The mapping of character blocks to their location in <code>DATA</code>.
385 * Each entry has been adjusted so that the 16-bit sum with the desired
386 * character gives the actual index into <code>DATA</code>.
388 String BLOCKS
391 for ($i = 0; $i < @blocks / 11; $i++) {
392 print OUTPUT $i ? "\n + \"" : " = \"";
393 for $j (0 .. 10) {
394 last if @blocks <= $i * 11 + $j;
395 my $val = $blocks[$i * 11 + $j];
396 print OUTPUT javaChar($val);
398 print OUTPUT "\"";
401 print OUTPUT <<EOF;
405 * Information about each character. The low order 5 bits form the
406 * character type, the next bit is a flag for non-breaking spaces, and the
407 * next bit is a flag for mirrored directionality. The high order 9 bits
408 * form the offset into the attribute tables. Note that this limits the
409 * number of unique character attributes to 512, which is not a problem
410 * as of Unicode version 3.2.0, but may soon become one.
412 String DATA
415 my $len = length($bestblkstr) / 2;
416 for ($i = 0; $i < $len / 11; $i++) {
417 print OUTPUT $i ? "\n + \"" : " = \"";
418 for $j (0 .. 10) {
419 last if $len <= $i * 11 + $j;
420 my $val = unpack "n", substr($bestblkstr, 2 * ($i*11 + $j), 2);
421 print OUTPUT javaChar($val);
423 print OUTPUT "\"";
426 print OUTPUT <<EOF;
430 * This is the attribute table for computing the numeric value of a
431 * character. The value is -1 if Unicode does not define a value, -2
432 * if the value is not a positive integer, otherwise it is the value.
433 * Note that this is a signed value, but stored as an unsigned char
434 * since this is a String literal.
436 String NUM_VALUE
439 $len = @charinfo;
440 for ($i = 0; $i < $len / 11; $i++) {
441 print OUTPUT $i ? "\n + \"" : " = \"";
442 for $j (0 .. 10) {
443 last if $len <= $i * 11 + $j;
444 my $val = $charinfo[$i * 11 + $j][0];
445 print OUTPUT javaChar($val);
447 print OUTPUT "\"";
450 print OUTPUT <<EOF;
454 * This is the attribute table for computing the uppercase representation
455 * of a character. The value is the signed difference between the
456 * character and its uppercase version. Note that this is stored as an
457 * unsigned char since this is a String literal.
459 String UPPER
462 $len = @charinfo;
463 for ($i = 0; $i < $len / 11; $i++) {
464 print OUTPUT $i ? "\n + \"" : " = \"";
465 for $j (0 .. 10) {
466 last if $len <= $i * 11 + $j;
467 my $val = $charinfo[$i * 11 + $j][1];
468 print OUTPUT javaChar($val);
470 print OUTPUT "\"";
473 print OUTPUT <<EOF;
477 * This is the attribute table for computing the lowercase representation
478 * of a character. The value is the signed difference between the
479 * character and its lowercase version. Note that this is stored as an
480 * unsigned char since this is a String literal.
482 String LOWER
485 $len = @charinfo;
486 for ($i = 0; $i < $len / 11; $i++) {
487 print OUTPUT $i ? "\n + \"" : " = \"";
488 for $j (0 .. 10) {
489 last if $len <= $i * 11 + $j;
490 my $val = $charinfo[$i * 11 + $j][2];
491 print OUTPUT javaChar($val);
493 print OUTPUT "\"";
496 print OUTPUT <<EOF;
500 * This is the attribute table for computing the directionality class
501 * of a character. At present, the value is in the range 0 - 18 if the
502 * character has a direction, otherwise it is -1. Note that this is
503 * stored as an unsigned char since this is a String literal.
505 String DIRECTION
508 $len = @charinfo;
509 for ($i = 0; $i < $len / 11; $i++) {
510 print OUTPUT $i ? "\n + \"" : " = \"";
511 for $j (0 .. 10) {
512 last if $len <= $i * 11 + $j;
513 my $val = $charinfo[$i * 11 + $j][3];
514 print OUTPUT javaChar($val);
516 print OUTPUT "\"";
519 print OUTPUT <<EOF;
523 * This is the listing of titlecase special cases (all other character
524 * can use <code>UPPER</code> to determine their titlecase). The listing
525 * is a sequence of character pairs; converting the first character of the
526 * pair to titlecase produces the second character.
528 String TITLE
531 $len = length($titlecase) / 2;
532 for ($i = 0; $i < $len / 11; $i++) {
533 print OUTPUT $i ? "\n + \"" : " = \"";
534 for $j (0 .. 10) {
535 last if $len <= $i * 11 + $j;
536 my $val = unpack "n", substr($titlecase, 2 * ($i*11 + $j), 2);
537 print OUTPUT javaChar($val);
539 print OUTPUT "\"";
542 print OUTPUT ";\n}\n";
543 close OUTPUT;
545 print "\nDone.\n";