* Mainline merge as of 2006-02-16 (@111136).
[official-gcc.git] / gcc / java / gen-table.pl
bloba2f22135b5ba5b7a85a767e977e75793257b06ca
1 #! /usr/bin/perl
3 # Copyright (C) 2000, 2001, 2003 Free Software Foundation
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2, or (at your option)
8 # any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 # 02110-1301, USA.
20 # gen-table.pl - Generate tables for gcj from Unicode data.
21 # Usage: perl gen-table.pl DATA-FILE
23 # You can find the Unicode data file here:
24 # ftp://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.txt
25 # Please update this URL when this program is used with a more
26 # recent version of the table. Note that this table cannot be
27 # distributed with gcc.
28 # This program should not be re-run indiscriminately. Care must be
29 # taken that what it generates is in sync with the Java specification.
31 # Names of fields in Unicode data table.
32 $CODE = 0;
33 $NAME = 1;
34 $CATEGORY = 2;
35 $COMBINING_CLASSES = 3;
36 $BIDI_CATEGORY = 4;
37 $DECOMPOSITION = 5;
38 $DECIMAL_VALUE = 6;
39 $DIGIT_VALUE = 7;
40 $NUMERIC_VALUE = 8;
41 $MIRRORED = 9;
42 $OLD_NAME = 10;
43 $COMMENT = 11;
44 $UPPER = 12;
45 $LOWER = 13;
46 $TITLE = 14;
48 # Start of special-cased gaps in Unicode data table.
49 %gaps = (
50 0x4e00 => "CJK",
51 0xac00 => "Hangul",
52 0xd800 => "Unassigned High Surrogate",
53 0xdb80 => "Private Use High Surrogate",
54 0xdc00 => "Low Surrogate",
55 0xe000 => "Private Use"
58 # This lists control characters which are also considered whitespace.
59 # This is a somewhat odd list, taken from the JCL definition of
60 # Character.isIdentifierIgnorable.
61 %whitespace_controls =
63 0x0009 => 1,
64 0x000a => 1,
65 0x000b => 1,
66 0x000c => 1,
67 0x000d => 1,
68 0x001c => 1,
69 0x001d => 1,
70 0x001e => 1,
71 0x001f => 1
74 open (INPUT, "< $ARGV[0]") || exit 1;
76 $last_code = -1;
77 while (<INPUT>)
79 chop;
80 @fields = split (';', $_, 30);
81 if ($#fields != 14)
83 print STDERR "Entry for $fields[$CODE] has wrong number of fields\n";
86 $code = hex ($fields[$CODE]);
87 last if $code > 0xffff;
88 if ($code > $last_code + 1)
90 # Found a gap.
91 if (defined $gaps{$code})
93 # Fill the gap with the last character read.
94 @gfields = @fields;
96 else
98 # The gap represents undefined characters. Only the type
99 # matters.
100 @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
101 '', '', '', '');
103 for (++$last_code; $last_code < $code; ++$last_code)
105 $gfields{$CODE} = sprintf ("%04x", $last_code);
106 &process_one ($last_code, @gfields);
109 &process_one ($code, @fields);
110 $last_code = $code;
113 close (INPUT);
115 @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
116 '', '', '', '');
117 for (++$last_code; $last_code < 0x10000; ++$last_code)
119 $gfields{$CODE} = sprintf ("%04x", $last_code);
120 &process_one ($last_code, @gfields);
122 --$last_code; # Want last to be 0xFFFF.
124 &print_tables ($last_code);
126 exit 0;
128 # Process a single character.
129 sub process_one
131 my ($code, @fields) = @_;
133 my @value = ();
134 my $type = $fields[$CATEGORY];
136 # See if the character is a valid identifier start.
137 if ($type =~ /L./ # Letter
138 || $type eq 'Pc' # Connecting punctuation
139 || $type eq 'Sc') # Currency symbol
141 push (@value, 'LETTER_START');
144 # See if the character is a valid identifier member.
145 if ($type =~ /L./ # Letter
146 || $type eq 'Pc' # Connecting punctuation
147 || $type eq 'Sc' # Currency symbol
148 || $type =~ /N[dl]/ # Number: decimal or letter
149 || $type =~ /M[nc]/ # Mark: non-spacing or combining
150 || ($type eq 'Cc' # Certain controls
151 && ! defined $whitespace_controls{$code})
152 || ($code >= 0x200c # Join controls
153 && $code <= 0x200f)
154 || ($code >= 0x202a # Bidi controls -- note that there
155 # is a typo in the JCL where these are
156 # concerned.
157 && $code <= 0x202e)
158 || ($code >= 0x206a # Format controls
159 && $code <= 0x206f)
160 || $code == 0xfeff) # ZWNBSP
162 push (@value, 'LETTER_PART');
165 if (($type =~ /Z./
166 # Java treats some values specially as non-spaces.
167 && $code != 0x00a0
168 && $code != 0x2007
169 && $code != 0x202f)
170 # And for our purposes there are some that should be specially
171 # treated as spaces.
172 || $code == 0x000b
173 || ($code >= 0x001c && $code <= 0x001f))
175 push (@value, 'LETTER_SPACE');
178 if (! @value)
180 $value = '0';
182 else
184 $value = '(' . join (' | ', @value) . ')';
187 $map[$code] = $value;
190 sub print_tables
192 my ($last) = @_;
194 local ($bytes_out) = 0;
196 open (OUT, "> chartables.h");
198 print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
199 print OUT " Instead, edit gen-table.pl and re-run. */\n\n";
201 print OUT "#ifndef GCC_CHARTABLES_H\n";
202 print OUT "#define GCC_CHARTABLES_H\n\n";
204 print OUT "#define LETTER_START 1\n";
205 print OUT "#define LETTER_PART 2\n";
206 print OUT "#define LETTER_SPACE 4\n\n";
207 print OUT "#define LETTER_MASK 7\n\n";
209 for ($count = 0; $count <= $last; $count += 256)
211 $row[$count / 256] = &print_row ($count, '(char *) ', 'const char', 1,
212 'page');
215 print OUT "static const char *const type_table[256] = {\n";
216 for ($count = 0; $count <= $last; $count += 256)
218 print OUT ",\n" if $count > 0;
219 print OUT " ", $row[$count / 256];
220 $bytes_out += 4;
222 print OUT "\n};\n\n";
224 print OUT "#endif /* ! GCC_CHARTABLES_H */\n";
226 close (OUT);
228 printf "Generated %d bytes\n", $bytes_out;
231 # Print a single "row" of a two-level table.
232 sub print_row
234 my ($start, $def_pfx, $typname, $typsize, $name) = @_;
236 my ($i);
237 my (@values);
238 my ($flag) = 1;
239 my ($off);
240 for ($off = 0; $off < 256; ++$off)
242 $values[$off] = $map[$off + $start];
243 if ($values[$off] ne $values[0])
245 $flag = 0;
248 if ($flag)
250 return $def_pfx . $values[0];
253 printf OUT "static %s %s%d[256] = {\n ", $typname, $name, $start / 256;
254 my ($column) = 2;
255 for ($i = $start; $i < $start + 256; ++$i)
257 print OUT ", "
258 if $i > $start;
259 my ($text) = $values[$i - $start];
260 if (length ($text) + $column + 2 > 78)
262 print OUT "\n ";
263 $column = 2;
265 print OUT $text;
266 $column += length ($text) + 2;
268 print OUT "\n};\n\n";
270 $bytes_out += 256 * $typsize;
272 return sprintf "%s%d", $name, $start / 256;