CLOSED TREE: TraceMonkey merge head. (a=blockers)
[mozilla-central.git] / intl / unicharutil / tools / ccmapbin.pl
bloba2c9fec75bd7abf88d450cea1f15f898cb5d5408
1 #!/usr/bin/perl -w
2 # ***** BEGIN LICENSE BLOCK *****
3 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 # The contents of this file are subject to the Mozilla Public License Version
6 # 1.1 (the "License"); you may not use this file except in compliance with
7 # the License. You may obtain a copy of the License at
8 # http://www.mozilla.org/MPL/
10 # Software distributed under the License is distributed on an "AS IS" basis,
11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 # for the specific language governing rights and limitations under the
13 # License.
15 # The Original Code is Mozilla Communicator.
17 # The Initial Developer of the Original Code is
18 # Jungshik Shin <jshin@mailaps.org>.
19 # Portions created by the Initial Developer are Copyright (C) 2002, 2003
20 # the Initial Developer. All Rights Reserved.
22 # Contributor(s):
24 # Alternatively, the contents of this file may be used under the terms of
25 # either the GNU General Public License Version 2 or later (the "GPL"), or
26 # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 # in which case the provisions of the GPL or the LGPL are applicable instead
28 # of those above. If you wish to allow use of your version of this file only
29 # under the terms of either the GPL or the LGPL, and not to allow others to
30 # use your version of this file under the terms of the MPL, indicate your
31 # decision by deleting the provisions above and replace them with the notice
32 # and other provisions required by the GPL or the LGPL. If you do not delete
33 # the provisions above, a recipient may use your version of this file under
34 # the terms of any one of the MPL, the GPL or the LGPL.
36 # ***** END LICENSE BLOCK *****
38 # This script is used to generate precompiled CCMap files.
39 # See bug 180266 for details.
41 # Revised to support extended CCMaps for non-BMP characters : 2003-09-19 (bug 205387)
42 # Revised to support the automatic generation of a macro defining the size
43 # of a CCMap in terms of PRUint16 : 2003-12-11 (bug 224337)
45 use strict;
48 use vars qw($fill_fmt $fu_sz);
49 use vars qw($e_mid_offset $e_pg_offset);
51 (@ARGV < 1 ) and usage();
53 my $ifn = $ARGV[0];
55 my ($ifh, $variable, $class);
56 open $ifh , "< $ifn" or die "Cannot open $ifn";
58 if (@ARGV >= 2) {
59 $variable = $ARGV[1];
60 printf STDERR
61 "$0:\n\t VARIABLE $variable is specified in the command line.\n" .
62 "\t The variable name spec. in the input file will be ignored.\n";
65 if (@ARGV >= 3) {
66 $class = $ARGV[2];
67 printf STDERR
68 "$0:\n\t CLASS $class is specified in the command line.\n" .
69 "\t The class spec. in the input file will be ignored.\n";
72 use constant N_PLANES => 17; # BMP + 16 non-BMP planes
73 use constant PLANE_SZ => 0x10000;
74 use constant MID_SZ => PLANE_SZ / 16;
75 use constant PG_SZ => MID_SZ / 16;
77 # Unlike FillInfo() method in Mozilla, let's use 16bit integer
78 # to pack the character coverage/representability. This way,
79 # we can just copy fillinfo to fill up page maps later.
80 use constant {
81 FILL_SZ => PLANE_SZ / 16,
82 MID_FILL_SZ => MID_SZ / 16,
83 PG_FILL_SZ => PG_SZ / 16
88 # network byte order short. actually, byte order doesn't matter.
89 $fill_fmt = "n1";
90 $fu_sz = length(pack $fill_fmt, 0); # fillinfo unit size in byte (size of short)
92 $e_mid_offset = 16;
93 $e_pg_offset = 32;
95 my @ccmap = ();
96 my %pg_flags = ();
97 my @fillinfo = ();
98 my %comments = ();
100 my $planes = &read_input(\@fillinfo,$ifh,\%comments);
102 if (!defined($variable) && !defined($comments{'VARIABLE'}))
104 printf STDERR "Variable name is not specified in the cmd line. " .
105 "Neither is it found in the input file.\n\n" ;
106 usage();
109 $variable = $comments{'VARIABLE'} if (! defined($variable));
111 if (!defined($class) && !defined($comments{'CLASS'}))
113 printf STDERR "Class name is not specified in the cmd line. " .
114 "Neither is it found in the input file.\n\n" ;
115 usage();
118 $class = $comments{'CLASS'} if (! defined($class));
120 my $have_non_bmp = 0;
122 # add the non_bmp flag and the bmp ccmap size (default to 0)
123 # at the very beginning if there are non-bmp characters.
124 if ($planes & 0x1fe) {
125 push @ccmap, (1, 0);
126 $have_non_bmp = 1;
129 my $plane_idx_offset;
130 foreach my $plane (0 .. ($have_non_bmp ? 16 : 0))
132 my @plane_ccmap = add_plane(\@ccmap, \@fillinfo, $plane);
133 my $size = @plane_ccmap;
134 push @ccmap, @plane_ccmap;
135 if ($plane == 0 && $have_non_bmp) {
136 $ccmap[1] = $size;
137 # add 2 for non-BMP flag and BMP plane size
138 # that have negative indices in C++.
139 $plane_idx_offset = $size + 2;
141 # 'Flag' the offset as holding the plane indices (any negative
142 # number would do)
143 $pg_flags{$plane_idx_offset} = -1;
144 $pg_flags{$plane_idx_offset + 16} = -1;
146 # plane indices are 16 PRUint32's(not 16 PRUint16's).
147 # In Perl, we assign each PRUint32 two slots in @ccmap (in BE order)
148 my $e_plane_offset = $size + 16 * 2;
150 # set plane indices to the empty plane by default
151 foreach my $i (1 .. 16) {
152 # split PRUint32 into two PRUint16's in BE
153 push @ccmap, $e_plane_offset >> 16;
154 push @ccmap, $e_plane_offset & 0xffff;
156 # add 'the' empty plane;
157 push @ccmap, (0) x 16;
159 if ($plane > 0) {
160 if ($size > 0) {
161 # split PRUint32 into two PRUint16's in BE.
162 # subtract 2 for non-BMP flag and BMP plane size
163 # that have negative indices in C++.
164 $ccmap[$plane_idx_offset + ($plane - 1) * 2] = (@ccmap - $size - 2) >> 16;
165 $ccmap[$plane_idx_offset + ($plane - 1) * 2 + 1] = (@ccmap - $size -2) & 0xffff;
170 &print_ccmap(\@ccmap, \%pg_flags, $variable, $class, \%comments, $have_non_bmp);
172 exit 0;
174 # END of Main
176 sub usage
178 print STDERR <<USAGE;
179 Usage: $0 input_file [variable [class]]
181 The output file "class.ccmap" will be generated with
182 all three cases LE(16/32/64bit)/BE(16bit), BE(32bit), and BE(64bit)
183 put together. 'variable' will be used to name two macros, one for
184 dimensioning the size of a PRUin16[] and the other for the array
185 initializer.
187 When 'variable' is omitted, it has to be specified in the input file with
188 the following syntax.
190 VARIABLE:: variable
192 When 'class' is omitted, it has to be specified in the input file with
193 the following syntax.
195 CLASS:: class_name
197 USAGE
199 exit 1;
202 sub read_input
204 my($fillinfo_p, $input, $comments_p) = @_;
205 @$fillinfo_p = (0) x (FILL_SZ * N_PLANES);
207 # init bitfield for plane flags (17bits : BMP + 16 non-BMP planes)
208 my $planes = 0;
209 my($lc)=0;
210 while (<$input>)
212 $lc++;
213 chomp;
214 /^\s*VARIABLE::\s*([a-zA-Z][a-zA-Z0-9_]*)$/ and
215 $comments_p->{'VARIABLE'} = $1,
216 next;
217 /^\s*CLASS::/ and
218 ($comments_p->{'CLASS'} = $_) =~ s/^\s*CLASS::\s*([a-zA-Z0-9_]+).*$/$1/,
219 next;
220 /^\s*DESCRIPTION::/ and
221 ($comments_p->{'DESC'} = $_) =~ s/^\s*DESCRIPTION::\s*//, next;
222 /^\s*FILE::/ and
223 ($comments_p->{'FILE'} = $_) =~ s/^\s*FILE::\s*//, next;
225 next unless /^\s*0[Xx][0-9A-Fa-f]{4}/;
227 /^\s*(.*)\s*$/;
228 my ($u, $comment) = split /\s+/, $1, 2;
229 $u =~ s/,//g;
230 $u =~ tr/A-Z/a-z/;
231 next if $u =~ /^0x.*[^0-9a-f]+.*/;
233 my $usv = oct $u;
234 if ( 0xd800 <= $usv && $usv <= 0xdfff || # surrogate code points
235 $usv > 0x10ffff ) {
236 printf STDERR "Invalid input $u at %4d\n", $lc;
237 next;
239 $fillinfo_p->[($usv >> 4)] |= (1 << ($usv & 0x0f));
240 # printf STDERR "input %s(%04x) \@line %d : put %04x @ %04x\n",
241 # $u,$usv, $lc, (1 << ($usv & 0x0f)), ($usv >> 4) & 0xfff;
243 # turn on plane flags
244 $planes |= (1 << ($usv >> 16));
246 my $key = sprintf("0X%06X", $usv);
247 $comments_p->{$key} = "";
249 # Remove '/*' and '*/' (C style comment) or '//' (C++ style comment)
250 # or ':' and store only the textual content of the comment.
251 if (defined($comment)) {
252 ($comments_p->{$key} = $comment)
253 =~ s !
254 (?:/\*|//|:)? # '/*', '//' or ':' or NULL. Do not store.
255 \s* # zero or more of white space(s)
256 ([^*]+) # one or more of non-white space(s).Store it
257 # in $1 for the reference in replace part.
258 \s* # zero or more of white space(s)
259 (?:\*/)? # '*/' or NONE. Do not store
260 !$1!sx # replace the whole match with $1 stored above.
264 return $planes;
267 sub add_full_mid
269 my($ccmap_p, $f_pg_offset) = @_;
270 # add a full page if not yet added.
271 if (! $f_pg_offset) {
272 $f_pg_offset = @$ccmap_p;
273 push @$ccmap_p, (0xffff) x 16;
275 # add the full mid-pointer array with all the pointers pointing to the full page.
276 my $f_mid_offset = @$ccmap_p;
277 push @$ccmap_p, ($f_pg_offset) x 16;
278 return ($f_mid_offset, $f_pg_offset);
281 sub add_new_mid
283 my($ccmap_p, $mid) = @_;
284 my $mid_offset = @$ccmap_p;
285 $ccmap_p->[$mid] = $mid_offset;
286 #by default, all mid-pointers point to the empty page.
287 push @$ccmap_p, ($e_pg_offset) x 16;
288 return $mid_offset;
291 sub add_plane
293 my ($full_ccmap_p, $fillinfo_p, $plane) = @_;
294 # my @ccmap = @$ccmap_p;
295 my @ccmap = (); # plane ccmap
296 my(@fillinfo) = splice @$fillinfo_p, 0, FILL_SZ;
297 # convert 4096(FILL_SZ) 16bit integers to a string of 4096 * $fu_sz
298 # characters.
299 my($plane_str) = pack $fill_fmt x FILL_SZ, @fillinfo;
301 # empty plane
302 if ($plane_str eq "\0" x ($fu_sz * FILL_SZ)) {
303 # for non-BMP plane, the default empty plane ccmap would work.
304 # for BMP, we need 'self-referring' folded CCMap (the smallest CCMap)
305 push @ccmap, (0) x 16 if (!$plane);
306 return @ccmap;
309 #get all upper pointers to point at empty mid pointers
310 push @ccmap, ($e_mid_offset) x 16;
311 #get all mid-pointers to point at empty page.
312 push @ccmap, ($e_pg_offset) x 16;
313 push @ccmap, (0) x 16; # empty pg
315 my $f_mid_offset = 0;
316 my $f_pg_offset;
318 foreach my $mid (0 .. 15)
320 my(@mid_fill) = splice @fillinfo, 0, MID_FILL_SZ;
321 # convert 256(MID_FILL_SZ) 16bit integers to a string of 256 * $fu_sz
322 # characters.
323 my($mid_str) = pack $fill_fmt x MID_FILL_SZ, @mid_fill;
325 # for an empty mid, upper-pointer is already pointing to the empty mid.
326 next if ($mid_str eq "\0" x ($fu_sz * MID_FILL_SZ));
328 # for a full mid, add full mid if necessary.
329 if ($mid_str eq "\xff" x ($fu_sz * MID_FILL_SZ)) {
330 ($f_mid_offset, $f_pg_offset) =
331 add_full_mid(\@ccmap, $f_pg_offset) unless ($f_mid_offset);
332 $ccmap[$mid] = $f_mid_offset;
333 next;
336 my $mid_offset = add_new_mid(\@ccmap,$mid);
338 foreach my $pg (0 .. 15) {
339 my(@pg_fill) = splice @mid_fill, 0, PG_FILL_SZ;
340 my($pg_str) = pack $fill_fmt x PG_FILL_SZ, @pg_fill;
342 # for an empty pg, mid-pointer is already pointing to the empty page.
343 next if ($pg_str eq "\x0" x ($fu_sz * PG_FILL_SZ));
345 # for a full pg, add the full pg if necessary.
346 # and set the mid-pointer to the full pg offset.
347 if ($pg_str eq "\xff" x ($fu_sz * PG_FILL_SZ)) {
348 if (! $f_pg_offset) {
349 $f_pg_offset = @ccmap;
350 #for the full pg, endianess and ALU size are immaterial.
351 push @ccmap, (0xffff) x 16;
353 $ccmap[$mid_offset + $pg] = $f_pg_offset;
354 next;
357 $ccmap[$mid_offset + $pg] = @ccmap;
359 # 'Flag' the offset as the beginning of a page with actual data as
360 # opposed to pointer sections.
361 $pg_flags{(scalar @$full_ccmap_p) + (scalar @ccmap)} = @ccmap;
363 push @ccmap, @pg_fill;
366 return @ccmap;
369 sub print_ccmap
371 my($ccmap_p,$pg_flags_p, $variable, $class, $comments_p, $is_ext) = @_;
374 my $ofn = $class . ($is_ext ? ".x-ccmap" : ".ccmap");
376 open OUT, "> $ofn" or
377 die "cannot open $ofn for output\n";
379 print OUT print_preamble($variable, $class);
381 print OUT "\n/*\n";
382 # defined ($comments_p->{'CLASS'}) and
383 # print OUT " CLASS:: $comments_p->{'CLASS'}\n";
384 print OUT " VARIABLE:: $variable\n";
385 print OUT " CLASS:: $class\n";
386 defined ($comments_p->{'DESC'}) and
387 print OUT " DESCRIPTION:: $comments_p->{'DESC'}\n";
388 defined ($comments_p->{'FILE'}) and
389 print OUT " FILE:: $comments_p->{'FILE'}\n";
391 print OUT "\n";
393 for my $key (sort keys %$comments_p) {
394 next if ($key !~ /^0X/);
395 printf OUT " %s : %s\n", $key, $comments_p->{$key};
398 printf OUT "*/\n\n";
401 my(@idxlist, @int16toint32);
403 # When CCMap is accessed, (PRUint16 *) is cast to
404 # the pointer type of the ALU of a machine.
405 # For little endian machines, the size of the ALU
406 # doesn't matter (16, 32, 64). For Big endian
407 # machines with 32/64 bit ALU, two/four 16bit words
408 # have to be rearranged to be interpreted correctly
409 # as 32bit or 64bit integers with the 16bit word
410 # at the lowest address taking the highest place value.
411 # This shuffling is NOT necessary for the upper pointer section
412 # and mid-pointer sections.
414 # If non-BMP characters are present, 16 plane indices
415 # (32bit integers stored in two 16bit shorts in
416 # BE order) have to be treated differently based on the
417 # the endianness as well.
419 # For BMP-only CCMap, 16BE CCMap is identical to LE CCMaps.
420 # With non-BMP characters present, to avoid the misalignment on 64bit
421 # machines, we add two 16-bit units of 0-padding before the ccmap flag
422 # (indicating whether the map is extended or not) and the BMP map size
423 # (bug 225340, bug 445626).
424 my @fmts = $is_ext ? ("64LE", "LE", "16BE", "32BE", "64BE") : ("LE", "32BE", "64BE") ;
425 foreach my $fmt (@fmts)
428 my($offset) = 0;
429 for ($fmt) {
430 /64LE/ and do {
431 @idxlist = (0, 1, 2, 3);
432 @int16toint32 = (1, 0, 3, 2);
433 print OUT "#if (defined(IS_LITTLE_ENDIAN) && ALU_SIZE == 64)\n" .
434 "// Precompiled CCMap for Little Endian(64bit)\n";
435 printf OUT "#define ${variable}_SIZE %d\n", scalar @$ccmap_p + 2;
436 printf OUT "#define ${variable}_INITIALIZER \\\n";
437 printf OUT "/* EXTFLG */ 0x0000,0x0000,0x%04X,0x%04X, \\\n",
438 $ccmap_p->[0], $ccmap_p->[1];
439 last;
441 /LE/ and do {
442 @idxlist = (0, 1, 2, 3);
443 @int16toint32 = (1, 0, 3, 2);
444 print OUT $is_ext ?
445 "#elif defined(IS_LITTLE_ENDIAN)\n" .
446 "// Precompiled CCMap for Little Endian(16/32bit) \n" :
447 "#if (defined(IS_LITTLE_ENDIAN) || ALU_SIZE == 16)\n" .
448 "// Precompiled CCMap for Little Endian(16/32/64bit)\n" .
449 "// and Big Endian(16bit)\n";
450 printf OUT "#define ${variable}_SIZE %d\n", scalar @$ccmap_p;
451 printf OUT "#define ${variable}_INITIALIZER \\\n";
452 if ($is_ext) {
453 printf OUT "/* EXTFLG */ 0x%04X,0x%04X, \\\n",
454 $ccmap_p->[0], $ccmap_p->[1];
456 last;
458 /16BE/ and do {
459 @idxlist = (0, 1, 2, 3);
460 @int16toint32 = (0, 1, 2, 3);
461 print OUT "#elif (ALU_SIZE == 16)\n" .
462 "// Precompiled CCMap for Big Endian(16bit)\n";
463 printf OUT "#define ${variable}_SIZE %d\n", scalar @$ccmap_p;
464 printf OUT "#define ${variable}_INITIALIZER \\\n";
465 printf OUT "/* EXTFLG */ 0x%04X,0x%04X, \\\n",
466 $ccmap_p->[0], $ccmap_p->[1];
467 last;
469 /32BE/ and do {
470 @idxlist = (1, 0, 3, 2);
471 @int16toint32 = (0, 1, 2, 3);
472 print OUT "#elif (ALU_SIZE == 32)\n" .
473 "// Precompiled CCMap for Big Endian(32bit)\n";
474 printf OUT "#define ${variable}_SIZE %d\n", scalar @$ccmap_p;
475 printf OUT "#define ${variable}_INITIALIZER \\\n";
476 if ($is_ext) {
477 printf OUT "/* EXTFLG */ 0x%04X,0x%04X, \\\n",
478 $ccmap_p->[0], $ccmap_p->[1];
480 last;
482 /64BE/ and do {
483 @idxlist = (3, 2, 1, 0);
484 @int16toint32 = (0, 1, 2, 3);
485 print OUT "#elif (ALU_SIZE == 64)\n" .
486 "// Precompiled CCMap for Big Endian(64bit)\n";
487 printf OUT "#define ${variable}_SIZE %d\n", scalar @$ccmap_p +
488 ($is_ext ? 2 : 0);
489 printf OUT "#define ${variable}_INITIALIZER \\\n";
490 if ($is_ext) {
491 printf OUT "/* EXTFLG */ 0x0000,0x0000,0x%04X,0x%04X, \\\n",
492 $ccmap_p->[0], $ccmap_p->[1];
494 last;
498 $offset = $is_ext ? 2 : 0;
500 while ($offset < @$ccmap_p) {
501 printf OUT "/* %06x */ ", $offset - ($is_ext ? 2 : 0);
502 for my $i (0 .. 3) {
503 for my $j (defined($pg_flags_p->{$offset}) ?
504 ($pg_flags_p->{$offset} > 0 ?
505 @idxlist : @int16toint32) : (0,1,2,3)) {
506 printf OUT "0x%04X,", $ccmap_p->[$offset + $i * 4 + $j];
508 print OUT " \\\n " if $i==1;
510 if ($offset + 16 < @$ccmap_p) {print OUT " \\\n"; }
511 $offset += 16;
513 print OUT "\n";
516 print OUT <<END;
517 #else
518 #error "We don't support this architecture."
519 #endif
523 close OUT;
526 sub print_preamble
529 my($variable, $class) = @_;
530 sprintf <<PREAMBLE;
531 /* ***** BEGIN LICENSE BLOCK *****
532 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
534 * The contents of this file are subject to the Mozilla Public License Version
535 * 1.1 (the "License"); you may not use this file except in compliance with
536 * the License. You may obtain a copy of the License at
537 * http://www.mozilla.org/MPL/
539 * Software distributed under the License is distributed on an "AS IS" basis,
540 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
541 * for the specific language governing rights and limitations under the
542 * License.
544 * The Original Code is mozilla.org code.
546 * The Initial Developer of the Original Code is
547 * Jungshik Shin <jshin\@mailaps.org>
548 * Portions created by the Initial Developer are Copyright (C) 2003
549 * the Initial Developer. All Rights Reserved.
551 * Contributor(s):
553 * Alternatively, the contents of this file may be used under the terms of
554 * either the GNU General Public License Version 2 or later (the "GPL"), or
555 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
556 * in which case the provisions of the GPL or the LGPL are applicable instead
557 * of those above. If you wish to allow use of your version of this file only
558 * under the terms of either the GPL or the LGPL, and not to allow others to
559 * use your version of this file under the terms of the MPL, indicate your
560 * decision by deleting the provisions above and replace them with the notice
561 * and other provisions required by the GPL or the LGPL. If you do not delete
562 * the provisions above, a recipient may use your version of this file under
563 * the terms of any one of the MPL, the GPL or the LGPL.
565 * ***** END LICENSE BLOCK ***** */
567 /*========================================================
568 This file contains a precompiled CCMap for a class of Unicode
569 characters ($class) to be identified quickly by Mozilla.
570 It was generated by ccmapbin.pl which you can find under
571 mozilla/intl/unicharutil/tools.
573 Enumerated below are characters included in the precompiled CCMap
574 which is human-readable but not so human-friendly. If you
575 needs to modify the list of characters belonging to "$class",
576 you have to make a new file (with the name of your choice)
577 listing characters (one character per line) you want to put
578 into "$class" in the format
580 0xuuuu // comment
582 In addition, the input file can have the following optional lines that
583 read
585 VARIABLE::$variable
586 CLASS::$class
587 DESCRIPTION:: description of a character class
588 FILE:: mozilla source file to include the output file
591 Then, run the following in the current directory.
593 perl ccmapbin.pl input_file [$variable [$class]]
595 which will generate $class.ccmap (or $class.x-ccmap if the ccmap
596 includes non-BMP characters.). $variable is used as the prefix
597 in macros for the array initializer and the array size.
599 (see bug 180266, bug 167136, and bug 224337)
603 PREAMBLE