libjava/chartables.pl

   1 # chartables.pl - A perl program to generate tables for use by the
   2 # Character class.
   3
   4 # Copyright (C) 1998, 1999  Cygnus Solutions
   5 #
   6 # This file is part of libjava.
   7 #
   8 # This software is copyrighted work licensed under the terms of the
   9 # Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
  10 # details.
  11
  12 # This program requires a `unidata.txt' file of the form distributed
  13 # on the Unicode 2.0 CD ROM.  Or, get it more conveniently here:
  14 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
  15 # Version `2.1.8' of this file was last used to update the Character class.
  16
  17 # Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
  18 # "The Java Language Specification", ISBN 0-201-63451-1
  19 # plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
  20
  21 # Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt
  22 # If this exits with nonzero status, then you must investigate the
  23 # cause of the problem.
  24 # Diagnostics and other information to stderr.
  25 # This creates the new include/java-chartables.h and
  26 # include/java-chardecomp.h files directly.
  27 # With -n, the files are not created, but all processing
  28 # still occurs.
  29
  30 # Fields in the table.
  31 $CODE = 0;
  32 $NAME = 1;
  33 $CATEGORY = 2;
  34 $DECOMPOSITION = 5;
  35 $DECIMAL = 6;
  36 $DIGIT = 7;
  37 $NUMERIC = 8;
  38 $UPPERCASE = 12;
  39 $LOWERCASE = 13;
  40 $TITLECASE = 14;
  41
  42 # A special case.
  43 $TAMIL_DIGIT_ONE  = 0x0be7;
  44 $TAMIL_DIGIT_NINE = 0x0bef;
  45
  46 # These are endpoints of legitimate gaps in the tables.
  47 $CJK_IDEOGRAPH_END = 0x9fa5;
  48 $HANGUL_END = 0xd7a3;
  49 $HIGH_SURROGATE_END = 0xdb7f;
  50 $PRIVATE_HIGH_SURROGATE_END = 0xdbff;
  51 $LOW_SURROGATE_END = 0xdfff;
  52 $PRIVATE_END = 0xf8ff;
  53
  54 %title_to_upper = ();
  55 %title_to_lower = ();
  56 %numerics  = ();
  57 %name = ();
  58
  59 @digit_start = ();
  60 @digit_end   = ();
  61
  62 @space_start = ();
  63 @space_end   = ();
  64
  65 # @letter_start = ();
  66 # @letter_end   = ();
  67
  68 @all_start = ();
  69 @all_end   = ();
  70 @all_cats  = ();
  71
  72 @upper_start = ();
  73 @upper_end   = ();
  74 @upper_map   = ();
  75 %upper_anom  = ();
  76
  77 @lower_start = ();
  78 @lower_end   = ();
  79 @lower_map   = ();
  80 %lower_anom  = ();
  81
  82 @attributes = ();
  83
  84 # There are a few characters which actually need two attributes.
  85 # These are special-cased.
  86 $ROMAN_START = 0x2160;
  87 $ROMAN_END   = 0x217f;
  88 %second_attributes = ();
  89
  90 $prevcode = -1;
  91 $status = 0;
  92
  93 %category_map =
  94 (
  95  'Mn' => 'NON_SPACING_MARK',
  96  'Mc' => 'COMBINING_SPACING_MARK',
  97  'Me' => 'ENCLOSING_MARK',
  98  'Nd' => 'DECIMAL_DIGIT_NUMBER',
  99  'Nl' => 'LETTER_NUMBER',
 100  'No' => 'OTHER_NUMBER',
 101  'Zs' => 'SPACE_SEPARATOR',
 102  'Zl' => 'LINE_SEPARATOR',
 103  'Zp' => 'PARAGRAPH_SEPARATOR',
 104  'Cc' => 'CONTROL',
 105  'Cf' => 'FORMAT',
 106  'Cs' => 'SURROGATE',
 107  'Co' => 'PRIVATE_USE',
 108  'Cn' => 'UNASSIGNED',
 109  'Lu' => 'UPPERCASE_LETTER',
 110  'Ll' => 'LOWERCASE_LETTER',
 111  'Lt' => 'TITLECASE_LETTER',
 112  'Lm' => 'MODIFIER_LETTER',
 113  'Lo' => 'OTHER_LETTER',
 114  'Pc' => 'CONNECTOR_PUNCTUATION',
 115  'Pd' => 'DASH_PUNCTUATION',
 116  'Ps' => 'START_PUNCTUATION',
 117  'Pe' => 'END_PUNCTUATION',
 118  'Pi' => 'START_PUNCTUATION',
 119  'Pf' => 'END_PUNCTUATION',
 120  'Po' => 'OTHER_PUNCTUATION',
 121  'Sm' => 'MATH_SYMBOL',
 122  'Sc' => 'CURRENCY_SYMBOL',
 123  'Sk' => 'MODIFIER_SYMBOL',
 124  'So' => 'OTHER_SYMBOL'
 125  );
 126
 127 # These maps characters to their decompositions.
 128 %canonical_decomposition = ();
 129 %full_decomposition = ();
 130
 131
 132 # Handle `-n' and open output files.
 133 local ($f1, $f2) = ('include/java-chartables.h',
 134                     'include/java-chardecomp.h');
 135 if ($ARGV[0] eq '-n')
 136 {
 137     shift @ARGV;
 138     $f1 = '/dev/null';
 139     $f2 = '/dev/null';
 140 }
 141
 142 open (CHARTABLE, "> $f1");
 143 open (DECOMP, "> $f2");
 144
 145 # Process the Unicode file.
 146 while (<>)
 147 {
 148     chop;
 149     # Specify a limit for split so that we pick up trailing fields.
 150     # We make the limit larger than we need, to catch the case where
 151     # there are extra fields.
 152     @fields = split (';', $_, 30);
 153     # Convert code to number.
 154     $ncode = hex ($fields[$CODE]);
 155
 156     if ($#fields != 14)
 157     {
 158         print STDERR ("Entry for \\u", $fields[$CODE],
 159                       " has wrong number of fields: ", $#fields, "\n");
 160     }
 161
 162     $name{$fields[$CODE]} = $fields[$NAME];
 163
 164     # If we've found a gap in the table, fill it in.
 165     if ($ncode != $prevcode + 1)
 166     {
 167         &process_gap (*fields, $prevcode, $ncode);
 168     }
 169
 170     &process_char (*fields, $ncode);
 171
 172     $prevcode = $ncode;
 173 }
 174
 175 if ($prevcode != 0xffff)
 176 {
 177     # Setting of `fields' parameter doesn't matter here.
 178     &process_gap (*fields, $prevcode, 0x10000);
 179 }
 180
 181 print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n";
 182 print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n";
 183 print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n";
 184 print CHARTABLE "// These tables are automatically generated by the chartables.pl\n";
 185 print CHARTABLE "// script.  DO NOT EDIT the tables.  Instead, fix the script\n";
 186 print CHARTABLE "// and run it again.\n\n";
 187 print CHARTABLE "// This file should only be included by natCharacter.cc\n\n";
 188
 189
 190 $bytes = 0;
 191
 192 # Titlecase mapping tables.
 193 if ($#title_to_lower != $#title_to_upper)
 194 {
 195     # If this fails we need to reimplement toTitleCase.
 196     print STDERR "titlecase mappings have different sizes\n";
 197     $status = 1;
 198 }
 199 # Also ensure that the tables are entirely parallel.
 200 foreach $key (sort keys %title_to_lower)
 201 {
 202     if (! defined $title_to_upper{$key})
 203     {
 204         print STDERR "titlecase mappings have different entries\n";
 205         $status = 1;
 206     }
 207 }
 208 &print_single_map ("title_to_lower_table", %title_to_lower);
 209 &print_single_map ("title_to_upper_table", %title_to_upper);
 210
 211 print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n";
 212
 213 printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE;
 214
 215 # All numeric values.
 216 &print_numerics;
 217
 218 # Digits only.
 219 &print_block ("digit_table", *digit_start, *digit_end);
 220
 221 # Space characters.
 222 &print_block ("space_table", *space_start, *space_end);
 223
 224 # Letters.  We used to generate a separate letter table.  But this
 225 # doesn't really seem worthwhile.  Simply using `all_table' saves us
 226 # about 800 bytes, and only adds 3 table probes to isLetter.
 227 # &print_block ("letter_table", *letter_start, *letter_end);
 228
 229 # Case tables.
 230 &print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom);
 231 &print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom);
 232
 233 # Everything else.
 234 &print_all_block (*all_start, *all_end, *all_cats);
 235
 236 print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n";
 237
 238 printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START;
 239 printf CHARTABLE "#define ROMAN_END   0x%04x\n\n", $ROMAN_END;
 240
 241 &print_fast_tables (*all_start, *all_end, *all_cats,
 242                     *attributes, *second_attributes);
 243
 244 print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n";
 245
 246 print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n";
 247
 248 printf STDERR "Approximately %d bytes of data generated (compact case)\n",
 249     $bytes;
 250
 251
 252 # Now generate decomposition tables.
 253 printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n";
 254 printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n";
 255 printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n";
 256 print DECOMP "// These tables are automatically generated by the chartables.pl\n";
 257 print DECOMP "// script.  DO NOT EDIT the tables.  Instead, fix the script\n";
 258 print DECOMP "// and run it again.\n\n";
 259 print DECOMP "// This file should only be included by natCollator.cc\n\n";
 260
 261 print DECOMP "struct decomp_entry\n{\n";
 262 print DECOMP "  jchar key;\n";
 263 print DECOMP "  const char *value;\n";
 264 print DECOMP "};\n\n";
 265
 266 &write_decompositions;
 267
 268 printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
 269
 270
 271 close (CHARTABLE);
 272 close (DECOMP);
 273
 274 exit $status;
 275
 276
 277 # Process a gap in the space.
 278 sub process_gap
 279 {
 280     local (*fields, $prevcode, $ncode) = @_;
 281     local (@gap_fields, $i);
 282
 283     if ($ncode == $CJK_IDEOGRAPH_END
 284         || $ncode == $HANGUL_END
 285         || $ncode == $HIGH_SURROGATE_END
 286         || $ncode == $PRIVATE_HIGH_SURROGATE_END
 287         || $ncode == $LOW_SURROGATE_END
 288         || $ncode == $PRIVATE_END)
 289     {
 290         # The characters in the gap we just found are known to
 291         # have the same properties as the character at the end of
 292         # the gap.
 293         @gap_fields = @fields;
 294     }
 295     else
 296     {
 297         # This prints too much to be enabled.
 298         # print STDERR "Gap found at \\u", $fields[$CODE], "\n";
 299         @gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '',
 300                        '', '', '', '');
 301     }
 302
 303     for ($i = $prevcode + 1; $i < $ncode; ++$i)
 304     {
 305         $gap_fields[$CODE] = sprintf ("%04x", $i);
 306         $gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE];
 307         &process_char (*gap_fields, $i);
 308     }
 309 }
 310
 311 # Process a single character.
 312 sub process_char
 313 {
 314     local (*fields, $ncode) = @_;
 315
 316     if ($fields[$DECOMPOSITION] ne '')
 317     {
 318         &add_decomposition ($ncode, $fields[$DECOMPOSITION]);
 319     }
 320
 321     # If this is a titlecase character, mark it.
 322     if ($fields[$CATEGORY] eq 'Lt')
 323     {
 324         $title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE];
 325         $title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE];
 326     }
 327     else
 328     {
 329         # For upper and lower case mappings, we try to build compact
 330         # tables that map range onto range.  We specifically want to
 331         # avoid titlecase characters.  Java specifies a range check to
 332         # make sure the character is not between 0x2000 and 0x2fff.
 333         # We avoid that here because we need to generate table entries
 334         # -- toLower and toUpper still work in that range.
 335         if ($fields[$UPPERCASE] eq ''
 336             && ($fields[$LOWERCASE] ne ''
 337                 || $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/))
 338         {
 339             if ($fields[$LOWERCASE] ne '')
 340             {
 341                 &update_case_block (*upper_start, *upper_end, *upper_map,
 342                                     $fields[$CODE], $fields[$LOWERCASE]);
 343                 &set_attribute ($ncode, hex ($fields[$LOWERCASE]));
 344             }
 345             else
 346             {
 347                 $upper_anom{$fields[$CODE]} = 1;
 348             }
 349         }
 350         elsif ($fields[$LOWERCASE] ne '')
 351         {
 352             print STDERR ("Java missed upper case char \\u",
 353                           $fields[$CODE], "\n");
 354         }
 355         elsif ($fields[$CATEGORY] eq 'Lu')
 356         {
 357             # This case is for letters which are marked as upper case
 358             # but for which there is no lower case equivalent.  For
 359             # instance, LATIN LETTER YR.
 360         }
 361
 362         if ($fields[$LOWERCASE] eq ''
 363             && ($fields[$UPPERCASE] ne ''
 364                 || $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/))
 365         {
 366             if ($fields[$UPPERCASE] ne '')
 367             {
 368                 &update_case_block (*lower_start, *lower_end, *lower_map,
 369                                     $fields[$CODE], $fields[$UPPERCASE]);
 370                 &set_attribute ($ncode, hex ($fields[$UPPERCASE]));
 371             }
 372             else
 373             {
 374                 $lower_anom{$fields[$CODE]} = 1;
 375             }
 376         }
 377         elsif ($fields[$UPPERCASE] ne '')
 378         {
 379             print STDERR ("Java missed lower case char \\u",
 380                           $fields[$CODE], "\n");
 381         }
 382         elsif ($fields[$CATEGORY] eq 'Ll')
 383         {
 384             # This case is for letters which are marked as lower case
 385             # but for which there is no upper case equivalent.  For
 386             # instance, FEMININE ORDINAL INDICATOR.
 387         }
 388     }
 389
 390
 391     # If we have a non-decimal numeric value, add it to the list.
 392     if ($fields[$CATEGORY] eq 'Nd'
 393         && ($ncode < 0x2000 || $ncode > 0x2fff)
 394         && $fields[$NAME] =~ /DIGIT/)
 395     {
 396         # This is a digit character that is handled elsewhere.
 397     }
 398     elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '')
 399     {
 400         # Do a simple check.
 401         if ($fields[$DECIMAL] ne '')
 402         {
 403             # This catches bugs in an earlier implementation of
 404             # chartables.pl.  Now it is here for historical interest
 405             # only.
 406             # print STDERR ("Character \u", $fields[$CODE],
 407             # " would have been missed as digit\n");
 408         }
 409
 410         local ($val) = $fields[$DIGIT];
 411         $val = $fields[$NUMERIC] if $val eq '';
 412         local ($ok) = 1;
 413
 414         # If we have a value which is not a positive integer, then we
 415         # set the value to -2 to make life easier for
 416         # Character.getNumericValue.
 417         if ($val !~ m/^[0-9]+$/)
 418         {
 419             if ($fields[$CATEGORY] ne 'Nl'
 420                 && $fields[$CATEGORY] ne 'No')
 421             {
 422                 # This shows a few errors in the Unicode table.  These
 423                 # characters have a missing Numeric field, and the `N'
 424                 # for the mirrored field shows up there instead.  I
 425                 # reported these characters to errata@unicode.org on
 426                 # Thu Sep 10 1998.  They said it will be fixed in the
 427                 # 2.1.6 release of the tables.
 428                 print STDERR ("Character \u", $fields[$CODE],
 429                               " has value but is not numeric; val = '",
 430                               $val, "'\n");
 431                 # We skip these.
 432                 $ok = 0;
 433             }
 434             $val = "-2";
 435         }
 436
 437         if ($ok)
 438         {
 439             $numerics{$fields[$CODE]} = $val;
 440             &set_attribute ($ncode, $val);
 441         }
 442     }
 443
 444     # We build a table that lists ranges of ordinary decimal values.
 445     # At each step we make sure that the digits are in the correct
 446     # order, with no holes, as this is assumed by Character.  If this
 447     # fails, reimplementation is required.  This implementation
 448     # dovetails nicely with the Java Spec, which has strange rules for
 449     # what constitutes a decimal value.  In particular the Unicode
 450     # name must contain the word `DIGIT'.  The spec doesn't directly
 451     # say that digits must have type `Nd' (or that their value must an
 452     # integer), but that can be inferred from the list of digits in
 453     # the book(s).  Currently the only Unicode characters whose name
 454     # includes `DIGIT' which would not fit are the Tibetan "half"
 455     # digits.
 456     if ($fields[$CATEGORY] eq 'Nd')
 457     {
 458         if (($ncode < 0x2000 || $ncode > 0x2fff)
 459             && $fields[$NAME] =~ /DIGIT/)
 460         {
 461             &update_digit_block (*digit_start, *digit_end, $fields[$CODE],
 462                                  $fields[$DECIMAL]);
 463             &set_attribute ($ncode, $fields[$DECIMAL]);
 464         }
 465         else
 466         {
 467             # If this fails then Character.getType will fail.  We
 468             # assume that things in `digit_table' are the only
 469             # category `Nd' characters.
 470             print STDERR ("Character \u", $fields[$CODE],
 471                           " is class Nd but not in digit table\n");
 472             $status = 1;
 473         }
 474     }
 475
 476     # Keep track of space characters.
 477     if ($fields[$CATEGORY] =~ /Z[slp]/)
 478     {
 479         &update_block (*space_start, *space_end, $fields[$CODE]);
 480     }
 481
 482     # Keep track of letters.
 483     # if ($fields[$CATEGORY] =~ /L[ultmo]/)
 484     # {
 485     #   &update_letter_block (*letter_start, *letter_end, $fields[$CODE],
 486     #                         $fields[$CATEGORY]);
 487     # }
 488
 489     # Keep track of all characters.  You might think we wouldn't have
 490     # to do this for uppercase letters, or other characters we already
 491     # "classify".  The problem is that this classification is
 492     # different.  E.g., \u216f is uppercase by Java rules, but is a
 493     # LETTER_NUMBER here.
 494     &update_all_block (*all_start, *all_end, *all_cats,
 495                        $fields[$CODE], $fields[$CATEGORY]);
 496 }
 497
 498
 499 # Called to add a new decomposition.
 500 sub add_decomposition
 501 {
 502     local ($ncode, $value) = @_;
 503     local ($is_full) = 0;
 504     local ($first) = 1;
 505     local (@decomp) = ();
 506
 507     foreach (split (' ', $value))
 508     {
 509         if ($first && /^\<.*\>$/)
 510         {
 511             $is_full = 1;
 512         }
 513         else
 514         {
 515             push (@decomp, hex ($_));
 516         }
 517         $first = 0;
 518     }
 519
 520     # We pack the value into a string because this means we can stick
 521     # with Perl 4 features.
 522     local ($s) = pack "I*", @decomp;
 523     if ($is_full)
 524     {
 525         $full_decomposition{$ncode} = $s;
 526     }
 527     else
 528     {
 529         $canonical_decomposition{$ncode} = $s;
 530     }
 531 }
 532
 533 # Write a single decomposition table.
 534 sub write_single_decomposition
 535 {
 536     local ($name, $is_canon, %table) = @_;
 537
 538     printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
 539
 540     local ($key, @expansion, $char);
 541     local ($first_line) = 1;
 542
 543     for ($key = 0; $key <= 65535; ++$key)
 544     {
 545         next if ! defined $table{$key};
 546
 547         printf DECOMP ",\n"
 548             unless $first_line;
 549         $first_line = 0;
 550
 551         printf DECOMP "  { 0x%04x, \"", $key;
 552
 553         # We represent the expansion as a series of bytes, terminated
 554         # with a double nul.  This is ugly, but relatively
 555         # space-efficient.  Most expansions are short, but there are a
 556         # few that are very long (e.g. \uFDFA).  This means that if we
 557         # chose a fixed-space representation we would waste a lot of
 558         # space.
 559         @expansion = unpack "I*", $table{$key};
 560         foreach $char (@expansion)
 561         {
 562             printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
 563         }
 564
 565         printf DECOMP "\" }";
 566     }
 567
 568     printf DECOMP "\n};\n\n";
 569 }
 570
 571 sub write_decompositions
 572 {
 573     &write_single_decomposition ('canonical', 1, %canonical_decomposition);
 574     &write_single_decomposition ('full', 0, %full_decomposition);
 575 }
 576
 577 # We represent a block of characters with a pair of lists.  This
 578 # function updates the pair to account for the new character.  Returns
 579 # 1 if we added to the old block, 0 otherwise.
 580 sub update_block
 581 {
 582     local (*start, *end, $char) = @_;
 583
 584     local ($nchar) = hex ($char);
 585     local ($count) = $#end;
 586     if ($count >= 0 && $end[$count] == $nchar - 1)
 587     {
 588         ++$end[$count];
 589         return 1;
 590     }
 591     else
 592     {
 593         ++$count;
 594         $start[$count] = $nchar;
 595         $end[$count] = $nchar;
 596     }
 597     return 0;
 598 }
 599
 600 # Return true if we will be appending this character to the end of the
 601 # existing block.
 602 sub block_append_p
 603 {
 604     local (*end, $char) = @_;
 605     return $#end >= 0 && $end[$#end] == $char - 1;
 606 }
 607
 608 # This updates the digit block.  This table is much like an ordinary
 609 # block, but it has an extra constraint.
 610 sub update_digit_block
 611 {
 612     local (*start, *end, $char, $value) = @_;
 613
 614     &update_block ($start, $end, $char);
 615     local ($nchar) = hex ($char);
 616
 617     # We want to make sure that the new digit's value is correct for
 618     # its place in the block.  However, we special-case Tamil digits,
 619     # since Tamil does not have a digit `0'.
 620     local ($count) = $#start;
 621     if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE)
 622         && $nchar - $start[$count] != $value)
 623     {
 624         # If this fails then Character.digit_value will be wrong.
 625         print STDERR "Character \\u", $char, " violates digit constraint\n";
 626         $status = 1;
 627     }
 628 }
 629
 630 # Update letter table.  We could be smart about avoiding upper or
 631 # lower case letters, but it is much simpler to just track them all.
 632 sub update_letter_block
 633 {
 634     local (*start, *end, $char, $category) = @_;
 635
 636     &update_block (*start, *end, $char);
 637 }
 638
 639 # Update `all' table.  This table holds all the characters we don't
 640 # already categorize for other reasons.  FIXME: if a given type has
 641 # very few characters, we should just inline the code.  E.g., there is
 642 # only one paragraph separator.
 643 sub update_all_block
 644 {
 645     local (*start, *end, *cats, $char, $category) = @_;
 646
 647     local ($nchar) = hex ($char);
 648     local ($count) = $#end;
 649     if ($count >= 0
 650         && $end[$count] == $nchar - 1
 651         && $cats[$count] eq $category)
 652     {
 653         ++$end[$count];
 654     }
 655     else
 656     {
 657         ++$count;
 658         $start[$count] = $nchar;
 659         $end[$count] = $nchar;
 660         $cats[$count] = $category;
 661     }
 662 }
 663
 664 # Update a case table.  We handle case tables specially because we
 665 # want to map (e.g.) a block of uppercase characters directly onto the
 666 # corresponding block of lowercase characters.  Therefore we generate
 667 # a new entry when the block would no longer map directly.
 668 sub update_case_block
 669 {
 670     local (*start, *end, *map, $char, $mapchar) = @_;
 671
 672     local ($nchar) = hex ($char);
 673     local ($nmap) = hex ($mapchar);
 674
 675     local ($count) = $#end;
 676     if ($count >= 0
 677         && $end[$count] == $nchar - 1
 678         && $nchar - $start[$count] == $nmap - $map[$count])
 679     {
 680         ++$end[$count];
 681     }
 682     else
 683     {
 684         ++$count;
 685         $start[$count] = $nchar;
 686         $end[$count] = $nchar;
 687         $map[$count] = $nmap;
 688     }
 689 }
 690
 691 # Set the attribute value for the character.  Each character can have
 692 # only one attribute.
 693 sub set_attribute
 694 {
 695     local ($ncode, $attr) = @_;
 696
 697     if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr)
 698     {
 699         if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END)
 700         {
 701             $second_attributes{$ncode} = $attr;
 702         }
 703         else
 704         {
 705             printf STDERR "character \\u%04x already has attribute\n", $ncode;
 706         }
 707     }
 708     # Attributes can be interpreted as unsigned in some situations,
 709     # so we check against 65535.  This could cause errors -- we need
 710     # to check the interpretation here.
 711     elsif ($attr < -32768 || $attr > 65535)
 712     {
 713         printf STDERR "attribute out of range for character \\u%04x\n", $ncode;
 714     }
 715     else
 716     {
 717         $attributes{$ncode} = $attr;
 718     }
 719 }
 720
 721
 722 # Print a block table.
 723 sub print_block
 724 {
 725     local ($title, *start, *end) = @_;
 726
 727     print CHARTABLE "static const jchar ", $title, "[][2] =\n";
 728     print CHARTABLE "  {\n";
 729
 730     local ($i) = 0;
 731     while ($i <= $#start)
 732     {
 733         print CHARTABLE "    { ";
 734         &print_char ($start[$i]);
 735         print CHARTABLE ", ";
 736         &print_char ($end[$i]);
 737         print CHARTABLE " }";
 738         print CHARTABLE "," if ($i != $#start);
 739         print CHARTABLE "\n";
 740         ++$i;
 741         $bytes += 4;            # Two bytes per char.
 742     }
 743
 744     print CHARTABLE "  };\n\n";
 745 }
 746
 747 # Print the numerics table.
 748 sub print_numerics
 749 {
 750     local ($i, $key, $count, @keys);
 751
 752     $i = 0;
 753     @keys = sort keys %numerics;
 754     $count = @keys;
 755
 756     print CHARTABLE "static const jchar numeric_table[] =\n";
 757     print CHARTABLE "  { ";
 758     foreach $key (@keys)
 759     {
 760         &print_char (hex ($key));
 761         ++$i;
 762         print CHARTABLE ", " if $i < $count;
 763         # Print 5 per line.
 764         print CHARTABLE "\n    " if ($i % 5 == 0);
 765         $bytes += 2;            # One character.
 766     }
 767     print CHARTABLE " };\n\n";
 768
 769     print CHARTABLE "static const jshort numeric_value[] =\n";
 770     print CHARTABLE "  { ";
 771     $i = 0;
 772     foreach $key (@keys)
 773     {
 774         print CHARTABLE $numerics{$key};
 775         if ($numerics{$key} > 32767 || $numerics{$key} < -32768)
 776         {
 777             # This means our generated type info is incorrect.  We
 778             # could just detect and work around this here, but I'm
 779             # lazy.
 780             print STDERR "numeric value won't fit in a short\n";
 781             $status = 1;
 782         }
 783         ++$i;
 784         print CHARTABLE ", " if $i < $count;
 785         # Print 10 per line.
 786         print CHARTABLE "\n    " if ($i % 10 == 0);
 787         $bytes += 2;            # One short.
 788     }
 789     print CHARTABLE " };\n\n";
 790 }
 791
 792 # Print a table that maps one single letter onto another.  It assumes
 793 # the map is index by char code.
 794 sub print_single_map
 795 {
 796     local ($title, %map) = @_;
 797
 798     local (@keys) = sort keys %map;
 799     $num = @keys;
 800     print CHARTABLE "static const jchar ", $title, "[][2] =\n";
 801     print CHARTABLE "  {\n";
 802     $i = 0;
 803     for $key (@keys)
 804     {
 805         print CHARTABLE "    { ";
 806         &print_char (hex ($key));
 807         print CHARTABLE ", ";
 808         &print_char (hex ($map{$key}));
 809         print CHARTABLE " }";
 810         ++$i;
 811         if ($i < $num)
 812         {
 813             print CHARTABLE ",";
 814         }
 815         else
 816         {
 817             print CHARTABLE " ";
 818         }
 819         print CHARTABLE "   // ", $name{$key}, "\n";
 820         $bytes += 4;            # Two bytes per char.
 821     }
 822     print CHARTABLE "  };\n\n";
 823 }
 824
 825 # Print the `all' block.
 826 sub print_all_block
 827 {
 828     local (*start, *end, *cats) = @_;
 829
 830     &print_block ("all_table", *start, *end);
 831
 832     local ($i) = 0;
 833     local ($sum) = 0;
 834     while ($i <= $#start)
 835     {
 836         $sum += $end[$i] - $start[$i] + 1;
 837         ++$i;
 838     }
 839     # We do this computation just to make sure it isn't cheaper to
 840     # simply list all the characters individually.
 841     printf STDERR ("all_table encodes %d characters in %d entries\n",
 842                    $sum, $#start + 1);
 843
 844     print CHARTABLE "static const jbyte category_table[] =\n";
 845     print CHARTABLE "  { ";
 846
 847     $i = 0;
 848     while ($i <= $#cats)
 849     {
 850         if ($i > 0 && $cats[$i] eq $cats[$i - 1])
 851         {
 852             # This isn't an error.  We can have a duplicate because
 853             # two ranges are not adjacent while the intervening
 854             # characters are left out of the table for other reasons.
 855             # We could exploit this to make the table a little smaller.
 856             # printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i];
 857         }
 858         print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
 859         print CHARTABLE ", " if ($i < $#cats);
 860         ++$i;
 861         print CHARTABLE "\n    ";
 862         ++$bytes;
 863     }
 864     print CHARTABLE "  };\n\n";
 865 }
 866
 867 # Print case table.
 868 sub print_case_table
 869 {
 870     local ($title, *start, *end, *map, *anomalous) = @_;
 871
 872     &print_block ($title . '_case_table', *start, *end);
 873
 874     print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n";
 875     print CHARTABLE "  { ";
 876
 877     local ($i) = 0;
 878     while ($i <= $#map)
 879     {
 880         &print_char ($map[$i]);
 881         print CHARTABLE ", " if $i < $#map;
 882         ++$i;
 883         print CHARTABLE "\n    " if $i % 5 == 0;
 884         $bytes += 2;
 885     }
 886     print CHARTABLE "  };\n";
 887
 888
 889     local ($key, @keys);
 890     @keys = sort keys %anomalous;
 891
 892     if ($title eq 'upper')
 893     {
 894         if ($#keys >= 0)
 895         {
 896             # If these are found we need to change Character.isUpperCase.
 897             print STDERR "Found anomalous upper case characters\n";
 898             $status = 1;
 899         }
 900     }
 901     else
 902     {
 903         print CHARTABLE "\n";
 904         print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n";
 905         print CHARTABLE "  { ";
 906         $i = 0;
 907         foreach $key (@keys)
 908         {
 909             &print_char (hex ($key));
 910             print CHARTABLE ", " if $i < $#keys;
 911             ++$i;
 912             print CHARTABLE "\n    " if $i % 5 == 0;
 913             $bytes += 2;
 914         }
 915         print CHARTABLE "  };\n";
 916     }
 917
 918     print CHARTABLE "\n";
 919 }
 920
 921 # Print the type table and attributes table for the fast version.
 922 sub print_fast_tables
 923 {
 924     local (*start, *end, *cats, *atts, *second_atts) = @_;
 925
 926     print CHARTABLE "static const jbyte type_table[] =\n{ ";
 927
 928     local ($i, $j);
 929     for ($i = 0; $i <= $#cats; ++$i)
 930     {
 931         for ($j = $start[$i]; $j <= $end[$i]; ++$j)
 932         {
 933             print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
 934             print CHARTABLE "," if ($i < $#cats || $j < $end[$i]);
 935             print CHARTABLE "\n    ";
 936         }
 937     }
 938     print CHARTABLE "\n };\n\n";
 939
 940     print CHARTABLE "static const jshort attribute_table[] =\n{ ";
 941     for ($i = 0; $i <= 0xffff; ++$i)
 942     {
 943         $atts{$i} = 0 if ! defined $atts{$i};
 944         print CHARTABLE $atts{$i};
 945         print CHARTABLE ", " if $i < 0xffff;
 946         print CHARTABLE "\n    " if $i % 5 == 1;
 947     }
 948     print CHARTABLE "\n };\n\n";
 949
 950     print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ ";
 951     for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i)
 952     {
 953         print CHARTABLE $second_atts{$i};
 954         print CHARTABLE ", " if $i < $ROMAN_END;
 955         print CHARTABLE "\n    " if $i % 5 == 1;
 956     }
 957     print CHARTABLE "\n };\n\n";
 958 }
 959
 960 # Print a character constant.
 961 sub print_char
 962 {
 963     local ($ncode) = @_;
 964     printf CHARTABLE "0x%04x", $ncode;
 965 }