libjava/classpath/scripts/unicode-blocks.pl

   1 #!/usr/bin/perl -w
   2 # unicode-blocks.pl -- Script to generate java.lang.Character.UnicodeBlock
   3 # Copyright (C) 2002, 2004 Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20 # 02110-1301 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39
  40 # Code for reading Blocks.txt and generating (to standard out) the code for
  41 # java.lang.Character.UnicodeBlock, for pasting into java/lang/Character.java.
  42 # You should probably check that the results are accurate to the
  43 # specification, but I made sure it works OOB for Unicode 3.0.0 and JDK 1.4.
  44 # As the grammar for the Blocks.txt file is changing in Unicode 3.2.0, you
  45 # will have to tweak this some for future use.  For now, the relevant
  46 # Unicode definition files are found in doc/unicode/.
  47 #
  48 # author Eric Blake <ebb9@email.byu.edu>
  49 #
  50 # usage: unicode-blocks.pl <blocks.txt>
  51 #    where <blocks.txt> is obtained from www.unicode.org (named Blocks-3.txt
  52 #    for Unicode version 3.0.0).
  53
  54
  55 die "Usage: $0 <blocks.txt>" unless @ARGV == 1;
  56 open (BLOCKS, $ARGV[0]) || die "Can't open Unicode block file: $!\n";
  57
  58 # A hash of added fields and the JDK they were added in, to automatically
  59 # print @since tags.  Maintaining this is optional (and tedious), but nice.
  60 my %additions = ("SYRIAC" => "1.4",
  61                  "THAANA" => "1.4",
  62                  "SINHALA" => "1.4",
  63                  "MYANMAR" => "1.4",
  64                  "ETHIOPIC" => "1.4",
  65                  "CHEROKEE" => "1.4",
  66                  "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS" => "1.4",
  67                  "OGHAM" => "1.4",
  68                  "RUNIC" => "1.4",
  69                  "KHMER" => "1.4",
  70                  "MONGOLIAN" => "1.4",
  71                  "BRAILLE_PATTERNS" => "1.4",
  72                  "CJK_RADICALS_SUPPLEMENT" => "1.4",
  73                  "KANGXI_RADICALS" => "1.4",
  74                  "IDEOGRAPHIC_DESCRIPTION_CHARACTERS" => "1.4",
  75                  "BOPOMOFO_EXTENDED" => "1.4",
  76                  "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A" => "1.4",
  77                  "YI_SYLLABLES" => "1.4",
  78                  "YI_RADICALS" => "1.4",
  79                  "CYRILLIC_SUPPLEMENTARY" => "1.5",
  80                  "TAGALOG" => "1.5",
  81                  "HANUNOO" => "1.5",
  82                  "BUHID" => "1.5",
  83                  "TAGBANWA" => "1.5",
  84                  "LIMBU" => "1.5",
  85                  "TAI_LE" => "1.5",
  86                  "KHMER_SYMBOLS" => "1.5",
  87                  "PHONETIC_EXTENSIONS" => "1.5",
  88                  "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A" => "1.5",
  89                  "SUPPLEMENTAL_ARROWS_A" => "1.5",
  90                  "SUPPLEMENTAL_ARROWS_B" => "1.5",
  91                  "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B" => "1.5",
  92                  "SUPPLEMENTAL_MATHEMATICAL_OPERATORS" => "1.5",
  93                  "MISCELLANEOUS_SYMBOLS_AND_ARROWS" => "1.5",
  94                  "KATAKANA_PHONETIC_EXTENSIONS" => "1.5",
  95                  "YIJING_HEXAGRAM_SYMBOLS" => "1.5",
  96                  "VARIATION_SELECTORS" => "1.5",
  97                  "LINEAR_B_SYLLABARY" => "1.5",
  98                  "LINEAR_B_IDEOGRAMS" => "1.5",
  99                  "AEGEAN_NUMBERS" => "1.5",
 100                  "OLD_ITALIC" => "1.5",
 101                  "GOTHIC" => "1.5",
 102                  "UGARITIC" => "1.5",
 103                  "DESERET" => "1.5",
 104                  "SHAVIAN" => "1.5",
 105                  "OSMANYA" => "1.5",
 106                  "CYPRIOT_SYLLABARY" => "1.5",
 107                  "BYZANTINE_MUSICAL_SYMBOLS" => "1.5",
 108                  "MUSICAL_SYMBOLS" => "1.5",
 109                  "TAI_XUAN_JING_SYMBOLS" => "1.5",
 110                  "MATHEMATICAL_ALPHANUMERIC_SYMBOLS" => "1.5",
 111                  "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B" => "1.5",
 112                  "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT" => "1.5",
 113                  "TAGS" => "1.5",
 114                  "VARIATION_SELECTORS_SUPPLEMENT" => "1.5",
 115                  "SUPPLEMENTARY_PRIVATE_USE_AREA_A" => "1.5",
 116                  "SUPPLEMENTARY_PRIVATE_USE_AREA_B" => "1.5",
 117                  "HIGH_SURROGATES" => "1.5",
 118                  "HIGH_PRIVATE_USE_SURROGATES" => "1.5",
 119                  "LOW_SURROGATES" => "1.5"
 120                  );
 121
 122 print <<'EOF';
 123   /**
 124    * A family of character subsets in the Unicode specification. A character
 125    * is in at most one of these blocks.
 126    *
 127    * This inner class was generated automatically from
 128    * <code>$ARGV[0]</code>, by some perl scripts.
 129    * This Unicode definition file can be found on the
 130    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 131    * JDK 1.5 uses Unicode version 4.0.0.
 132    *
 133    * @author scripts/unicode-blocks.pl (written by Eric Blake)
 134    * @since 1.2
 135    */
 136   public static final class UnicodeBlock extends Subset
 137   {
 138     /** The start of the subset. */
 139     private final int start;
 140
 141     /** The end of the subset. */
 142     private final int end;
 143
 144     /** The canonical name of the block according to the Unicode standard. */
 145     private final String canonicalName;
 146
 147     /** Constants for the <code>forName()</code> method */
 148     private static final int CANONICAL_NAME = 0;
 149     private static final int NO_SPACES_NAME = 1;
 150     private static final int CONSTANT_NAME = 2;
 151
 152     /**
 153      * Constructor for strictly defined blocks.
 154      *
 155      * @param start the start character of the range
 156      * @param end the end character of the range
 157      * @param name the block name
 158      * @param canonicalName the name of the block as defined in the Unicode
 159      *        standard.
 160      */
 161     private UnicodeBlock(int start, int end, String name,
 162                          String canonicalName)
 163     {
 164       super(name);
 165       this.start = start;
 166       this.end = end;
 167       this.canonicalName = canonicalName;
 168     }
 169
 170     /**
 171      * Returns the Unicode character block which a character belongs to.
 172      * <strong>Note</strong>: This method does not support the use of
 173      * supplementary characters.  For such support, <code>of(int)</code>
 174      * should be used instead.
 175      *
 176      * @param ch the character to look up
 177      * @return the set it belongs to, or null if it is not in one
 178      */
 179     public static UnicodeBlock of(char ch)
 180     {
 181       return of((int) ch);
 182     }
 183
 184     /**
 185      * Returns the Unicode character block which a code point belongs to.
 186      *
 187      * @param codePoint the character to look up
 188      * @return the set it belongs to, or null if it is not in one.
 189      * @throws IllegalArgumentException if the specified code point is
 190      *         invalid.
 191      * @since 1.5
 192      */
 193     public static UnicodeBlock of(int codePoint)
 194     {
 195       if (codePoint > MAX_CODE_POINT)
 196         throw new IllegalArgumentException("The supplied integer value is " +
 197                                            "too large to be a codepoint.");
 198       // Simple binary search for the correct block.
 199       int low = 0;
 200       int hi = sets.length - 1;
 201       while (low <= hi)
 202         {
 203           int mid = (low + hi) >> 1;
 204           UnicodeBlock b = sets[mid];
 205           if (codePoint < b.start)
 206             hi = mid - 1;
 207           else if (codePoint > b.end)
 208             low = mid + 1;
 209           else
 210             return b;
 211         }
 212       return null;
 213     }
 214
 215     /**
 216      * <p>
 217      * Returns the <code>UnicodeBlock</code> with the given name, as defined
 218      * by the Unicode standard.  The version of Unicode in use is defined by
 219      * the <code>Character</code> class, and the names are given in the
 220      * <code>Blocks-<version>.txt</code> file corresponding to that version.
 221      * The name may be specified in one of three ways:
 222      * </p>
 223      * <ol>
 224      * <li>The canonical, human-readable name used by the Unicode standard.
 225      * This is the name with all spaces and hyphens retained.  For example,
 226      * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
 227      * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
 228      * <li>The name used for the constants specified by this class, which
 229      * is the canonical name with all spaces and hyphens replaced with
 230      * underscores e.g. `BASIC_LATIN'</li>
 231      * </ol>
 232      * <p>
 233      * The names are compared case-insensitively using the case comparison
 234      * associated with the U.S. English locale.  The method recognises the
 235      * previous names used for blocks as well as the current ones.  At
 236      * present, this simply means that the deprecated `SURROGATES_AREA'
 237      * will be recognised by this method (the <code>of()</code> methods
 238      * only return one of the three new surrogate blocks).
 239      * </p>
 240      *
 241      * @param blockName the name of the block to look up.
 242      * @return the specified block.
 243      * @throws NullPointerException if the <code>blockName</code> is
 244      *         <code>null</code>.
 245      * @throws IllegalArgumentException if the name does not match any Unicode
 246      *         block.
 247      * @since 1.5
 248      */
 249     public static final UnicodeBlock forName(String blockName)
 250     {
 251       int type;
 252       if (blockName.indexOf(' ') != -1)
 253         type = CANONICAL_NAME;
 254       else if (blockName.indexOf('_') != -1)
 255         type = CONSTANT_NAME;
 256       else
 257         type = NO_SPACES_NAME;
 258       Collator usCollator = Collator.getInstance(Locale.US);
 259       usCollator.setStrength(Collator.PRIMARY);
 260       /* Special case for deprecated blocks not in sets */
 261       switch (type)
 262       {
 263         case CANONICAL_NAME:
 264           if (usCollator.compare(blockName, "Surrogates Area") == 0)
 265             return SURROGATES_AREA;
 266           break;
 267         case NO_SPACES_NAME:
 268           if (usCollator.compare(blockName, "SurrogatesArea") == 0)
 269             return SURROGATES_AREA;
 270           break;
 271         case CONSTANT_NAME:
 272           if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
 273             return SURROGATES_AREA;
 274           break;
 275       }
 276       /* Other cases */
 277       int setLength = sets.length;
 278       switch (type)
 279       {
 280         case CANONICAL_NAME:
 281           for (int i = 0; i < setLength; i++)
 282             {
 283               UnicodeBlock block = sets[i];
 284               if (usCollator.compare(blockName, block.canonicalName) == 0)
 285                 return block;
 286             }
 287           break;
 288         case NO_SPACES_NAME:
 289           for (int i = 0; i < setLength; i++)
 290             {
 291               UnicodeBlock block = sets[i];
 292               String nsName = block.canonicalName.replaceAll(" ","");
 293               if (usCollator.compare(blockName, nsName) == 0)
 294                 return block;
 295             }
 296           break;
 297         case CONSTANT_NAME:
 298           for (int i = 0; i < setLength; i++)
 299             {
 300               UnicodeBlock block = sets[i];
 301               if (usCollator.compare(blockName, block.toString()) == 0)
 302                 return block;
 303             }
 304           break;
 305       }
 306       throw new IllegalArgumentException("No Unicode block found for " +
 307                                          blockName + ".");
 308     }
 309 EOF
 310
 311 my @names = ();
 312 while (<BLOCKS>) {
 313     next if /^\#/;
 314     my ($range, $block) = split(/; /);
 315     my ($start, $end) = split /\.\./, $range;
 316     next unless defined $block;
 317     chomp $block;
 318     $block =~ s/ *$//;
 319
 320     # Translate new Unicode names which have the old name in Java
 321     $block = "Greek" if $block =~ /Greek and Coptic/;
 322     $block = "Combining Marks for Symbols"
 323       if $block =~ /Combining Diacritical Marks for Symbols/;
 324
 325     (my $name = $block) =~ tr/a-z -/A-Z__/;
 326     push @names, $name;
 327     my $since = (defined $additions{$name}
 328                  ? "\n     * \@since $additions{$name}" : "");
 329     print <<EOF;
 330
 331     /**
 332      * $block.
 333      * 0x$start - 0x$end.$since
 334      */
 335     public static final UnicodeBlock $name
 336       = new UnicodeBlock(0x$start, 0x$end,
 337                          "$name",
 338                          "$block");
 339 EOF
 340 }
 341
 342 print <<EOF;
 343
 344     /**
 345      * Surrogates Area.
 346      * '\uD800' - '\uDFFF'.
 347      * \@deprecated As of 1.5, the three areas,
 348      * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
 349      * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
 350      * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
 351      * by the Unicode standard, should be used in preference to
 352      * this.  These are also returned from calls to <code>of(int)</code>
 353      * and <code>of(char)</code>.
 354      */
 355     \@Deprecated
 356     public static final UnicodeBlock SURROGATES_AREA
 357       = new UnicodeBlock(0xD800, 0xDFFF,
 358                          "SURROGATES_AREA",
 359                          "Surrogates Area");
 360
 361     /**
 362      * The defined subsets.
 363      */
 364     private static final UnicodeBlock sets[] = {
 365 EOF
 366
 367 foreach (@names) {
 368     print "      $_,\n";
 369 }
 370
 371 print <<EOF;
 372     };
 373   } // class UnicodeBlock
 374 EOF