libjava/classpath/scripts/unicode-blocks.pl

   1 #!/usr/bin/perl -w
   2 # unicode-blocks.pl -- Script to generate java.lang.Character.UnicodeBlock
   3 # Copyright (C) 2002, 2004 Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20 # 02110-1301 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39
  40 # Code for reading Blocks.txt and generating (to standard out) the code for
  41 # java.lang.Character.UnicodeBlock, for pasting into java/lang/Character.java.
  42 # You should probably check that the results are accurate to the
  43 # specification, but I made sure it works OOB for Unicode 3.0.0 and JDK 1.4.
  44 # As the grammar for the Blocks.txt file is changing in Unicode 3.2.0, you
  45 # will have to tweak this some for future use.  For now, the relevant
  46 # Unicode definition files are found in doc/unicode/.
  47 #
  48 # author Eric Blake <ebb9@email.byu.edu>
  49 #
  50 # usage: unicode-blocks.pl <blocks.txt>
  51 #    where <blocks.txt> is obtained from www.unicode.org (named Blocks-3.txt
  52 #    for Unicode version 3.0.0).
  53
  54
  55 die "Usage: $0 <blocks.txt>" unless @ARGV == 1;
  56 open (BLOCKS, $ARGV[0]) || die "Can't open Unicode block file: $!\n";
  57
  58 # A hash of added fields and the JDK they were added in, to automatically
  59 # print @since tags.  Maintaining this is optional (and tedious), but nice.
  60 my %additions = ("SYRIAC" => "1.4",
  61                  "THAANA" => "1.4",
  62                  "SINHALA" => "1.4",
  63                  "MYANMAR" => "1.4",
  64                  "ETHIOPIC" => "1.4",
  65                  "CHEROKEE" => "1.4",
  66                  "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS" => "1.4",
  67                  "OGHAM" => "1.4",
  68                  "RUNIC" => "1.4",
  69                  "KHMER" => "1.4",
  70                  "MONGOLIAN" => "1.4",
  71                  "BRAILLE_PATTERNS" => "1.4",
  72                  "CJK_RADICALS_SUPPLEMENT" => "1.4",
  73                  "KANGXI_RADICALS" => "1.4",
  74                  "IDEOGRAPHIC_DESCRIPTION_CHARACTERS" => "1.4",
  75                  "BOPOMOFO_EXTENDED" => "1.4",
  76                  "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A" => "1.4",
  77                  "YI_SYLLABLES" => "1.4",
  78                  "YI_RADICALS" => "1.4",
  79                  "CYRILLIC_SUPPLEMENTARY" => "1.5",
  80                  "TAGALOG" => "1.5",
  81                  "HANUNOO" => "1.5",
  82                  "BUHID" => "1.5",
  83                  "TAGBANWA" => "1.5",
  84                  "LIMBU" => "1.5",
  85                  "TAI_LE" => "1.5",
  86                  "KHMER_SYMBOLS" => "1.5",
  87                  "PHONETIC_EXTENSIONS" => "1.5",
  88                  "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A" => "1.5",
  89                  "SUPPLEMENTAL_ARROWS_A" => "1.5",
  90                  "SUPPLEMENTAL_ARROWS_B" => "1.5",
  91                  "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B" => "1.5",
  92                  "SUPPLEMENTAL_MATHEMATICAL_OPERATORS" => "1.5",
  93                  "MISCELLANEOUS_SYMBOLS_AND_ARROWS" => "1.5",
  94                  "KATAKANA_PHONETIC_EXTENSIONS" => "1.5",
  95                  "YIJING_HEXAGRAM_SYMBOLS" => "1.5",
  96                  "VARIATION_SELECTORS" => "1.5",
  97                  "LINEAR_B_SYLLABARY" => "1.5",
  98                  "LINEAR_B_IDEOGRAMS" => "1.5",
  99                  "AEGEAN_NUMBERS" => "1.5",
 100                  "OLD_ITALIC" => "1.5",
 101                  "GOTHIC" => "1.5",
 102                  "UGARITIC" => "1.5",
 103                  "DESERET" => "1.5",
 104                  "SHAVIAN" => "1.5",
 105                  "OSMANYA" => "1.5",
 106                  "CYPRIOT_SYLLABARY" => "1.5",
 107                  "BYZANTINE_MUSICAL_SYMBOLS" => "1.5",
 108                  "MUSICAL_SYMBOLS" => "1.5",
 109                  "TAI_XUAN_JING_SYMBOLS" => "1.5",
 110                  "MATHEMATICAL_ALPHANUMERIC_SYMBOLS" => "1.5",
 111                  "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B" => "1.5",
 112                  "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT" => "1.5",
 113                  "TAGS" => "1.5",
 114                  "VARIATION_SELECTORS_SUPPLEMENT" => "1.5",
 115                  "SUPPLEMENTARY_PRIVATE_USE_AREA_A" => "1.5",
 116                  "SUPPLEMENTARY_PRIVATE_USE_AREA_B" => "1.5",
 117                  "HIGH_SURROGATES" => "1.5",
 118                  "HIGH_PRIVATE_USE_SURROGATES" => "1.5",
 119                  "LOW_SURROGATES" => "1.5"
 120                  );
 121
 122 print <<'EOF';
 123   /**
 124    * A family of character subsets in the Unicode specification. A character
 125    * is in at most one of these blocks.
 126    *
 127    * This inner class was generated automatically from
 128    * <code>$ARGV[0]</code>, by some perl scripts.
 129    * This Unicode definition file can be found on the
 130    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 131    * JDK 1.5 uses Unicode version 4.0.0.
 132    *
 133    * @author scripts/unicode-blocks.pl (written by Eric Blake)
 134    * @since 1.2
 135    */
 136   public static final class UnicodeBlock extends Subset
 137   {
 138     /** The start of the subset. */
 139     private final int start;
 140
 141     /** The end of the subset. */
 142     private final int end;
 143
 144     /** The canonical name of the block according to the Unicode standard. */
 145     private final String canonicalName;
 146
 147     /** Enumeration for the <code>forName()</code> method */
 148     private enum NameType { CANONICAL, NO_SPACES, CONSTANT; };
 149
 150     /**
 151      * Constructor for strictly defined blocks.
 152      *
 153      * @param start the start character of the range
 154      * @param end the end character of the range
 155      * @param name the block name
 156      * @param canonicalName the name of the block as defined in the Unicode
 157      *        standard.
 158      */
 159     private UnicodeBlock(int start, int end, String name,
 160                          String canonicalName)
 161     {
 162       super(name);
 163       this.start = start;
 164       this.end = end;
 165       this.canonicalName = canonicalName;
 166     }
 167
 168     /**
 169      * Returns the Unicode character block which a character belongs to.
 170      * <strong>Note</strong>: This method does not support the use of
 171      * supplementary characters.  For such support, <code>of(int)</code>
 172      * should be used instead.
 173      *
 174      * @param ch the character to look up
 175      * @return the set it belongs to, or null if it is not in one
 176      */
 177     public static UnicodeBlock of(char ch)
 178     {
 179       return of((int) ch);
 180     }
 181
 182     /**
 183      * Returns the Unicode character block which a code point belongs to.
 184      *
 185      * @param codePoint the character to look up
 186      * @return the set it belongs to, or null if it is not in one.
 187      * @throws IllegalArgumentException if the specified code point is
 188      *         invalid.
 189      * @since 1.5
 190      */
 191     public static UnicodeBlock of(int codePoint)
 192     {
 193       if (codePoint > MAX_CODE_POINT)
 194         throw new IllegalArgumentException("The supplied integer value is " +
 195                                            "too large to be a codepoint.");
 196       // Simple binary search for the correct block.
 197       int low = 0;
 198       int hi = sets.length - 1;
 199       while (low <= hi)
 200         {
 201           int mid = (low + hi) >> 1;
 202           UnicodeBlock b = sets[mid];
 203           if (codePoint < b.start)
 204             hi = mid - 1;
 205           else if (codePoint > b.end)
 206             low = mid + 1;
 207           else
 208             return b;
 209         }
 210       return null;
 211     }
 212
 213     /**
 214      * <p>
 215      * Returns the <code>UnicodeBlock</code> with the given name, as defined
 216      * by the Unicode standard.  The version of Unicode in use is defined by
 217      * the <code>Character</code> class, and the names are given in the
 218      * <code>Blocks-<version>.txt</code> file corresponding to that version.
 219      * The name may be specified in one of three ways:
 220      * </p>
 221      * <ol>
 222      * <li>The canonical, human-readable name used by the Unicode standard.
 223      * This is the name with all spaces and hyphens retained.  For example,
 224      * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
 225      * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
 226      * <li>The name used for the constants specified by this class, which
 227      * is the canonical name with all spaces and hyphens replaced with
 228      * underscores e.g. `BASIC_LATIN'</li>
 229      * </ol>
 230      * <p>
 231      * The names are compared case-insensitively using the case comparison
 232      * associated with the U.S. English locale.  The method recognises the
 233      * previous names used for blocks as well as the current ones.  At
 234      * present, this simply means that the deprecated `SURROGATES_AREA'
 235      * will be recognised by this method (the <code>of()</code> methods
 236      * only return one of the three new surrogate blocks).
 237      * </p>
 238      *
 239      * @param blockName the name of the block to look up.
 240      * @return the specified block.
 241      * @throws NullPointerException if the <code>blockName</code> is
 242      *         <code>null</code>.
 243      * @throws IllegalArgumentException if the name does not match any Unicode
 244      *         block.
 245      * @since 1.5
 246      */
 247     public static final UnicodeBlock forName(String blockName)
 248     {
 249       NameType type;
 250       if (blockName.indexOf(' ') != -1)
 251         type = NameType.CANONICAL;
 252       else if (blockName.indexOf('_') != -1)
 253         type = NameType.CONSTANT;
 254       else
 255         type = NameType.NO_SPACES;
 256       Collator usCollator = Collator.getInstance(Locale.US);
 257       usCollator.setStrength(Collator.PRIMARY);
 258       /* Special case for deprecated blocks not in sets */
 259       switch (type)
 260       {
 261         case CANONICAL:
 262           if (usCollator.compare(blockName, "Surrogates Area") == 0)
 263             return SURROGATES_AREA;
 264           break;
 265         case NO_SPACES:
 266           if (usCollator.compare(blockName, "SurrogatesArea") == 0)
 267             return SURROGATES_AREA;
 268           break;
 269         case CONSTANT:
 270           if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
 271             return SURROGATES_AREA;
 272           break;
 273       }
 274       /* Other cases */
 275       switch (type)
 276       {
 277         case CANONICAL:
 278           for (UnicodeBlock block : sets)
 279             if (usCollator.compare(blockName, block.canonicalName) == 0)
 280               return block;
 281           break;
 282         case NO_SPACES:
 283           for (UnicodeBlock block : sets)
 284             {
 285               String nsName = block.canonicalName.replaceAll(" ","");
 286               if (usCollator.compare(blockName, nsName) == 0)
 287                 return block;
 288             }
 289           break;
 290         case CONSTANT:
 291           for (UnicodeBlock block : sets)
 292             if (usCollator.compare(blockName, block.toString()) == 0)
 293               return block;
 294           break;
 295       }
 296       throw new IllegalArgumentException("No Unicode block found for " +
 297                                          blockName + ".");
 298     }
 299 EOF
 300
 301 my @names = ();
 302 while (<BLOCKS>) {
 303     next if /^\#/;
 304     my ($range, $block) = split(/; /);
 305     my ($start, $end) = split /\.\./, $range;
 306     next unless defined $block;
 307     chomp $block;
 308     $block =~ s/ *$//;
 309
 310     # Translate new Unicode names which have the old name in Java
 311     $block = "Greek" if $block =~ /Greek and Coptic/;
 312     $block = "Combining Marks for Symbols"
 313       if $block =~ /Combining Diacritical Marks for Symbols/;
 314
 315     (my $name = $block) =~ tr/a-z -/A-Z__/;
 316     push @names, $name;
 317     my $since = (defined $additions{$name}
 318                  ? "\n     * \@since $additions{$name}" : "");
 319     print <<EOF;
 320
 321     /**
 322      * $block.
 323      * 0x$start - 0x$end.$since
 324      */
 325     public static final UnicodeBlock $name
 326       = new UnicodeBlock(0x$start, 0x$end,
 327                          "$name",
 328                          "$block");
 329 EOF
 330 }
 331
 332 print <<EOF;
 333
 334     /**
 335      * Surrogates Area.
 336      * '\uD800' - '\uDFFF'.
 337      * \@deprecated As of 1.5, the three areas,
 338      * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
 339      * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
 340      * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
 341      * by the Unicode standard, should be used in preference to
 342      * this.  These are also returned from calls to <code>of(int)</code>
 343      * and <code>of(char)</code>.
 344      */
 345     \@Deprecated
 346     public static final UnicodeBlock SURROGATES_AREA
 347       = new UnicodeBlock(0xD800, 0xDFFF,
 348                          "SURROGATES_AREA",
 349                          "Surrogates Area");
 350
 351     /**
 352      * The defined subsets.
 353      */
 354     private static final UnicodeBlock sets[] = {
 355 EOF
 356
 357 foreach (@names) {
 358     print "      $_,\n";
 359 }
 360
 361 print <<EOF;
 362     };
 363   } // class UnicodeBlock
 364 EOF