libjava/java/lang/Character.java

   1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
   2    Copyright (C) 1998, 1999, 2001, 2002, 2005, 2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 /*
  39  * Note: This class must not be merged with Classpath.  Gcj uses C-style
  40  * arrays (see include/java-chartables.h) to store the Unicode character
  41  * database, whereas Classpath uses Java objects (char[] extracted from
  42  * String constants) in gnu.java.lang.CharData.  Gcj's approach is more
  43  * efficient, because there is no vtable or data relocation to worry about.
  44  * However, despite the difference in the database interface, the two
  45  * versions share identical algorithms.
  46  */
  47
  48 package java.lang;
  49
  50 import java.io.Serializable;
  51 import java.text.Collator;
  52 import java.util.Locale;
  53
  54 /**
  55  * Wrapper class for the primitive char data type.  In addition, this class
  56  * allows one to retrieve property information and perform transformations
  57  * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0.
  58  * java.lang.Character is designed to be very dynamic, and as such, it
  59  * retrieves information on the Unicode character set from a separate
  60  * database, gnu.java.lang.CharData, which can be easily upgraded.
  61  *
  62  * <p>For predicates, boundaries are used to describe
  63  * the set of characters for which the method will return true.
  64  * This syntax uses fairly normal regular expression notation.
  65  * See 5.13 of the Unicode Standard, Version 3.0, for the
  66  * boundary specification.
  67  *
  68  * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
  69  * for more information on the Unicode Standard.
  70  *
  71  * @author Tom Tromey (tromey@cygnus.com)
  72  * @author Paul N. Fisher
  73  * @author Jochen Hoenicke
  74  * @author Eric Blake (ebb9@email.byu.edu)
  75  * @since 1.0
  76  * @status updated to 1.4
  77  */
  78 public final class Character implements Serializable, Comparable
  79 {
  80   /**
  81    * A subset of Unicode blocks.
  82    *
  83    * @author Paul N. Fisher
  84    * @author Eric Blake (ebb9@email.byu.edu)
  85    * @since 1.2
  86    */
  87   public static class Subset
  88   {
  89     /** The name of the subset. */
  90     private final String name;
  91
  92     /**
  93      * Construct a new subset of characters.
  94      *
  95      * @param name the name of the subset
  96      * @throws NullPointerException if name is null
  97      */
  98     protected Subset(String name)
  99     {
 100       // Note that name.toString() is name, unless name was null.
 101       this.name = name.toString();
 102     }
 103
 104     /**
 105      * Compares two Subsets for equality. This is <code>final</code>, and
 106      * restricts the comparison on the <code>==</code> operator, so it returns
 107      * true only for the same object.
 108      *
 109      * @param o the object to compare
 110      * @return true if o is this
 111      */
 112     public final boolean equals(Object o)
 113     {
 114       return o == this;
 115     }
 116
 117     /**
 118      * Makes the original hashCode of Object final, to be consistent with
 119      * equals.
 120      *
 121      * @return the hash code for this object
 122      */
 123     public final int hashCode()
 124     {
 125       return super.hashCode();
 126     }
 127
 128     /**
 129      * Returns the name of the subset.
 130      *
 131      * @return the name
 132      */
 133     public final String toString()
 134     {
 135       return name;
 136     }
 137   } // class Subset
 138
 139   /**
 140    * A family of character subsets in the Unicode specification. A character
 141    * is in at most one of these blocks.
 142    *
 143    * This inner class was generated automatically from
 144    * <code>libjava/gnu/gcj/convert/Blocks-3.txt</code>, by some perl scripts.
 145    * This Unicode definition file can be found on the
 146    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 147    * JDK 1.4 uses Unicode version 3.0.0.
 148    *
 149    * @author scripts/unicode-blocks.pl (written by Eric Blake)
 150    * @since 1.2
 151    */
 152   public static final class UnicodeBlock extends Subset
 153   {
 154     /** The start of the subset. */
 155     private final int start;
 156
 157     /** The end of the subset. */
 158     private final int end;
 159
 160     /** The canonical name of the block according to the Unicode standard. */
 161     private final String canonicalName;
 162
 163     /** Constants for the <code>forName()</code> method */
 164     private static final int CANONICAL_NAME = 0;
 165     private static final int NO_SPACES_NAME = 1;
 166     private static final int CONSTANT_NAME = 2;
 167
 168     /**
 169      * Constructor for strictly defined blocks.
 170      *
 171      * @param start the start character of the range
 172      * @param end the end character of the range
 173      * @param name the block name
 174      */
 175     private UnicodeBlock(int start, int end, String name,
 176              String canonicalName)
 177     {
 178       super(name);
 179       this.start = start;
 180       this.end = end;
 181       this.canonicalName = canonicalName;
 182     }
 183
 184     /**
 185      * Returns the Unicode character block which a character belongs to.
 186      * <strong>Note</strong>: This method does not support the use of
 187      * supplementary characters.  For such support, <code>of(int)</code>
 188      * should be used instead.
 189      *
 190      * @param ch the character to look up
 191      * @return the set it belongs to, or null if it is not in one
 192      */
 193     public static UnicodeBlock of(char ch)
 194     {
 195       return of((int) ch);
 196     }
 197
 198     /**
 199      * Returns the Unicode character block which a code point belongs to.
 200      *
 201      * @param codePoint the character to look up
 202      * @return the set it belongs to, or null if it is not in one.
 203      * @throws IllegalArgumentException if the specified code point is
 204      *         invalid.
 205      * @since 1.5
 206      */
 207     public static UnicodeBlock of(int codePoint)
 208     {
 209       if (codePoint > MAX_CODE_POINT)
 210     throw new IllegalArgumentException("The supplied integer value is " +
 211                        "too large to be a codepoint.");
 212       // Simple binary search for the correct block.
 213       int low = 0;
 214       int hi = sets.length - 1;
 215       while (low <= hi)
 216         {
 217           int mid = (low + hi) >> 1;
 218           UnicodeBlock b = sets[mid];
 219           if (codePoint < b.start)
 220             hi = mid - 1;
 221           else if (codePoint > b.end)
 222             low = mid + 1;
 223           else
 224             return b;
 225         }
 226       return null;
 227     }
 228
 229     /**
 230      * <p>
 231      * Returns the <code>UnicodeBlock</code> with the given name, as defined
 232      * by the Unicode standard.  The version of Unicode in use is defined by
 233      * the <code>Character</code> class, and the names are given in the
 234      * <code>Blocks-<version>.txt</code> file corresponding to that version.
 235      * The name may be specified in one of three ways:
 236      * </p>
 237      * <ol>
 238      * <li>The canonical, human-readable name used by the Unicode standard.
 239      * This is the name with all spaces and hyphens retained.  For example,
 240      * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
 241      * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
 242      * <li>The name used for the constants specified by this class, which
 243      * is the canonical name with all spaces and hyphens replaced with
 244      * underscores e.g. `BASIC_LATIN'</li>
 245      * </ol>
 246      * <p>
 247      * The names are compared case-insensitively using the case comparison
 248      * associated with the U.S. English locale.  The method recognises the
 249      * previous names used for blocks as well as the current ones.  At
 250      * present, this simply means that the deprecated `SURROGATES_AREA'
 251      * will be recognised by this method (the <code>of()</code> methods
 252      * only return one of the three new surrogate blocks).
 253      * </p>
 254      *
 255      * @param blockName the name of the block to look up.
 256      * @return the specified block.
 257      * @throws NullPointerException if the <code>blockName</code> is
 258      *         <code>null</code>.
 259      * @throws IllegalArgumentException if the name does not match any Unicode
 260      *         block.
 261      * @since 1.5
 262      */
 263     public static final UnicodeBlock forName(String blockName)
 264     {
 265       int type;
 266       if (blockName.indexOf(' ') != -1)
 267         type = CANONICAL_NAME;
 268       else if (blockName.indexOf('_') != -1)
 269         type = CONSTANT_NAME;
 270       else
 271         type = NO_SPACES_NAME;
 272       Collator usCollator = Collator.getInstance(Locale.US);
 273       usCollator.setStrength(Collator.PRIMARY);
 274       /* Special case for deprecated blocks not in sets */
 275       switch (type)
 276       {
 277         case CANONICAL_NAME:
 278           if (usCollator.compare(blockName, "Surrogates Area") == 0)
 279             return SURROGATES_AREA;
 280           break;
 281         case NO_SPACES_NAME:
 282           if (usCollator.compare(blockName, "SurrogatesArea") == 0)
 283             return SURROGATES_AREA;
 284           break;
 285         case CONSTANT_NAME:
 286           if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
 287             return SURROGATES_AREA;
 288           break;
 289       }
 290       /* Other cases */
 291       int setLength = sets.length;
 292       switch (type)
 293       {
 294         case CANONICAL_NAME:
 295           for (int i = 0; i < setLength; i++)
 296             {
 297               UnicodeBlock block = sets[i];
 298               if (usCollator.compare(blockName, block.canonicalName) == 0)
 299                 return block;
 300             }
 301           break;
 302         case NO_SPACES_NAME:
 303           for (int i = 0; i < setLength; i++)
 304             {
 305               UnicodeBlock block = sets[i];
 306               String nsName = block.canonicalName.replaceAll(" ","");
 307               if (usCollator.compare(blockName, nsName) == 0)
 308                 return block;
 309             }
 310           break;
 311         case CONSTANT_NAME:
 312           for (int i = 0; i < setLength; i++)
 313             {
 314               UnicodeBlock block = sets[i];
 315               if (usCollator.compare(blockName, block.toString()) == 0)
 316                 return block;
 317             }
 318           break;
 319       }
 320       throw new IllegalArgumentException("No Unicode block found for " +
 321                                          blockName + ".");
 322     }
 323
 324     /**
 325      * Basic Latin.
 326      * 0x0000 - 0x007F.
 327      */
 328     public static final UnicodeBlock BASIC_LATIN
 329       = new UnicodeBlock(0x0000, 0x007F,
 330                          "BASIC_LATIN",
 331                          "Basic Latin");
 332
 333     /**
 334      * Latin-1 Supplement.
 335      * 0x0080 - 0x00FF.
 336      */
 337     public static final UnicodeBlock LATIN_1_SUPPLEMENT
 338       = new UnicodeBlock(0x0080, 0x00FF,
 339                          "LATIN_1_SUPPLEMENT",
 340                          "Latin-1 Supplement");
 341
 342     /**
 343      * Latin Extended-A.
 344      * 0x0100 - 0x017F.
 345      */
 346     public static final UnicodeBlock LATIN_EXTENDED_A
 347       = new UnicodeBlock(0x0100, 0x017F,
 348                          "LATIN_EXTENDED_A",
 349                          "Latin Extended-A");
 350
 351     /**
 352      * Latin Extended-B.
 353      * 0x0180 - 0x024F.
 354      */
 355     public static final UnicodeBlock LATIN_EXTENDED_B
 356       = new UnicodeBlock(0x0180, 0x024F,
 357                          "LATIN_EXTENDED_B",
 358                          "Latin Extended-B");
 359
 360     /**
 361      * IPA Extensions.
 362      * 0x0250 - 0x02AF.
 363      */
 364     public static final UnicodeBlock IPA_EXTENSIONS
 365       = new UnicodeBlock(0x0250, 0x02AF,
 366                          "IPA_EXTENSIONS",
 367                          "IPA Extensions");
 368
 369     /**
 370      * Spacing Modifier Letters.
 371      * 0x02B0 - 0x02FF.
 372      */
 373     public static final UnicodeBlock SPACING_MODIFIER_LETTERS
 374       = new UnicodeBlock(0x02B0, 0x02FF,
 375                          "SPACING_MODIFIER_LETTERS",
 376                          "Spacing Modifier Letters");
 377
 378     /**
 379      * Combining Diacritical Marks.
 380      * 0x0300 - 0x036F.
 381      */
 382     public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
 383       = new UnicodeBlock(0x0300, 0x036F,
 384                          "COMBINING_DIACRITICAL_MARKS",
 385                          "Combining Diacritical Marks");
 386
 387     /**
 388      * Greek.
 389      * 0x0370 - 0x03FF.
 390      */
 391     public static final UnicodeBlock GREEK
 392       = new UnicodeBlock(0x0370, 0x03FF,
 393                          "GREEK",
 394                          "Greek");
 395
 396     /**
 397      * Cyrillic.
 398      * 0x0400 - 0x04FF.
 399      */
 400     public static final UnicodeBlock CYRILLIC
 401       = new UnicodeBlock(0x0400, 0x04FF,
 402                          "CYRILLIC",
 403                          "Cyrillic");
 404
 405     /**
 406      * Cyrillic Supplementary.
 407      * 0x0500 - 0x052F.
 408      * @since 1.5
 409      */
 410     public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
 411       = new UnicodeBlock(0x0500, 0x052F,
 412                          "CYRILLIC_SUPPLEMENTARY",
 413                          "Cyrillic Supplementary");
 414
 415     /**
 416      * Armenian.
 417      * 0x0530 - 0x058F.
 418      */
 419     public static final UnicodeBlock ARMENIAN
 420       = new UnicodeBlock(0x0530, 0x058F,
 421                          "ARMENIAN",
 422                          "Armenian");
 423
 424     /**
 425      * Hebrew.
 426      * 0x0590 - 0x05FF.
 427      */
 428     public static final UnicodeBlock HEBREW
 429       = new UnicodeBlock(0x0590, 0x05FF,
 430                          "HEBREW",
 431                          "Hebrew");
 432
 433     /**
 434      * Arabic.
 435      * 0x0600 - 0x06FF.
 436      */
 437     public static final UnicodeBlock ARABIC
 438       = new UnicodeBlock(0x0600, 0x06FF,
 439                          "ARABIC",
 440                          "Arabic");
 441
 442     /**
 443      * Syriac.
 444      * 0x0700 - 0x074F.
 445      * @since 1.4
 446      */
 447     public static final UnicodeBlock SYRIAC
 448       = new UnicodeBlock(0x0700, 0x074F,
 449                          "SYRIAC",
 450                          "Syriac");
 451
 452     /**
 453      * Thaana.
 454      * 0x0780 - 0x07BF.
 455      * @since 1.4
 456      */
 457     public static final UnicodeBlock THAANA
 458       = new UnicodeBlock(0x0780, 0x07BF,
 459                          "THAANA",
 460                          "Thaana");
 461
 462     /**
 463      * Devanagari.
 464      * 0x0900 - 0x097F.
 465      */
 466     public static final UnicodeBlock DEVANAGARI
 467       = new UnicodeBlock(0x0900, 0x097F,
 468                          "DEVANAGARI",
 469                          "Devanagari");
 470
 471     /**
 472      * Bengali.
 473      * 0x0980 - 0x09FF.
 474      */
 475     public static final UnicodeBlock BENGALI
 476       = new UnicodeBlock(0x0980, 0x09FF,
 477                          "BENGALI",
 478                          "Bengali");
 479
 480     /**
 481      * Gurmukhi.
 482      * 0x0A00 - 0x0A7F.
 483      */
 484     public static final UnicodeBlock GURMUKHI
 485       = new UnicodeBlock(0x0A00, 0x0A7F,
 486                          "GURMUKHI",
 487                          "Gurmukhi");
 488
 489     /**
 490      * Gujarati.
 491      * 0x0A80 - 0x0AFF.
 492      */
 493     public static final UnicodeBlock GUJARATI
 494       = new UnicodeBlock(0x0A80, 0x0AFF,
 495                          "GUJARATI",
 496                          "Gujarati");
 497
 498     /**
 499      * Oriya.
 500      * 0x0B00 - 0x0B7F.
 501      */
 502     public static final UnicodeBlock ORIYA
 503       = new UnicodeBlock(0x0B00, 0x0B7F,
 504                          "ORIYA",
 505                          "Oriya");
 506
 507     /**
 508      * Tamil.
 509      * 0x0B80 - 0x0BFF.
 510      */
 511     public static final UnicodeBlock TAMIL
 512       = new UnicodeBlock(0x0B80, 0x0BFF,
 513                          "TAMIL",
 514                          "Tamil");
 515
 516     /**
 517      * Telugu.
 518      * 0x0C00 - 0x0C7F.
 519      */
 520     public static final UnicodeBlock TELUGU
 521       = new UnicodeBlock(0x0C00, 0x0C7F,
 522                          "TELUGU",
 523                          "Telugu");
 524
 525     /**
 526      * Kannada.
 527      * 0x0C80 - 0x0CFF.
 528      */
 529     public static final UnicodeBlock KANNADA
 530       = new UnicodeBlock(0x0C80, 0x0CFF,
 531                          "KANNADA",
 532                          "Kannada");
 533
 534     /**
 535      * Malayalam.
 536      * 0x0D00 - 0x0D7F.
 537      */
 538     public static final UnicodeBlock MALAYALAM
 539       = new UnicodeBlock(0x0D00, 0x0D7F,
 540                          "MALAYALAM",
 541                          "Malayalam");
 542
 543     /**
 544      * Sinhala.
 545      * 0x0D80 - 0x0DFF.
 546      * @since 1.4
 547      */
 548     public static final UnicodeBlock SINHALA
 549       = new UnicodeBlock(0x0D80, 0x0DFF,
 550                          "SINHALA",
 551                          "Sinhala");
 552
 553     /**
 554      * Thai.
 555      * 0x0E00 - 0x0E7F.
 556      */
 557     public static final UnicodeBlock THAI
 558       = new UnicodeBlock(0x0E00, 0x0E7F,
 559                          "THAI",
 560                          "Thai");
 561
 562     /**
 563      * Lao.
 564      * 0x0E80 - 0x0EFF.
 565      */
 566     public static final UnicodeBlock LAO
 567       = new UnicodeBlock(0x0E80, 0x0EFF,
 568                          "LAO",
 569                          "Lao");
 570
 571     /**
 572      * Tibetan.
 573      * 0x0F00 - 0x0FFF.
 574      */
 575     public static final UnicodeBlock TIBETAN
 576       = new UnicodeBlock(0x0F00, 0x0FFF,
 577                          "TIBETAN",
 578                          "Tibetan");
 579
 580     /**
 581      * Myanmar.
 582      * 0x1000 - 0x109F.
 583      * @since 1.4
 584      */
 585     public static final UnicodeBlock MYANMAR
 586       = new UnicodeBlock(0x1000, 0x109F,
 587                          "MYANMAR",
 588                          "Myanmar");
 589
 590     /**
 591      * Georgian.
 592      * 0x10A0 - 0x10FF.
 593      */
 594     public static final UnicodeBlock GEORGIAN
 595       = new UnicodeBlock(0x10A0, 0x10FF,
 596                          "GEORGIAN",
 597                          "Georgian");
 598
 599     /**
 600      * Hangul Jamo.
 601      * 0x1100 - 0x11FF.
 602      */
 603     public static final UnicodeBlock HANGUL_JAMO
 604       = new UnicodeBlock(0x1100, 0x11FF,
 605                          "HANGUL_JAMO",
 606                          "Hangul Jamo");
 607
 608     /**
 609      * Ethiopic.
 610      * 0x1200 - 0x137F.
 611      * @since 1.4
 612      */
 613     public static final UnicodeBlock ETHIOPIC
 614       = new UnicodeBlock(0x1200, 0x137F,
 615                          "ETHIOPIC",
 616                          "Ethiopic");
 617
 618     /**
 619      * Cherokee.
 620      * 0x13A0 - 0x13FF.
 621      * @since 1.4
 622      */
 623     public static final UnicodeBlock CHEROKEE
 624       = new UnicodeBlock(0x13A0, 0x13FF,
 625                          "CHEROKEE",
 626                          "Cherokee");
 627
 628     /**
 629      * Unified Canadian Aboriginal Syllabics.
 630      * 0x1400 - 0x167F.
 631      * @since 1.4
 632      */
 633     public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
 634       = new UnicodeBlock(0x1400, 0x167F,
 635                          "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
 636                          "Unified Canadian Aboriginal Syllabics");
 637
 638     /**
 639      * Ogham.
 640      * 0x1680 - 0x169F.
 641      * @since 1.4
 642      */
 643     public static final UnicodeBlock OGHAM
 644       = new UnicodeBlock(0x1680, 0x169F,
 645                          "OGHAM",
 646                          "Ogham");
 647
 648     /**
 649      * Runic.
 650      * 0x16A0 - 0x16FF.
 651      * @since 1.4
 652      */
 653     public static final UnicodeBlock RUNIC
 654       = new UnicodeBlock(0x16A0, 0x16FF,
 655                          "RUNIC",
 656                          "Runic");
 657
 658     /**
 659      * Tagalog.
 660      * 0x1700 - 0x171F.
 661      * @since 1.5
 662      */
 663     public static final UnicodeBlock TAGALOG
 664       = new UnicodeBlock(0x1700, 0x171F,
 665                          "TAGALOG",
 666                          "Tagalog");
 667
 668     /**
 669      * Hanunoo.
 670      * 0x1720 - 0x173F.
 671      * @since 1.5
 672      */
 673     public static final UnicodeBlock HANUNOO
 674       = new UnicodeBlock(0x1720, 0x173F,
 675                          "HANUNOO",
 676                          "Hanunoo");
 677
 678     /**
 679      * Buhid.
 680      * 0x1740 - 0x175F.
 681      * @since 1.5
 682      */
 683     public static final UnicodeBlock BUHID
 684       = new UnicodeBlock(0x1740, 0x175F,
 685                          "BUHID",
 686                          "Buhid");
 687
 688     /**
 689      * Tagbanwa.
 690      * 0x1760 - 0x177F.
 691      * @since 1.5
 692      */
 693     public static final UnicodeBlock TAGBANWA
 694       = new UnicodeBlock(0x1760, 0x177F,
 695                          "TAGBANWA",
 696                          "Tagbanwa");
 697
 698     /**
 699      * Khmer.
 700      * 0x1780 - 0x17FF.
 701      * @since 1.4
 702      */
 703     public static final UnicodeBlock KHMER
 704       = new UnicodeBlock(0x1780, 0x17FF,
 705                          "KHMER",
 706                          "Khmer");
 707
 708     /**
 709      * Mongolian.
 710      * 0x1800 - 0x18AF.
 711      * @since 1.4
 712      */
 713     public static final UnicodeBlock MONGOLIAN
 714       = new UnicodeBlock(0x1800, 0x18AF,
 715                          "MONGOLIAN",
 716                          "Mongolian");
 717
 718     /**
 719      * Limbu.
 720      * 0x1900 - 0x194F.
 721      * @since 1.5
 722      */
 723     public static final UnicodeBlock LIMBU
 724       = new UnicodeBlock(0x1900, 0x194F,
 725                          "LIMBU",
 726                          "Limbu");
 727
 728     /**
 729      * Tai Le.
 730      * 0x1950 - 0x197F.
 731      * @since 1.5
 732      */
 733     public static final UnicodeBlock TAI_LE
 734       = new UnicodeBlock(0x1950, 0x197F,
 735                          "TAI_LE",
 736                          "Tai Le");
 737
 738     /**
 739      * Khmer Symbols.
 740      * 0x19E0 - 0x19FF.
 741      * @since 1.5
 742      */
 743     public static final UnicodeBlock KHMER_SYMBOLS
 744       = new UnicodeBlock(0x19E0, 0x19FF,
 745                          "KHMER_SYMBOLS",
 746                          "Khmer Symbols");
 747
 748     /**
 749      * Phonetic Extensions.
 750      * 0x1D00 - 0x1D7F.
 751      * @since 1.5
 752      */
 753     public static final UnicodeBlock PHONETIC_EXTENSIONS
 754       = new UnicodeBlock(0x1D00, 0x1D7F,
 755                          "PHONETIC_EXTENSIONS",
 756                          "Phonetic Extensions");
 757
 758     /**
 759      * Latin Extended Additional.
 760      * 0x1E00 - 0x1EFF.
 761      */
 762     public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
 763       = new UnicodeBlock(0x1E00, 0x1EFF,
 764                          "LATIN_EXTENDED_ADDITIONAL",
 765                          "Latin Extended Additional");
 766
 767     /**
 768      * Greek Extended.
 769      * 0x1F00 - 0x1FFF.
 770      */
 771     public static final UnicodeBlock GREEK_EXTENDED
 772       = new UnicodeBlock(0x1F00, 0x1FFF,
 773                          "GREEK_EXTENDED",
 774                          "Greek Extended");
 775
 776     /**
 777      * General Punctuation.
 778      * 0x2000 - 0x206F.
 779      */
 780     public static final UnicodeBlock GENERAL_PUNCTUATION
 781       = new UnicodeBlock(0x2000, 0x206F,
 782                          "GENERAL_PUNCTUATION",
 783                          "General Punctuation");
 784
 785     /**
 786      * Superscripts and Subscripts.
 787      * 0x2070 - 0x209F.
 788      */
 789     public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
 790       = new UnicodeBlock(0x2070, 0x209F,
 791                          "SUPERSCRIPTS_AND_SUBSCRIPTS",
 792                          "Superscripts and Subscripts");
 793
 794     /**
 795      * Currency Symbols.
 796      * 0x20A0 - 0x20CF.
 797      */
 798     public static final UnicodeBlock CURRENCY_SYMBOLS
 799       = new UnicodeBlock(0x20A0, 0x20CF,
 800                          "CURRENCY_SYMBOLS",
 801                          "Currency Symbols");
 802
 803     /**
 804      * Combining Marks for Symbols.
 805      * 0x20D0 - 0x20FF.
 806      */
 807     public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
 808       = new UnicodeBlock(0x20D0, 0x20FF,
 809                          "COMBINING_MARKS_FOR_SYMBOLS",
 810                          "Combining Marks for Symbols");
 811
 812     /**
 813      * Letterlike Symbols.
 814      * 0x2100 - 0x214F.
 815      */
 816     public static final UnicodeBlock LETTERLIKE_SYMBOLS
 817       = new UnicodeBlock(0x2100, 0x214F,
 818                          "LETTERLIKE_SYMBOLS",
 819                          "Letterlike Symbols");
 820
 821     /**
 822      * Number Forms.
 823      * 0x2150 - 0x218F.
 824      */
 825     public static final UnicodeBlock NUMBER_FORMS
 826       = new UnicodeBlock(0x2150, 0x218F,
 827                          "NUMBER_FORMS",
 828                          "Number Forms");
 829
 830     /**
 831      * Arrows.
 832      * 0x2190 - 0x21FF.
 833      */
 834     public static final UnicodeBlock ARROWS
 835       = new UnicodeBlock(0x2190, 0x21FF,
 836                          "ARROWS",
 837                          "Arrows");
 838
 839     /**
 840      * Mathematical Operators.
 841      * 0x2200 - 0x22FF.
 842      */
 843     public static final UnicodeBlock MATHEMATICAL_OPERATORS
 844       = new UnicodeBlock(0x2200, 0x22FF,
 845                          "MATHEMATICAL_OPERATORS",
 846                          "Mathematical Operators");
 847
 848     /**
 849      * Miscellaneous Technical.
 850      * 0x2300 - 0x23FF.
 851      */
 852     public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
 853       = new UnicodeBlock(0x2300, 0x23FF,
 854                          "MISCELLANEOUS_TECHNICAL",
 855                          "Miscellaneous Technical");
 856
 857     /**
 858      * Control Pictures.
 859      * 0x2400 - 0x243F.
 860      */
 861     public static final UnicodeBlock CONTROL_PICTURES
 862       = new UnicodeBlock(0x2400, 0x243F,
 863                          "CONTROL_PICTURES",
 864                          "Control Pictures");
 865
 866     /**
 867      * Optical Character Recognition.
 868      * 0x2440 - 0x245F.
 869      */
 870     public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
 871       = new UnicodeBlock(0x2440, 0x245F,
 872                          "OPTICAL_CHARACTER_RECOGNITION",
 873                          "Optical Character Recognition");
 874
 875     /**
 876      * Enclosed Alphanumerics.
 877      * 0x2460 - 0x24FF.
 878      */
 879     public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
 880       = new UnicodeBlock(0x2460, 0x24FF,
 881                          "ENCLOSED_ALPHANUMERICS",
 882                          "Enclosed Alphanumerics");
 883
 884     /**
 885      * Box Drawing.
 886      * 0x2500 - 0x257F.
 887      */
 888     public static final UnicodeBlock BOX_DRAWING
 889       = new UnicodeBlock(0x2500, 0x257F,
 890                          "BOX_DRAWING",
 891                          "Box Drawing");
 892
 893     /**
 894      * Block Elements.
 895      * 0x2580 - 0x259F.
 896      */
 897     public static final UnicodeBlock BLOCK_ELEMENTS
 898       = new UnicodeBlock(0x2580, 0x259F,
 899                          "BLOCK_ELEMENTS",
 900                          "Block Elements");
 901
 902     /**
 903      * Geometric Shapes.
 904      * 0x25A0 - 0x25FF.
 905      */
 906     public static final UnicodeBlock GEOMETRIC_SHAPES
 907       = new UnicodeBlock(0x25A0, 0x25FF,
 908                          "GEOMETRIC_SHAPES",
 909                          "Geometric Shapes");
 910
 911     /**
 912      * Miscellaneous Symbols.
 913      * 0x2600 - 0x26FF.
 914      */
 915     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
 916       = new UnicodeBlock(0x2600, 0x26FF,
 917                          "MISCELLANEOUS_SYMBOLS",
 918                          "Miscellaneous Symbols");
 919
 920     /**
 921      * Dingbats.
 922      * 0x2700 - 0x27BF.
 923      */
 924     public static final UnicodeBlock DINGBATS
 925       = new UnicodeBlock(0x2700, 0x27BF,
 926                          "DINGBATS",
 927                          "Dingbats");
 928
 929     /**
 930      * Miscellaneous Mathematical Symbols-A.
 931      * 0x27C0 - 0x27EF.
 932      * @since 1.5
 933      */
 934     public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
 935       = new UnicodeBlock(0x27C0, 0x27EF,
 936                          "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
 937                          "Miscellaneous Mathematical Symbols-A");
 938
 939     /**
 940      * Supplemental Arrows-A.
 941      * 0x27F0 - 0x27FF.
 942      * @since 1.5
 943      */
 944     public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
 945       = new UnicodeBlock(0x27F0, 0x27FF,
 946                          "SUPPLEMENTAL_ARROWS_A",
 947                          "Supplemental Arrows-A");
 948
 949     /**
 950      * Braille Patterns.
 951      * 0x2800 - 0x28FF.
 952      * @since 1.4
 953      */
 954     public static final UnicodeBlock BRAILLE_PATTERNS
 955       = new UnicodeBlock(0x2800, 0x28FF,
 956                          "BRAILLE_PATTERNS",
 957                          "Braille Patterns");
 958
 959     /**
 960      * Supplemental Arrows-B.
 961      * 0x2900 - 0x297F.
 962      * @since 1.5
 963      */
 964     public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
 965       = new UnicodeBlock(0x2900, 0x297F,
 966                          "SUPPLEMENTAL_ARROWS_B",
 967                          "Supplemental Arrows-B");
 968
 969     /**
 970      * Miscellaneous Mathematical Symbols-B.
 971      * 0x2980 - 0x29FF.
 972      * @since 1.5
 973      */
 974     public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
 975       = new UnicodeBlock(0x2980, 0x29FF,
 976                          "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
 977                          "Miscellaneous Mathematical Symbols-B");
 978
 979     /**
 980      * Supplemental Mathematical Operators.
 981      * 0x2A00 - 0x2AFF.
 982      * @since 1.5
 983      */
 984     public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
 985       = new UnicodeBlock(0x2A00, 0x2AFF,
 986                          "SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
 987                          "Supplemental Mathematical Operators");
 988
 989     /**
 990      * Miscellaneous Symbols and Arrows.
 991      * 0x2B00 - 0x2BFF.
 992      * @since 1.5
 993      */
 994     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
 995       = new UnicodeBlock(0x2B00, 0x2BFF,
 996                          "MISCELLANEOUS_SYMBOLS_AND_ARROWS",
 997                          "Miscellaneous Symbols and Arrows");
 998
 999     /**
1000      * CJK Radicals Supplement.
1001      * 0x2E80 - 0x2EFF.
1002      * @since 1.4
1003      */
1004     public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
1005       = new UnicodeBlock(0x2E80, 0x2EFF,
1006                          "CJK_RADICALS_SUPPLEMENT",
1007                          "CJK Radicals Supplement");
1008
1009     /**
1010      * Kangxi Radicals.
1011      * 0x2F00 - 0x2FDF.
1012      * @since 1.4
1013      */
1014     public static final UnicodeBlock KANGXI_RADICALS
1015       = new UnicodeBlock(0x2F00, 0x2FDF,
1016                          "KANGXI_RADICALS",
1017                          "Kangxi Radicals");
1018
1019     /**
1020      * Ideographic Description Characters.
1021      * 0x2FF0 - 0x2FFF.
1022      * @since 1.4
1023      */
1024     public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
1025       = new UnicodeBlock(0x2FF0, 0x2FFF,
1026                          "IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
1027                          "Ideographic Description Characters");
1028
1029     /**
1030      * CJK Symbols and Punctuation.
1031      * 0x3000 - 0x303F.
1032      */
1033     public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
1034       = new UnicodeBlock(0x3000, 0x303F,
1035                          "CJK_SYMBOLS_AND_PUNCTUATION",
1036                          "CJK Symbols and Punctuation");
1037
1038     /**
1039      * Hiragana.
1040      * 0x3040 - 0x309F.
1041      */
1042     public static final UnicodeBlock HIRAGANA
1043       = new UnicodeBlock(0x3040, 0x309F,
1044                          "HIRAGANA",
1045                          "Hiragana");
1046
1047     /**
1048      * Katakana.
1049      * 0x30A0 - 0x30FF.
1050      */
1051     public static final UnicodeBlock KATAKANA
1052       = new UnicodeBlock(0x30A0, 0x30FF,
1053                          "KATAKANA",
1054                          "Katakana");
1055
1056     /**
1057      * Bopomofo.
1058      * 0x3100 - 0x312F.
1059      */
1060     public static final UnicodeBlock BOPOMOFO
1061       = new UnicodeBlock(0x3100, 0x312F,
1062                          "BOPOMOFO",
1063                          "Bopomofo");
1064
1065     /**
1066      * Hangul Compatibility Jamo.
1067      * 0x3130 - 0x318F.
1068      */
1069     public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
1070       = new UnicodeBlock(0x3130, 0x318F,
1071                          "HANGUL_COMPATIBILITY_JAMO",
1072                          "Hangul Compatibility Jamo");
1073
1074     /**
1075      * Kanbun.
1076      * 0x3190 - 0x319F.
1077      */
1078     public static final UnicodeBlock KANBUN
1079       = new UnicodeBlock(0x3190, 0x319F,
1080                          "KANBUN",
1081                          "Kanbun");
1082
1083     /**
1084      * Bopomofo Extended.
1085      * 0x31A0 - 0x31BF.
1086      * @since 1.4
1087      */
1088     public static final UnicodeBlock BOPOMOFO_EXTENDED
1089       = new UnicodeBlock(0x31A0, 0x31BF,
1090                          "BOPOMOFO_EXTENDED",
1091                          "Bopomofo Extended");
1092
1093     /**
1094      * Katakana Phonetic Extensions.
1095      * 0x31F0 - 0x31FF.
1096      * @since 1.5
1097      */
1098     public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
1099       = new UnicodeBlock(0x31F0, 0x31FF,
1100                          "KATAKANA_PHONETIC_EXTENSIONS",
1101                          "Katakana Phonetic Extensions");
1102
1103     /**
1104      * Enclosed CJK Letters and Months.
1105      * 0x3200 - 0x32FF.
1106      */
1107     public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
1108       = new UnicodeBlock(0x3200, 0x32FF,
1109                          "ENCLOSED_CJK_LETTERS_AND_MONTHS",
1110                          "Enclosed CJK Letters and Months");
1111
1112     /**
1113      * CJK Compatibility.
1114      * 0x3300 - 0x33FF.
1115      */
1116     public static final UnicodeBlock CJK_COMPATIBILITY
1117       = new UnicodeBlock(0x3300, 0x33FF,
1118                          "CJK_COMPATIBILITY",
1119                          "CJK Compatibility");
1120
1121     /**
1122      * CJK Unified Ideographs Extension A.
1123      * 0x3400 - 0x4DBF.
1124      * @since 1.4
1125      */
1126     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1127       = new UnicodeBlock(0x3400, 0x4DBF,
1128                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
1129                          "CJK Unified Ideographs Extension A");
1130
1131     /**
1132      * Yijing Hexagram Symbols.
1133      * 0x4DC0 - 0x4DFF.
1134      * @since 1.5
1135      */
1136     public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
1137       = new UnicodeBlock(0x4DC0, 0x4DFF,
1138                          "YIJING_HEXAGRAM_SYMBOLS",
1139                          "Yijing Hexagram Symbols");
1140
1141     /**
1142      * CJK Unified Ideographs.
1143      * 0x4E00 - 0x9FFF.
1144      */
1145     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
1146       = new UnicodeBlock(0x4E00, 0x9FFF,
1147                          "CJK_UNIFIED_IDEOGRAPHS",
1148                          "CJK Unified Ideographs");
1149
1150     /**
1151      * Yi Syllables.
1152      * 0xA000 - 0xA48F.
1153      * @since 1.4
1154      */
1155     public static final UnicodeBlock YI_SYLLABLES
1156       = new UnicodeBlock(0xA000, 0xA48F,
1157                          "YI_SYLLABLES",
1158                          "Yi Syllables");
1159
1160     /**
1161      * Yi Radicals.
1162      * 0xA490 - 0xA4CF.
1163      * @since 1.4
1164      */
1165     public static final UnicodeBlock YI_RADICALS
1166       = new UnicodeBlock(0xA490, 0xA4CF,
1167                          "YI_RADICALS",
1168                          "Yi Radicals");
1169
1170     /**
1171      * Hangul Syllables.
1172      * 0xAC00 - 0xD7AF.
1173      */
1174     public static final UnicodeBlock HANGUL_SYLLABLES
1175       = new UnicodeBlock(0xAC00, 0xD7AF,
1176                          "HANGUL_SYLLABLES",
1177                          "Hangul Syllables");
1178
1179     /**
1180      * High Surrogates.
1181      * 0xD800 - 0xDB7F.
1182      * @since 1.5
1183      */
1184     public static final UnicodeBlock HIGH_SURROGATES
1185       = new UnicodeBlock(0xD800, 0xDB7F,
1186                          "HIGH_SURROGATES",
1187                          "High Surrogates");
1188
1189     /**
1190      * High Private Use Surrogates.
1191      * 0xDB80 - 0xDBFF.
1192      * @since 1.5
1193      */
1194     public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
1195       = new UnicodeBlock(0xDB80, 0xDBFF,
1196                          "HIGH_PRIVATE_USE_SURROGATES",
1197                          "High Private Use Surrogates");
1198
1199     /**
1200      * Low Surrogates.
1201      * 0xDC00 - 0xDFFF.
1202      * @since 1.5
1203      */
1204     public static final UnicodeBlock LOW_SURROGATES
1205       = new UnicodeBlock(0xDC00, 0xDFFF,
1206                          "LOW_SURROGATES",
1207                          "Low Surrogates");
1208
1209     /**
1210      * Private Use Area.
1211      * 0xE000 - 0xF8FF.
1212      */
1213     public static final UnicodeBlock PRIVATE_USE_AREA
1214       = new UnicodeBlock(0xE000, 0xF8FF,
1215                          "PRIVATE_USE_AREA",
1216                          "Private Use Area");
1217
1218     /**
1219      * CJK Compatibility Ideographs.
1220      * 0xF900 - 0xFAFF.
1221      */
1222     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
1223       = new UnicodeBlock(0xF900, 0xFAFF,
1224                          "CJK_COMPATIBILITY_IDEOGRAPHS",
1225                          "CJK Compatibility Ideographs");
1226
1227     /**
1228      * Alphabetic Presentation Forms.
1229      * 0xFB00 - 0xFB4F.
1230      */
1231     public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
1232       = new UnicodeBlock(0xFB00, 0xFB4F,
1233                          "ALPHABETIC_PRESENTATION_FORMS",
1234                          "Alphabetic Presentation Forms");
1235
1236     /**
1237      * Arabic Presentation Forms-A.
1238      * 0xFB50 - 0xFDFF.
1239      */
1240     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
1241       = new UnicodeBlock(0xFB50, 0xFDFF,
1242                          "ARABIC_PRESENTATION_FORMS_A",
1243                          "Arabic Presentation Forms-A");
1244
1245     /**
1246      * Variation Selectors.
1247      * 0xFE00 - 0xFE0F.
1248      * @since 1.5
1249      */
1250     public static final UnicodeBlock VARIATION_SELECTORS
1251       = new UnicodeBlock(0xFE00, 0xFE0F,
1252                          "VARIATION_SELECTORS",
1253                          "Variation Selectors");
1254
1255     /**
1256      * Combining Half Marks.
1257      * 0xFE20 - 0xFE2F.
1258      */
1259     public static final UnicodeBlock COMBINING_HALF_MARKS
1260       = new UnicodeBlock(0xFE20, 0xFE2F,
1261                          "COMBINING_HALF_MARKS",
1262                          "Combining Half Marks");
1263
1264     /**
1265      * CJK Compatibility Forms.
1266      * 0xFE30 - 0xFE4F.
1267      */
1268     public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
1269       = new UnicodeBlock(0xFE30, 0xFE4F,
1270                          "CJK_COMPATIBILITY_FORMS",
1271                          "CJK Compatibility Forms");
1272
1273     /**
1274      * Small Form Variants.
1275      * 0xFE50 - 0xFE6F.
1276      */
1277     public static final UnicodeBlock SMALL_FORM_VARIANTS
1278       = new UnicodeBlock(0xFE50, 0xFE6F,
1279                          "SMALL_FORM_VARIANTS",
1280                          "Small Form Variants");
1281
1282     /**
1283      * Arabic Presentation Forms-B.
1284      * 0xFE70 - 0xFEFF.
1285      */
1286     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
1287       = new UnicodeBlock(0xFE70, 0xFEFF,
1288                          "ARABIC_PRESENTATION_FORMS_B",
1289                          "Arabic Presentation Forms-B");
1290
1291     /**
1292      * Halfwidth and Fullwidth Forms.
1293      * 0xFF00 - 0xFFEF.
1294      */
1295     public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
1296       = new UnicodeBlock(0xFF00, 0xFFEF,
1297                          "HALFWIDTH_AND_FULLWIDTH_FORMS",
1298                          "Halfwidth and Fullwidth Forms");
1299
1300     /**
1301      * Specials.
1302      * 0xFFF0 - 0xFFFF.
1303      */
1304     public static final UnicodeBlock SPECIALS
1305       = new UnicodeBlock(0xFFF0, 0xFFFF,
1306                          "SPECIALS",
1307                          "Specials");
1308
1309     /**
1310      * Linear B Syllabary.
1311      * 0x10000 - 0x1007F.
1312      * @since 1.5
1313      */
1314     public static final UnicodeBlock LINEAR_B_SYLLABARY
1315       = new UnicodeBlock(0x10000, 0x1007F,
1316                          "LINEAR_B_SYLLABARY",
1317                          "Linear B Syllabary");
1318
1319     /**
1320      * Linear B Ideograms.
1321      * 0x10080 - 0x100FF.
1322      * @since 1.5
1323      */
1324     public static final UnicodeBlock LINEAR_B_IDEOGRAMS
1325       = new UnicodeBlock(0x10080, 0x100FF,
1326                          "LINEAR_B_IDEOGRAMS",
1327                          "Linear B Ideograms");
1328
1329     /**
1330      * Aegean Numbers.
1331      * 0x10100 - 0x1013F.
1332      * @since 1.5
1333      */
1334     public static final UnicodeBlock AEGEAN_NUMBERS
1335       = new UnicodeBlock(0x10100, 0x1013F,
1336                          "AEGEAN_NUMBERS",
1337                          "Aegean Numbers");
1338
1339     /**
1340      * Old Italic.
1341      * 0x10300 - 0x1032F.
1342      * @since 1.5
1343      */
1344     public static final UnicodeBlock OLD_ITALIC
1345       = new UnicodeBlock(0x10300, 0x1032F,
1346                          "OLD_ITALIC",
1347                          "Old Italic");
1348
1349     /**
1350      * Gothic.
1351      * 0x10330 - 0x1034F.
1352      * @since 1.5
1353      */
1354     public static final UnicodeBlock GOTHIC
1355       = new UnicodeBlock(0x10330, 0x1034F,
1356                          "GOTHIC",
1357                          "Gothic");
1358
1359     /**
1360      * Ugaritic.
1361      * 0x10380 - 0x1039F.
1362      * @since 1.5
1363      */
1364     public static final UnicodeBlock UGARITIC
1365       = new UnicodeBlock(0x10380, 0x1039F,
1366                          "UGARITIC",
1367                          "Ugaritic");
1368
1369     /**
1370      * Deseret.
1371      * 0x10400 - 0x1044F.
1372      * @since 1.5
1373      */
1374     public static final UnicodeBlock DESERET
1375       = new UnicodeBlock(0x10400, 0x1044F,
1376                          "DESERET",
1377                          "Deseret");
1378
1379     /**
1380      * Shavian.
1381      * 0x10450 - 0x1047F.
1382      * @since 1.5
1383      */
1384     public static final UnicodeBlock SHAVIAN
1385       = new UnicodeBlock(0x10450, 0x1047F,
1386                          "SHAVIAN",
1387                          "Shavian");
1388
1389     /**
1390      * Osmanya.
1391      * 0x10480 - 0x104AF.
1392      * @since 1.5
1393      */
1394     public static final UnicodeBlock OSMANYA
1395       = new UnicodeBlock(0x10480, 0x104AF,
1396                          "OSMANYA",
1397                          "Osmanya");
1398
1399     /**
1400      * Cypriot Syllabary.
1401      * 0x10800 - 0x1083F.
1402      * @since 1.5
1403      */
1404     public static final UnicodeBlock CYPRIOT_SYLLABARY
1405       = new UnicodeBlock(0x10800, 0x1083F,
1406                          "CYPRIOT_SYLLABARY",
1407                          "Cypriot Syllabary");
1408
1409     /**
1410      * Byzantine Musical Symbols.
1411      * 0x1D000 - 0x1D0FF.
1412      * @since 1.5
1413      */
1414     public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
1415       = new UnicodeBlock(0x1D000, 0x1D0FF,
1416                          "BYZANTINE_MUSICAL_SYMBOLS",
1417                          "Byzantine Musical Symbols");
1418
1419     /**
1420      * Musical Symbols.
1421      * 0x1D100 - 0x1D1FF.
1422      * @since 1.5
1423      */
1424     public static final UnicodeBlock MUSICAL_SYMBOLS
1425       = new UnicodeBlock(0x1D100, 0x1D1FF,
1426                          "MUSICAL_SYMBOLS",
1427                          "Musical Symbols");
1428
1429     /**
1430      * Tai Xuan Jing Symbols.
1431      * 0x1D300 - 0x1D35F.
1432      * @since 1.5
1433      */
1434     public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
1435       = new UnicodeBlock(0x1D300, 0x1D35F,
1436                          "TAI_XUAN_JING_SYMBOLS",
1437                          "Tai Xuan Jing Symbols");
1438
1439     /**
1440      * Mathematical Alphanumeric Symbols.
1441      * 0x1D400 - 0x1D7FF.
1442      * @since 1.5
1443      */
1444     public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
1445       = new UnicodeBlock(0x1D400, 0x1D7FF,
1446                          "MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
1447                          "Mathematical Alphanumeric Symbols");
1448
1449     /**
1450      * CJK Unified Ideographs Extension B.
1451      * 0x20000 - 0x2A6DF.
1452      * @since 1.5
1453      */
1454     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1455       = new UnicodeBlock(0x20000, 0x2A6DF,
1456                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
1457                          "CJK Unified Ideographs Extension B");
1458
1459     /**
1460      * CJK Compatibility Ideographs Supplement.
1461      * 0x2F800 - 0x2FA1F.
1462      * @since 1.5
1463      */
1464     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
1465       = new UnicodeBlock(0x2F800, 0x2FA1F,
1466                          "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
1467                          "CJK Compatibility Ideographs Supplement");
1468
1469     /**
1470      * Tags.
1471      * 0xE0000 - 0xE007F.
1472      * @since 1.5
1473      */
1474     public static final UnicodeBlock TAGS
1475       = new UnicodeBlock(0xE0000, 0xE007F,
1476                          "TAGS",
1477                          "Tags");
1478
1479     /**
1480      * Variation Selectors Supplement.
1481      * 0xE0100 - 0xE01EF.
1482      * @since 1.5
1483      */
1484     public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
1485       = new UnicodeBlock(0xE0100, 0xE01EF,
1486                          "VARIATION_SELECTORS_SUPPLEMENT",
1487                          "Variation Selectors Supplement");
1488
1489     /**
1490      * Supplementary Private Use Area-A.
1491      * 0xF0000 - 0xFFFFF.
1492      * @since 1.5
1493      */
1494     public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
1495       = new UnicodeBlock(0xF0000, 0xFFFFF,
1496                          "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
1497                          "Supplementary Private Use Area-A");
1498
1499     /**
1500      * Supplementary Private Use Area-B.
1501      * 0x100000 - 0x10FFFF.
1502      * @since 1.5
1503      */
1504     public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
1505       = new UnicodeBlock(0x100000, 0x10FFFF,
1506                          "SUPPLEMENTARY_PRIVATE_USE_AREA_B",
1507                          "Supplementary Private Use Area-B");
1508
1509     /**
1510      * Surrogates Area.
1511      * 'D800' - 'DFFF'.
1512      * @deprecated As of 1.5, the three areas,
1513      * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
1514      * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
1515      * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
1516      * by the Unicode standard, should be used in preference to
1517      * this.  These are also returned from calls to <code>of(int)</code>
1518      * and <code>of(char)</code>.
1519      */
1520     public static final UnicodeBlock SURROGATES_AREA
1521       = new UnicodeBlock(0xD800, 0xDFFF,
1522                          "SURROGATES_AREA",
1523              "Surrogates Area");
1524
1525     /**
1526      * The defined subsets.
1527      */
1528     private static final UnicodeBlock sets[] = {
1529       BASIC_LATIN,
1530       LATIN_1_SUPPLEMENT,
1531       LATIN_EXTENDED_A,
1532       LATIN_EXTENDED_B,
1533       IPA_EXTENSIONS,
1534       SPACING_MODIFIER_LETTERS,
1535       COMBINING_DIACRITICAL_MARKS,
1536       GREEK,
1537       CYRILLIC,
1538       CYRILLIC_SUPPLEMENTARY,
1539       ARMENIAN,
1540       HEBREW,
1541       ARABIC,
1542       SYRIAC,
1543       THAANA,
1544       DEVANAGARI,
1545       BENGALI,
1546       GURMUKHI,
1547       GUJARATI,
1548       ORIYA,
1549       TAMIL,
1550       TELUGU,
1551       KANNADA,
1552       MALAYALAM,
1553       SINHALA,
1554       THAI,
1555       LAO,
1556       TIBETAN,
1557       MYANMAR,
1558       GEORGIAN,
1559       HANGUL_JAMO,
1560       ETHIOPIC,
1561       CHEROKEE,
1562       UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
1563       OGHAM,
1564       RUNIC,
1565       TAGALOG,
1566       HANUNOO,
1567       BUHID,
1568       TAGBANWA,
1569       KHMER,
1570       MONGOLIAN,
1571       LIMBU,
1572       TAI_LE,
1573       KHMER_SYMBOLS,
1574       PHONETIC_EXTENSIONS,
1575       LATIN_EXTENDED_ADDITIONAL,
1576       GREEK_EXTENDED,
1577       GENERAL_PUNCTUATION,
1578       SUPERSCRIPTS_AND_SUBSCRIPTS,
1579       CURRENCY_SYMBOLS,
1580       COMBINING_MARKS_FOR_SYMBOLS,
1581       LETTERLIKE_SYMBOLS,
1582       NUMBER_FORMS,
1583       ARROWS,
1584       MATHEMATICAL_OPERATORS,
1585       MISCELLANEOUS_TECHNICAL,
1586       CONTROL_PICTURES,
1587       OPTICAL_CHARACTER_RECOGNITION,
1588       ENCLOSED_ALPHANUMERICS,
1589       BOX_DRAWING,
1590       BLOCK_ELEMENTS,
1591       GEOMETRIC_SHAPES,
1592       MISCELLANEOUS_SYMBOLS,
1593       DINGBATS,
1594       MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
1595       SUPPLEMENTAL_ARROWS_A,
1596       BRAILLE_PATTERNS,
1597       SUPPLEMENTAL_ARROWS_B,
1598       MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
1599       SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
1600       MISCELLANEOUS_SYMBOLS_AND_ARROWS,
1601       CJK_RADICALS_SUPPLEMENT,
1602       KANGXI_RADICALS,
1603       IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
1604       CJK_SYMBOLS_AND_PUNCTUATION,
1605       HIRAGANA,
1606       KATAKANA,
1607       BOPOMOFO,
1608       HANGUL_COMPATIBILITY_JAMO,
1609       KANBUN,
1610       BOPOMOFO_EXTENDED,
1611       KATAKANA_PHONETIC_EXTENSIONS,
1612       ENCLOSED_CJK_LETTERS_AND_MONTHS,
1613       CJK_COMPATIBILITY,
1614       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
1615       YIJING_HEXAGRAM_SYMBOLS,
1616       CJK_UNIFIED_IDEOGRAPHS,
1617       YI_SYLLABLES,
1618       YI_RADICALS,
1619       HANGUL_SYLLABLES,
1620       HIGH_SURROGATES,
1621       HIGH_PRIVATE_USE_SURROGATES,
1622       LOW_SURROGATES,
1623       PRIVATE_USE_AREA,
1624       CJK_COMPATIBILITY_IDEOGRAPHS,
1625       ALPHABETIC_PRESENTATION_FORMS,
1626       ARABIC_PRESENTATION_FORMS_A,
1627       VARIATION_SELECTORS,
1628       COMBINING_HALF_MARKS,
1629       CJK_COMPATIBILITY_FORMS,
1630       SMALL_FORM_VARIANTS,
1631       ARABIC_PRESENTATION_FORMS_B,
1632       HALFWIDTH_AND_FULLWIDTH_FORMS,
1633       SPECIALS,
1634       LINEAR_B_SYLLABARY,
1635       LINEAR_B_IDEOGRAMS,
1636       AEGEAN_NUMBERS,
1637       OLD_ITALIC,
1638       GOTHIC,
1639       UGARITIC,
1640       DESERET,
1641       SHAVIAN,
1642       OSMANYA,
1643       CYPRIOT_SYLLABARY,
1644       BYZANTINE_MUSICAL_SYMBOLS,
1645       MUSICAL_SYMBOLS,
1646       TAI_XUAN_JING_SYMBOLS,
1647       MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
1648       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
1649       CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
1650       TAGS,
1651       VARIATION_SELECTORS_SUPPLEMENT,
1652       SUPPLEMENTARY_PRIVATE_USE_AREA_A,
1653       SUPPLEMENTARY_PRIVATE_USE_AREA_B,
1654     };
1655   } // class UnicodeBlock
1656
1657   /**
1658    * The immutable value of this Character.
1659    *
1660    * @serial the value of this Character
1661    */
1662   private final char value;
1663
1664   /**
1665    * Compatible with JDK 1.0+.
1666    */
1667   private static final long serialVersionUID = 3786198910865385080L;
1668
1669   /**
1670    * Smallest value allowed for radix arguments in Java. This value is 2.
1671    *
1672    * @see #digit(char, int)
1673    * @see #forDigit(int, int)
1674    * @see Integer#toString(int, int)
1675    * @see Integer#valueOf(String)
1676    */
1677   public static final int MIN_RADIX = 2;
1678
1679   /**
1680    * Largest value allowed for radix arguments in Java. This value is 36.
1681    *
1682    * @see #digit(char, int)
1683    * @see #forDigit(int, int)
1684    * @see Integer#toString(int, int)
1685    * @see Integer#valueOf(String)
1686    */
1687   public static final int MAX_RADIX = 36;
1688
1689   /**
1690    * The minimum value the char data type can hold.
1691    * This value is <code>'\\u0000'</code>.
1692    */
1693   public static final char MIN_VALUE = '\u0000';
1694
1695   /**
1696    * The maximum value the char data type can hold.
1697    * This value is <code>'\\uFFFF'</code>.
1698    */
1699   public static final char MAX_VALUE = '\uFFFF';
1700
1701   /**
1702    * Class object representing the primitive char data type.
1703    *
1704    * @since 1.1
1705    */
1706   public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1707
1708   /**
1709    * The number of bits needed to represent a <code>char</code>.
1710    * @since 1.5
1711    */
1712   public static final int SIZE = 16;
1713
1714   // This caches some Character values, and is used by boxing
1715   // conversions via valueOf().  We must cache at least 0..127;
1716   // this constant controls how much we actually cache.
1717   private static final int MAX_CACHE = 127;
1718   private static Character[] charCache = new Character[MAX_CACHE + 1];
1719
1720   /**
1721    * Lu = Letter, Uppercase (Informative).
1722    *
1723    * @since 1.1
1724    */
1725   public static final byte UPPERCASE_LETTER = 1;
1726
1727   /**
1728    * Ll = Letter, Lowercase (Informative).
1729    *
1730    * @since 1.1
1731    */
1732   public static final byte LOWERCASE_LETTER = 2;
1733
1734   /**
1735    * Lt = Letter, Titlecase (Informative).
1736    *
1737    * @since 1.1
1738    */
1739   public static final byte TITLECASE_LETTER = 3;
1740
1741   /**
1742    * Mn = Mark, Non-Spacing (Normative).
1743    *
1744    * @since 1.1
1745    */
1746   public static final byte NON_SPACING_MARK = 6;
1747
1748   /**
1749    * Mc = Mark, Spacing Combining (Normative).
1750    *
1751    * @since 1.1
1752    */
1753   public static final byte COMBINING_SPACING_MARK = 8;
1754
1755   /**
1756    * Me = Mark, Enclosing (Normative).
1757    *
1758    * @since 1.1
1759    */
1760   public static final byte ENCLOSING_MARK = 7;
1761
1762   /**
1763    * Nd = Number, Decimal Digit (Normative).
1764    *
1765    * @since 1.1
1766    */
1767   public static final byte DECIMAL_DIGIT_NUMBER = 9;
1768
1769   /**
1770    * Nl = Number, Letter (Normative).
1771    *
1772    * @since 1.1
1773    */
1774   public static final byte LETTER_NUMBER = 10;
1775
1776   /**
1777    * No = Number, Other (Normative).
1778    *
1779    * @since 1.1
1780    */
1781   public static final byte OTHER_NUMBER = 11;
1782
1783   /**
1784    * Zs = Separator, Space (Normative).
1785    *
1786    * @since 1.1
1787    */
1788   public static final byte SPACE_SEPARATOR = 12;
1789
1790   /**
1791    * Zl = Separator, Line (Normative).
1792    *
1793    * @since 1.1
1794    */
1795   public static final byte LINE_SEPARATOR = 13;
1796
1797   /**
1798    * Zp = Separator, Paragraph (Normative).
1799    *
1800    * @since 1.1
1801    */
1802   public static final byte PARAGRAPH_SEPARATOR = 14;
1803
1804   /**
1805    * Cc = Other, Control (Normative).
1806    *
1807    * @since 1.1
1808    */
1809   public static final byte CONTROL = 15;
1810
1811   /**
1812    * Cf = Other, Format (Normative).
1813    *
1814    * @since 1.1
1815    */
1816   public static final byte FORMAT = 16;
1817
1818   /**
1819    * Cs = Other, Surrogate (Normative).
1820    *
1821    * @since 1.1
1822    */
1823   public static final byte SURROGATE = 19;
1824
1825   /**
1826    * Co = Other, Private Use (Normative).
1827    *
1828    * @since 1.1
1829    */
1830   public static final byte PRIVATE_USE = 18;
1831
1832   /**
1833    * Cn = Other, Not Assigned (Normative).
1834    *
1835    * @since 1.1
1836    */
1837   public static final byte UNASSIGNED = 0;
1838
1839   /**
1840    * Lm = Letter, Modifier (Informative).
1841    *
1842    * @since 1.1
1843    */
1844   public static final byte MODIFIER_LETTER = 4;
1845
1846   /**
1847    * Lo = Letter, Other (Informative).
1848    *
1849    * @since 1.1
1850    */
1851   public static final byte OTHER_LETTER = 5;
1852
1853   /**
1854    * Pc = Punctuation, Connector (Informative).
1855    *
1856    * @since 1.1
1857    */
1858   public static final byte CONNECTOR_PUNCTUATION = 23;
1859
1860   /**
1861    * Pd = Punctuation, Dash (Informative).
1862    *
1863    * @since 1.1
1864    */
1865   public static final byte DASH_PUNCTUATION = 20;
1866
1867   /**
1868    * Ps = Punctuation, Open (Informative).
1869    *
1870    * @since 1.1
1871    */
1872   public static final byte START_PUNCTUATION = 21;
1873
1874   /**
1875    * Pe = Punctuation, Close (Informative).
1876    *
1877    * @since 1.1
1878    */
1879   public static final byte END_PUNCTUATION = 22;
1880
1881   /**
1882    * Pi = Punctuation, Initial Quote (Informative).
1883    *
1884    * @since 1.4
1885    */
1886   public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
1887
1888   /**
1889    * Pf = Punctuation, Final Quote (Informative).
1890    *
1891    * @since 1.4
1892    */
1893   public static final byte FINAL_QUOTE_PUNCTUATION = 30;
1894
1895   /**
1896    * Po = Punctuation, Other (Informative).
1897    *
1898    * @since 1.1
1899    */
1900   public static final byte OTHER_PUNCTUATION = 24;
1901
1902   /**
1903    * Sm = Symbol, Math (Informative).
1904    *
1905    * @since 1.1
1906    */
1907   public static final byte MATH_SYMBOL = 25;
1908
1909   /**
1910    * Sc = Symbol, Currency (Informative).
1911    *
1912    * @since 1.1
1913    */
1914   public static final byte CURRENCY_SYMBOL = 26;
1915
1916   /**
1917    * Sk = Symbol, Modifier (Informative).
1918    *
1919    * @since 1.1
1920    */
1921   public static final byte MODIFIER_SYMBOL = 27;
1922
1923   /**
1924    * So = Symbol, Other (Informative).
1925    *
1926    * @since 1.1
1927    */
1928   public static final byte OTHER_SYMBOL = 28;
1929
1930   /**
1931    * Undefined bidirectional character type. Undefined char values have
1932    * undefined directionality in the Unicode specification.
1933    *
1934    * @since 1.4
1935    */
1936   public static final byte DIRECTIONALITY_UNDEFINED = -1;
1937
1938   /**
1939    * Strong bidirectional character type "L".
1940    *
1941    * @since 1.4
1942    */
1943   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
1944
1945   /**
1946    * Strong bidirectional character type "R".
1947    *
1948    * @since 1.4
1949    */
1950   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
1951
1952   /**
1953    * Strong bidirectional character type "AL".
1954    *
1955    * @since 1.4
1956    */
1957   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
1958
1959   /**
1960    * Weak bidirectional character type "EN".
1961    *
1962    * @since 1.4
1963    */
1964   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
1965
1966   /**
1967    * Weak bidirectional character type "ES".
1968    *
1969    * @since 1.4
1970    */
1971   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
1972
1973   /**
1974    * Weak bidirectional character type "ET".
1975    *
1976    * @since 1.4
1977    */
1978   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
1979
1980   /**
1981    * Weak bidirectional character type "AN".
1982    *
1983    * @since 1.4
1984    */
1985   public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
1986
1987   /**
1988    * Weak bidirectional character type "CS".
1989    *
1990    * @since 1.4
1991    */
1992   public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
1993
1994   /**
1995    * Weak bidirectional character type "NSM".
1996    *
1997    * @since 1.4
1998    */
1999   public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
2000
2001   /**
2002    * Weak bidirectional character type "BN".
2003    *
2004    * @since 1.4
2005    */
2006   public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
2007
2008   /**
2009    * Neutral bidirectional character type "B".
2010    *
2011    * @since 1.4
2012    */
2013   public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
2014
2015   /**
2016    * Neutral bidirectional character type "S".
2017    *
2018    * @since 1.4
2019    */
2020   public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
2021
2022   /**
2023    * Strong bidirectional character type "WS".
2024    *
2025    * @since 1.4
2026    */
2027   public static final byte DIRECTIONALITY_WHITESPACE = 12;
2028
2029   /**
2030    * Neutral bidirectional character type "ON".
2031    *
2032    * @since 1.4
2033    */
2034   public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
2035
2036   /**
2037    * Strong bidirectional character type "LRE".
2038    *
2039    * @since 1.4
2040    */
2041   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
2042
2043   /**
2044    * Strong bidirectional character type "LRO".
2045    *
2046    * @since 1.4
2047    */
2048   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
2049
2050   /**
2051    * Strong bidirectional character type "RLE".
2052    *
2053    * @since 1.4
2054    */
2055   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
2056
2057   /**
2058    * Strong bidirectional character type "RLO".
2059    *
2060    * @since 1.4
2061    */
2062   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
2063
2064   /**
2065    * Weak bidirectional character type "PDF".
2066    *
2067    * @since 1.4
2068    */
2069   public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
2070
2071   /**
2072    * Mask for grabbing the type out of the result of readChar.
2073    * @see #readChar(char)
2074    */
2075   private static final int TYPE_MASK = 0x1F;
2076
2077   /**
2078    * Mask for grabbing the non-breaking space flag out of the result of
2079    * readChar.
2080    * @see #readChar(char)
2081    */
2082   private static final int NO_BREAK_MASK = 0x20;
2083
2084   /**
2085    * Mask for grabbing the mirrored directionality flag out of the result
2086    * of readChar.
2087    * @see #readChar(char)
2088    */
2089   private static final int MIRROR_MASK = 0x40;
2090
2091   /**
2092    * Min value for supplementary code point.
2093    *
2094    * @since 1.5
2095    */
2096   public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
2097
2098   /**
2099    * Min value for code point.
2100    *
2101    * @since 1.5
2102    */
2103   public static final int MIN_CODE_POINT = 0;
2104
2105
2106   /**
2107    * Max value for code point.
2108    *
2109    * @since 1.5
2110    */
2111   public static final int MAX_CODE_POINT = 0x010ffff;
2112
2113
2114   /**
2115    * Minimum high surrogate code in UTF-16 encoding.
2116    *
2117    * @since 1.5
2118    */
2119   public static final char MIN_HIGH_SURROGATE = '\ud800';
2120
2121   /**
2122    * Maximum high surrogate code in UTF-16 encoding.
2123    *
2124    * @since 1.5
2125    */
2126   public static final char MAX_HIGH_SURROGATE = '\udbff';
2127
2128   /**
2129    * Minimum low surrogate code in UTF-16 encoding.
2130    *
2131    * @since 1.5
2132    */
2133   public static final char MIN_LOW_SURROGATE = '\udc00';
2134
2135   /**
2136    * Maximum low surrogate code in UTF-16 encoding.
2137    *
2138    * @since 1.5
2139    */
2140   public static final char MAX_LOW_SURROGATE = '\udfff';
2141
2142   /**
2143    * Minimum surrogate code in UTF-16 encoding.
2144    *
2145    * @since 1.5
2146    */
2147   public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
2148
2149   /**
2150    * Maximum low surrogate code in UTF-16 encoding.
2151    *
2152    * @since 1.5
2153    */
2154   public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
2155
2156   /**
2157    * Grabs an attribute offset from the Unicode attribute database. The lower
2158    * 5 bits are the character type, the next 2 bits are flags, and the top
2159    * 9 bits are the offset into the attribute tables. Note that the top 9
2160    * bits are meaningless in this context; they are useful only in the native
2161    * code.
2162    *
2163    * @param ch the character to look up
2164    * @return the character's attribute offset and type
2165    * @see #TYPE_MASK
2166    * @see #NO_BREAK_MASK
2167    * @see #MIRROR_MASK
2168    */
2169   private static native char readChar(char ch);
2170
2171   /**
2172    * Grabs an attribute offset from the Unicode attribute database. The lower
2173    * 5 bits are the character type, the next 2 bits are flags, and the top
2174    * 9 bits are the offset into the attribute tables. Note that the top 9
2175    * bits are meaningless in this context; they are useful only in the native
2176    * code.
2177    *
2178    * @param codePoint the character to look up
2179    * @return the character's attribute offset and type
2180    * @see #TYPE_MASK
2181    * @see #NO_BREAK_MASK
2182    * @see #MIRROR_MASK
2183    */
2184   private static native char readCodePoint(int codePoint);
2185
2186   /**
2187    * Wraps up a character.
2188    *
2189    * @param value the character to wrap
2190    */
2191   public Character(char value)
2192   {
2193     this.value = value;
2194   }
2195
2196   /**
2197    * Returns the character which has been wrapped by this class.
2198    *
2199    * @return the character wrapped
2200    */
2201   public char charValue()
2202   {
2203     return value;
2204   }
2205
2206   /**
2207    * Returns the numerical value (unsigned) of the wrapped character.
2208    * Range of returned values: 0x0000-0xFFFF.
2209    *
2210    * @return the value of the wrapped character
2211    */
2212   public int hashCode()
2213   {
2214     return value;
2215   }
2216
2217   /**
2218    * Determines if an object is equal to this object. This is only true for
2219    * another Character object wrapping the same value.
2220    *
2221    * @param o object to compare
2222    * @return true if o is a Character with the same value
2223    */
2224   public boolean equals(Object o)
2225   {
2226     return o instanceof Character && value == ((Character) o).value;
2227   }
2228
2229   /**
2230    * Converts the wrapped character into a String.
2231    *
2232    * @return a String containing one character -- the wrapped character
2233    *         of this instance
2234    */
2235   public String toString()
2236   {
2237     // This assumes that String.valueOf(char) can create a single-character
2238     // String more efficiently than through the public API.
2239     return String.valueOf(value);
2240   }
2241
2242   /**
2243    * Returns a String of length 1 representing the specified character.
2244    *
2245    * @param ch the character to convert
2246    * @return a String containing the character
2247    * @since 1.4
2248    */
2249   public static String toString(char ch)
2250   {
2251     // This assumes that String.valueOf(char) can create a single-character
2252     // String more efficiently than through the public API.
2253     return String.valueOf(ch);
2254   }
2255
2256   /**
2257    * Determines if a character is a Unicode lowercase letter. For example,
2258    * <code>'a'</code> is lowercase.
2259    * <br>
2260    * lowercase = [Ll]
2261    *
2262    * @param ch character to test
2263    * @return true if ch is a Unicode lowercase letter, else false
2264    * @see #isUpperCase(char)
2265    * @see #isTitleCase(char)
2266    * @see #toLowerCase(char)
2267    * @see #getType(char)
2268    */
2269   public static boolean isLowerCase(char ch)
2270   {
2271     return getType(ch) == LOWERCASE_LETTER;
2272   }
2273
2274   /**
2275    * Determines if a character is a Unicode lowercase letter. For example,
2276    * <code>'a'</code> is lowercase.  Unlike isLowerCase(char), this method
2277    * supports supplementary Unicode code points.
2278    * <br>
2279    * lowercase = [Ll]
2280    *
2281    * @param codePoint character to test
2282    * @return true if codePoint is a Unicode lowercase letter, else false
2283    * @see #isUpperCase(int)
2284    * @see #isTitleCase(int)
2285    * @see #toLowerCase(int)
2286    * @see #getType(int)
2287    * @since 1.5
2288    */
2289   public static boolean isLowerCase(int codePoint)
2290   {
2291     return getType(codePoint) == LOWERCASE_LETTER;
2292   }
2293
2294   /**
2295    * Determines if a character is a Unicode uppercase letter. For example,
2296    * <code>'A'</code> is uppercase.
2297    * <br>
2298    * uppercase = [Lu]
2299    *
2300    * @param ch character to test
2301    * @return true if ch is a Unicode uppercase letter, else false
2302    * @see #isLowerCase(char)
2303    * @see #isTitleCase(char)
2304    * @see #toUpperCase(char)
2305    * @see #getType(char)
2306    */
2307   public static boolean isUpperCase(char ch)
2308   {
2309     return getType(ch) == UPPERCASE_LETTER;
2310   }
2311
2312   /**
2313    * Determines if a character is a Unicode uppercase letter. For example,
2314    * <code>'A'</code> is uppercase.  Unlike isUpperCase(char), this method
2315    * supports supplementary Unicode code points.
2316    * <br>
2317    * uppercase = [Lu]
2318    *
2319    * @param codePoint character to test
2320    * @return true if codePoint is a Unicode uppercase letter, else false
2321    * @see #isLowerCase(int)
2322    * @see #isTitleCase(int)
2323    * @see #toUpperCase(int)
2324    * @see #getType(int)
2325    * @since 1.5
2326    */
2327   public static boolean isUpperCase(int codePoint)
2328   {
2329     return getType(codePoint) == UPPERCASE_LETTER;
2330   }
2331
2332   /**
2333    * Determines if a character is a Unicode titlecase letter. For example,
2334    * the character "Lj" (Latin capital L with small letter j) is titlecase.
2335    * <br>
2336    * titlecase = [Lt]
2337    *
2338    * @param ch character to test
2339    * @return true if ch is a Unicode titlecase letter, else false
2340    * @see #isLowerCase(char)
2341    * @see #isUpperCase(char)
2342    * @see #toTitleCase(char)
2343    * @see #getType(char)
2344    */
2345   public static boolean isTitleCase(char ch)
2346   {
2347     return getType(ch) == TITLECASE_LETTER;
2348   }
2349
2350   /**
2351    * Determines if a character is a Unicode titlecase letter. For example,
2352    * the character "Lj" (Latin capital L with small letter j) is titlecase.
2353    * Unlike isTitleCase(char), this method supports supplementary Unicode
2354    * code points.
2355    * <br>
2356    * titlecase = [Lt]
2357    *
2358    * @param codePoint character to test
2359    * @return true if codePoint is a Unicode titlecase letter, else false
2360    * @see #isLowerCase(int)
2361    * @see #isUpperCase(int)
2362    * @see #toTitleCase(int)
2363    * @see #getType(int)
2364    * @since 1.5
2365    */
2366   public static boolean isTitleCase(int codePoint)
2367   {
2368     return getType(codePoint) == TITLECASE_LETTER;
2369   }
2370
2371   /**
2372    * Determines if a character is a Unicode decimal digit. For example,
2373    * <code>'0'</code> is a digit.
2374    * <br>
2375    * Unicode decimal digit = [Nd]
2376    *
2377    * @param ch character to test
2378    * @return true if ch is a Unicode decimal digit, else false
2379    * @see #digit(char, int)
2380    * @see #forDigit(int, int)
2381    * @see #getType(char)
2382    */
2383   public static boolean isDigit(char ch)
2384   {
2385     return getType(ch) == DECIMAL_DIGIT_NUMBER;
2386   }
2387
2388   /**
2389    * Determines if a character is a Unicode decimal digit. For example,
2390    * <code>'0'</code> is a digit.  Unlike isDigit(char), this method
2391    * supports supplementary Unicode code points.
2392    * <br>
2393    * Unicode decimal digit = [Nd]
2394    *
2395    * @param codePoint character to test
2396    * @return true if ccodePoint is a Unicode decimal digit, else false
2397    * @see #digit(int, int)
2398    * @see #forDigit(int, int)
2399    * @see #getType(int)
2400    * @since 1.5
2401    */
2402   public static boolean isDigit(int codePoint)
2403   {
2404     return getType(codePoint) == DECIMAL_DIGIT_NUMBER;
2405   }
2406
2407   /**
2408    * Determines if a character is part of the Unicode Standard. This is an
2409    * evolving standard, but covers every character in the data file.
2410    * <br>
2411    * defined = not [Cn]
2412    *
2413    * @param ch character to test
2414    * @return true if ch is a Unicode character, else false
2415    * @see #isDigit(char)
2416    * @see #isLetter(char)
2417    * @see #isLetterOrDigit(char)
2418    * @see #isLowerCase(char)
2419    * @see #isTitleCase(char)
2420    * @see #isUpperCase(char)
2421    */
2422   public static boolean isDefined(char ch)
2423   {
2424     return getType(ch) != UNASSIGNED;
2425   }
2426
2427   /**
2428    * Determines if a character is part of the Unicode Standard. This is an
2429    * evolving standard, but covers every character in the data file.  Unlike
2430    * isDefined(char), this method supports supplementary Unicode code points.
2431    * <br>
2432    * defined = not [Cn]
2433    *
2434    * @param codePoint character to test
2435    * @return true if codePoint is a Unicode character, else false
2436    * @see #isDigit(int)
2437    * @see #isLetter(int)
2438    * @see #isLetterOrDigit(int)
2439    * @see #isLowerCase(int)
2440    * @see #isTitleCase(int)
2441    * @see #isUpperCase(int)
2442    * @since 1.5
2443    */
2444   public static boolean isDefined(int codePoint)
2445   {
2446     return getType(codePoint) != UNASSIGNED;
2447   }
2448
2449   /**
2450    * Determines if a character is a Unicode letter. Not all letters have case,
2451    * so this may return true when isLowerCase and isUpperCase return false.
2452    * <br>
2453    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2454    *
2455    * @param ch character to test
2456    * @return true if ch is a Unicode letter, else false
2457    * @see #isDigit(char)
2458    * @see #isJavaIdentifierStart(char)
2459    * @see #isJavaLetter(char)
2460    * @see #isJavaLetterOrDigit(char)
2461    * @see #isLetterOrDigit(char)
2462    * @see #isLowerCase(char)
2463    * @see #isTitleCase(char)
2464    * @see #isUnicodeIdentifierStart(char)
2465    * @see #isUpperCase(char)
2466    */
2467   public static boolean isLetter(char ch)
2468   {
2469     return ((1 << getType(ch))
2470             & ((1 << UPPERCASE_LETTER)
2471                | (1 << LOWERCASE_LETTER)
2472                | (1 << TITLECASE_LETTER)
2473                | (1 << MODIFIER_LETTER)
2474                | (1 << OTHER_LETTER))) != 0;
2475   }
2476
2477   /**
2478    * Determines if a character is a Unicode letter. Not all letters have case,
2479    * so this may return true when isLowerCase and isUpperCase return false.
2480    * Unlike isLetter(char), this method supports supplementary Unicode code
2481    * points.
2482    * <br>
2483    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2484    *
2485    * @param codePoint character to test
2486    * @return true if codePoint is a Unicode letter, else false
2487    * @see #isDigit(int)
2488    * @see #isJavaIdentifierStart(int)
2489    * @see #isJavaLetter(int)
2490    * @see #isJavaLetterOrDigit(int)
2491    * @see #isLetterOrDigit(int)
2492    * @see #isLowerCase(int)
2493    * @see #isTitleCase(int)
2494    * @see #isUnicodeIdentifierStart(int)
2495    * @see #isUpperCase(int)
2496    * @since 1.5
2497    */
2498   public static boolean isLetter(int codePoint)
2499   {
2500     return ((1 << getType(codePoint))
2501             & ((1 << UPPERCASE_LETTER)
2502                | (1 << LOWERCASE_LETTER)
2503                | (1 << TITLECASE_LETTER)
2504                | (1 << MODIFIER_LETTER)
2505                | (1 << OTHER_LETTER))) != 0;
2506   }
2507
2508   /**
2509    * Determines if a character is a Unicode letter or a Unicode digit. This
2510    * is the combination of isLetter and isDigit.
2511    * <br>
2512    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
2513    *
2514    * @param ch character to test
2515    * @return true if ch is a Unicode letter or a Unicode digit, else false
2516    * @see #isDigit(char)
2517    * @see #isJavaIdentifierPart(char)
2518    * @see #isJavaLetter(char)
2519    * @see #isJavaLetterOrDigit(char)
2520    * @see #isLetter(char)
2521    * @see #isUnicodeIdentifierPart(char)
2522    */
2523   public static boolean isLetterOrDigit(char ch)
2524   {
2525     return ((1 << getType(ch))
2526             & ((1 << UPPERCASE_LETTER)
2527                | (1 << LOWERCASE_LETTER)
2528                | (1 << TITLECASE_LETTER)
2529                | (1 << MODIFIER_LETTER)
2530                | (1 << OTHER_LETTER)
2531                | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
2532   }
2533
2534   /**
2535    * Determines if a character is a Unicode letter or a Unicode digit. This
2536    * is the combination of isLetter and isDigit.  Unlike isLetterOrDigit(char),
2537    * this method supports supplementary Unicode code points.
2538    * <br>
2539    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
2540    *
2541    * @param codePoint character to test
2542    * @return true if codePoint is a Unicode letter or a Unicode digit, else false
2543    * @see #isDigit(int)
2544    * @see #isJavaIdentifierPart(int)
2545    * @see #isJavaLetter(int)
2546    * @see #isJavaLetterOrDigit(int)
2547    * @see #isLetter(int)
2548    * @see #isUnicodeIdentifierPart(int)
2549    * @since 1.5
2550    */
2551   public static boolean isLetterOrDigit(int codePoint)
2552   {
2553     return ((1 << getType(codePoint)
2554             & ((1 << UPPERCASE_LETTER)
2555                | (1 << LOWERCASE_LETTER)
2556                | (1 << TITLECASE_LETTER)
2557                | (1 << MODIFIER_LETTER)
2558                | (1 << OTHER_LETTER)
2559                | (1 << DECIMAL_DIGIT_NUMBER))) != 0);
2560   }
2561
2562   /**
2563    * Determines if a character can start a Java identifier. This is the
2564    * combination of isLetter, any character where getType returns
2565    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
2566    * (like '_').
2567    *
2568    * @param ch character to test
2569    * @return true if ch can start a Java identifier, else false
2570    * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
2571    * @see #isJavaLetterOrDigit(char)
2572    * @see #isJavaIdentifierStart(char)
2573    * @see #isJavaIdentifierPart(char)
2574    * @see #isLetter(char)
2575    * @see #isLetterOrDigit(char)
2576    * @see #isUnicodeIdentifierStart(char)
2577    */
2578   public static boolean isJavaLetter(char ch)
2579   {
2580     return isJavaIdentifierStart(ch);
2581   }
2582
2583   /**
2584    * Determines if a character can start a Java identifier. This is the
2585    * combination of isLetter, any character where getType returns
2586    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
2587    * (like '_'). Unlike isJavaIdentifierStart(char), this method supports
2588    * supplementary Unicode code points.
2589    * <br>
2590    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
2591    *
2592    * @param codePoint character to test
2593    * @return true if codePoint can start a Java identifier, else false
2594    * @see #isJavaIdentifierPart(int)
2595    * @see #isLetter(int)
2596    * @see #isUnicodeIdentifierStart(int)
2597    * @since 1.5
2598    */
2599   public static boolean isJavaIdentifierStart(int codePoint)
2600   {
2601     return ((1 << getType(codePoint))
2602             & ((1 << UPPERCASE_LETTER)
2603                | (1 << LOWERCASE_LETTER)
2604                | (1 << TITLECASE_LETTER)
2605                | (1 << MODIFIER_LETTER)
2606                | (1 << OTHER_LETTER)
2607                | (1 << LETTER_NUMBER)
2608                | (1 << CURRENCY_SYMBOL)
2609                | (1 << CONNECTOR_PUNCTUATION))) != 0;
2610   }
2611
2612   /**
2613    * Determines if a character can follow the first letter in
2614    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
2615    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
2616    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
2617    * or isIdentifierIgnorable.
2618    *
2619    * @param ch character to test
2620    * @return true if ch can follow the first letter in a Java identifier
2621    * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
2622    * @see #isJavaLetter(char)
2623    * @see #isJavaIdentifierStart(char)
2624    * @see #isJavaIdentifierPart(char)
2625    * @see #isLetter(char)
2626    * @see #isLetterOrDigit(char)
2627    * @see #isUnicodeIdentifierPart(char)
2628    * @see #isIdentifierIgnorable(char)
2629    */
2630   public static boolean isJavaLetterOrDigit(char ch)
2631   {
2632     return isJavaIdentifierPart(ch);
2633   }
2634
2635   /**
2636    * Determines if a character can start a Java identifier. This is the
2637    * combination of isLetter, any character where getType returns
2638    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
2639    * (like '_').
2640    * <br>
2641    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
2642    *
2643    * @param ch character to test
2644    * @return true if ch can start a Java identifier, else false
2645    * @see #isJavaIdentifierPart(char)
2646    * @see #isLetter(char)
2647    * @see #isUnicodeIdentifierStart(char)
2648    * @since 1.1
2649    */
2650   public static boolean isJavaIdentifierStart(char ch)
2651   {
2652     return ((1 << getType(ch))
2653             & ((1 << UPPERCASE_LETTER)
2654                | (1 << LOWERCASE_LETTER)
2655                | (1 << TITLECASE_LETTER)
2656                | (1 << MODIFIER_LETTER)
2657                | (1 << OTHER_LETTER)
2658                | (1 << LETTER_NUMBER)
2659                | (1 << CURRENCY_SYMBOL)
2660                | (1 << CONNECTOR_PUNCTUATION))) != 0;
2661   }
2662
2663   /**
2664    * Determines if a character can follow the first letter in
2665    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
2666    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
2667    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
2668    * or isIdentifierIgnorable.
2669    * <br>
2670    * Java identifier extender =
2671    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
2672    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
2673    *
2674    * @param ch character to test
2675    * @return true if ch can follow the first letter in a Java identifier
2676    * @see #isIdentifierIgnorable(char)
2677    * @see #isJavaIdentifierStart(char)
2678    * @see #isLetterOrDigit(char)
2679    * @see #isUnicodeIdentifierPart(char)
2680    * @since 1.1
2681    */
2682   public static boolean isJavaIdentifierPart(char ch)
2683   {
2684     int category = getType(ch);
2685     return ((1 << category)
2686             & ((1 << UPPERCASE_LETTER)
2687                | (1 << LOWERCASE_LETTER)
2688                | (1 << TITLECASE_LETTER)
2689                | (1 << MODIFIER_LETTER)
2690                | (1 << OTHER_LETTER)
2691                | (1 << NON_SPACING_MARK)
2692                | (1 << COMBINING_SPACING_MARK)
2693                | (1 << DECIMAL_DIGIT_NUMBER)
2694                | (1 << LETTER_NUMBER)
2695                | (1 << CURRENCY_SYMBOL)
2696                | (1 << CONNECTOR_PUNCTUATION)
2697                | (1 << FORMAT))) != 0
2698       || (category == CONTROL && isIdentifierIgnorable(ch));
2699   }
2700
2701   /**
2702    * Determines if a character can follow the first letter in
2703    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
2704    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
2705    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
2706    * or isIdentifierIgnorable. Unlike isJavaIdentifierPart(char), this method
2707    * supports supplementary Unicode code points.
2708    * <br>
2709    * Java identifier extender =
2710    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
2711    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
2712    *
2713    * @param codePoint character to test
2714    * @return true if codePoint can follow the first letter in a Java identifier
2715    * @see #isIdentifierIgnorable(int)
2716    * @see #isJavaIdentifierStart(int)
2717    * @see #isLetterOrDigit(int)
2718    * @see #isUnicodeIdentifierPart(int)
2719    * @since 1.5
2720    */
2721   public static boolean isJavaIdentifierPart(int codePoint)
2722   {
2723     int category = getType(codePoint);
2724     return ((1 << category)
2725             & ((1 << UPPERCASE_LETTER)
2726                | (1 << LOWERCASE_LETTER)
2727                | (1 << TITLECASE_LETTER)
2728                | (1 << MODIFIER_LETTER)
2729                | (1 << OTHER_LETTER)
2730                | (1 << NON_SPACING_MARK)
2731                | (1 << COMBINING_SPACING_MARK)
2732                | (1 << DECIMAL_DIGIT_NUMBER)
2733                | (1 << LETTER_NUMBER)
2734                | (1 << CURRENCY_SYMBOL)
2735                | (1 << CONNECTOR_PUNCTUATION)
2736                | (1 << FORMAT))) != 0
2737       || (category == CONTROL && isIdentifierIgnorable(codePoint));
2738   }
2739
2740   /**
2741    * Determines if a character can start a Unicode identifier.  Only
2742    * letters can start a Unicode identifier, but this includes characters
2743    * in LETTER_NUMBER.
2744    * <br>
2745    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
2746    *
2747    * @param ch character to test
2748    * @return true if ch can start a Unicode identifier, else false
2749    * @see #isJavaIdentifierStart(char)
2750    * @see #isLetter(char)
2751    * @see #isUnicodeIdentifierPart(char)
2752    * @since 1.1
2753    */
2754   public static boolean isUnicodeIdentifierStart(char ch)
2755   {
2756     return ((1 << getType(ch))
2757             & ((1 << UPPERCASE_LETTER)
2758                | (1 << LOWERCASE_LETTER)
2759                | (1 << TITLECASE_LETTER)
2760                | (1 << MODIFIER_LETTER)
2761                | (1 << OTHER_LETTER)
2762                | (1 << LETTER_NUMBER))) != 0;
2763   }
2764
2765   /**
2766    * Determines if a character can start a Unicode identifier.  Only
2767    * letters can start a Unicode identifier, but this includes characters
2768    * in LETTER_NUMBER.  Unlike isUnicodeIdentifierStart(char), this method
2769    * supports supplementary Unicode code points.
2770    * <br>
2771    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
2772    *
2773    * @param codePoint character to test
2774    * @return true if codePoint can start a Unicode identifier, else false
2775    * @see #isJavaIdentifierStart(int)
2776    * @see #isLetter(int)
2777    * @see #isUnicodeIdentifierPart(int)
2778    * @since 1.5
2779    */
2780   public static boolean isUnicodeIdentifierStart(int codePoint)
2781   {
2782     return ((1 << getType(codePoint))
2783             & ((1 << UPPERCASE_LETTER)
2784                | (1 << LOWERCASE_LETTER)
2785                | (1 << TITLECASE_LETTER)
2786                | (1 << MODIFIER_LETTER)
2787                | (1 << OTHER_LETTER)
2788                | (1 << LETTER_NUMBER))) != 0;
2789   }
2790
2791   /**
2792    * Determines if a character can follow the first letter in
2793    * a Unicode identifier. This includes letters, connecting punctuation,
2794    * digits, numeric letters, combining marks, non-spacing marks, and
2795    * isIdentifierIgnorable.
2796    * <br>
2797    * Unicode identifier extender =
2798    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
2799    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
2800    *
2801    * @param ch character to test
2802    * @return true if ch can follow the first letter in a Unicode identifier
2803    * @see #isIdentifierIgnorable(char)
2804    * @see #isJavaIdentifierPart(char)
2805    * @see #isLetterOrDigit(char)
2806    * @see #isUnicodeIdentifierStart(char)
2807    * @since 1.1
2808    */
2809   public static boolean isUnicodeIdentifierPart(char ch)
2810   {
2811     int category = getType(ch);
2812     return ((1 << category)
2813             & ((1 << UPPERCASE_LETTER)
2814                | (1 << LOWERCASE_LETTER)
2815                | (1 << TITLECASE_LETTER)
2816                | (1 << MODIFIER_LETTER)
2817                | (1 << OTHER_LETTER)
2818                | (1 << NON_SPACING_MARK)
2819                | (1 << COMBINING_SPACING_MARK)
2820                | (1 << DECIMAL_DIGIT_NUMBER)
2821                | (1 << LETTER_NUMBER)
2822                | (1 << CONNECTOR_PUNCTUATION)
2823                | (1 << FORMAT))) != 0
2824       || (category == CONTROL && isIdentifierIgnorable(ch));
2825   }
2826
2827   /**
2828    * Determines if a character can follow the first letter in
2829    * a Unicode identifier. This includes letters, connecting punctuation,
2830    * digits, numeric letters, combining marks, non-spacing marks, and
2831    * isIdentifierIgnorable.  Unlike isUnicodeIdentifierPart(char), this method
2832    * supports supplementary Unicode code points.
2833    * <br>
2834    * Unicode identifier extender =
2835    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
2836    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
2837    *
2838    * @param codePoint character to test
2839    * @return true if codePoint can follow the first letter in a Unicode
2840    *         identifier
2841    * @see #isIdentifierIgnorable(int)
2842    * @see #isJavaIdentifierPart(int)
2843    * @see #isLetterOrDigit(int)
2844    * @see #isUnicodeIdentifierStart(int)
2845    * @since 1.5
2846    */
2847   public static boolean isUnicodeIdentifierPart(int codePoint)
2848   {
2849     int category = getType(codePoint);
2850     return ((1 << category)
2851             & ((1 << UPPERCASE_LETTER)
2852                | (1 << LOWERCASE_LETTER)
2853                | (1 << TITLECASE_LETTER)
2854                | (1 << MODIFIER_LETTER)
2855                | (1 << OTHER_LETTER)
2856                | (1 << NON_SPACING_MARK)
2857                | (1 << COMBINING_SPACING_MARK)
2858                | (1 << DECIMAL_DIGIT_NUMBER)
2859                | (1 << LETTER_NUMBER)
2860                | (1 << CONNECTOR_PUNCTUATION)
2861                | (1 << FORMAT))) != 0
2862       || (category == CONTROL && isIdentifierIgnorable(codePoint));
2863   }
2864
2865   /**
2866    * Determines if a character is ignorable in a Unicode identifier. This
2867    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
2868    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
2869    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
2870    * <code>'\u009F'</code>), and FORMAT characters.
2871    * <br>
2872    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
2873    *    |U+007F-U+009F
2874    *
2875    * @param ch character to test
2876    * @return true if ch is ignorable in a Unicode or Java identifier
2877    * @see #isJavaIdentifierPart(char)
2878    * @see #isUnicodeIdentifierPart(char)
2879    * @since 1.1
2880    */
2881   public static boolean isIdentifierIgnorable(char ch)
2882   {
2883     return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F'
2884                                || (ch <= '\u001B' && ch >= '\u000E')))
2885       || getType(ch) == FORMAT;
2886   }
2887
2888   /**
2889    * Determines if a character is ignorable in a Unicode identifier. This
2890    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
2891    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
2892    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
2893    * <code>'\u009F'</code>), and FORMAT characters.  Unlike
2894    * isIdentifierIgnorable(char), this method supports supplementary Unicode
2895    * code points.
2896    * <br>
2897    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
2898    *    |U+007F-U+009F
2899    *
2900    * @param codePoint character to test
2901    * @return true if codePoint is ignorable in a Unicode or Java identifier
2902    * @see #isJavaIdentifierPart(int)
2903    * @see #isUnicodeIdentifierPart(int)
2904    * @since 1.5
2905    */
2906   public static boolean isIdentifierIgnorable(int codePoint)
2907   {
2908     return ((codePoint >= 0 && codePoint <= 0x0008)
2909         || (codePoint >= 0x000E && codePoint <= 0x001B)
2910         || (codePoint >= 0x007F && codePoint <= 0x009F)
2911             || getType(codePoint) == FORMAT);
2912   }
2913
2914   /**
2915    * Converts a Unicode character into its lowercase equivalent mapping.
2916    * If a mapping does not exist, then the character passed is returned.
2917    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
2918    *
2919    * @param ch character to convert to lowercase
2920    * @return lowercase mapping of ch, or ch if lowercase mapping does
2921    *         not exist
2922    * @see #isLowerCase(char)
2923    * @see #isUpperCase(char)
2924    * @see #toTitleCase(char)
2925    * @see #toUpperCase(char)
2926    */
2927   public static native char toLowerCase(char ch);
2928
2929   /**
2930    * Converts a Unicode character into its lowercase equivalent mapping.
2931    * If a mapping does not exist, then the character passed is returned.
2932    * Note that isLowerCase(toLowerCase(codePoint)) does not always return true.
2933    * Unlike toLowerCase(char), this method supports supplementary Unicode
2934    * code points.
2935    *
2936    * @param codePoint character to convert to lowercase
2937    * @return lowercase mapping of codePoint, or codePoint if lowercase
2938    *         mapping does not exist
2939    * @see #isLowerCase(int)
2940    * @see #isUpperCase(int)
2941    * @see #toTitleCase(int)
2942    * @see #toUpperCase(int)
2943    * @since 1.5
2944    */
2945   public static native int toLowerCase(int codePoint);
2946
2947   /**
2948    * Converts a Unicode character into its uppercase equivalent mapping.
2949    * If a mapping does not exist, then the character passed is returned.
2950    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
2951    *
2952    * @param ch character to convert to uppercase
2953    * @return uppercase mapping of ch, or ch if uppercase mapping does
2954    *         not exist
2955    * @see #isLowerCase(char)
2956    * @see #isUpperCase(char)
2957    * @see #toLowerCase(char)
2958    * @see #toTitleCase(char)
2959    */
2960   public static native char toUpperCase(char ch);
2961
2962   /**
2963    * Converts a Unicode character into its uppercase equivalent mapping.
2964    * If a mapping does not exist, then the character passed is returned.
2965    * Note that isUpperCase(toUpperCase(codePoint)) does not always return true.
2966    * Unlike toUpperCase(char), this method supports supplementary
2967    * Unicode code points.
2968    *
2969    * @param codePoint character to convert to uppercase
2970    * @return uppercase mapping of codePoint, or codePoint if uppercase
2971    *         mapping does not exist
2972    * @see #isLowerCase(int)
2973    * @see #isUpperCase(int)
2974    * @see #toLowerCase(int)
2975    * @see #toTitleCase(int)
2976    * @since 1.5
2977    */
2978   public static native int toUpperCase(int codePoint);
2979
2980   /**
2981    * Converts a Unicode character into its titlecase equivalent mapping.
2982    * If a mapping does not exist, then the character passed is returned.
2983    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
2984    *
2985    * @param ch character to convert to titlecase
2986    * @return titlecase mapping of ch, or ch if titlecase mapping does
2987    *         not exist
2988    * @see #isTitleCase(char)
2989    * @see #toLowerCase(char)
2990    * @see #toUpperCase(char)
2991    */
2992   public static native char toTitleCase(char ch);
2993
2994   /**
2995    * Converts a Unicode character into its titlecase equivalent mapping.
2996    * If a mapping does not exist, then the character passed is returned.
2997    * Note that isTitleCase(toTitleCase(codePoint)) does not always return true.
2998    * Unlike toTitleCase(char), this method supports supplementary
2999    * Unicode code points.
3000    *
3001    * @param codePoint character to convert to titlecase
3002    * @return titlecase mapping of codePoint, or codePoint if titlecase
3003    *         mapping does not exist
3004    * @see #isTitleCase(int)
3005    * @see #toLowerCase(int)
3006    * @see #toUpperCase(int)
3007    * @since 1.5
3008    */
3009   public static native int toTitleCase(int codePoint);
3010
3011   /**
3012    * Converts a character into a digit of the specified radix. If the radix
3013    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3014    * exceeds the radix, or if ch is not a decimal digit or in the case
3015    * insensitive set of 'a'-'z', the result is -1.
3016    * <br>
3017    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3018    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3019    *
3020    * @param ch character to convert into a digit
3021    * @param radix radix in which ch is a digit
3022    * @return digit which ch represents in radix, or -1 not a valid digit
3023    * @see #MIN_RADIX
3024    * @see #MAX_RADIX
3025    * @see #forDigit(int, int)
3026    * @see #isDigit(char)
3027    * @see #getNumericValue(char)
3028    */
3029   public static native int digit(char ch, int radix);
3030
3031   /**
3032    * Converts a character into a digit of the specified radix. If the radix
3033    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(int)
3034    * exceeds the radix, or if codePoint is not a decimal digit or in the case
3035    * insensitive set of 'a'-'z', the result is -1.  Unlike digit(char, int),
3036    * this method supports supplementary Unicode code points.
3037    * <br>
3038    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3039    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3040    *
3041    * @param codePoint character to convert into a digit
3042    * @param radix radix in which codePoint is a digit
3043    * @return digit which codePoint represents in radix, or -1 not a valid digit
3044    * @see #MIN_RADIX
3045    * @see #MAX_RADIX
3046    * @see #forDigit(int, int)
3047    * @see #isDigit(int)
3048    * @see #getNumericValue(int)
3049    * @since 1.5
3050    */
3051   public static native int digit(int codePoint, int radix);
3052
3053   /**
3054    * Returns the Unicode numeric value property of a character. For example,
3055    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3056    *
3057    * <p>This method also returns values for the letters A through Z, (not
3058    * specified by Unicode), in these ranges: <code>'\u0041'</code>
3059    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3060    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3061    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3062    * <code>'\uFF5A'</code> (full width variants).
3063    *
3064    * <p>If the character lacks a numeric value property, -1 is returned.
3065    * If the character has a numeric value property which is not representable
3066    * as a nonnegative integer, such as a fraction, -2 is returned.
3067    *
3068    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3069    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3070    *
3071    * @param ch character from which the numeric value property will
3072    *        be retrieved
3073    * @return the numeric value property of ch, or -1 if it does not exist, or
3074    *         -2 if it is not representable as a nonnegative integer
3075    * @see #forDigit(int, int)
3076    * @see #digit(char, int)
3077    * @see #isDigit(char)
3078    * @since 1.1
3079    */
3080   public static native int getNumericValue(char ch);
3081
3082   /**
3083    * Returns the Unicode numeric value property of a character. For example,
3084    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3085    *
3086    * <p>This method also returns values for the letters A through Z, (not
3087    * specified by Unicode), in these ranges: <code>'\u0041'</code>
3088    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3089    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3090    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3091    * <code>'\uFF5A'</code> (full width variants).
3092    *
3093    * <p>If the character lacks a numeric value property, -1 is returned.
3094    * If the character has a numeric value property which is not representable
3095    * as a nonnegative integer, such as a fraction, -2 is returned.
3096    *
3097    * Unlike getNumericValue(char), this method supports supplementary Unicode
3098    * code points.
3099    *
3100    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3101    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3102    *
3103    * @param codePoint character from which the numeric value property will
3104    *        be retrieved
3105    * @return the numeric value property of codePoint, or -1 if it does not
3106    *         exist, or -2 if it is not representable as a nonnegative integer
3107    * @see #forDigit(int, int)
3108    * @see #digit(int, int)
3109    * @see #isDigit(int)
3110    * @since 1.5
3111    */
3112   public static native int getNumericValue(int codePoint);
3113
3114   /**
3115    * Determines if a character is a ISO-LATIN-1 space. This is only the five
3116    * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
3117    * <code>'\r'</code>, and <code>' '</code>.
3118    * <br>
3119    * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
3120    *
3121    * @param ch character to test
3122    * @return true if ch is a space, else false
3123    * @deprecated Replaced by {@link #isWhitespace(char)}
3124    * @see #isSpaceChar(char)
3125    * @see #isWhitespace(char)
3126    */
3127   public static boolean isSpace(char ch)
3128   {
3129     // Performing the subtraction up front alleviates need to compare longs.
3130     return ch-- <= ' ' && ((1 << ch)
3131                            & ((1 << (' ' - 1))
3132                               | (1 << ('\t' - 1))
3133                               | (1 << ('\n' - 1))
3134                               | (1 << ('\r' - 1))
3135                               | (1 << ('\f' - 1)))) != 0;
3136   }
3137
3138   /**
3139    * Determines if a character is a Unicode space character. This includes
3140    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3141    * <br>
3142    * Unicode space = [Zs]|[Zp]|[Zl]
3143    *
3144    * @param ch character to test
3145    * @return true if ch is a Unicode space, else false
3146    * @see #isWhitespace(char)
3147    * @since 1.1
3148    */
3149   public static boolean isSpaceChar(char ch)
3150   {
3151     return ((1 << getType(ch))
3152             & ((1 << SPACE_SEPARATOR)
3153                | (1 << LINE_SEPARATOR)
3154                | (1 << PARAGRAPH_SEPARATOR))) != 0;
3155   }
3156
3157   /**
3158    * Determines if a character is a Unicode space character. This includes
3159    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.  Unlike
3160    * isSpaceChar(char), this method supports supplementary Unicode code points.
3161    * <br>
3162    * Unicode space = [Zs]|[Zp]|[Zl]
3163    *
3164    * @param codePoint character to test
3165    * @return true if codePoint is a Unicode space, else false
3166    * @see #isWhitespace(int)
3167    * @since 1.5
3168    */
3169   public static boolean isSpaceChar(int codePoint)
3170   {
3171     return ((1 << getType(codePoint))
3172             & ((1 << SPACE_SEPARATOR)
3173                | (1 << LINE_SEPARATOR)
3174                | (1 << PARAGRAPH_SEPARATOR))) != 0;
3175   }
3176
3177   /**
3178    * Determines if a character is Java whitespace. This includes Unicode
3179    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3180    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3181    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3182    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3183    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3184    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3185    * and <code>'\u001F'</code>.
3186    * <br>
3187    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3188    *
3189    * @param ch character to test
3190    * @return true if ch is Java whitespace, else false
3191    * @see #isSpaceChar(char)
3192    * @since 1.1
3193    */
3194   public static boolean isWhitespace(char ch)
3195   {
3196     int attr = readChar(ch);
3197     return ((((1 << (attr & TYPE_MASK))
3198               & ((1 << SPACE_SEPARATOR)
3199                  | (1 << LINE_SEPARATOR)
3200                  | (1 << PARAGRAPH_SEPARATOR))) != 0)
3201             && (attr & NO_BREAK_MASK) == 0)
3202       || (ch <= '\u001F' && ((1 << ch)
3203                              & ((1 << '\t')
3204                                 | (1 << '\n')
3205                                 | (1 << '\u000B')
3206                                 | (1 << '\u000C')
3207                                 | (1 << '\r')
3208                                 | (1 << '\u001C')
3209                                 | (1 << '\u001D')
3210                                 | (1 << '\u001E')
3211                                 | (1 << '\u001F'))) != 0);
3212   }
3213
3214   /**
3215    * Determines if a character is Java whitespace. This includes Unicode
3216    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3217    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3218    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3219    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3220    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3221    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3222    * and <code>'\u001F'</code>.  Unlike isWhitespace(char), this method
3223    * supports supplementary Unicode code points.
3224    * <br>
3225    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3226    *
3227    * @param codePoint character to test
3228    * @return true if codePoint is Java whitespace, else false
3229    * @see #isSpaceChar(int)
3230    * @since 1.5
3231    */
3232   public static boolean isWhitespace(int codePoint)
3233   {
3234     int plane = codePoint >>> 16;
3235     if (plane > 2 && plane != 14)
3236       return false;
3237     int attr = readCodePoint(codePoint);
3238     return ((((1 << (attr & TYPE_MASK))
3239               & ((1 << SPACE_SEPARATOR)
3240                  | (1 << LINE_SEPARATOR)
3241                  | (1 << PARAGRAPH_SEPARATOR))) != 0)
3242             && (attr & NO_BREAK_MASK) == 0)
3243       || (codePoint <= '\u001F' && ((1 << codePoint)
3244                              & ((1 << '\t')
3245                                 | (1 << '\n')
3246                                 | (1 << '\u000B')
3247                                 | (1 << '\u000C')
3248                                 | (1 << '\r')
3249                                 | (1 << '\u001C')
3250                                 | (1 << '\u001D')
3251                                 | (1 << '\u001E')
3252                                 | (1 << '\u001F'))) != 0);
3253   }
3254
3255   /**
3256    * Determines if a character has the ISO Control property.
3257    * <br>
3258    * ISO Control = [Cc]
3259    *
3260    * @param ch character to test
3261    * @return true if ch is an ISO Control character, else false
3262    * @see #isSpaceChar(char)
3263    * @see #isWhitespace(char)
3264    * @since 1.1
3265    */
3266   public static boolean isISOControl(char ch)
3267   {
3268     return getType(ch) == CONTROL;
3269   }
3270
3271   /**
3272    * Determines if a character has the ISO Control property.  Unlike
3273    * isISOControl(char), this method supports supplementary unicode
3274    * code points.
3275    * <br>
3276    * ISO Control = [Cc]
3277    *
3278    * @param codePoint character to test
3279    * @return true if codePoint is an ISO Control character, else false
3280    * @see #isSpaceChar(int)
3281    * @see #isWhitespace(int)
3282    * @since 1.5
3283    */
3284   public static boolean isISOControl(int codePoint)
3285   {
3286     return getType(codePoint) == CONTROL;
3287   }
3288
3289   /**
3290    * Returns the Unicode general category property of a character.
3291    *
3292    * @param ch character from which the general category property will
3293    *        be retrieved
3294    * @return the character category property of ch as an integer
3295    * @see #UNASSIGNED
3296    * @see #UPPERCASE_LETTER
3297    * @see #LOWERCASE_LETTER
3298    * @see #TITLECASE_LETTER
3299    * @see #MODIFIER_LETTER
3300    * @see #OTHER_LETTER
3301    * @see #NON_SPACING_MARK
3302    * @see #ENCLOSING_MARK
3303    * @see #COMBINING_SPACING_MARK
3304    * @see #DECIMAL_DIGIT_NUMBER
3305    * @see #LETTER_NUMBER
3306    * @see #OTHER_NUMBER
3307    * @see #SPACE_SEPARATOR
3308    * @see #LINE_SEPARATOR
3309    * @see #PARAGRAPH_SEPARATOR
3310    * @see #CONTROL
3311    * @see #FORMAT
3312    * @see #PRIVATE_USE
3313    * @see #SURROGATE
3314    * @see #DASH_PUNCTUATION
3315    * @see #START_PUNCTUATION
3316    * @see #END_PUNCTUATION
3317    * @see #CONNECTOR_PUNCTUATION
3318    * @see #OTHER_PUNCTUATION
3319    * @see #MATH_SYMBOL
3320    * @see #CURRENCY_SYMBOL
3321    * @see #MODIFIER_SYMBOL
3322    * @see #INITIAL_QUOTE_PUNCTUATION
3323    * @see #FINAL_QUOTE_PUNCTUATION
3324    * @since 1.1
3325    */
3326   public static native int getType(char ch);
3327
3328   /**
3329    * Returns the Unicode general category property of a character.  Supports
3330    * supplementary Unicode code points.
3331    *
3332    * @param codePoint character from which the general category property will
3333    *        be retrieved
3334    * @return the character category property of codePoint as an integer
3335    * @see #UNASSIGNED
3336    * @see #UPPERCASE_LETTER
3337    * @see #LOWERCASE_LETTER
3338    * @see #TITLECASE_LETTER
3339    * @see #MODIFIER_LETTER
3340    * @see #OTHER_LETTER
3341    * @see #NON_SPACING_MARK
3342    * @see #ENCLOSING_MARK
3343    * @see #COMBINING_SPACING_MARK
3344    * @see #DECIMAL_DIGIT_NUMBER
3345    * @see #LETTER_NUMBER
3346    * @see #OTHER_NUMBER
3347    * @see #SPACE_SEPARATOR
3348    * @see #LINE_SEPARATOR
3349    * @see #PARAGRAPH_SEPARATOR
3350    * @see #CONTROL
3351    * @see #FORMAT
3352    * @see #PRIVATE_USE
3353    * @see #SURROGATE
3354    * @see #DASH_PUNCTUATION
3355    * @see #START_PUNCTUATION
3356    * @see #END_PUNCTUATION
3357    * @see #CONNECTOR_PUNCTUATION
3358    * @see #OTHER_PUNCTUATION
3359    * @see #MATH_SYMBOL
3360    * @see #CURRENCY_SYMBOL
3361    * @see #MODIFIER_SYMBOL
3362    * @see #INITIAL_QUOTE_PUNCTUATION
3363    * @see #FINAL_QUOTE_PUNCTUATION
3364    * @since 1.5
3365    */
3366   public static native int getType(int codePoint);
3367
3368   /**
3369    * Converts a digit into a character which represents that digit
3370    * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
3371    * or the digit exceeds the radix, then the null character <code>'\0'</code>
3372    * is returned.  Otherwise the return value is in '0'-'9' and 'a'-'z'.
3373    * <br>
3374    * return value boundary = U+0030-U+0039|U+0061-U+007A
3375    *
3376    * @param digit digit to be converted into a character
3377    * @param radix radix of digit
3378    * @return character representing digit in radix, or '\0'
3379    * @see #MIN_RADIX
3380    * @see #MAX_RADIX
3381    * @see #digit(char, int)
3382    */
3383   public static char forDigit(int digit, int radix)
3384   {
3385     if (radix < MIN_RADIX || radix > MAX_RADIX
3386         || digit < 0 || digit >= radix)
3387       return '\0';
3388     return (char) (digit < 10 ? ('0' + digit) : ('a' - 10 + digit));
3389   }
3390
3391   /**
3392    * Returns the Unicode directionality property of the character. This
3393    * is used in the visual ordering of text.
3394    *
3395    * @param ch the character to look up
3396    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
3397    * @see #DIRECTIONALITY_UNDEFINED
3398    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
3399    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
3400    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
3401    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
3402    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
3403    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
3404    * @see #DIRECTIONALITY_ARABIC_NUMBER
3405    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
3406    * @see #DIRECTIONALITY_NONSPACING_MARK
3407    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
3408    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
3409    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
3410    * @see #DIRECTIONALITY_WHITESPACE
3411    * @see #DIRECTIONALITY_OTHER_NEUTRALS
3412    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
3413    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
3414    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
3415    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
3416    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
3417    * @since 1.4
3418    */
3419   public static native byte getDirectionality(char ch);
3420
3421   /**
3422    * Returns the Unicode directionality property of the character. This
3423    * is used in the visual ordering of text.  Unlike getDirectionality(char),
3424    * this method supports supplementary Unicode code points.
3425    *
3426    * @param codePoint the character to look up
3427    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
3428    * @see #DIRECTIONALITY_UNDEFINED
3429    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
3430    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
3431    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
3432    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
3433    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
3434    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
3435    * @see #DIRECTIONALITY_ARABIC_NUMBER
3436    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
3437    * @see #DIRECTIONALITY_NONSPACING_MARK
3438    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
3439    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
3440    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
3441    * @see #DIRECTIONALITY_WHITESPACE
3442    * @see #DIRECTIONALITY_OTHER_NEUTRALS
3443    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
3444    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
3445    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
3446    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
3447    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
3448    * @since 1.5
3449    */
3450   public static native byte getDirectionality(int codePoint);
3451
3452   /**
3453    * Determines whether the character is mirrored according to Unicode. For
3454    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
3455    * left-to-right text, but ')' in right-to-left text.
3456    *
3457    * @param ch the character to look up
3458    * @return true if the character is mirrored
3459    * @since 1.4
3460    */
3461   public static boolean isMirrored(char ch)
3462   {
3463     return (readChar(ch) & MIRROR_MASK) != 0;
3464   }
3465
3466   /**
3467    * Determines whether the character is mirrored according to Unicode. For
3468    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
3469    * left-to-right text, but ')' in right-to-left text.  Unlike
3470    * isMirrored(char), this method supports supplementary Unicode code points.
3471    *
3472    * @param codePoint the character to look up
3473    * @return true if the character is mirrored
3474    * @since 1.5
3475    */
3476   public static boolean isMirrored(int codePoint)
3477   {
3478     int plane = codePoint >>> 16;
3479     if (plane > 2 && plane != 14)
3480       return false;
3481     return (readCodePoint(codePoint) & MIRROR_MASK) != 0;
3482   }
3483
3484   /**
3485    * Compares another Character to this Character, numerically.
3486    *
3487    * @param anotherCharacter Character to compare with this Character
3488    * @return a negative integer if this Character is less than
3489    *         anotherCharacter, zero if this Character is equal, and
3490    *         a positive integer if this Character is greater
3491    * @throws NullPointerException if anotherCharacter is null
3492    * @since 1.2
3493    */
3494   public int compareTo(Character anotherCharacter)
3495   {
3496     return value - anotherCharacter.value;
3497   }
3498
3499   /**
3500    * Compares an object to this Character.  Assuming the object is a
3501    * Character object, this method performs the same comparison as
3502    * compareTo(Character).
3503    *
3504    * @param o object to compare
3505    * @return the comparison value
3506    * @throws ClassCastException if o is not a Character object
3507    * @throws NullPointerException if o is null
3508    * @see #compareTo(Character)
3509    * @since 1.2
3510    */
3511   public int compareTo(Object o)
3512   {
3513     return compareTo((Character) o);
3514   }
3515
3516   /**
3517    * Returns an <code>Character</code> object wrapping the value.
3518    * In contrast to the <code>Character</code> constructor, this method
3519    * will cache some values.  It is used by boxing conversion.
3520    *
3521    * @param val the value to wrap
3522    * @return the <code>Character</code>
3523    *
3524    * @since 1.5
3525    */
3526   public static Character valueOf(char val)
3527   {
3528     if (val > MAX_CACHE)
3529       return new Character(val);
3530     synchronized (charCache)
3531       {
3532     if (charCache[val - MIN_VALUE] == null)
3533       charCache[val - MIN_VALUE] = new Character(val);
3534     return charCache[val - MIN_VALUE];
3535       }
3536   }
3537
3538   /**
3539    * Reverse the bytes in val.
3540    * @since 1.5
3541    */
3542   public static char reverseBytes(char val)
3543   {
3544     return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
3545   }
3546
3547   /**
3548    * Converts a unicode code point to a UTF-16 representation of that
3549    * code point.
3550    *
3551    * @param codePoint the unicode code point
3552    *
3553    * @return the UTF-16 representation of that code point
3554    *
3555    * @throws IllegalArgumentException if the code point is not a valid
3556    *         unicode code point
3557    *
3558    * @since 1.5
3559    */
3560   public static char[] toChars(int codePoint)
3561   {
3562     char[] result = new char[charCount(codePoint)];
3563     int ignore = toChars(codePoint, result, 0);
3564     return result;
3565   }
3566
3567   /**
3568    * Converts a unicode code point to its UTF-16 representation.
3569    *
3570    * @param codePoint the unicode code point
3571    * @param dst the target char array
3572    * @param dstIndex the start index for the target
3573    *
3574    * @return number of characters written to <code>dst</code>
3575    *
3576    * @throws IllegalArgumentException if <code>codePoint</code> is not a
3577    *         valid unicode code point
3578    * @throws NullPointerException if <code>dst</code> is <code>null</code>
3579    * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
3580    *         in <code>dst</code> or if the UTF-16 representation does not
3581    *         fit into <code>dst</code>
3582    *
3583    * @since 1.5
3584    */
3585   public static int toChars(int codePoint, char[] dst, int dstIndex)
3586   {
3587     if (!isValidCodePoint(codePoint))
3588       {
3589         throw new IllegalArgumentException("not a valid code point: "
3590                                            + codePoint);
3591       }
3592
3593     int result;
3594     if (isSupplementaryCodePoint(codePoint))
3595       {
3596         // Write second char first to cause IndexOutOfBoundsException
3597         // immediately.
3598         final int cp2 = codePoint - 0x10000;
3599         dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
3600         dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
3601         result = 2;
3602       }
3603     else
3604       {
3605         dst[dstIndex] = (char) codePoint;
3606         result = 1;
3607       }
3608     return result;
3609   }
3610
3611   /**
3612    * Return number of 16-bit characters required to represent the given
3613    * code point.
3614    *
3615    * @param codePoint a unicode code point
3616    *
3617    * @return 2 if codePoint >= 0x10000, 1 otherwise.
3618    *
3619    * @since 1.5
3620    */
3621   public static int charCount(int codePoint)
3622   {
3623     return
3624       (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
3625       ? 2
3626       : 1;
3627   }
3628
3629   /**
3630    * Determines whether the specified code point is
3631    * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
3632    * supplementary character range.
3633    *
3634    * @param codePoint a Unicode code point
3635    *
3636    * @return <code>true</code> if code point is in supplementary range
3637    *
3638    * @since 1.5
3639    */
3640   public static boolean isSupplementaryCodePoint(int codePoint)
3641   {
3642     return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
3643       && codePoint <= MAX_CODE_POINT;
3644   }
3645
3646   /**
3647    * Determines whether the specified code point is
3648    * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
3649    *
3650    * @param codePoint a Unicode code point
3651    *
3652    * @return <code>true</code> if code point is valid
3653    *
3654    * @since 1.5
3655    */
3656   public static boolean isValidCodePoint(int codePoint)
3657   {
3658     return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
3659   }
3660
3661   /**
3662    * Return true if the given character is a high surrogate.
3663    * @param ch the character
3664    * @return true if the character is a high surrogate character
3665    *
3666    * @since 1.5
3667    */
3668   public static boolean isHighSurrogate(char ch)
3669   {
3670     return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
3671   }
3672
3673   /**
3674    * Return true if the given character is a low surrogate.
3675    * @param ch the character
3676    * @return true if the character is a low surrogate character
3677    *
3678    * @since 1.5
3679    */
3680   public static boolean isLowSurrogate(char ch)
3681   {
3682     return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
3683   }
3684
3685   /**
3686    * Return true if the given characters compose a surrogate pair.
3687    * This is true if the first character is a high surrogate and the
3688    * second character is a low surrogate.
3689    * @param ch1 the first character
3690    * @param ch2 the first character
3691    * @return true if the characters compose a surrogate pair
3692    *
3693    * @since 1.5
3694    */
3695   public static boolean isSurrogatePair(char ch1, char ch2)
3696   {
3697     return isHighSurrogate(ch1) && isLowSurrogate(ch2);
3698   }
3699
3700   /**
3701    * Given a valid surrogate pair, this returns the corresponding
3702    * code point.
3703    * @param high the high character of the pair
3704    * @param low the low character of the pair
3705    * @return the corresponding code point
3706    *
3707    * @since 1.5
3708    */
3709   public static int toCodePoint(char high, char low)
3710   {
3711     return ((high - MIN_HIGH_SURROGATE) * 0x400) +
3712       (low - MIN_LOW_SURROGATE) + 0x10000;
3713   }
3714
3715   /**
3716    * Get the code point at the specified index in the CharSequence.
3717    * This is like CharSequence#charAt(int), but if the character is
3718    * the start of a surrogate pair, and there is a following
3719    * character, and this character completes the pair, then the
3720    * corresponding supplementary code point is returned.  Otherwise,
3721    * the character at the index is returned.
3722    *
3723    * @param sequence the CharSequence
3724    * @param index the index of the codepoint to get, starting at 0
3725    * @return the codepoint at the specified index
3726    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
3727    * @since 1.5
3728    */
3729   public static int codePointAt(CharSequence sequence, int index)
3730   {
3731     int len = sequence.length();
3732     if (index < 0 || index >= len)
3733       throw new IndexOutOfBoundsException();
3734     char high = sequence.charAt(index);
3735     if (! isHighSurrogate(high) || ++index >= len)
3736       return high;
3737     char low = sequence.charAt(index);
3738     if (! isLowSurrogate(low))
3739       return high;
3740     return toCodePoint(high, low);
3741   }
3742
3743   /**
3744    * Get the code point at the specified index in the CharSequence.
3745    * If the character is the start of a surrogate pair, and there is a
3746    * following character, and this character completes the pair, then
3747    * the corresponding supplementary code point is returned.
3748    * Otherwise, the character at the index is returned.
3749    *
3750    * @param chars the character array in which to look
3751    * @param index the index of the codepoint to get, starting at 0
3752    * @return the codepoint at the specified index
3753    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
3754    * @since 1.5
3755    */
3756   public static int codePointAt(char[] chars, int index)
3757   {
3758     return codePointAt(chars, index, chars.length);
3759   }
3760
3761   /**
3762    * Get the code point at the specified index in the CharSequence.
3763    * If the character is the start of a surrogate pair, and there is a
3764    * following character within the specified range, and this
3765    * character completes the pair, then the corresponding
3766    * supplementary code point is returned.  Otherwise, the character
3767    * at the index is returned.
3768    *
3769    * @param chars the character array in which to look
3770    * @param index the index of the codepoint to get, starting at 0
3771    * @param limit the limit past which characters should not be examined
3772    * @return the codepoint at the specified index
3773    * @throws IndexOutOfBoundsException if index is negative or &gt;=
3774    * limit, or if limit is negative or &gt;= the length of the array
3775    * @since 1.5
3776    */
3777   public static int codePointAt(char[] chars, int index, int limit)
3778   {
3779     if (index < 0 || index >= limit || limit < 0 || limit >= chars.length)
3780       throw new IndexOutOfBoundsException();
3781     char high = chars[index];
3782     if (! isHighSurrogate(high) || ++index >= limit)
3783       return high;
3784     char low = chars[index];
3785     if (! isLowSurrogate(low))
3786       return high;
3787     return toCodePoint(high, low);
3788   }
3789
3790   /**
3791    * Get the code point before the specified index.  This is like
3792    * #codePointAt(char[], int), but checks the characters at
3793    * <code>index-1</code> and <code>index-2</code> to see if they form
3794    * a supplementary code point.  If they do not, the character at
3795    * <code>index-1</code> is returned.
3796    *
3797    * @param chars the character array
3798    * @param index the index just past the codepoint to get, starting at 0
3799    * @return the codepoint at the specified index
3800    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
3801    * @since 1.5
3802    */
3803   public static int codePointBefore(char[] chars, int index)
3804   {
3805     return codePointBefore(chars, index, 1);
3806   }
3807
3808   /**
3809    * Get the code point before the specified index.  This is like
3810    * #codePointAt(char[], int), but checks the characters at
3811    * <code>index-1</code> and <code>index-2</code> to see if they form
3812    * a supplementary code point.  If they do not, the character at
3813    * <code>index-1</code> is returned.  The start parameter is used to
3814    * limit the range of the array which may be examined.
3815    *
3816    * @param chars the character array
3817    * @param index the index just past the codepoint to get, starting at 0
3818    * @param start the index before which characters should not be examined
3819    * @return the codepoint at the specified index
3820    * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
3821    * the length of the array, or if limit is negative or &gt;= the
3822    * length of the array
3823    * @since 1.5
3824    */
3825   public static int codePointBefore(char[] chars, int index, int start)
3826   {
3827     if (index < start || index > chars.length
3828         || start < 0 || start >= chars.length)
3829       throw new IndexOutOfBoundsException();
3830     --index;
3831     char low = chars[index];
3832     if (! isLowSurrogate(low) || --index < start)
3833       return low;
3834     char high = chars[index];
3835     if (! isHighSurrogate(high))
3836       return low;
3837     return toCodePoint(high, low);
3838   }
3839
3840   /**
3841    * Get the code point before the specified index.  This is like
3842    * #codePointAt(CharSequence, int), but checks the characters at
3843    * <code>index-1</code> and <code>index-2</code> to see if they form
3844    * a supplementary code point.  If they do not, the character at
3845    * <code>index-1</code> is returned.
3846    *
3847    * @param sequence the CharSequence
3848    * @param index the index just past the codepoint to get, starting at 0
3849    * @return the codepoint at the specified index
3850    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
3851    * @since 1.5
3852    */
3853   public static int codePointBefore(CharSequence sequence, int index)
3854   {
3855     int len = sequence.length();
3856     if (index < 1 || index > len)
3857       throw new IndexOutOfBoundsException();
3858     --index;
3859     char low = sequence.charAt(index);
3860     if (! isLowSurrogate(low) || --index < 0)
3861       return low;
3862     char high = sequence.charAt(index);
3863     if (! isHighSurrogate(high))
3864       return low;
3865     return toCodePoint(high, low);
3866   }
3867 } // class Character