libjava/java/lang/Character.java

   1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
   2    Copyright (C) 1998, 1999, 2001, 2002, 2005, 2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 /*
  39  * Note: This class must not be merged with Classpath.  Gcj uses C-style
  40  * arrays (see include/java-chartables.h) to store the Unicode character
  41  * database, whereas Classpath uses Java objects (char[] extracted from
  42  * String constants) in gnu.java.lang.CharData.  Gcj's approach is more
  43  * efficient, because there is no vtable or data relocation to worry about.
  44  * However, despite the difference in the database interface, the two
  45  * versions share identical algorithms.
  46  */
  47
  48 package java.lang;
  49
  50 import java.io.Serializable;
  51
  52 /**
  53  * Wrapper class for the primitive char data type.  In addition, this class
  54  * allows one to retrieve property information and perform transformations
  55  * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0.
  56  * java.lang.Character is designed to be very dynamic, and as such, it
  57  * retrieves information on the Unicode character set from a separate
  58  * database, gnu.java.lang.CharData, which can be easily upgraded.
  59  *
  60  * <p>For predicates, boundaries are used to describe
  61  * the set of characters for which the method will return true.
  62  * This syntax uses fairly normal regular expression notation.
  63  * See 5.13 of the Unicode Standard, Version 3.0, for the
  64  * boundary specification.
  65  *
  66  * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
  67  * for more information on the Unicode Standard.
  68  *
  69  * @author Tom Tromey (tromey@cygnus.com)
  70  * @author Paul N. Fisher
  71  * @author Jochen Hoenicke
  72  * @author Eric Blake (ebb9@email.byu.edu)
  73  * @since 1.0
  74  * @status updated to 1.4
  75  */
  76 public final class Character implements Serializable, Comparable
  77 {
  78   /**
  79    * A subset of Unicode blocks.
  80    *
  81    * @author Paul N. Fisher
  82    * @author Eric Blake (ebb9@email.byu.edu)
  83    * @since 1.2
  84    */
  85   public static class Subset
  86   {
  87     /** The name of the subset. */
  88     private final String name;
  89
  90     /**
  91      * Construct a new subset of characters.
  92      *
  93      * @param name the name of the subset
  94      * @throws NullPointerException if name is null
  95      */
  96     protected Subset(String name)
  97     {
  98       // Note that name.toString() is name, unless name was null.
  99       this.name = name.toString();
 100     }
 101
 102     /**
 103      * Compares two Subsets for equality. This is <code>final</code>, and
 104      * restricts the comparison on the <code>==</code> operator, so it returns
 105      * true only for the same object.
 106      *
 107      * @param o the object to compare
 108      * @return true if o is this
 109      */
 110     public final boolean equals(Object o)
 111     {
 112       return o == this;
 113     }
 114
 115     /**
 116      * Makes the original hashCode of Object final, to be consistent with
 117      * equals.
 118      *
 119      * @return the hash code for this object
 120      */
 121     public final int hashCode()
 122     {
 123       return super.hashCode();
 124     }
 125
 126     /**
 127      * Returns the name of the subset.
 128      *
 129      * @return the name
 130      */
 131     public final String toString()
 132     {
 133       return name;
 134     }
 135   } // class Subset
 136
 137   /**
 138    * A family of character subsets in the Unicode specification. A character
 139    * is in at most one of these blocks.
 140    *
 141    * This inner class was generated automatically from
 142    * <code>libjava/gnu/gcj/convert/Blocks-3.txt</code>, by some perl scripts.
 143    * This Unicode definition file can be found on the
 144    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 145    * JDK 1.4 uses Unicode version 3.0.0.
 146    *
 147    * @author scripts/unicode-blocks.pl (written by Eric Blake)
 148    * @since 1.2
 149    */
 150   public static final class UnicodeBlock extends Subset
 151   {
 152     /** The start of the subset. */
 153     private final char start;
 154
 155     /** The end of the subset. */
 156     private final char end;
 157
 158     /**
 159      * Constructor for strictly defined blocks.
 160      *
 161      * @param start the start character of the range
 162      * @param end the end character of the range
 163      * @param name the block name
 164      */
 165     private UnicodeBlock(char start, char end, String name)
 166     {
 167       super(name);
 168       this.start = start;
 169       this.end = end;
 170     }
 171
 172     /**
 173      * Returns the Unicode character block which a character belongs to.
 174      *
 175      * @param ch the character to look up
 176      * @return the set it belongs to, or null if it is not in one
 177      */
 178     public static UnicodeBlock of(char ch)
 179     {
 180       // Special case, since SPECIALS contains two ranges.
 181       if (ch == '\uFEFF')
 182         return SPECIALS;
 183       // Simple binary search for the correct block.
 184       int low = 0;
 185       int hi = sets.length - 1;
 186       while (low <= hi)
 187         {
 188           int mid = (low + hi) >> 1;
 189           UnicodeBlock b = sets[mid];
 190           if (ch < b.start)
 191             hi = mid - 1;
 192           else if (ch > b.end)
 193             low = mid + 1;
 194           else
 195             return b;
 196         }
 197       return null;
 198     }
 199
 200     /**
 201      * Basic Latin.
 202      * '\u0000' - '\u007F'.
 203      */
 204     public static final UnicodeBlock BASIC_LATIN
 205       = new UnicodeBlock('\u0000', '\u007F',
 206                          "BASIC_LATIN");
 207
 208     /**
 209      * Latin-1 Supplement.
 210      * '\u0080' - '\u00FF'.
 211      */
 212     public static final UnicodeBlock LATIN_1_SUPPLEMENT
 213       = new UnicodeBlock('\u0080', '\u00FF',
 214                          "LATIN_1_SUPPLEMENT");
 215
 216     /**
 217      * Latin Extended-A.
 218      * '\u0100' - '\u017F'.
 219      */
 220     public static final UnicodeBlock LATIN_EXTENDED_A
 221       = new UnicodeBlock('\u0100', '\u017F',
 222                          "LATIN_EXTENDED_A");
 223
 224     /**
 225      * Latin Extended-B.
 226      * '\u0180' - '\u024F'.
 227      */
 228     public static final UnicodeBlock LATIN_EXTENDED_B
 229       = new UnicodeBlock('\u0180', '\u024F',
 230                          "LATIN_EXTENDED_B");
 231
 232     /**
 233      * IPA Extensions.
 234      * '\u0250' - '\u02AF'.
 235      */
 236     public static final UnicodeBlock IPA_EXTENSIONS
 237       = new UnicodeBlock('\u0250', '\u02AF',
 238                          "IPA_EXTENSIONS");
 239
 240     /**
 241      * Spacing Modifier Letters.
 242      * '\u02B0' - '\u02FF'.
 243      */
 244     public static final UnicodeBlock SPACING_MODIFIER_LETTERS
 245       = new UnicodeBlock('\u02B0', '\u02FF',
 246                          "SPACING_MODIFIER_LETTERS");
 247
 248     /**
 249      * Combining Diacritical Marks.
 250      * '\u0300' - '\u036F'.
 251      */
 252     public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
 253       = new UnicodeBlock('\u0300', '\u036F',
 254                          "COMBINING_DIACRITICAL_MARKS");
 255
 256     /**
 257      * Greek.
 258      * '\u0370' - '\u03FF'.
 259      */
 260     public static final UnicodeBlock GREEK
 261       = new UnicodeBlock('\u0370', '\u03FF',
 262                          "GREEK");
 263
 264     /**
 265      * Cyrillic.
 266      * '\u0400' - '\u04FF'.
 267      */
 268     public static final UnicodeBlock CYRILLIC
 269       = new UnicodeBlock('\u0400', '\u04FF',
 270                          "CYRILLIC");
 271
 272     /**
 273      * Armenian.
 274      * '\u0530' - '\u058F'.
 275      */
 276     public static final UnicodeBlock ARMENIAN
 277       = new UnicodeBlock('\u0530', '\u058F',
 278                          "ARMENIAN");
 279
 280     /**
 281      * Hebrew.
 282      * '\u0590' - '\u05FF'.
 283      */
 284     public static final UnicodeBlock HEBREW
 285       = new UnicodeBlock('\u0590', '\u05FF',
 286                          "HEBREW");
 287
 288     /**
 289      * Arabic.
 290      * '\u0600' - '\u06FF'.
 291      */
 292     public static final UnicodeBlock ARABIC
 293       = new UnicodeBlock('\u0600', '\u06FF',
 294                          "ARABIC");
 295
 296     /**
 297      * Syriac.
 298      * '\u0700' - '\u074F'.
 299      * @since 1.4
 300      */
 301     public static final UnicodeBlock SYRIAC
 302       = new UnicodeBlock('\u0700', '\u074F',
 303                          "SYRIAC");
 304
 305     /**
 306      * Thaana.
 307      * '\u0780' - '\u07BF'.
 308      * @since 1.4
 309      */
 310     public static final UnicodeBlock THAANA
 311       = new UnicodeBlock('\u0780', '\u07BF',
 312                          "THAANA");
 313
 314     /**
 315      * Devanagari.
 316      * '\u0900' - '\u097F'.
 317      */
 318     public static final UnicodeBlock DEVANAGARI
 319       = new UnicodeBlock('\u0900', '\u097F',
 320                          "DEVANAGARI");
 321
 322     /**
 323      * Bengali.
 324      * '\u0980' - '\u09FF'.
 325      */
 326     public static final UnicodeBlock BENGALI
 327       = new UnicodeBlock('\u0980', '\u09FF',
 328                          "BENGALI");
 329
 330     /**
 331      * Gurmukhi.
 332      * '\u0A00' - '\u0A7F'.
 333      */
 334     public static final UnicodeBlock GURMUKHI
 335       = new UnicodeBlock('\u0A00', '\u0A7F',
 336                          "GURMUKHI");
 337
 338     /**
 339      * Gujarati.
 340      * '\u0A80' - '\u0AFF'.
 341      */
 342     public static final UnicodeBlock GUJARATI
 343       = new UnicodeBlock('\u0A80', '\u0AFF',
 344                          "GUJARATI");
 345
 346     /**
 347      * Oriya.
 348      * '\u0B00' - '\u0B7F'.
 349      */
 350     public static final UnicodeBlock ORIYA
 351       = new UnicodeBlock('\u0B00', '\u0B7F',
 352                          "ORIYA");
 353
 354     /**
 355      * Tamil.
 356      * '\u0B80' - '\u0BFF'.
 357      */
 358     public static final UnicodeBlock TAMIL
 359       = new UnicodeBlock('\u0B80', '\u0BFF',
 360                          "TAMIL");
 361
 362     /**
 363      * Telugu.
 364      * '\u0C00' - '\u0C7F'.
 365      */
 366     public static final UnicodeBlock TELUGU
 367       = new UnicodeBlock('\u0C00', '\u0C7F',
 368                          "TELUGU");
 369
 370     /**
 371      * Kannada.
 372      * '\u0C80' - '\u0CFF'.
 373      */
 374     public static final UnicodeBlock KANNADA
 375       = new UnicodeBlock('\u0C80', '\u0CFF',
 376                          "KANNADA");
 377
 378     /**
 379      * Malayalam.
 380      * '\u0D00' - '\u0D7F'.
 381      */
 382     public static final UnicodeBlock MALAYALAM
 383       = new UnicodeBlock('\u0D00', '\u0D7F',
 384                          "MALAYALAM");
 385
 386     /**
 387      * Sinhala.
 388      * '\u0D80' - '\u0DFF'.
 389      * @since 1.4
 390      */
 391     public static final UnicodeBlock SINHALA
 392       = new UnicodeBlock('\u0D80', '\u0DFF',
 393                          "SINHALA");
 394
 395     /**
 396      * Thai.
 397      * '\u0E00' - '\u0E7F'.
 398      */
 399     public static final UnicodeBlock THAI
 400       = new UnicodeBlock('\u0E00', '\u0E7F',
 401                          "THAI");
 402
 403     /**
 404      * Lao.
 405      * '\u0E80' - '\u0EFF'.
 406      */
 407     public static final UnicodeBlock LAO
 408       = new UnicodeBlock('\u0E80', '\u0EFF',
 409                          "LAO");
 410
 411     /**
 412      * Tibetan.
 413      * '\u0F00' - '\u0FFF'.
 414      */
 415     public static final UnicodeBlock TIBETAN
 416       = new UnicodeBlock('\u0F00', '\u0FFF',
 417                          "TIBETAN");
 418
 419     /**
 420      * Myanmar.
 421      * '\u1000' - '\u109F'.
 422      * @since 1.4
 423      */
 424     public static final UnicodeBlock MYANMAR
 425       = new UnicodeBlock('\u1000', '\u109F',
 426                          "MYANMAR");
 427
 428     /**
 429      * Georgian.
 430      * '\u10A0' - '\u10FF'.
 431      */
 432     public static final UnicodeBlock GEORGIAN
 433       = new UnicodeBlock('\u10A0', '\u10FF',
 434                          "GEORGIAN");
 435
 436     /**
 437      * Hangul Jamo.
 438      * '\u1100' - '\u11FF'.
 439      */
 440     public static final UnicodeBlock HANGUL_JAMO
 441       = new UnicodeBlock('\u1100', '\u11FF',
 442                          "HANGUL_JAMO");
 443
 444     /**
 445      * Ethiopic.
 446      * '\u1200' - '\u137F'.
 447      * @since 1.4
 448      */
 449     public static final UnicodeBlock ETHIOPIC
 450       = new UnicodeBlock('\u1200', '\u137F',
 451                          "ETHIOPIC");
 452
 453     /**
 454      * Cherokee.
 455      * '\u13A0' - '\u13FF'.
 456      * @since 1.4
 457      */
 458     public static final UnicodeBlock CHEROKEE
 459       = new UnicodeBlock('\u13A0', '\u13FF',
 460                          "CHEROKEE");
 461
 462     /**
 463      * Unified Canadian Aboriginal Syllabics.
 464      * '\u1400' - '\u167F'.
 465      * @since 1.4
 466      */
 467     public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
 468       = new UnicodeBlock('\u1400', '\u167F',
 469                          "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS");
 470
 471     /**
 472      * Ogham.
 473      * '\u1680' - '\u169F'.
 474      * @since 1.4
 475      */
 476     public static final UnicodeBlock OGHAM
 477       = new UnicodeBlock('\u1680', '\u169F',
 478                          "OGHAM");
 479
 480     /**
 481      * Runic.
 482      * '\u16A0' - '\u16FF'.
 483      * @since 1.4
 484      */
 485     public static final UnicodeBlock RUNIC
 486       = new UnicodeBlock('\u16A0', '\u16FF',
 487                          "RUNIC");
 488
 489     /**
 490      * Khmer.
 491      * '\u1780' - '\u17FF'.
 492      * @since 1.4
 493      */
 494     public static final UnicodeBlock KHMER
 495       = new UnicodeBlock('\u1780', '\u17FF',
 496                          "KHMER");
 497
 498     /**
 499      * Mongolian.
 500      * '\u1800' - '\u18AF'.
 501      * @since 1.4
 502      */
 503     public static final UnicodeBlock MONGOLIAN
 504       = new UnicodeBlock('\u1800', '\u18AF',
 505                          "MONGOLIAN");
 506
 507     /**
 508      * Latin Extended Additional.
 509      * '\u1E00' - '\u1EFF'.
 510      */
 511     public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
 512       = new UnicodeBlock('\u1E00', '\u1EFF',
 513                          "LATIN_EXTENDED_ADDITIONAL");
 514
 515     /**
 516      * Greek Extended.
 517      * '\u1F00' - '\u1FFF'.
 518      */
 519     public static final UnicodeBlock GREEK_EXTENDED
 520       = new UnicodeBlock('\u1F00', '\u1FFF',
 521                          "GREEK_EXTENDED");
 522
 523     /**
 524      * General Punctuation.
 525      * '\u2000' - '\u206F'.
 526      */
 527     public static final UnicodeBlock GENERAL_PUNCTUATION
 528       = new UnicodeBlock('\u2000', '\u206F',
 529                          "GENERAL_PUNCTUATION");
 530
 531     /**
 532      * Superscripts and Subscripts.
 533      * '\u2070' - '\u209F'.
 534      */
 535     public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
 536       = new UnicodeBlock('\u2070', '\u209F',
 537                          "SUPERSCRIPTS_AND_SUBSCRIPTS");
 538
 539     /**
 540      * Currency Symbols.
 541      * '\u20A0' - '\u20CF'.
 542      */
 543     public static final UnicodeBlock CURRENCY_SYMBOLS
 544       = new UnicodeBlock('\u20A0', '\u20CF',
 545                          "CURRENCY_SYMBOLS");
 546
 547     /**
 548      * Combining Marks for Symbols.
 549      * '\u20D0' - '\u20FF'.
 550      */
 551     public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
 552       = new UnicodeBlock('\u20D0', '\u20FF',
 553                          "COMBINING_MARKS_FOR_SYMBOLS");
 554
 555     /**
 556      * Letterlike Symbols.
 557      * '\u2100' - '\u214F'.
 558      */
 559     public static final UnicodeBlock LETTERLIKE_SYMBOLS
 560       = new UnicodeBlock('\u2100', '\u214F',
 561                          "LETTERLIKE_SYMBOLS");
 562
 563     /**
 564      * Number Forms.
 565      * '\u2150' - '\u218F'.
 566      */
 567     public static final UnicodeBlock NUMBER_FORMS
 568       = new UnicodeBlock('\u2150', '\u218F',
 569                          "NUMBER_FORMS");
 570
 571     /**
 572      * Arrows.
 573      * '\u2190' - '\u21FF'.
 574      */
 575     public static final UnicodeBlock ARROWS
 576       = new UnicodeBlock('\u2190', '\u21FF',
 577                          "ARROWS");
 578
 579     /**
 580      * Mathematical Operators.
 581      * '\u2200' - '\u22FF'.
 582      */
 583     public static final UnicodeBlock MATHEMATICAL_OPERATORS
 584       = new UnicodeBlock('\u2200', '\u22FF',
 585                          "MATHEMATICAL_OPERATORS");
 586
 587     /**
 588      * Miscellaneous Technical.
 589      * '\u2300' - '\u23FF'.
 590      */
 591     public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
 592       = new UnicodeBlock('\u2300', '\u23FF',
 593                          "MISCELLANEOUS_TECHNICAL");
 594
 595     /**
 596      * Control Pictures.
 597      * '\u2400' - '\u243F'.
 598      */
 599     public static final UnicodeBlock CONTROL_PICTURES
 600       = new UnicodeBlock('\u2400', '\u243F',
 601                          "CONTROL_PICTURES");
 602
 603     /**
 604      * Optical Character Recognition.
 605      * '\u2440' - '\u245F'.
 606      */
 607     public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
 608       = new UnicodeBlock('\u2440', '\u245F',
 609                          "OPTICAL_CHARACTER_RECOGNITION");
 610
 611     /**
 612      * Enclosed Alphanumerics.
 613      * '\u2460' - '\u24FF'.
 614      */
 615     public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
 616       = new UnicodeBlock('\u2460', '\u24FF',
 617                          "ENCLOSED_ALPHANUMERICS");
 618
 619     /**
 620      * Box Drawing.
 621      * '\u2500' - '\u257F'.
 622      */
 623     public static final UnicodeBlock BOX_DRAWING
 624       = new UnicodeBlock('\u2500', '\u257F',
 625                          "BOX_DRAWING");
 626
 627     /**
 628      * Block Elements.
 629      * '\u2580' - '\u259F'.
 630      */
 631     public static final UnicodeBlock BLOCK_ELEMENTS
 632       = new UnicodeBlock('\u2580', '\u259F',
 633                          "BLOCK_ELEMENTS");
 634
 635     /**
 636      * Geometric Shapes.
 637      * '\u25A0' - '\u25FF'.
 638      */
 639     public static final UnicodeBlock GEOMETRIC_SHAPES
 640       = new UnicodeBlock('\u25A0', '\u25FF',
 641                          "GEOMETRIC_SHAPES");
 642
 643     /**
 644      * Miscellaneous Symbols.
 645      * '\u2600' - '\u26FF'.
 646      */
 647     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
 648       = new UnicodeBlock('\u2600', '\u26FF',
 649                          "MISCELLANEOUS_SYMBOLS");
 650
 651     /**
 652      * Dingbats.
 653      * '\u2700' - '\u27BF'.
 654      */
 655     public static final UnicodeBlock DINGBATS
 656       = new UnicodeBlock('\u2700', '\u27BF',
 657                          "DINGBATS");
 658
 659     /**
 660      * Braille Patterns.
 661      * '\u2800' - '\u28FF'.
 662      * @since 1.4
 663      */
 664     public static final UnicodeBlock BRAILLE_PATTERNS
 665       = new UnicodeBlock('\u2800', '\u28FF',
 666                          "BRAILLE_PATTERNS");
 667
 668     /**
 669      * CJK Radicals Supplement.
 670      * '\u2E80' - '\u2EFF'.
 671      * @since 1.4
 672      */
 673     public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
 674       = new UnicodeBlock('\u2E80', '\u2EFF',
 675                          "CJK_RADICALS_SUPPLEMENT");
 676
 677     /**
 678      * Kangxi Radicals.
 679      * '\u2F00' - '\u2FDF'.
 680      * @since 1.4
 681      */
 682     public static final UnicodeBlock KANGXI_RADICALS
 683       = new UnicodeBlock('\u2F00', '\u2FDF',
 684                          "KANGXI_RADICALS");
 685
 686     /**
 687      * Ideographic Description Characters.
 688      * '\u2FF0' - '\u2FFF'.
 689      * @since 1.4
 690      */
 691     public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
 692       = new UnicodeBlock('\u2FF0', '\u2FFF',
 693                          "IDEOGRAPHIC_DESCRIPTION_CHARACTERS");
 694
 695     /**
 696      * CJK Symbols and Punctuation.
 697      * '\u3000' - '\u303F'.
 698      */
 699     public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
 700       = new UnicodeBlock('\u3000', '\u303F',
 701                          "CJK_SYMBOLS_AND_PUNCTUATION");
 702
 703     /**
 704      * Hiragana.
 705      * '\u3040' - '\u309F'.
 706      */
 707     public static final UnicodeBlock HIRAGANA
 708       = new UnicodeBlock('\u3040', '\u309F',
 709                          "HIRAGANA");
 710
 711     /**
 712      * Katakana.
 713      * '\u30A0' - '\u30FF'.
 714      */
 715     public static final UnicodeBlock KATAKANA
 716       = new UnicodeBlock('\u30A0', '\u30FF',
 717                          "KATAKANA");
 718
 719     /**
 720      * Bopomofo.
 721      * '\u3100' - '\u312F'.
 722      */
 723     public static final UnicodeBlock BOPOMOFO
 724       = new UnicodeBlock('\u3100', '\u312F',
 725                          "BOPOMOFO");
 726
 727     /**
 728      * Hangul Compatibility Jamo.
 729      * '\u3130' - '\u318F'.
 730      */
 731     public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
 732       = new UnicodeBlock('\u3130', '\u318F',
 733                          "HANGUL_COMPATIBILITY_JAMO");
 734
 735     /**
 736      * Kanbun.
 737      * '\u3190' - '\u319F'.
 738      */
 739     public static final UnicodeBlock KANBUN
 740       = new UnicodeBlock('\u3190', '\u319F',
 741                          "KANBUN");
 742
 743     /**
 744      * Bopomofo Extended.
 745      * '\u31A0' - '\u31BF'.
 746      * @since 1.4
 747      */
 748     public static final UnicodeBlock BOPOMOFO_EXTENDED
 749       = new UnicodeBlock('\u31A0', '\u31BF',
 750                          "BOPOMOFO_EXTENDED");
 751
 752     /**
 753      * Enclosed CJK Letters and Months.
 754      * '\u3200' - '\u32FF'.
 755      */
 756     public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
 757       = new UnicodeBlock('\u3200', '\u32FF',
 758                          "ENCLOSED_CJK_LETTERS_AND_MONTHS");
 759
 760     /**
 761      * CJK Compatibility.
 762      * '\u3300' - '\u33FF'.
 763      */
 764     public static final UnicodeBlock CJK_COMPATIBILITY
 765       = new UnicodeBlock('\u3300', '\u33FF',
 766                          "CJK_COMPATIBILITY");
 767
 768     /**
 769      * CJK Unified Ideographs Extension A.
 770      * '\u3400' - '\u4DB5'.
 771      * @since 1.4
 772      */
 773     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
 774       = new UnicodeBlock('\u3400', '\u4DB5',
 775                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A");
 776
 777     /**
 778      * CJK Unified Ideographs.
 779      * '\u4E00' - '\u9FFF'.
 780      */
 781     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
 782       = new UnicodeBlock('\u4E00', '\u9FFF',
 783                          "CJK_UNIFIED_IDEOGRAPHS");
 784
 785     /**
 786      * Yi Syllables.
 787      * '\uA000' - '\uA48F'.
 788      * @since 1.4
 789      */
 790     public static final UnicodeBlock YI_SYLLABLES
 791       = new UnicodeBlock('\uA000', '\uA48F',
 792                          "YI_SYLLABLES");
 793
 794     /**
 795      * Yi Radicals.
 796      * '\uA490' - '\uA4CF'.
 797      * @since 1.4
 798      */
 799     public static final UnicodeBlock YI_RADICALS
 800       = new UnicodeBlock('\uA490', '\uA4CF',
 801                          "YI_RADICALS");
 802
 803     /**
 804      * Hangul Syllables.
 805      * '\uAC00' - '\uD7A3'.
 806      */
 807     public static final UnicodeBlock HANGUL_SYLLABLES
 808       = new UnicodeBlock('\uAC00', '\uD7A3',
 809                          "HANGUL_SYLLABLES");
 810
 811     /**
 812      * Surrogates Area.
 813      * '\uD800' - '\uDFFF'.
 814      */
 815     public static final UnicodeBlock SURROGATES_AREA
 816       = new UnicodeBlock('\uD800', '\uDFFF',
 817                          "SURROGATES_AREA");
 818
 819     /**
 820      * Private Use Area.
 821      * '\uE000' - '\uF8FF'.
 822      */
 823     public static final UnicodeBlock PRIVATE_USE_AREA
 824       = new UnicodeBlock('\uE000', '\uF8FF',
 825                          "PRIVATE_USE_AREA");
 826
 827     /**
 828      * CJK Compatibility Ideographs.
 829      * '\uF900' - '\uFAFF'.
 830      */
 831     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
 832       = new UnicodeBlock('\uF900', '\uFAFF',
 833                          "CJK_COMPATIBILITY_IDEOGRAPHS");
 834
 835     /**
 836      * Alphabetic Presentation Forms.
 837      * '\uFB00' - '\uFB4F'.
 838      */
 839     public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
 840       = new UnicodeBlock('\uFB00', '\uFB4F',
 841                          "ALPHABETIC_PRESENTATION_FORMS");
 842
 843     /**
 844      * Arabic Presentation Forms-A.
 845      * '\uFB50' - '\uFDFF'.
 846      */
 847     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
 848       = new UnicodeBlock('\uFB50', '\uFDFF',
 849                          "ARABIC_PRESENTATION_FORMS_A");
 850
 851     /**
 852      * Combining Half Marks.
 853      * '\uFE20' - '\uFE2F'.
 854      */
 855     public static final UnicodeBlock COMBINING_HALF_MARKS
 856       = new UnicodeBlock('\uFE20', '\uFE2F',
 857                          "COMBINING_HALF_MARKS");
 858
 859     /**
 860      * CJK Compatibility Forms.
 861      * '\uFE30' - '\uFE4F'.
 862      */
 863     public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
 864       = new UnicodeBlock('\uFE30', '\uFE4F',
 865                          "CJK_COMPATIBILITY_FORMS");
 866
 867     /**
 868      * Small Form Variants.
 869      * '\uFE50' - '\uFE6F'.
 870      */
 871     public static final UnicodeBlock SMALL_FORM_VARIANTS
 872       = new UnicodeBlock('\uFE50', '\uFE6F',
 873                          "SMALL_FORM_VARIANTS");
 874
 875     /**
 876      * Arabic Presentation Forms-B.
 877      * '\uFE70' - '\uFEFE'.
 878      */
 879     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
 880       = new UnicodeBlock('\uFE70', '\uFEFE',
 881                          "ARABIC_PRESENTATION_FORMS_B");
 882
 883     /**
 884      * Halfwidth and Fullwidth Forms.
 885      * '\uFF00' - '\uFFEF'.
 886      */
 887     public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
 888       = new UnicodeBlock('\uFF00', '\uFFEF',
 889                          "HALFWIDTH_AND_FULLWIDTH_FORMS");
 890
 891     /**
 892      * Specials.
 893      * '\uFEFF', '\uFFF0' - '\uFFFD'.
 894      */
 895     public static final UnicodeBlock SPECIALS
 896       = new UnicodeBlock('\uFFF0', '\uFFFD',
 897                          "SPECIALS");
 898
 899     /**
 900      * The defined subsets.
 901      */
 902     private static final UnicodeBlock sets[] = {
 903       BASIC_LATIN,
 904       LATIN_1_SUPPLEMENT,
 905       LATIN_EXTENDED_A,
 906       LATIN_EXTENDED_B,
 907       IPA_EXTENSIONS,
 908       SPACING_MODIFIER_LETTERS,
 909       COMBINING_DIACRITICAL_MARKS,
 910       GREEK,
 911       CYRILLIC,
 912       ARMENIAN,
 913       HEBREW,
 914       ARABIC,
 915       SYRIAC,
 916       THAANA,
 917       DEVANAGARI,
 918       BENGALI,
 919       GURMUKHI,
 920       GUJARATI,
 921       ORIYA,
 922       TAMIL,
 923       TELUGU,
 924       KANNADA,
 925       MALAYALAM,
 926       SINHALA,
 927       THAI,
 928       LAO,
 929       TIBETAN,
 930       MYANMAR,
 931       GEORGIAN,
 932       HANGUL_JAMO,
 933       ETHIOPIC,
 934       CHEROKEE,
 935       UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
 936       OGHAM,
 937       RUNIC,
 938       KHMER,
 939       MONGOLIAN,
 940       LATIN_EXTENDED_ADDITIONAL,
 941       GREEK_EXTENDED,
 942       GENERAL_PUNCTUATION,
 943       SUPERSCRIPTS_AND_SUBSCRIPTS,
 944       CURRENCY_SYMBOLS,
 945       COMBINING_MARKS_FOR_SYMBOLS,
 946       LETTERLIKE_SYMBOLS,
 947       NUMBER_FORMS,
 948       ARROWS,
 949       MATHEMATICAL_OPERATORS,
 950       MISCELLANEOUS_TECHNICAL,
 951       CONTROL_PICTURES,
 952       OPTICAL_CHARACTER_RECOGNITION,
 953       ENCLOSED_ALPHANUMERICS,
 954       BOX_DRAWING,
 955       BLOCK_ELEMENTS,
 956       GEOMETRIC_SHAPES,
 957       MISCELLANEOUS_SYMBOLS,
 958       DINGBATS,
 959       BRAILLE_PATTERNS,
 960       CJK_RADICALS_SUPPLEMENT,
 961       KANGXI_RADICALS,
 962       IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
 963       CJK_SYMBOLS_AND_PUNCTUATION,
 964       HIRAGANA,
 965       KATAKANA,
 966       BOPOMOFO,
 967       HANGUL_COMPATIBILITY_JAMO,
 968       KANBUN,
 969       BOPOMOFO_EXTENDED,
 970       ENCLOSED_CJK_LETTERS_AND_MONTHS,
 971       CJK_COMPATIBILITY,
 972       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
 973       CJK_UNIFIED_IDEOGRAPHS,
 974       YI_SYLLABLES,
 975       YI_RADICALS,
 976       HANGUL_SYLLABLES,
 977       SURROGATES_AREA,
 978       PRIVATE_USE_AREA,
 979       CJK_COMPATIBILITY_IDEOGRAPHS,
 980       ALPHABETIC_PRESENTATION_FORMS,
 981       ARABIC_PRESENTATION_FORMS_A,
 982       COMBINING_HALF_MARKS,
 983       CJK_COMPATIBILITY_FORMS,
 984       SMALL_FORM_VARIANTS,
 985       ARABIC_PRESENTATION_FORMS_B,
 986       HALFWIDTH_AND_FULLWIDTH_FORMS,
 987       SPECIALS,
 988     };
 989   } // class UnicodeBlock
 990
 991   /**
 992    * The immutable value of this Character.
 993    *
 994    * @serial the value of this Character
 995    */
 996   private final char value;
 997
 998   /**
 999    * Compatible with JDK 1.0+.
1000    */
1001   private static final long serialVersionUID = 3786198910865385080L;
1002
1003   /**
1004    * Smallest value allowed for radix arguments in Java. This value is 2.
1005    *
1006    * @see #digit(char, int)
1007    * @see #forDigit(int, int)
1008    * @see Integer#toString(int, int)
1009    * @see Integer#valueOf(String)
1010    */
1011   public static final int MIN_RADIX = 2;
1012
1013   /**
1014    * Largest value allowed for radix arguments in Java. This value is 36.
1015    *
1016    * @see #digit(char, int)
1017    * @see #forDigit(int, int)
1018    * @see Integer#toString(int, int)
1019    * @see Integer#valueOf(String)
1020    */
1021   public static final int MAX_RADIX = 36;
1022
1023   /**
1024    * The minimum value the char data type can hold.
1025    * This value is <code>'\\u0000'</code>.
1026    */
1027   public static final char MIN_VALUE = '\u0000';
1028
1029   /**
1030    * The maximum value the char data type can hold.
1031    * This value is <code>'\\uFFFF'</code>.
1032    */
1033   public static final char MAX_VALUE = '\uFFFF';
1034
1035   /**
1036    * Class object representing the primitive char data type.
1037    *
1038    * @since 1.1
1039    */
1040   public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1041
1042   /**
1043    * The number of bits needed to represent a <code>char</code>.
1044    * @since 1.5
1045    */
1046   public static final int SIZE = 16;
1047
1048   // This caches some Character values, and is used by boxing
1049   // conversions via valueOf().  We must cache at least 0..127;
1050   // this constant controls how much we actually cache.
1051   private static final int MAX_CACHE = 127;
1052   private static Character[] charCache = new Character[MAX_CACHE + 1];
1053
1054   /**
1055    * Lu = Letter, Uppercase (Informative).
1056    *
1057    * @since 1.1
1058    */
1059   public static final byte UPPERCASE_LETTER = 1;
1060
1061   /**
1062    * Ll = Letter, Lowercase (Informative).
1063    *
1064    * @since 1.1
1065    */
1066   public static final byte LOWERCASE_LETTER = 2;
1067
1068   /**
1069    * Lt = Letter, Titlecase (Informative).
1070    *
1071    * @since 1.1
1072    */
1073   public static final byte TITLECASE_LETTER = 3;
1074
1075   /**
1076    * Mn = Mark, Non-Spacing (Normative).
1077    *
1078    * @since 1.1
1079    */
1080   public static final byte NON_SPACING_MARK = 6;
1081
1082   /**
1083    * Mc = Mark, Spacing Combining (Normative).
1084    *
1085    * @since 1.1
1086    */
1087   public static final byte COMBINING_SPACING_MARK = 8;
1088
1089   /**
1090    * Me = Mark, Enclosing (Normative).
1091    *
1092    * @since 1.1
1093    */
1094   public static final byte ENCLOSING_MARK = 7;
1095
1096   /**
1097    * Nd = Number, Decimal Digit (Normative).
1098    *
1099    * @since 1.1
1100    */
1101   public static final byte DECIMAL_DIGIT_NUMBER = 9;
1102
1103   /**
1104    * Nl = Number, Letter (Normative).
1105    *
1106    * @since 1.1
1107    */
1108   public static final byte LETTER_NUMBER = 10;
1109
1110   /**
1111    * No = Number, Other (Normative).
1112    *
1113    * @since 1.1
1114    */
1115   public static final byte OTHER_NUMBER = 11;
1116
1117   /**
1118    * Zs = Separator, Space (Normative).
1119    *
1120    * @since 1.1
1121    */
1122   public static final byte SPACE_SEPARATOR = 12;
1123
1124   /**
1125    * Zl = Separator, Line (Normative).
1126    *
1127    * @since 1.1
1128    */
1129   public static final byte LINE_SEPARATOR = 13;
1130
1131   /**
1132    * Zp = Separator, Paragraph (Normative).
1133    *
1134    * @since 1.1
1135    */
1136   public static final byte PARAGRAPH_SEPARATOR = 14;
1137
1138   /**
1139    * Cc = Other, Control (Normative).
1140    *
1141    * @since 1.1
1142    */
1143   public static final byte CONTROL = 15;
1144
1145   /**
1146    * Cf = Other, Format (Normative).
1147    *
1148    * @since 1.1
1149    */
1150   public static final byte FORMAT = 16;
1151
1152   /**
1153    * Cs = Other, Surrogate (Normative).
1154    *
1155    * @since 1.1
1156    */
1157   public static final byte SURROGATE = 19;
1158
1159   /**
1160    * Co = Other, Private Use (Normative).
1161    *
1162    * @since 1.1
1163    */
1164   public static final byte PRIVATE_USE = 18;
1165
1166   /**
1167    * Cn = Other, Not Assigned (Normative).
1168    *
1169    * @since 1.1
1170    */
1171   public static final byte UNASSIGNED = 0;
1172
1173   /**
1174    * Lm = Letter, Modifier (Informative).
1175    *
1176    * @since 1.1
1177    */
1178   public static final byte MODIFIER_LETTER = 4;
1179
1180   /**
1181    * Lo = Letter, Other (Informative).
1182    *
1183    * @since 1.1
1184    */
1185   public static final byte OTHER_LETTER = 5;
1186
1187   /**
1188    * Pc = Punctuation, Connector (Informative).
1189    *
1190    * @since 1.1
1191    */
1192   public static final byte CONNECTOR_PUNCTUATION = 23;
1193
1194   /**
1195    * Pd = Punctuation, Dash (Informative).
1196    *
1197    * @since 1.1
1198    */
1199   public static final byte DASH_PUNCTUATION = 20;
1200
1201   /**
1202    * Ps = Punctuation, Open (Informative).
1203    *
1204    * @since 1.1
1205    */
1206   public static final byte START_PUNCTUATION = 21;
1207
1208   /**
1209    * Pe = Punctuation, Close (Informative).
1210    *
1211    * @since 1.1
1212    */
1213   public static final byte END_PUNCTUATION = 22;
1214
1215   /**
1216    * Pi = Punctuation, Initial Quote (Informative).
1217    *
1218    * @since 1.4
1219    */
1220   public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
1221
1222   /**
1223    * Pf = Punctuation, Final Quote (Informative).
1224    *
1225    * @since 1.4
1226    */
1227   public static final byte FINAL_QUOTE_PUNCTUATION = 30;
1228
1229   /**
1230    * Po = Punctuation, Other (Informative).
1231    *
1232    * @since 1.1
1233    */
1234   public static final byte OTHER_PUNCTUATION = 24;
1235
1236   /**
1237    * Sm = Symbol, Math (Informative).
1238    *
1239    * @since 1.1
1240    */
1241   public static final byte MATH_SYMBOL = 25;
1242
1243   /**
1244    * Sc = Symbol, Currency (Informative).
1245    *
1246    * @since 1.1
1247    */
1248   public static final byte CURRENCY_SYMBOL = 26;
1249
1250   /**
1251    * Sk = Symbol, Modifier (Informative).
1252    *
1253    * @since 1.1
1254    */
1255   public static final byte MODIFIER_SYMBOL = 27;
1256
1257   /**
1258    * So = Symbol, Other (Informative).
1259    *
1260    * @since 1.1
1261    */
1262   public static final byte OTHER_SYMBOL = 28;
1263
1264   /**
1265    * Undefined bidirectional character type. Undefined char values have
1266    * undefined directionality in the Unicode specification.
1267    *
1268    * @since 1.4
1269    */
1270   public static final byte DIRECTIONALITY_UNDEFINED = -1;
1271
1272   /**
1273    * Strong bidirectional character type "L".
1274    *
1275    * @since 1.4
1276    */
1277   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
1278
1279   /**
1280    * Strong bidirectional character type "R".
1281    *
1282    * @since 1.4
1283    */
1284   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
1285
1286   /**
1287    * Strong bidirectional character type "AL".
1288    *
1289    * @since 1.4
1290    */
1291   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
1292
1293   /**
1294    * Weak bidirectional character type "EN".
1295    *
1296    * @since 1.4
1297    */
1298   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
1299
1300   /**
1301    * Weak bidirectional character type "ES".
1302    *
1303    * @since 1.4
1304    */
1305   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
1306
1307   /**
1308    * Weak bidirectional character type "ET".
1309    *
1310    * @since 1.4
1311    */
1312   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
1313
1314   /**
1315    * Weak bidirectional character type "AN".
1316    *
1317    * @since 1.4
1318    */
1319   public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
1320
1321   /**
1322    * Weak bidirectional character type "CS".
1323    *
1324    * @since 1.4
1325    */
1326   public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
1327
1328   /**
1329    * Weak bidirectional character type "NSM".
1330    *
1331    * @since 1.4
1332    */
1333   public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
1334
1335   /**
1336    * Weak bidirectional character type "BN".
1337    *
1338    * @since 1.4
1339    */
1340   public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
1341
1342   /**
1343    * Neutral bidirectional character type "B".
1344    *
1345    * @since 1.4
1346    */
1347   public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
1348
1349   /**
1350    * Neutral bidirectional character type "S".
1351    *
1352    * @since 1.4
1353    */
1354   public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
1355
1356   /**
1357    * Strong bidirectional character type "WS".
1358    *
1359    * @since 1.4
1360    */
1361   public static final byte DIRECTIONALITY_WHITESPACE = 12;
1362
1363   /**
1364    * Neutral bidirectional character type "ON".
1365    *
1366    * @since 1.4
1367    */
1368   public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
1369
1370   /**
1371    * Strong bidirectional character type "LRE".
1372    *
1373    * @since 1.4
1374    */
1375   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
1376
1377   /**
1378    * Strong bidirectional character type "LRO".
1379    *
1380    * @since 1.4
1381    */
1382   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
1383
1384   /**
1385    * Strong bidirectional character type "RLE".
1386    *
1387    * @since 1.4
1388    */
1389   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
1390
1391   /**
1392    * Strong bidirectional character type "RLO".
1393    *
1394    * @since 1.4
1395    */
1396   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
1397
1398   /**
1399    * Weak bidirectional character type "PDF".
1400    *
1401    * @since 1.4
1402    */
1403   public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
1404
1405   /**
1406    * Mask for grabbing the type out of the result of readChar.
1407    * @see #readChar(char)
1408    */
1409   private static final int TYPE_MASK = 0x1F;
1410
1411   /**
1412    * Mask for grabbing the non-breaking space flag out of the result of
1413    * readChar.
1414    * @see #readChar(char)
1415    */
1416   private static final int NO_BREAK_MASK = 0x20;
1417
1418   /**
1419    * Mask for grabbing the mirrored directionality flag out of the result
1420    * of readChar.
1421    * @see #readChar(char)
1422    */
1423   private static final int MIRROR_MASK = 0x40;
1424
1425   /**
1426    * Min value for supplementary code point.
1427    *
1428    * @since 1.5
1429    */
1430   public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
1431
1432   /**
1433    * Min value for code point.
1434    *
1435    * @since 1.5
1436    */
1437   public static final int MIN_CODE_POINT = 0;
1438
1439
1440   /**
1441    * Max value for code point.
1442    *
1443    * @since 1.5
1444    */
1445   public static final int MAX_CODE_POINT = 0x010ffff;
1446
1447
1448   /**
1449    * Minimum high surrogate code in UTF-16 encoding.
1450    *
1451    * @since 1.5
1452    */
1453   public static final char MIN_HIGH_SURROGATE = '\ud800';
1454
1455   /**
1456    * Maximum high surrogate code in UTF-16 encoding.
1457    *
1458    * @since 1.5
1459    */
1460   public static final char MAX_HIGH_SURROGATE = '\udbff';
1461
1462   /**
1463    * Minimum low surrogate code in UTF-16 encoding.
1464    *
1465    * @since 1.5
1466    */
1467   public static final char MIN_LOW_SURROGATE = '\udc00';
1468
1469   /**
1470    * Maximum low surrogate code in UTF-16 encoding.
1471    *
1472    * @since 1.5
1473    */
1474   public static final char MAX_LOW_SURROGATE = '\udfff';
1475
1476   /**
1477    * Minimum surrogate code in UTF-16 encoding.
1478    *
1479    * @since 1.5
1480    */
1481   public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
1482
1483   /**
1484    * Maximum low surrogate code in UTF-16 encoding.
1485    *
1486    * @since 1.5
1487    */
1488   public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
1489
1490   /**
1491    * Grabs an attribute offset from the Unicode attribute database. The lower
1492    * 5 bits are the character type, the next 2 bits are flags, and the top
1493    * 9 bits are the offset into the attribute tables. Note that the top 9
1494    * bits are meaningless in this context; they are useful only in the native
1495    * code.
1496    *
1497    * @param ch the character to look up
1498    * @return the character's attribute offset and type
1499    * @see #TYPE_MASK
1500    * @see #NO_BREAK_MASK
1501    * @see #MIRROR_MASK
1502    */
1503   private static native char readChar(char ch);
1504
1505   /**
1506    * Wraps up a character.
1507    *
1508    * @param value the character to wrap
1509    */
1510   public Character(char value)
1511   {
1512     this.value = value;
1513   }
1514
1515   /**
1516    * Returns the character which has been wrapped by this class.
1517    *
1518    * @return the character wrapped
1519    */
1520   public char charValue()
1521   {
1522     return value;
1523   }
1524
1525   /**
1526    * Returns the numerical value (unsigned) of the wrapped character.
1527    * Range of returned values: 0x0000-0xFFFF.
1528    *
1529    * @return the value of the wrapped character
1530    */
1531   public int hashCode()
1532   {
1533     return value;
1534   }
1535
1536   /**
1537    * Determines if an object is equal to this object. This is only true for
1538    * another Character object wrapping the same value.
1539    *
1540    * @param o object to compare
1541    * @return true if o is a Character with the same value
1542    */
1543   public boolean equals(Object o)
1544   {
1545     return o instanceof Character && value == ((Character) o).value;
1546   }
1547
1548   /**
1549    * Converts the wrapped character into a String.
1550    *
1551    * @return a String containing one character -- the wrapped character
1552    *         of this instance
1553    */
1554   public String toString()
1555   {
1556     // This assumes that String.valueOf(char) can create a single-character
1557     // String more efficiently than through the public API.
1558     return String.valueOf(value);
1559   }
1560
1561   /**
1562    * Returns a String of length 1 representing the specified character.
1563    *
1564    * @param ch the character to convert
1565    * @return a String containing the character
1566    * @since 1.4
1567    */
1568   public static String toString(char ch)
1569   {
1570     // This assumes that String.valueOf(char) can create a single-character
1571     // String more efficiently than through the public API.
1572     return String.valueOf(ch);
1573   }
1574
1575   /**
1576    * Determines if a character is a Unicode lowercase letter. For example,
1577    * <code>'a'</code> is lowercase.
1578    * <br>
1579    * lowercase = [Ll]
1580    *
1581    * @param ch character to test
1582    * @return true if ch is a Unicode lowercase letter, else false
1583    * @see #isUpperCase(char)
1584    * @see #isTitleCase(char)
1585    * @see #toLowerCase(char)
1586    * @see #getType(char)
1587    */
1588   public static boolean isLowerCase(char ch)
1589   {
1590     return getType(ch) == LOWERCASE_LETTER;
1591   }
1592
1593   /**
1594    * Determines if a character is a Unicode uppercase letter. For example,
1595    * <code>'A'</code> is uppercase.
1596    * <br>
1597    * uppercase = [Lu]
1598    *
1599    * @param ch character to test
1600    * @return true if ch is a Unicode uppercase letter, else false
1601    * @see #isLowerCase(char)
1602    * @see #isTitleCase(char)
1603    * @see #toUpperCase(char)
1604    * @see #getType(char)
1605    */
1606   public static boolean isUpperCase(char ch)
1607   {
1608     return getType(ch) == UPPERCASE_LETTER;
1609   }
1610
1611   /**
1612    * Determines if a character is a Unicode titlecase letter. For example,
1613    * the character "Lj" (Latin capital L with small letter j) is titlecase.
1614    * <br>
1615    * titlecase = [Lt]
1616    *
1617    * @param ch character to test
1618    * @return true if ch is a Unicode titlecase letter, else false
1619    * @see #isLowerCase(char)
1620    * @see #isUpperCase(char)
1621    * @see #toTitleCase(char)
1622    * @see #getType(char)
1623    */
1624   public static boolean isTitleCase(char ch)
1625   {
1626     return getType(ch) == TITLECASE_LETTER;
1627   }
1628
1629   /**
1630    * Determines if a character is a Unicode decimal digit. For example,
1631    * <code>'0'</code> is a digit.
1632    * <br>
1633    * Unicode decimal digit = [Nd]
1634    *
1635    * @param ch character to test
1636    * @return true if ch is a Unicode decimal digit, else false
1637    * @see #digit(char, int)
1638    * @see #forDigit(int, int)
1639    * @see #getType(char)
1640    */
1641   public static boolean isDigit(char ch)
1642   {
1643     return getType(ch) == DECIMAL_DIGIT_NUMBER;
1644   }
1645
1646   /**
1647    * Determines if a character is part of the Unicode Standard. This is an
1648    * evolving standard, but covers every character in the data file.
1649    * <br>
1650    * defined = not [Cn]
1651    *
1652    * @param ch character to test
1653    * @return true if ch is a Unicode character, else false
1654    * @see #isDigit(char)
1655    * @see #isLetter(char)
1656    * @see #isLetterOrDigit(char)
1657    * @see #isLowerCase(char)
1658    * @see #isTitleCase(char)
1659    * @see #isUpperCase(char)
1660    */
1661   public static boolean isDefined(char ch)
1662   {
1663     return getType(ch) != UNASSIGNED;
1664   }
1665
1666   /**
1667    * Determines if a character is a Unicode letter. Not all letters have case,
1668    * so this may return true when isLowerCase and isUpperCase return false.
1669    * <br>
1670    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
1671    *
1672    * @param ch character to test
1673    * @return true if ch is a Unicode letter, else false
1674    * @see #isDigit(char)
1675    * @see #isJavaIdentifierStart(char)
1676    * @see #isJavaLetter(char)
1677    * @see #isJavaLetterOrDigit(char)
1678    * @see #isLetterOrDigit(char)
1679    * @see #isLowerCase(char)
1680    * @see #isTitleCase(char)
1681    * @see #isUnicodeIdentifierStart(char)
1682    * @see #isUpperCase(char)
1683    */
1684   public static boolean isLetter(char ch)
1685   {
1686     return ((1 << getType(ch))
1687             & ((1 << UPPERCASE_LETTER)
1688                | (1 << LOWERCASE_LETTER)
1689                | (1 << TITLECASE_LETTER)
1690                | (1 << MODIFIER_LETTER)
1691                | (1 << OTHER_LETTER))) != 0;
1692   }
1693
1694   /**
1695    * Determines if a character is a Unicode letter or a Unicode digit. This
1696    * is the combination of isLetter and isDigit.
1697    * <br>
1698    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
1699    *
1700    * @param ch character to test
1701    * @return true if ch is a Unicode letter or a Unicode digit, else false
1702    * @see #isDigit(char)
1703    * @see #isJavaIdentifierPart(char)
1704    * @see #isJavaLetter(char)
1705    * @see #isJavaLetterOrDigit(char)
1706    * @see #isLetter(char)
1707    * @see #isUnicodeIdentifierPart(char)
1708    */
1709   public static boolean isLetterOrDigit(char ch)
1710   {
1711     return ((1 << getType(ch))
1712             & ((1 << UPPERCASE_LETTER)
1713                | (1 << LOWERCASE_LETTER)
1714                | (1 << TITLECASE_LETTER)
1715                | (1 << MODIFIER_LETTER)
1716                | (1 << OTHER_LETTER)
1717                | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
1718   }
1719
1720   /**
1721    * Determines if a character can start a Java identifier. This is the
1722    * combination of isLetter, any character where getType returns
1723    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1724    * (like '_').
1725    *
1726    * @param ch character to test
1727    * @return true if ch can start a Java identifier, else false
1728    * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
1729    * @see #isJavaLetterOrDigit(char)
1730    * @see #isJavaIdentifierStart(char)
1731    * @see #isJavaIdentifierPart(char)
1732    * @see #isLetter(char)
1733    * @see #isLetterOrDigit(char)
1734    * @see #isUnicodeIdentifierStart(char)
1735    */
1736   public static boolean isJavaLetter(char ch)
1737   {
1738     return isJavaIdentifierStart(ch);
1739   }
1740
1741   /**
1742    * Determines if a character can follow the first letter in
1743    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
1744    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1745    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1746    * or isIdentifierIgnorable.
1747    *
1748    * @param ch character to test
1749    * @return true if ch can follow the first letter in a Java identifier
1750    * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
1751    * @see #isJavaLetter(char)
1752    * @see #isJavaIdentifierStart(char)
1753    * @see #isJavaIdentifierPart(char)
1754    * @see #isLetter(char)
1755    * @see #isLetterOrDigit(char)
1756    * @see #isUnicodeIdentifierPart(char)
1757    * @see #isIdentifierIgnorable(char)
1758    */
1759   public static boolean isJavaLetterOrDigit(char ch)
1760   {
1761     return isJavaIdentifierPart(ch);
1762   }
1763
1764   /**
1765    * Determines if a character can start a Java identifier. This is the
1766    * combination of isLetter, any character where getType returns
1767    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1768    * (like '_').
1769    * <br>
1770    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
1771    *
1772    * @param ch character to test
1773    * @return true if ch can start a Java identifier, else false
1774    * @see #isJavaIdentifierPart(char)
1775    * @see #isLetter(char)
1776    * @see #isUnicodeIdentifierStart(char)
1777    * @since 1.1
1778    */
1779   public static boolean isJavaIdentifierStart(char ch)
1780   {
1781     return ((1 << getType(ch))
1782             & ((1 << UPPERCASE_LETTER)
1783                | (1 << LOWERCASE_LETTER)
1784                | (1 << TITLECASE_LETTER)
1785                | (1 << MODIFIER_LETTER)
1786                | (1 << OTHER_LETTER)
1787                | (1 << LETTER_NUMBER)
1788                | (1 << CURRENCY_SYMBOL)
1789                | (1 << CONNECTOR_PUNCTUATION))) != 0;
1790   }
1791
1792   /**
1793    * Determines if a character can follow the first letter in
1794    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
1795    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1796    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1797    * or isIdentifierIgnorable.
1798    * <br>
1799    * Java identifier extender =
1800    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
1801    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1802    *
1803    * @param ch character to test
1804    * @return true if ch can follow the first letter in a Java identifier
1805    * @see #isIdentifierIgnorable(char)
1806    * @see #isJavaIdentifierStart(char)
1807    * @see #isLetterOrDigit(char)
1808    * @see #isUnicodeIdentifierPart(char)
1809    * @since 1.1
1810    */
1811   public static boolean isJavaIdentifierPart(char ch)
1812   {
1813     int category = getType(ch);
1814     return ((1 << category)
1815             & ((1 << UPPERCASE_LETTER)
1816                | (1 << LOWERCASE_LETTER)
1817                | (1 << TITLECASE_LETTER)
1818                | (1 << MODIFIER_LETTER)
1819                | (1 << OTHER_LETTER)
1820                | (1 << NON_SPACING_MARK)
1821                | (1 << COMBINING_SPACING_MARK)
1822                | (1 << DECIMAL_DIGIT_NUMBER)
1823                | (1 << LETTER_NUMBER)
1824                | (1 << CURRENCY_SYMBOL)
1825                | (1 << CONNECTOR_PUNCTUATION)
1826                | (1 << FORMAT))) != 0
1827       || (category == CONTROL && isIdentifierIgnorable(ch));
1828   }
1829
1830   /**
1831    * Determines if a character can start a Unicode identifier.  Only
1832    * letters can start a Unicode identifier, but this includes characters
1833    * in LETTER_NUMBER.
1834    * <br>
1835    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
1836    *
1837    * @param ch character to test
1838    * @return true if ch can start a Unicode identifier, else false
1839    * @see #isJavaIdentifierStart(char)
1840    * @see #isLetter(char)
1841    * @see #isUnicodeIdentifierPart(char)
1842    * @since 1.1
1843    */
1844   public static boolean isUnicodeIdentifierStart(char ch)
1845   {
1846     return ((1 << getType(ch))
1847             & ((1 << UPPERCASE_LETTER)
1848                | (1 << LOWERCASE_LETTER)
1849                | (1 << TITLECASE_LETTER)
1850                | (1 << MODIFIER_LETTER)
1851                | (1 << OTHER_LETTER)
1852                | (1 << LETTER_NUMBER))) != 0;
1853   }
1854
1855   /**
1856    * Determines if a character can follow the first letter in
1857    * a Unicode identifier. This includes letters, connecting punctuation,
1858    * digits, numeric letters, combining marks, non-spacing marks, and
1859    * isIdentifierIgnorable.
1860    * <br>
1861    * Unicode identifier extender =
1862    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
1863    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1864    *
1865    * @param ch character to test
1866    * @return true if ch can follow the first letter in a Unicode identifier
1867    * @see #isIdentifierIgnorable(char)
1868    * @see #isJavaIdentifierPart(char)
1869    * @see #isLetterOrDigit(char)
1870    * @see #isUnicodeIdentifierStart(char)
1871    * @since 1.1
1872    */
1873   public static boolean isUnicodeIdentifierPart(char ch)
1874   {
1875     int category = getType(ch);
1876     return ((1 << category)
1877             & ((1 << UPPERCASE_LETTER)
1878                | (1 << LOWERCASE_LETTER)
1879                | (1 << TITLECASE_LETTER)
1880                | (1 << MODIFIER_LETTER)
1881                | (1 << OTHER_LETTER)
1882                | (1 << NON_SPACING_MARK)
1883                | (1 << COMBINING_SPACING_MARK)
1884                | (1 << DECIMAL_DIGIT_NUMBER)
1885                | (1 << LETTER_NUMBER)
1886                | (1 << CONNECTOR_PUNCTUATION)
1887                | (1 << FORMAT))) != 0
1888       || (category == CONTROL && isIdentifierIgnorable(ch));
1889   }
1890
1891   /**
1892    * Determines if a character is ignorable in a Unicode identifier. This
1893    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
1894    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
1895    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
1896    * <code>'\u009F'</code>), and FORMAT characters.
1897    * <br>
1898    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
1899    *    |U+007F-U+009F
1900    *
1901    * @param ch character to test
1902    * @return true if ch is ignorable in a Unicode or Java identifier
1903    * @see #isJavaIdentifierPart(char)
1904    * @see #isUnicodeIdentifierPart(char)
1905    * @since 1.1
1906    */
1907   public static boolean isIdentifierIgnorable(char ch)
1908   {
1909     return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F'
1910                                || (ch <= '\u001B' && ch >= '\u000E')))
1911       || getType(ch) == FORMAT;
1912   }
1913
1914   /**
1915    * Converts a Unicode character into its lowercase equivalent mapping.
1916    * If a mapping does not exist, then the character passed is returned.
1917    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
1918    *
1919    * @param ch character to convert to lowercase
1920    * @return lowercase mapping of ch, or ch if lowercase mapping does
1921    *         not exist
1922    * @see #isLowerCase(char)
1923    * @see #isUpperCase(char)
1924    * @see #toTitleCase(char)
1925    * @see #toUpperCase(char)
1926    */
1927   public static native char toLowerCase(char ch);
1928
1929   /**
1930    * Converts a Unicode character into its uppercase equivalent mapping.
1931    * If a mapping does not exist, then the character passed is returned.
1932    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
1933    *
1934    * @param ch character to convert to uppercase
1935    * @return uppercase mapping of ch, or ch if uppercase mapping does
1936    *         not exist
1937    * @see #isLowerCase(char)
1938    * @see #isUpperCase(char)
1939    * @see #toLowerCase(char)
1940    * @see #toTitleCase(char)
1941    */
1942   public static native char toUpperCase(char ch);
1943
1944   /**
1945    * Converts a Unicode character into its titlecase equivalent mapping.
1946    * If a mapping does not exist, then the character passed is returned.
1947    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
1948    *
1949    * @param ch character to convert to titlecase
1950    * @return titlecase mapping of ch, or ch if titlecase mapping does
1951    *         not exist
1952    * @see #isTitleCase(char)
1953    * @see #toLowerCase(char)
1954    * @see #toUpperCase(char)
1955    */
1956   public static native char toTitleCase(char ch);
1957
1958   /**
1959    * Converts a character into a digit of the specified radix. If the radix
1960    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
1961    * exceeds the radix, or if ch is not a decimal digit or in the case
1962    * insensitive set of 'a'-'z', the result is -1.
1963    * <br>
1964    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
1965    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
1966    *
1967    * @param ch character to convert into a digit
1968    * @param radix radix in which ch is a digit
1969    * @return digit which ch represents in radix, or -1 not a valid digit
1970    * @see #MIN_RADIX
1971    * @see #MAX_RADIX
1972    * @see #forDigit(int, int)
1973    * @see #isDigit(char)
1974    * @see #getNumericValue(char)
1975    */
1976   public static native int digit(char ch, int radix);
1977
1978   /**
1979    * Returns the Unicode numeric value property of a character. For example,
1980    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
1981    *
1982    * <p>This method also returns values for the letters A through Z, (not
1983    * specified by Unicode), in these ranges: <code>'\u0041'</code>
1984    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
1985    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
1986    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
1987    * <code>'\uFF5A'</code> (full width variants).
1988    *
1989    * <p>If the character lacks a numeric value property, -1 is returned.
1990    * If the character has a numeric value property which is not representable
1991    * as a nonnegative integer, such as a fraction, -2 is returned.
1992    *
1993    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
1994    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
1995    *
1996    * @param ch character from which the numeric value property will
1997    *        be retrieved
1998    * @return the numeric value property of ch, or -1 if it does not exist, or
1999    *         -2 if it is not representable as a nonnegative integer
2000    * @see #forDigit(int, int)
2001    * @see #digit(char, int)
2002    * @see #isDigit(char)
2003    * @since 1.1
2004    */
2005   public static native int getNumericValue(char ch);
2006
2007   /**
2008    * Determines if a character is a ISO-LATIN-1 space. This is only the five
2009    * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
2010    * <code>'\r'</code>, and <code>' '</code>.
2011    * <br>
2012    * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
2013    *
2014    * @param ch character to test
2015    * @return true if ch is a space, else false
2016    * @deprecated Replaced by {@link #isWhitespace(char)}
2017    * @see #isSpaceChar(char)
2018    * @see #isWhitespace(char)
2019    */
2020   public static boolean isSpace(char ch)
2021   {
2022     // Performing the subtraction up front alleviates need to compare longs.
2023     return ch-- <= ' ' && ((1 << ch)
2024                            & ((1 << (' ' - 1))
2025                               | (1 << ('\t' - 1))
2026                               | (1 << ('\n' - 1))
2027                               | (1 << ('\r' - 1))
2028                               | (1 << ('\f' - 1)))) != 0;
2029   }
2030
2031   /**
2032    * Determines if a character is a Unicode space character. This includes
2033    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
2034    * <br>
2035    * Unicode space = [Zs]|[Zp]|[Zl]
2036    *
2037    * @param ch character to test
2038    * @return true if ch is a Unicode space, else false
2039    * @see #isWhitespace(char)
2040    * @since 1.1
2041    */
2042   public static boolean isSpaceChar(char ch)
2043   {
2044     return ((1 << getType(ch))
2045             & ((1 << SPACE_SEPARATOR)
2046                | (1 << LINE_SEPARATOR)
2047                | (1 << PARAGRAPH_SEPARATOR))) != 0;
2048   }
2049
2050   /**
2051    * Determines if a character is Java whitespace. This includes Unicode
2052    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
2053    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
2054    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
2055    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
2056    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
2057    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
2058    * and <code>'\u001F'</code>.
2059    * <br>
2060    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
2061    *
2062    * @param ch character to test
2063    * @return true if ch is Java whitespace, else false
2064    * @see #isSpaceChar(char)
2065    * @since 1.1
2066    */
2067   public static boolean isWhitespace(char ch)
2068   {
2069     int attr = readChar(ch);
2070     return ((((1 << (attr & TYPE_MASK))
2071               & ((1 << SPACE_SEPARATOR)
2072                  | (1 << LINE_SEPARATOR)
2073                  | (1 << PARAGRAPH_SEPARATOR))) != 0)
2074             && (attr & NO_BREAK_MASK) == 0)
2075       || (ch <= '\u001F' && ((1 << ch)
2076                              & ((1 << '\t')
2077                                 | (1 << '\n')
2078                                 | (1 << '\u000B')
2079                                 | (1 << '\u000C')
2080                                 | (1 << '\r')
2081                                 | (1 << '\u001C')
2082                                 | (1 << '\u001D')
2083                                 | (1 << '\u001E')
2084                                 | (1 << '\u001F'))) != 0);
2085   }
2086
2087   /**
2088    * Determines if a character has the ISO Control property.
2089    * <br>
2090    * ISO Control = [Cc]
2091    *
2092    * @param ch character to test
2093    * @return true if ch is an ISO Control character, else false
2094    * @see #isSpaceChar(char)
2095    * @see #isWhitespace(char)
2096    * @since 1.1
2097    */
2098   public static boolean isISOControl(char ch)
2099   {
2100     return getType(ch) == CONTROL;
2101   }
2102
2103   /**
2104    * Returns the Unicode general category property of a character.
2105    *
2106    * @param ch character from which the general category property will
2107    *        be retrieved
2108    * @return the character category property of ch as an integer
2109    * @see #UNASSIGNED
2110    * @see #UPPERCASE_LETTER
2111    * @see #LOWERCASE_LETTER
2112    * @see #TITLECASE_LETTER
2113    * @see #MODIFIER_LETTER
2114    * @see #OTHER_LETTER
2115    * @see #NON_SPACING_MARK
2116    * @see #ENCLOSING_MARK
2117    * @see #COMBINING_SPACING_MARK
2118    * @see #DECIMAL_DIGIT_NUMBER
2119    * @see #LETTER_NUMBER
2120    * @see #OTHER_NUMBER
2121    * @see #SPACE_SEPARATOR
2122    * @see #LINE_SEPARATOR
2123    * @see #PARAGRAPH_SEPARATOR
2124    * @see #CONTROL
2125    * @see #FORMAT
2126    * @see #PRIVATE_USE
2127    * @see #SURROGATE
2128    * @see #DASH_PUNCTUATION
2129    * @see #START_PUNCTUATION
2130    * @see #END_PUNCTUATION
2131    * @see #CONNECTOR_PUNCTUATION
2132    * @see #OTHER_PUNCTUATION
2133    * @see #MATH_SYMBOL
2134    * @see #CURRENCY_SYMBOL
2135    * @see #MODIFIER_SYMBOL
2136    * @see #INITIAL_QUOTE_PUNCTUATION
2137    * @see #FINAL_QUOTE_PUNCTUATION
2138    * @since 1.1
2139    */
2140   public static native int getType(char ch);
2141
2142   /**
2143    * Converts a digit into a character which represents that digit
2144    * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
2145    * or the digit exceeds the radix, then the null character <code>'\0'</code>
2146    * is returned.  Otherwise the return value is in '0'-'9' and 'a'-'z'.
2147    * <br>
2148    * return value boundary = U+0030-U+0039|U+0061-U+007A
2149    *
2150    * @param digit digit to be converted into a character
2151    * @param radix radix of digit
2152    * @return character representing digit in radix, or '\0'
2153    * @see #MIN_RADIX
2154    * @see #MAX_RADIX
2155    * @see #digit(char, int)
2156    */
2157   public static char forDigit(int digit, int radix)
2158   {
2159     if (radix < MIN_RADIX || radix > MAX_RADIX
2160         || digit < 0 || digit >= radix)
2161       return '\0';
2162     return (char) (digit < 10 ? ('0' + digit) : ('a' - 10 + digit));
2163   }
2164
2165   /**
2166    * Returns the Unicode directionality property of the character. This
2167    * is used in the visual ordering of text.
2168    *
2169    * @param ch the character to look up
2170    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
2171    * @see #DIRECTIONALITY_UNDEFINED
2172    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
2173    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
2174    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
2175    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
2176    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
2177    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
2178    * @see #DIRECTIONALITY_ARABIC_NUMBER
2179    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
2180    * @see #DIRECTIONALITY_NONSPACING_MARK
2181    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
2182    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
2183    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
2184    * @see #DIRECTIONALITY_WHITESPACE
2185    * @see #DIRECTIONALITY_OTHER_NEUTRALS
2186    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
2187    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
2188    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
2189    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
2190    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
2191    * @since 1.4
2192    */
2193   public static native byte getDirectionality(char ch);
2194
2195   /**
2196    * Determines whether the character is mirrored according to Unicode. For
2197    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
2198    * left-to-right text, but ')' in right-to-left text.
2199    *
2200    * @param ch the character to look up
2201    * @return true if the character is mirrored
2202    * @since 1.4
2203    */
2204   public static boolean isMirrored(char ch)
2205   {
2206     return (readChar(ch) & MIRROR_MASK) != 0;
2207   }
2208
2209   /**
2210    * Compares another Character to this Character, numerically.
2211    *
2212    * @param anotherCharacter Character to compare with this Character
2213    * @return a negative integer if this Character is less than
2214    *         anotherCharacter, zero if this Character is equal, and
2215    *         a positive integer if this Character is greater
2216    * @throws NullPointerException if anotherCharacter is null
2217    * @since 1.2
2218    */
2219   public int compareTo(Character anotherCharacter)
2220   {
2221     return value - anotherCharacter.value;
2222   }
2223
2224   /**
2225    * Compares an object to this Character.  Assuming the object is a
2226    * Character object, this method performs the same comparison as
2227    * compareTo(Character).
2228    *
2229    * @param o object to compare
2230    * @return the comparison value
2231    * @throws ClassCastException if o is not a Character object
2232    * @throws NullPointerException if o is null
2233    * @see #compareTo(Character)
2234    * @since 1.2
2235    */
2236   public int compareTo(Object o)
2237   {
2238     return compareTo((Character) o);
2239   }
2240
2241   /**
2242    * Returns an <code>Character</code> object wrapping the value.
2243    * In contrast to the <code>Character</code> constructor, this method
2244    * will cache some values.  It is used by boxing conversion.
2245    *
2246    * @param val the value to wrap
2247    * @return the <code>Character</code>
2248    *
2249    * @since 1.5
2250    */
2251   public static Character valueOf(char val)
2252   {
2253     if (val > MAX_CACHE)
2254       return new Character(val);
2255     synchronized (charCache)
2256       {
2257     if (charCache[val - MIN_VALUE] == null)
2258       charCache[val - MIN_VALUE] = new Character(val);
2259     return charCache[val - MIN_VALUE];
2260       }
2261   }
2262
2263   /**
2264    * Reverse the bytes in val.
2265    * @since 1.5
2266    */
2267   public static char reverseBytes(char val)
2268   {
2269     return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
2270   }
2271
2272   /**
2273    * Converts a unicode code point to a UTF-16 representation of that
2274    * code point.
2275    *
2276    * @param codePoint the unicode code point
2277    *
2278    * @return the UTF-16 representation of that code point
2279    *
2280    * @throws IllegalArgumentException if the code point is not a valid
2281    *         unicode code point
2282    *
2283    * @since 1.5
2284    */
2285   public static char[] toChars(int codePoint)
2286   {
2287     char[] result = new char[charCount(codePoint)];
2288     int ignore = toChars(codePoint, result, 0);
2289     return result;
2290   }
2291
2292   /**
2293    * Converts a unicode code point to its UTF-16 representation.
2294    *
2295    * @param codePoint the unicode code point
2296    * @param dst the target char array
2297    * @param dstIndex the start index for the target
2298    *
2299    * @return number of characters written to <code>dst</code>
2300    *
2301    * @throws IllegalArgumentException if <code>codePoint</code> is not a
2302    *         valid unicode code point
2303    * @throws NullPointerException if <code>dst</code> is <code>null</code>
2304    * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
2305    *         in <code>dst</code> or if the UTF-16 representation does not
2306    *         fit into <code>dst</code>
2307    *
2308    * @since 1.5
2309    */
2310   public static int toChars(int codePoint, char[] dst, int dstIndex)
2311   {
2312     if (!isValidCodePoint(codePoint))
2313       {
2314         throw new IllegalArgumentException("not a valid code point: "
2315                                            + codePoint);
2316       }
2317
2318     int result;
2319     if (isSupplementaryCodePoint(codePoint))
2320       {
2321         // Write second char first to cause IndexOutOfBoundsException
2322         // immediately.
2323         final int cp2 = codePoint - 0x10000;
2324         dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
2325         dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
2326         result = 2;
2327       }
2328     else
2329       {
2330         dst[dstIndex] = (char) codePoint;
2331         result = 1;
2332       }
2333     return result;
2334   }
2335
2336   /**
2337    * Return number of 16-bit characters required to represent the given
2338    * code point.
2339    *
2340    * @param codePoint a unicode code point
2341    *
2342    * @return 2 if codePoint >= 0x10000, 1 otherwise.
2343    *
2344    * @since 1.5
2345    */
2346   public static int charCount(int codePoint)
2347   {
2348     return
2349       (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
2350       ? 2
2351       : 1;
2352   }
2353
2354   /**
2355    * Determines whether the specified code point is
2356    * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
2357    * supplementary character range.
2358    *
2359    * @param codePoint a Unicode code point
2360    *
2361    * @return <code>true</code> if code point is in supplementary range
2362    *
2363    * @since 1.5
2364    */
2365   public static boolean isSupplementaryCodePoint(int codePoint)
2366   {
2367     return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
2368       && codePoint <= MAX_CODE_POINT;
2369   }
2370
2371   /**
2372    * Determines whether the specified code point is
2373    * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
2374    *
2375    * @param codePoint a Unicode code point
2376    *
2377    * @return <code>true</code> if code point is valid
2378    *
2379    * @since 1.5
2380    */
2381   public static boolean isValidCodePoint(int codePoint)
2382   {
2383     return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
2384   }
2385
2386   /**
2387    * Return true if the given character is a high surrogate.
2388    * @param ch the character
2389    * @return true if the character is a high surrogate character
2390    *
2391    * @since 1.5
2392    */
2393   public static boolean isHighSurrogate(char ch)
2394   {
2395     return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
2396   }
2397
2398   /**
2399    * Return true if the given character is a low surrogate.
2400    * @param ch the character
2401    * @return true if the character is a low surrogate character
2402    *
2403    * @since 1.5
2404    */
2405   public static boolean isLowSurrogate(char ch)
2406   {
2407     return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
2408   }
2409
2410   /**
2411    * Return true if the given characters compose a surrogate pair.
2412    * This is true if the first character is a high surrogate and the
2413    * second character is a low surrogate.
2414    * @param ch1 the first character
2415    * @param ch2 the first character
2416    * @return true if the characters compose a surrogate pair
2417    *
2418    * @since 1.5
2419    */
2420   public static boolean isSurrogatePair(char ch1, char ch2)
2421   {
2422     return isHighSurrogate(ch1) && isLowSurrogate(ch2);
2423   }
2424
2425   /**
2426    * Given a valid surrogate pair, this returns the corresponding
2427    * code point.
2428    * @param high the high character of the pair
2429    * @param low the low character of the pair
2430    * @return the corresponding code point
2431    *
2432    * @since 1.5
2433    */
2434   public static int toCodePoint(char high, char low)
2435   {
2436     return ((high - MIN_HIGH_SURROGATE) * 0x400) +
2437       (low - MIN_LOW_SURROGATE) + 0x10000;
2438   }
2439
2440   /**
2441    * Get the code point at the specified index in the CharSequence.
2442    * This is like CharSequence#charAt(int), but if the character is
2443    * the start of a surrogate pair, and there is a following
2444    * character, and this character completes the pair, then the
2445    * corresponding supplementary code point is returned.  Otherwise,
2446    * the character at the index is returned.
2447    *
2448    * @param sequence the CharSequence
2449    * @param index the index of the codepoint to get, starting at 0
2450    * @return the codepoint at the specified index
2451    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2452    * @since 1.5
2453    */
2454   public static int codePointAt(CharSequence sequence, int index)
2455   {
2456     int len = sequence.length();
2457     if (index < 0 || index >= len)
2458       throw new IndexOutOfBoundsException();
2459     char high = sequence.charAt(index);
2460     if (! isHighSurrogate(high) || ++index >= len)
2461       return high;
2462     char low = sequence.charAt(index);
2463     if (! isLowSurrogate(low))
2464       return high;
2465     return toCodePoint(high, low);
2466   }
2467
2468   /**
2469    * Get the code point at the specified index in the CharSequence.
2470    * If the character is the start of a surrogate pair, and there is a
2471    * following character, and this character completes the pair, then
2472    * the corresponding supplementary code point is returned.
2473    * Otherwise, the character at the index is returned.
2474    *
2475    * @param chars the character array in which to look
2476    * @param index the index of the codepoint to get, starting at 0
2477    * @return the codepoint at the specified index
2478    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2479    * @since 1.5
2480    */
2481   public static int codePointAt(char[] chars, int index)
2482   {
2483     return codePointAt(chars, index, chars.length);
2484   }
2485
2486   /**
2487    * Get the code point at the specified index in the CharSequence.
2488    * If the character is the start of a surrogate pair, and there is a
2489    * following character within the specified range, and this
2490    * character completes the pair, then the corresponding
2491    * supplementary code point is returned.  Otherwise, the character
2492    * at the index is returned.
2493    *
2494    * @param chars the character array in which to look
2495    * @param index the index of the codepoint to get, starting at 0
2496    * @param limit the limit past which characters should not be examined
2497    * @return the codepoint at the specified index
2498    * @throws IndexOutOfBoundsException if index is negative or &gt;=
2499    * limit, or if limit is negative or &gt;= the length of the array
2500    * @since 1.5
2501    */
2502   public static int codePointAt(char[] chars, int index, int limit)
2503   {
2504     if (index < 0 || index >= limit || limit < 0 || limit >= chars.length)
2505       throw new IndexOutOfBoundsException();
2506     char high = chars[index];
2507     if (! isHighSurrogate(high) || ++index >= limit)
2508       return high;
2509     char low = chars[index];
2510     if (! isLowSurrogate(low))
2511       return high;
2512     return toCodePoint(high, low);
2513   }
2514
2515   /**
2516    * Get the code point before the specified index.  This is like
2517    * #codePointAt(char[], int), but checks the characters at
2518    * <code>index-1</code> and <code>index-2</code> to see if they form
2519    * a supplementary code point.  If they do not, the character at
2520    * <code>index-1</code> is returned.
2521    *
2522    * @param chars the character array
2523    * @param index the index just past the codepoint to get, starting at 0
2524    * @return the codepoint at the specified index
2525    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2526    * @since 1.5
2527    */
2528   public static int codePointBefore(char[] chars, int index)
2529   {
2530     return codePointBefore(chars, index, 1);
2531   }
2532
2533   /**
2534    * Get the code point before the specified index.  This is like
2535    * #codePointAt(char[], int), but checks the characters at
2536    * <code>index-1</code> and <code>index-2</code> to see if they form
2537    * a supplementary code point.  If they do not, the character at
2538    * <code>index-1</code> is returned.  The start parameter is used to
2539    * limit the range of the array which may be examined.
2540    *
2541    * @param chars the character array
2542    * @param index the index just past the codepoint to get, starting at 0
2543    * @param start the index before which characters should not be examined
2544    * @return the codepoint at the specified index
2545    * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
2546    * the length of the array, or if limit is negative or &gt;= the
2547    * length of the array
2548    * @since 1.5
2549    */
2550   public static int codePointBefore(char[] chars, int index, int start)
2551   {
2552     if (index < start || index > chars.length
2553         || start < 0 || start >= chars.length)
2554       throw new IndexOutOfBoundsException();
2555     --index;
2556     char low = chars[index];
2557     if (! isLowSurrogate(low) || --index < start)
2558       return low;
2559     char high = chars[index];
2560     if (! isHighSurrogate(high))
2561       return low;
2562     return toCodePoint(high, low);
2563   }
2564
2565   /**
2566    * Get the code point before the specified index.  This is like
2567    * #codePointAt(CharSequence, int), but checks the characters at
2568    * <code>index-1</code> and <code>index-2</code> to see if they form
2569    * a supplementary code point.  If they do not, the character at
2570    * <code>index-1</code> is returned.
2571    *
2572    * @param sequence the CharSequence
2573    * @param index the index just past the codepoint to get, starting at 0
2574    * @return the codepoint at the specified index
2575    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2576    * @since 1.5
2577    */
2578   public static int codePointBefore(CharSequence sequence, int index)
2579   {
2580     int len = sequence.length();
2581     if (index < 1 || index > len)
2582       throw new IndexOutOfBoundsException();
2583     --index;
2584     char low = sequence.charAt(index);
2585     if (! isLowSurrogate(low) || --index < 0)
2586       return low;
2587     char high = sequence.charAt(index);
2588     if (! isHighSurrogate(high))
2589       return low;
2590     return toCodePoint(high, low);
2591   }
2592 } // class Character