gomp-20050608-branch/libjava/classpath/java/lang/Character.java

   1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
   2    Copyright (C) 1998, 1999, 2001, 2002, 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package java.lang;
  40
  41 import gnu.java.lang.CharData;
  42
  43 import java.io.Serializable;
  44
  45 /**
  46  * Wrapper class for the primitive char data type.  In addition, this class
  47  * allows one to retrieve property information and perform transformations
  48  * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0.
  49  * java.lang.Character is designed to be very dynamic, and as such, it
  50  * retrieves information on the Unicode character set from a separate
  51  * database, gnu.java.lang.CharData, which can be easily upgraded.
  52  *
  53  * <p>For predicates, boundaries are used to describe
  54  * the set of characters for which the method will return true.
  55  * This syntax uses fairly normal regular expression notation.
  56  * See 5.13 of the Unicode Standard, Version 3.0, for the
  57  * boundary specification.
  58  *
  59  * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
  60  * for more information on the Unicode Standard.
  61  *
  62  * @author Tom Tromey (tromey@cygnus.com)
  63  * @author Paul N. Fisher
  64  * @author Jochen Hoenicke
  65  * @author Eric Blake (ebb9@email.byu.edu)
  66  * @see CharData
  67  * @since 1.0
  68  * @status updated to 1.4
  69  */
  70 public final class Character implements Serializable, Comparable
  71 {
  72   /**
  73    * A subset of Unicode blocks.
  74    *
  75    * @author Paul N. Fisher
  76    * @author Eric Blake (ebb9@email.byu.edu)
  77    * @since 1.2
  78    */
  79   public static class Subset
  80   {
  81     /** The name of the subset. */
  82     private final String name;
  83
  84     /**
  85      * Construct a new subset of characters.
  86      *
  87      * @param name the name of the subset
  88      * @throws NullPointerException if name is null
  89      */
  90     protected Subset(String name)
  91     {
  92       // Note that name.toString() is name, unless name was null.
  93       this.name = name.toString();
  94     }
  95
  96     /**
  97      * Compares two Subsets for equality. This is <code>final</code>, and
  98      * restricts the comparison on the <code>==</code> operator, so it returns
  99      * true only for the same object.
 100      *
 101      * @param o the object to compare
 102      * @return true if o is this
 103      */
 104     public final boolean equals(Object o)
 105     {
 106       return o == this;
 107     }
 108
 109     /**
 110      * Makes the original hashCode of Object final, to be consistent with
 111      * equals.
 112      *
 113      * @return the hash code for this object
 114      */
 115     public final int hashCode()
 116     {
 117       return super.hashCode();
 118     }
 119
 120     /**
 121      * Returns the name of the subset.
 122      *
 123      * @return the name
 124      */
 125     public final String toString()
 126     {
 127       return name;
 128     }
 129   } // class Subset
 130
 131   /**
 132    * A family of character subsets in the Unicode specification. A character
 133    * is in at most one of these blocks.
 134    *
 135    * This inner class was generated automatically from
 136    * <code>doc/unicode/Block-3.txt</code>, by some perl scripts.
 137    * This Unicode definition file can be found on the
 138    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 139    * JDK 1.4 uses Unicode version 3.0.0.
 140    *
 141    * @author scripts/unicode-blocks.pl (written by Eric Blake)
 142    * @since 1.2
 143    */
 144   public static final class UnicodeBlock extends Subset
 145   {
 146     /** The start of the subset. */
 147     private final char start;
 148
 149     /** The end of the subset. */
 150     private final char end;
 151
 152     /**
 153      * Constructor for strictly defined blocks.
 154      *
 155      * @param start the start character of the range
 156      * @param end the end character of the range
 157      * @param name the block name
 158      */
 159     private UnicodeBlock(char start, char end, String name)
 160     {
 161       super(name);
 162       this.start = start;
 163       this.end = end;
 164     }
 165
 166     /**
 167      * Returns the Unicode character block which a character belongs to.
 168      *
 169      * @param ch the character to look up
 170      * @return the set it belongs to, or null if it is not in one
 171      */
 172     public static UnicodeBlock of(char ch)
 173     {
 174       // Special case, since SPECIALS contains two ranges.
 175       if (ch == '\uFEFF')
 176         return SPECIALS;
 177       // Simple binary search for the correct block.
 178       int low = 0;
 179       int hi = sets.length - 1;
 180       while (low <= hi)
 181         {
 182           int mid = (low + hi) >> 1;
 183           UnicodeBlock b = sets[mid];
 184           if (ch < b.start)
 185             hi = mid - 1;
 186           else if (ch > b.end)
 187             low = mid + 1;
 188           else
 189             return b;
 190         }
 191       return null;
 192     }
 193
 194     /**
 195      * Basic Latin.
 196      * '\u0000' - '\u007F'.
 197      */
 198     public static final UnicodeBlock BASIC_LATIN
 199       = new UnicodeBlock('\u0000', '\u007F',
 200                          "BASIC_LATIN");
 201
 202     /**
 203      * Latin-1 Supplement.
 204      * '\u0080' - '\u00FF'.
 205      */
 206     public static final UnicodeBlock LATIN_1_SUPPLEMENT
 207       = new UnicodeBlock('\u0080', '\u00FF',
 208                          "LATIN_1_SUPPLEMENT");
 209
 210     /**
 211      * Latin Extended-A.
 212      * '\u0100' - '\u017F'.
 213      */
 214     public static final UnicodeBlock LATIN_EXTENDED_A
 215       = new UnicodeBlock('\u0100', '\u017F',
 216                          "LATIN_EXTENDED_A");
 217
 218     /**
 219      * Latin Extended-B.
 220      * '\u0180' - '\u024F'.
 221      */
 222     public static final UnicodeBlock LATIN_EXTENDED_B
 223       = new UnicodeBlock('\u0180', '\u024F',
 224                          "LATIN_EXTENDED_B");
 225
 226     /**
 227      * IPA Extensions.
 228      * '\u0250' - '\u02AF'.
 229      */
 230     public static final UnicodeBlock IPA_EXTENSIONS
 231       = new UnicodeBlock('\u0250', '\u02AF',
 232                          "IPA_EXTENSIONS");
 233
 234     /**
 235      * Spacing Modifier Letters.
 236      * '\u02B0' - '\u02FF'.
 237      */
 238     public static final UnicodeBlock SPACING_MODIFIER_LETTERS
 239       = new UnicodeBlock('\u02B0', '\u02FF',
 240                          "SPACING_MODIFIER_LETTERS");
 241
 242     /**
 243      * Combining Diacritical Marks.
 244      * '\u0300' - '\u036F'.
 245      */
 246     public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
 247       = new UnicodeBlock('\u0300', '\u036F',
 248                          "COMBINING_DIACRITICAL_MARKS");
 249
 250     /**
 251      * Greek.
 252      * '\u0370' - '\u03FF'.
 253      */
 254     public static final UnicodeBlock GREEK
 255       = new UnicodeBlock('\u0370', '\u03FF',
 256                          "GREEK");
 257
 258     /**
 259      * Cyrillic.
 260      * '\u0400' - '\u04FF'.
 261      */
 262     public static final UnicodeBlock CYRILLIC
 263       = new UnicodeBlock('\u0400', '\u04FF',
 264                          "CYRILLIC");
 265
 266     /**
 267      * Armenian.
 268      * '\u0530' - '\u058F'.
 269      */
 270     public static final UnicodeBlock ARMENIAN
 271       = new UnicodeBlock('\u0530', '\u058F',
 272                          "ARMENIAN");
 273
 274     /**
 275      * Hebrew.
 276      * '\u0590' - '\u05FF'.
 277      */
 278     public static final UnicodeBlock HEBREW
 279       = new UnicodeBlock('\u0590', '\u05FF',
 280                          "HEBREW");
 281
 282     /**
 283      * Arabic.
 284      * '\u0600' - '\u06FF'.
 285      */
 286     public static final UnicodeBlock ARABIC
 287       = new UnicodeBlock('\u0600', '\u06FF',
 288                          "ARABIC");
 289
 290     /**
 291      * Syriac.
 292      * '\u0700' - '\u074F'.
 293      * @since 1.4
 294      */
 295     public static final UnicodeBlock SYRIAC
 296       = new UnicodeBlock('\u0700', '\u074F',
 297                          "SYRIAC");
 298
 299     /**
 300      * Thaana.
 301      * '\u0780' - '\u07BF'.
 302      * @since 1.4
 303      */
 304     public static final UnicodeBlock THAANA
 305       = new UnicodeBlock('\u0780', '\u07BF',
 306                          "THAANA");
 307
 308     /**
 309      * Devanagari.
 310      * '\u0900' - '\u097F'.
 311      */
 312     public static final UnicodeBlock DEVANAGARI
 313       = new UnicodeBlock('\u0900', '\u097F',
 314                          "DEVANAGARI");
 315
 316     /**
 317      * Bengali.
 318      * '\u0980' - '\u09FF'.
 319      */
 320     public static final UnicodeBlock BENGALI
 321       = new UnicodeBlock('\u0980', '\u09FF',
 322                          "BENGALI");
 323
 324     /**
 325      * Gurmukhi.
 326      * '\u0A00' - '\u0A7F'.
 327      */
 328     public static final UnicodeBlock GURMUKHI
 329       = new UnicodeBlock('\u0A00', '\u0A7F',
 330                          "GURMUKHI");
 331
 332     /**
 333      * Gujarati.
 334      * '\u0A80' - '\u0AFF'.
 335      */
 336     public static final UnicodeBlock GUJARATI
 337       = new UnicodeBlock('\u0A80', '\u0AFF',
 338                          "GUJARATI");
 339
 340     /**
 341      * Oriya.
 342      * '\u0B00' - '\u0B7F'.
 343      */
 344     public static final UnicodeBlock ORIYA
 345       = new UnicodeBlock('\u0B00', '\u0B7F',
 346                          "ORIYA");
 347
 348     /**
 349      * Tamil.
 350      * '\u0B80' - '\u0BFF'.
 351      */
 352     public static final UnicodeBlock TAMIL
 353       = new UnicodeBlock('\u0B80', '\u0BFF',
 354                          "TAMIL");
 355
 356     /**
 357      * Telugu.
 358      * '\u0C00' - '\u0C7F'.
 359      */
 360     public static final UnicodeBlock TELUGU
 361       = new UnicodeBlock('\u0C00', '\u0C7F',
 362                          "TELUGU");
 363
 364     /**
 365      * Kannada.
 366      * '\u0C80' - '\u0CFF'.
 367      */
 368     public static final UnicodeBlock KANNADA
 369       = new UnicodeBlock('\u0C80', '\u0CFF',
 370                          "KANNADA");
 371
 372     /**
 373      * Malayalam.
 374      * '\u0D00' - '\u0D7F'.
 375      */
 376     public static final UnicodeBlock MALAYALAM
 377       = new UnicodeBlock('\u0D00', '\u0D7F',
 378                          "MALAYALAM");
 379
 380     /**
 381      * Sinhala.
 382      * '\u0D80' - '\u0DFF'.
 383      * @since 1.4
 384      */
 385     public static final UnicodeBlock SINHALA
 386       = new UnicodeBlock('\u0D80', '\u0DFF',
 387                          "SINHALA");
 388
 389     /**
 390      * Thai.
 391      * '\u0E00' - '\u0E7F'.
 392      */
 393     public static final UnicodeBlock THAI
 394       = new UnicodeBlock('\u0E00', '\u0E7F',
 395                          "THAI");
 396
 397     /**
 398      * Lao.
 399      * '\u0E80' - '\u0EFF'.
 400      */
 401     public static final UnicodeBlock LAO
 402       = new UnicodeBlock('\u0E80', '\u0EFF',
 403                          "LAO");
 404
 405     /**
 406      * Tibetan.
 407      * '\u0F00' - '\u0FFF'.
 408      */
 409     public static final UnicodeBlock TIBETAN
 410       = new UnicodeBlock('\u0F00', '\u0FFF',
 411                          "TIBETAN");
 412
 413     /**
 414      * Myanmar.
 415      * '\u1000' - '\u109F'.
 416      * @since 1.4
 417      */
 418     public static final UnicodeBlock MYANMAR
 419       = new UnicodeBlock('\u1000', '\u109F',
 420                          "MYANMAR");
 421
 422     /**
 423      * Georgian.
 424      * '\u10A0' - '\u10FF'.
 425      */
 426     public static final UnicodeBlock GEORGIAN
 427       = new UnicodeBlock('\u10A0', '\u10FF',
 428                          "GEORGIAN");
 429
 430     /**
 431      * Hangul Jamo.
 432      * '\u1100' - '\u11FF'.
 433      */
 434     public static final UnicodeBlock HANGUL_JAMO
 435       = new UnicodeBlock('\u1100', '\u11FF',
 436                          "HANGUL_JAMO");
 437
 438     /**
 439      * Ethiopic.
 440      * '\u1200' - '\u137F'.
 441      * @since 1.4
 442      */
 443     public static final UnicodeBlock ETHIOPIC
 444       = new UnicodeBlock('\u1200', '\u137F',
 445                          "ETHIOPIC");
 446
 447     /**
 448      * Cherokee.
 449      * '\u13A0' - '\u13FF'.
 450      * @since 1.4
 451      */
 452     public static final UnicodeBlock CHEROKEE
 453       = new UnicodeBlock('\u13A0', '\u13FF',
 454                          "CHEROKEE");
 455
 456     /**
 457      * Unified Canadian Aboriginal Syllabics.
 458      * '\u1400' - '\u167F'.
 459      * @since 1.4
 460      */
 461     public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
 462       = new UnicodeBlock('\u1400', '\u167F',
 463                          "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS");
 464
 465     /**
 466      * Ogham.
 467      * '\u1680' - '\u169F'.
 468      * @since 1.4
 469      */
 470     public static final UnicodeBlock OGHAM
 471       = new UnicodeBlock('\u1680', '\u169F',
 472                          "OGHAM");
 473
 474     /**
 475      * Runic.
 476      * '\u16A0' - '\u16FF'.
 477      * @since 1.4
 478      */
 479     public static final UnicodeBlock RUNIC
 480       = new UnicodeBlock('\u16A0', '\u16FF',
 481                          "RUNIC");
 482
 483     /**
 484      * Khmer.
 485      * '\u1780' - '\u17FF'.
 486      * @since 1.4
 487      */
 488     public static final UnicodeBlock KHMER
 489       = new UnicodeBlock('\u1780', '\u17FF',
 490                          "KHMER");
 491
 492     /**
 493      * Mongolian.
 494      * '\u1800' - '\u18AF'.
 495      * @since 1.4
 496      */
 497     public static final UnicodeBlock MONGOLIAN
 498       = new UnicodeBlock('\u1800', '\u18AF',
 499                          "MONGOLIAN");
 500
 501     /**
 502      * Latin Extended Additional.
 503      * '\u1E00' - '\u1EFF'.
 504      */
 505     public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
 506       = new UnicodeBlock('\u1E00', '\u1EFF',
 507                          "LATIN_EXTENDED_ADDITIONAL");
 508
 509     /**
 510      * Greek Extended.
 511      * '\u1F00' - '\u1FFF'.
 512      */
 513     public static final UnicodeBlock GREEK_EXTENDED
 514       = new UnicodeBlock('\u1F00', '\u1FFF',
 515                          "GREEK_EXTENDED");
 516
 517     /**
 518      * General Punctuation.
 519      * '\u2000' - '\u206F'.
 520      */
 521     public static final UnicodeBlock GENERAL_PUNCTUATION
 522       = new UnicodeBlock('\u2000', '\u206F',
 523                          "GENERAL_PUNCTUATION");
 524
 525     /**
 526      * Superscripts and Subscripts.
 527      * '\u2070' - '\u209F'.
 528      */
 529     public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
 530       = new UnicodeBlock('\u2070', '\u209F',
 531                          "SUPERSCRIPTS_AND_SUBSCRIPTS");
 532
 533     /**
 534      * Currency Symbols.
 535      * '\u20A0' - '\u20CF'.
 536      */
 537     public static final UnicodeBlock CURRENCY_SYMBOLS
 538       = new UnicodeBlock('\u20A0', '\u20CF',
 539                          "CURRENCY_SYMBOLS");
 540
 541     /**
 542      * Combining Marks for Symbols.
 543      * '\u20D0' - '\u20FF'.
 544      */
 545     public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
 546       = new UnicodeBlock('\u20D0', '\u20FF',
 547                          "COMBINING_MARKS_FOR_SYMBOLS");
 548
 549     /**
 550      * Letterlike Symbols.
 551      * '\u2100' - '\u214F'.
 552      */
 553     public static final UnicodeBlock LETTERLIKE_SYMBOLS
 554       = new UnicodeBlock('\u2100', '\u214F',
 555                          "LETTERLIKE_SYMBOLS");
 556
 557     /**
 558      * Number Forms.
 559      * '\u2150' - '\u218F'.
 560      */
 561     public static final UnicodeBlock NUMBER_FORMS
 562       = new UnicodeBlock('\u2150', '\u218F',
 563                          "NUMBER_FORMS");
 564
 565     /**
 566      * Arrows.
 567      * '\u2190' - '\u21FF'.
 568      */
 569     public static final UnicodeBlock ARROWS
 570       = new UnicodeBlock('\u2190', '\u21FF',
 571                          "ARROWS");
 572
 573     /**
 574      * Mathematical Operators.
 575      * '\u2200' - '\u22FF'.
 576      */
 577     public static final UnicodeBlock MATHEMATICAL_OPERATORS
 578       = new UnicodeBlock('\u2200', '\u22FF',
 579                          "MATHEMATICAL_OPERATORS");
 580
 581     /**
 582      * Miscellaneous Technical.
 583      * '\u2300' - '\u23FF'.
 584      */
 585     public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
 586       = new UnicodeBlock('\u2300', '\u23FF',
 587                          "MISCELLANEOUS_TECHNICAL");
 588
 589     /**
 590      * Control Pictures.
 591      * '\u2400' - '\u243F'.
 592      */
 593     public static final UnicodeBlock CONTROL_PICTURES
 594       = new UnicodeBlock('\u2400', '\u243F',
 595                          "CONTROL_PICTURES");
 596
 597     /**
 598      * Optical Character Recognition.
 599      * '\u2440' - '\u245F'.
 600      */
 601     public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
 602       = new UnicodeBlock('\u2440', '\u245F',
 603                          "OPTICAL_CHARACTER_RECOGNITION");
 604
 605     /**
 606      * Enclosed Alphanumerics.
 607      * '\u2460' - '\u24FF'.
 608      */
 609     public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
 610       = new UnicodeBlock('\u2460', '\u24FF',
 611                          "ENCLOSED_ALPHANUMERICS");
 612
 613     /**
 614      * Box Drawing.
 615      * '\u2500' - '\u257F'.
 616      */
 617     public static final UnicodeBlock BOX_DRAWING
 618       = new UnicodeBlock('\u2500', '\u257F',
 619                          "BOX_DRAWING");
 620
 621     /**
 622      * Block Elements.
 623      * '\u2580' - '\u259F'.
 624      */
 625     public static final UnicodeBlock BLOCK_ELEMENTS
 626       = new UnicodeBlock('\u2580', '\u259F',
 627                          "BLOCK_ELEMENTS");
 628
 629     /**
 630      * Geometric Shapes.
 631      * '\u25A0' - '\u25FF'.
 632      */
 633     public static final UnicodeBlock GEOMETRIC_SHAPES
 634       = new UnicodeBlock('\u25A0', '\u25FF',
 635                          "GEOMETRIC_SHAPES");
 636
 637     /**
 638      * Miscellaneous Symbols.
 639      * '\u2600' - '\u26FF'.
 640      */
 641     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
 642       = new UnicodeBlock('\u2600', '\u26FF',
 643                          "MISCELLANEOUS_SYMBOLS");
 644
 645     /**
 646      * Dingbats.
 647      * '\u2700' - '\u27BF'.
 648      */
 649     public static final UnicodeBlock DINGBATS
 650       = new UnicodeBlock('\u2700', '\u27BF',
 651                          "DINGBATS");
 652
 653     /**
 654      * Braille Patterns.
 655      * '\u2800' - '\u28FF'.
 656      * @since 1.4
 657      */
 658     public static final UnicodeBlock BRAILLE_PATTERNS
 659       = new UnicodeBlock('\u2800', '\u28FF',
 660                          "BRAILLE_PATTERNS");
 661
 662     /**
 663      * CJK Radicals Supplement.
 664      * '\u2E80' - '\u2EFF'.
 665      * @since 1.4
 666      */
 667     public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
 668       = new UnicodeBlock('\u2E80', '\u2EFF',
 669                          "CJK_RADICALS_SUPPLEMENT");
 670
 671     /**
 672      * Kangxi Radicals.
 673      * '\u2F00' - '\u2FDF'.
 674      * @since 1.4
 675      */
 676     public static final UnicodeBlock KANGXI_RADICALS
 677       = new UnicodeBlock('\u2F00', '\u2FDF',
 678                          "KANGXI_RADICALS");
 679
 680     /**
 681      * Ideographic Description Characters.
 682      * '\u2FF0' - '\u2FFF'.
 683      * @since 1.4
 684      */
 685     public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
 686       = new UnicodeBlock('\u2FF0', '\u2FFF',
 687                          "IDEOGRAPHIC_DESCRIPTION_CHARACTERS");
 688
 689     /**
 690      * CJK Symbols and Punctuation.
 691      * '\u3000' - '\u303F'.
 692      */
 693     public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
 694       = new UnicodeBlock('\u3000', '\u303F',
 695                          "CJK_SYMBOLS_AND_PUNCTUATION");
 696
 697     /**
 698      * Hiragana.
 699      * '\u3040' - '\u309F'.
 700      */
 701     public static final UnicodeBlock HIRAGANA
 702       = new UnicodeBlock('\u3040', '\u309F',
 703                          "HIRAGANA");
 704
 705     /**
 706      * Katakana.
 707      * '\u30A0' - '\u30FF'.
 708      */
 709     public static final UnicodeBlock KATAKANA
 710       = new UnicodeBlock('\u30A0', '\u30FF',
 711                          "KATAKANA");
 712
 713     /**
 714      * Bopomofo.
 715      * '\u3100' - '\u312F'.
 716      */
 717     public static final UnicodeBlock BOPOMOFO
 718       = new UnicodeBlock('\u3100', '\u312F',
 719                          "BOPOMOFO");
 720
 721     /**
 722      * Hangul Compatibility Jamo.
 723      * '\u3130' - '\u318F'.
 724      */
 725     public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
 726       = new UnicodeBlock('\u3130', '\u318F',
 727                          "HANGUL_COMPATIBILITY_JAMO");
 728
 729     /**
 730      * Kanbun.
 731      * '\u3190' - '\u319F'.
 732      */
 733     public static final UnicodeBlock KANBUN
 734       = new UnicodeBlock('\u3190', '\u319F',
 735                          "KANBUN");
 736
 737     /**
 738      * Bopomofo Extended.
 739      * '\u31A0' - '\u31BF'.
 740      * @since 1.4
 741      */
 742     public static final UnicodeBlock BOPOMOFO_EXTENDED
 743       = new UnicodeBlock('\u31A0', '\u31BF',
 744                          "BOPOMOFO_EXTENDED");
 745
 746     /**
 747      * Enclosed CJK Letters and Months.
 748      * '\u3200' - '\u32FF'.
 749      */
 750     public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
 751       = new UnicodeBlock('\u3200', '\u32FF',
 752                          "ENCLOSED_CJK_LETTERS_AND_MONTHS");
 753
 754     /**
 755      * CJK Compatibility.
 756      * '\u3300' - '\u33FF'.
 757      */
 758     public static final UnicodeBlock CJK_COMPATIBILITY
 759       = new UnicodeBlock('\u3300', '\u33FF',
 760                          "CJK_COMPATIBILITY");
 761
 762     /**
 763      * CJK Unified Ideographs Extension A.
 764      * '\u3400' - '\u4DB5'.
 765      * @since 1.4
 766      */
 767     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
 768       = new UnicodeBlock('\u3400', '\u4DB5',
 769                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A");
 770
 771     /**
 772      * CJK Unified Ideographs.
 773      * '\u4E00' - '\u9FFF'.
 774      */
 775     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
 776       = new UnicodeBlock('\u4E00', '\u9FFF',
 777                          "CJK_UNIFIED_IDEOGRAPHS");
 778
 779     /**
 780      * Yi Syllables.
 781      * '\uA000' - '\uA48F'.
 782      * @since 1.4
 783      */
 784     public static final UnicodeBlock YI_SYLLABLES
 785       = new UnicodeBlock('\uA000', '\uA48F',
 786                          "YI_SYLLABLES");
 787
 788     /**
 789      * Yi Radicals.
 790      * '\uA490' - '\uA4CF'.
 791      * @since 1.4
 792      */
 793     public static final UnicodeBlock YI_RADICALS
 794       = new UnicodeBlock('\uA490', '\uA4CF',
 795                          "YI_RADICALS");
 796
 797     /**
 798      * Hangul Syllables.
 799      * '\uAC00' - '\uD7A3'.
 800      */
 801     public static final UnicodeBlock HANGUL_SYLLABLES
 802       = new UnicodeBlock('\uAC00', '\uD7A3',
 803                          "HANGUL_SYLLABLES");
 804
 805     /**
 806      * Surrogates Area.
 807      * '\uD800' - '\uDFFF'.
 808      */
 809     public static final UnicodeBlock SURROGATES_AREA
 810       = new UnicodeBlock('\uD800', '\uDFFF',
 811                          "SURROGATES_AREA");
 812
 813     /**
 814      * Private Use Area.
 815      * '\uE000' - '\uF8FF'.
 816      */
 817     public static final UnicodeBlock PRIVATE_USE_AREA
 818       = new UnicodeBlock('\uE000', '\uF8FF',
 819                          "PRIVATE_USE_AREA");
 820
 821     /**
 822      * CJK Compatibility Ideographs.
 823      * '\uF900' - '\uFAFF'.
 824      */
 825     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
 826       = new UnicodeBlock('\uF900', '\uFAFF',
 827                          "CJK_COMPATIBILITY_IDEOGRAPHS");
 828
 829     /**
 830      * Alphabetic Presentation Forms.
 831      * '\uFB00' - '\uFB4F'.
 832      */
 833     public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
 834       = new UnicodeBlock('\uFB00', '\uFB4F',
 835                          "ALPHABETIC_PRESENTATION_FORMS");
 836
 837     /**
 838      * Arabic Presentation Forms-A.
 839      * '\uFB50' - '\uFDFF'.
 840      */
 841     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
 842       = new UnicodeBlock('\uFB50', '\uFDFF',
 843                          "ARABIC_PRESENTATION_FORMS_A");
 844
 845     /**
 846      * Combining Half Marks.
 847      * '\uFE20' - '\uFE2F'.
 848      */
 849     public static final UnicodeBlock COMBINING_HALF_MARKS
 850       = new UnicodeBlock('\uFE20', '\uFE2F',
 851                          "COMBINING_HALF_MARKS");
 852
 853     /**
 854      * CJK Compatibility Forms.
 855      * '\uFE30' - '\uFE4F'.
 856      */
 857     public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
 858       = new UnicodeBlock('\uFE30', '\uFE4F',
 859                          "CJK_COMPATIBILITY_FORMS");
 860
 861     /**
 862      * Small Form Variants.
 863      * '\uFE50' - '\uFE6F'.
 864      */
 865     public static final UnicodeBlock SMALL_FORM_VARIANTS
 866       = new UnicodeBlock('\uFE50', '\uFE6F',
 867                          "SMALL_FORM_VARIANTS");
 868
 869     /**
 870      * Arabic Presentation Forms-B.
 871      * '\uFE70' - '\uFEFE'.
 872      */
 873     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
 874       = new UnicodeBlock('\uFE70', '\uFEFE',
 875                          "ARABIC_PRESENTATION_FORMS_B");
 876
 877     /**
 878      * Halfwidth and Fullwidth Forms.
 879      * '\uFF00' - '\uFFEF'.
 880      */
 881     public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
 882       = new UnicodeBlock('\uFF00', '\uFFEF',
 883                          "HALFWIDTH_AND_FULLWIDTH_FORMS");
 884
 885     /**
 886      * Specials.
 887      * '\uFEFF', '\uFFF0' - '\uFFFD'.
 888      */
 889     public static final UnicodeBlock SPECIALS
 890       = new UnicodeBlock('\uFFF0', '\uFFFD',
 891                          "SPECIALS");
 892
 893     /**
 894      * The defined subsets.
 895      */
 896     private static final UnicodeBlock sets[] = {
 897       BASIC_LATIN,
 898       LATIN_1_SUPPLEMENT,
 899       LATIN_EXTENDED_A,
 900       LATIN_EXTENDED_B,
 901       IPA_EXTENSIONS,
 902       SPACING_MODIFIER_LETTERS,
 903       COMBINING_DIACRITICAL_MARKS,
 904       GREEK,
 905       CYRILLIC,
 906       ARMENIAN,
 907       HEBREW,
 908       ARABIC,
 909       SYRIAC,
 910       THAANA,
 911       DEVANAGARI,
 912       BENGALI,
 913       GURMUKHI,
 914       GUJARATI,
 915       ORIYA,
 916       TAMIL,
 917       TELUGU,
 918       KANNADA,
 919       MALAYALAM,
 920       SINHALA,
 921       THAI,
 922       LAO,
 923       TIBETAN,
 924       MYANMAR,
 925       GEORGIAN,
 926       HANGUL_JAMO,
 927       ETHIOPIC,
 928       CHEROKEE,
 929       UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
 930       OGHAM,
 931       RUNIC,
 932       KHMER,
 933       MONGOLIAN,
 934       LATIN_EXTENDED_ADDITIONAL,
 935       GREEK_EXTENDED,
 936       GENERAL_PUNCTUATION,
 937       SUPERSCRIPTS_AND_SUBSCRIPTS,
 938       CURRENCY_SYMBOLS,
 939       COMBINING_MARKS_FOR_SYMBOLS,
 940       LETTERLIKE_SYMBOLS,
 941       NUMBER_FORMS,
 942       ARROWS,
 943       MATHEMATICAL_OPERATORS,
 944       MISCELLANEOUS_TECHNICAL,
 945       CONTROL_PICTURES,
 946       OPTICAL_CHARACTER_RECOGNITION,
 947       ENCLOSED_ALPHANUMERICS,
 948       BOX_DRAWING,
 949       BLOCK_ELEMENTS,
 950       GEOMETRIC_SHAPES,
 951       MISCELLANEOUS_SYMBOLS,
 952       DINGBATS,
 953       BRAILLE_PATTERNS,
 954       CJK_RADICALS_SUPPLEMENT,
 955       KANGXI_RADICALS,
 956       IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
 957       CJK_SYMBOLS_AND_PUNCTUATION,
 958       HIRAGANA,
 959       KATAKANA,
 960       BOPOMOFO,
 961       HANGUL_COMPATIBILITY_JAMO,
 962       KANBUN,
 963       BOPOMOFO_EXTENDED,
 964       ENCLOSED_CJK_LETTERS_AND_MONTHS,
 965       CJK_COMPATIBILITY,
 966       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
 967       CJK_UNIFIED_IDEOGRAPHS,
 968       YI_SYLLABLES,
 969       YI_RADICALS,
 970       HANGUL_SYLLABLES,
 971       SURROGATES_AREA,
 972       PRIVATE_USE_AREA,
 973       CJK_COMPATIBILITY_IDEOGRAPHS,
 974       ALPHABETIC_PRESENTATION_FORMS,
 975       ARABIC_PRESENTATION_FORMS_A,
 976       COMBINING_HALF_MARKS,
 977       CJK_COMPATIBILITY_FORMS,
 978       SMALL_FORM_VARIANTS,
 979       ARABIC_PRESENTATION_FORMS_B,
 980       HALFWIDTH_AND_FULLWIDTH_FORMS,
 981       SPECIALS,
 982     };
 983   } // class UnicodeBlock
 984
 985   /**
 986    * The immutable value of this Character.
 987    *
 988    * @serial the value of this Character
 989    */
 990   private final char value;
 991
 992   /**
 993    * Compatible with JDK 1.0+.
 994    */
 995   private static final long serialVersionUID = 3786198910865385080L;
 996
 997   /**
 998    * Smallest value allowed for radix arguments in Java. This value is 2.
 999    *
1000    * @see #digit(char, int)
1001    * @see #forDigit(int, int)
1002    * @see Integer#toString(int, int)
1003    * @see Integer#valueOf(String)
1004    */
1005   public static final int MIN_RADIX = 2;
1006
1007   /**
1008    * Largest value allowed for radix arguments in Java. This value is 36.
1009    *
1010    * @see #digit(char, int)
1011    * @see #forDigit(int, int)
1012    * @see Integer#toString(int, int)
1013    * @see Integer#valueOf(String)
1014    */
1015   public static final int MAX_RADIX = 36;
1016
1017   /**
1018    * The minimum value the char data type can hold.
1019    * This value is <code>'\\u0000'</code>.
1020    */
1021   public static final char MIN_VALUE = '\u0000';
1022
1023   /**
1024    * The maximum value the char data type can hold.
1025    * This value is <code>'\\uFFFF'</code>.
1026    */
1027   public static final char MAX_VALUE = '\uFFFF';
1028
1029   /**
1030    * Class object representing the primitive char data type.
1031    *
1032    * @since 1.1
1033    */
1034   public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1035
1036   /**
1037    * The number of bits needed to represent a <code>char</code>.
1038    * @since 1.5
1039    */
1040   public static final int SIZE = 16;
1041
1042   // This caches some Character values, and is used by boxing
1043   // conversions via valueOf().  We must cache at least 0..127;
1044   // this constant controls how much we actually cache.
1045   private static final int MAX_CACHE = 127;
1046   private static Character[] charCache = new Character[MAX_CACHE + 1];
1047
1048   /**
1049    * Lu = Letter, Uppercase (Informative).
1050    *
1051    * @since 1.1
1052    */
1053   public static final byte UPPERCASE_LETTER = 1;
1054
1055   /**
1056    * Ll = Letter, Lowercase (Informative).
1057    *
1058    * @since 1.1
1059    */
1060   public static final byte LOWERCASE_LETTER = 2;
1061
1062   /**
1063    * Lt = Letter, Titlecase (Informative).
1064    *
1065    * @since 1.1
1066    */
1067   public static final byte TITLECASE_LETTER = 3;
1068
1069   /**
1070    * Mn = Mark, Non-Spacing (Normative).
1071    *
1072    * @since 1.1
1073    */
1074   public static final byte NON_SPACING_MARK = 6;
1075
1076   /**
1077    * Mc = Mark, Spacing Combining (Normative).
1078    *
1079    * @since 1.1
1080    */
1081   public static final byte COMBINING_SPACING_MARK = 8;
1082
1083   /**
1084    * Me = Mark, Enclosing (Normative).
1085    *
1086    * @since 1.1
1087    */
1088   public static final byte ENCLOSING_MARK = 7;
1089
1090   /**
1091    * Nd = Number, Decimal Digit (Normative).
1092    *
1093    * @since 1.1
1094    */
1095   public static final byte DECIMAL_DIGIT_NUMBER = 9;
1096
1097   /**
1098    * Nl = Number, Letter (Normative).
1099    *
1100    * @since 1.1
1101    */
1102   public static final byte LETTER_NUMBER = 10;
1103
1104   /**
1105    * No = Number, Other (Normative).
1106    *
1107    * @since 1.1
1108    */
1109   public static final byte OTHER_NUMBER = 11;
1110
1111   /**
1112    * Zs = Separator, Space (Normative).
1113    *
1114    * @since 1.1
1115    */
1116   public static final byte SPACE_SEPARATOR = 12;
1117
1118   /**
1119    * Zl = Separator, Line (Normative).
1120    *
1121    * @since 1.1
1122    */
1123   public static final byte LINE_SEPARATOR = 13;
1124
1125   /**
1126    * Zp = Separator, Paragraph (Normative).
1127    *
1128    * @since 1.1
1129    */
1130   public static final byte PARAGRAPH_SEPARATOR = 14;
1131
1132   /**
1133    * Cc = Other, Control (Normative).
1134    *
1135    * @since 1.1
1136    */
1137   public static final byte CONTROL = 15;
1138
1139   /**
1140    * Cf = Other, Format (Normative).
1141    *
1142    * @since 1.1
1143    */
1144   public static final byte FORMAT = 16;
1145
1146   /**
1147    * Cs = Other, Surrogate (Normative).
1148    *
1149    * @since 1.1
1150    */
1151   public static final byte SURROGATE = 19;
1152
1153   /**
1154    * Co = Other, Private Use (Normative).
1155    *
1156    * @since 1.1
1157    */
1158   public static final byte PRIVATE_USE = 18;
1159
1160   /**
1161    * Cn = Other, Not Assigned (Normative).
1162    *
1163    * @since 1.1
1164    */
1165   public static final byte UNASSIGNED = 0;
1166
1167   /**
1168    * Lm = Letter, Modifier (Informative).
1169    *
1170    * @since 1.1
1171    */
1172   public static final byte MODIFIER_LETTER = 4;
1173
1174   /**
1175    * Lo = Letter, Other (Informative).
1176    *
1177    * @since 1.1
1178    */
1179   public static final byte OTHER_LETTER = 5;
1180
1181   /**
1182    * Pc = Punctuation, Connector (Informative).
1183    *
1184    * @since 1.1
1185    */
1186   public static final byte CONNECTOR_PUNCTUATION = 23;
1187
1188   /**
1189    * Pd = Punctuation, Dash (Informative).
1190    *
1191    * @since 1.1
1192    */
1193   public static final byte DASH_PUNCTUATION = 20;
1194
1195   /**
1196    * Ps = Punctuation, Open (Informative).
1197    *
1198    * @since 1.1
1199    */
1200   public static final byte START_PUNCTUATION = 21;
1201
1202   /**
1203    * Pe = Punctuation, Close (Informative).
1204    *
1205    * @since 1.1
1206    */
1207   public static final byte END_PUNCTUATION = 22;
1208
1209   /**
1210    * Pi = Punctuation, Initial Quote (Informative).
1211    *
1212    * @since 1.4
1213    */
1214   public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
1215
1216   /**
1217    * Pf = Punctuation, Final Quote (Informative).
1218    *
1219    * @since 1.4
1220    */
1221   public static final byte FINAL_QUOTE_PUNCTUATION = 30;
1222
1223   /**
1224    * Po = Punctuation, Other (Informative).
1225    *
1226    * @since 1.1
1227    */
1228   public static final byte OTHER_PUNCTUATION = 24;
1229
1230   /**
1231    * Sm = Symbol, Math (Informative).
1232    *
1233    * @since 1.1
1234    */
1235   public static final byte MATH_SYMBOL = 25;
1236
1237   /**
1238    * Sc = Symbol, Currency (Informative).
1239    *
1240    * @since 1.1
1241    */
1242   public static final byte CURRENCY_SYMBOL = 26;
1243
1244   /**
1245    * Sk = Symbol, Modifier (Informative).
1246    *
1247    * @since 1.1
1248    */
1249   public static final byte MODIFIER_SYMBOL = 27;
1250
1251   /**
1252    * So = Symbol, Other (Informative).
1253    *
1254    * @since 1.1
1255    */
1256   public static final byte OTHER_SYMBOL = 28;
1257
1258   /**
1259    * Undefined bidirectional character type. Undefined char values have
1260    * undefined directionality in the Unicode specification.
1261    *
1262    * @since 1.4
1263    */
1264   public static final byte DIRECTIONALITY_UNDEFINED = -1;
1265
1266   /**
1267    * Strong bidirectional character type "L".
1268    *
1269    * @since 1.4
1270    */
1271   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
1272
1273   /**
1274    * Strong bidirectional character type "R".
1275    *
1276    * @since 1.4
1277    */
1278   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
1279
1280   /**
1281    * Strong bidirectional character type "AL".
1282    *
1283    * @since 1.4
1284    */
1285   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
1286
1287   /**
1288    * Weak bidirectional character type "EN".
1289    *
1290    * @since 1.4
1291    */
1292   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
1293
1294   /**
1295    * Weak bidirectional character type "ES".
1296    *
1297    * @since 1.4
1298    */
1299   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
1300
1301   /**
1302    * Weak bidirectional character type "ET".
1303    *
1304    * @since 1.4
1305    */
1306   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
1307
1308   /**
1309    * Weak bidirectional character type "AN".
1310    *
1311    * @since 1.4
1312    */
1313   public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
1314
1315   /**
1316    * Weak bidirectional character type "CS".
1317    *
1318    * @since 1.4
1319    */
1320   public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
1321
1322   /**
1323    * Weak bidirectional character type "NSM".
1324    *
1325    * @since 1.4
1326    */
1327   public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
1328
1329   /**
1330    * Weak bidirectional character type "BN".
1331    *
1332    * @since 1.4
1333    */
1334   public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
1335
1336   /**
1337    * Neutral bidirectional character type "B".
1338    *
1339    * @since 1.4
1340    */
1341   public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
1342
1343   /**
1344    * Neutral bidirectional character type "S".
1345    *
1346    * @since 1.4
1347    */
1348   public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
1349
1350   /**
1351    * Strong bidirectional character type "WS".
1352    *
1353    * @since 1.4
1354    */
1355   public static final byte DIRECTIONALITY_WHITESPACE = 12;
1356
1357   /**
1358    * Neutral bidirectional character type "ON".
1359    *
1360    * @since 1.4
1361    */
1362   public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
1363
1364   /**
1365    * Strong bidirectional character type "LRE".
1366    *
1367    * @since 1.4
1368    */
1369   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
1370
1371   /**
1372    * Strong bidirectional character type "LRO".
1373    *
1374    * @since 1.4
1375    */
1376   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
1377
1378   /**
1379    * Strong bidirectional character type "RLE".
1380    *
1381    * @since 1.4
1382    */
1383   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
1384
1385   /**
1386    * Strong bidirectional character type "RLO".
1387    *
1388    * @since 1.4
1389    */
1390   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
1391
1392   /**
1393    * Weak bidirectional character type "PDF".
1394    *
1395    * @since 1.4
1396    */
1397   public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
1398
1399   /**
1400    * Stores unicode block offset lookup table. Exploit package visibility of
1401    * String.value to avoid copying the array.
1402    * @see #readChar(char)
1403    * @see CharData#BLOCKS
1404    */
1405   private static final char[] blocks = String.zeroBasedStringValue(CharData.BLOCKS);
1406
1407   /**
1408    * Stores unicode attribute offset lookup table. Exploit package visibility
1409    * of String.value to avoid copying the array.
1410    * @see CharData#DATA
1411    */
1412   private static final char[] data = String.zeroBasedStringValue(CharData.DATA);
1413
1414   /**
1415    * Stores unicode numeric value attribute table. Exploit package visibility
1416    * of String.value to avoid copying the array.
1417    * @see CharData#NUM_VALUE
1418    */
1419   private static final char[] numValue
1420           = String.zeroBasedStringValue(CharData.NUM_VALUE);
1421
1422   /**
1423    * Stores unicode uppercase attribute table. Exploit package visibility
1424    * of String.value to avoid copying the array.
1425    * @see CharData#UPPER
1426    */
1427   private static final char[] upper = String.zeroBasedStringValue(CharData.UPPER);
1428
1429   /**
1430    * Stores unicode lowercase attribute table. Exploit package visibility
1431    * of String.value to avoid copying the array.
1432    * @see CharData#LOWER
1433    */
1434   private static final char[] lower = String.zeroBasedStringValue(CharData.LOWER);
1435
1436   /**
1437    * Stores unicode direction attribute table. Exploit package visibility
1438    * of String.value to avoid copying the array.
1439    * @see CharData#DIRECTION
1440    */
1441   // Package visible for use by String.
1442   static final char[] direction = String.zeroBasedStringValue(CharData.DIRECTION);
1443
1444   /**
1445    * Stores unicode titlecase table. Exploit package visibility of
1446    * String.value to avoid copying the array.
1447    * @see CharData#TITLE
1448    */
1449   private static final char[] title = String.zeroBasedStringValue(CharData.TITLE);
1450
1451   /**
1452    * Mask for grabbing the type out of the contents of data.
1453    * @see CharData#DATA
1454    */
1455   private static final int TYPE_MASK = 0x1F;
1456
1457   /**
1458    * Mask for grabbing the non-breaking space flag out of the contents of
1459    * data.
1460    * @see CharData#DATA
1461    */
1462   private static final int NO_BREAK_MASK = 0x20;
1463
1464   /**
1465    * Mask for grabbing the mirrored directionality flag out of the contents
1466    * of data.
1467    * @see CharData#DATA
1468    */
1469   private static final int MIRROR_MASK = 0x40;
1470
1471   /**
1472    * Min value for supplementary code point.
1473    *
1474    * @since 1.5
1475    */
1476   public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
1477
1478   /**
1479    * Min value for code point.
1480    *
1481    * @since 1.5
1482    */
1483   public static final int MIN_CODE_POINT = 0;
1484
1485
1486   /**
1487    * Max value for code point.
1488    *
1489    * @since 1.5
1490    */
1491   public static final int MAX_CODE_POINT = 0x010ffff;
1492
1493
1494   /**
1495    * Minimum high surrogate code in UTF-16 encoding.
1496    *
1497    * @since 1.5
1498    */
1499   public static final char MIN_HIGH_SURROGATE = '\ud800';
1500
1501   /**
1502    * Maximum high surrogate code in UTF-16 encoding.
1503    *
1504    * @since 1.5
1505    */
1506   public static final char MAX_HIGH_SURROGATE = '\udbff';
1507
1508   /**
1509    * Minimum low surrogate code in UTF-16 encoding.
1510    *
1511    * @since 1.5
1512    */
1513   public static final char MIN_LOW_SURROGATE = '\udc00';
1514
1515   /**
1516    * Maximum low surrogate code in UTF-16 encoding.
1517    *
1518    * @since 1.5
1519    */
1520   public static final char MAX_LOW_SURROGATE = '\udfff';
1521
1522   /**
1523    * Minimum surrogate code in UTF-16 encoding.
1524    *
1525    * @since 1.5
1526    */
1527   public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
1528
1529   /**
1530    * Maximum low surrogate code in UTF-16 encoding.
1531    *
1532    * @since 1.5
1533    */
1534   public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
1535
1536   /**
1537    * Grabs an attribute offset from the Unicode attribute database. The lower
1538    * 5 bits are the character type, the next 2 bits are flags, and the top
1539    * 9 bits are the offset into the attribute tables.
1540    *
1541    * @param ch the character to look up
1542    * @return the character's attribute offset and type
1543    * @see #TYPE_MASK
1544    * @see #NO_BREAK_MASK
1545    * @see #MIRROR_MASK
1546    * @see CharData#DATA
1547    * @see CharData#SHIFT
1548    */
1549   // Package visible for use in String.
1550   static char readChar(char ch)
1551   {
1552     // Perform 16-bit addition to find the correct entry in data.
1553     return data[(char) (blocks[ch >> CharData.SHIFT] + ch)];
1554   }
1555
1556   /**
1557    * Wraps up a character.
1558    *
1559    * @param value the character to wrap
1560    */
1561   public Character(char value)
1562   {
1563     this.value = value;
1564   }
1565
1566   /**
1567    * Returns the character which has been wrapped by this class.
1568    *
1569    * @return the character wrapped
1570    */
1571   public char charValue()
1572   {
1573     return value;
1574   }
1575
1576   /**
1577    * Returns the numerical value (unsigned) of the wrapped character.
1578    * Range of returned values: 0x0000-0xFFFF.
1579    *
1580    * @return the value of the wrapped character
1581    */
1582   public int hashCode()
1583   {
1584     return value;
1585   }
1586
1587   /**
1588    * Determines if an object is equal to this object. This is only true for
1589    * another Character object wrapping the same value.
1590    *
1591    * @param o object to compare
1592    * @return true if o is a Character with the same value
1593    */
1594   public boolean equals(Object o)
1595   {
1596     return o instanceof Character && value == ((Character) o).value;
1597   }
1598
1599   /**
1600    * Converts the wrapped character into a String.
1601    *
1602    * @return a String containing one character -- the wrapped character
1603    *         of this instance
1604    */
1605   public String toString()
1606   {
1607     // Package constructor avoids an array copy.
1608     return new String(new char[] { value }, 0, 1, true);
1609   }
1610
1611   /**
1612    * Returns a String of length 1 representing the specified character.
1613    *
1614    * @param ch the character to convert
1615    * @return a String containing the character
1616    * @since 1.4
1617    */
1618   public static String toString(char ch)
1619   {
1620     // Package constructor avoids an array copy.
1621     return new String(new char[] { ch }, 0, 1, true);
1622   }
1623
1624   /**
1625    * Determines if a character is a Unicode lowercase letter. For example,
1626    * <code>'a'</code> is lowercase.
1627    * <br>
1628    * lowercase = [Ll]
1629    *
1630    * @param ch character to test
1631    * @return true if ch is a Unicode lowercase letter, else false
1632    * @see #isUpperCase(char)
1633    * @see #isTitleCase(char)
1634    * @see #toLowerCase(char)
1635    * @see #getType(char)
1636    */
1637   public static boolean isLowerCase(char ch)
1638   {
1639     return getType(ch) == LOWERCASE_LETTER;
1640   }
1641
1642   /**
1643    * Determines if a character is a Unicode uppercase letter. For example,
1644    * <code>'A'</code> is uppercase.
1645    * <br>
1646    * uppercase = [Lu]
1647    *
1648    * @param ch character to test
1649    * @return true if ch is a Unicode uppercase letter, else false
1650    * @see #isLowerCase(char)
1651    * @see #isTitleCase(char)
1652    * @see #toUpperCase(char)
1653    * @see #getType(char)
1654    */
1655   public static boolean isUpperCase(char ch)
1656   {
1657     return getType(ch) == UPPERCASE_LETTER;
1658   }
1659
1660   /**
1661    * Determines if a character is a Unicode titlecase letter. For example,
1662    * the character "Lj" (Latin capital L with small letter j) is titlecase.
1663    * <br>
1664    * titlecase = [Lt]
1665    *
1666    * @param ch character to test
1667    * @return true if ch is a Unicode titlecase letter, else false
1668    * @see #isLowerCase(char)
1669    * @see #isUpperCase(char)
1670    * @see #toTitleCase(char)
1671    * @see #getType(char)
1672    */
1673   public static boolean isTitleCase(char ch)
1674   {
1675     return getType(ch) == TITLECASE_LETTER;
1676   }
1677
1678   /**
1679    * Determines if a character is a Unicode decimal digit. For example,
1680    * <code>'0'</code> is a digit.
1681    * <br>
1682    * Unicode decimal digit = [Nd]
1683    *
1684    * @param ch character to test
1685    * @return true if ch is a Unicode decimal digit, else false
1686    * @see #digit(char, int)
1687    * @see #forDigit(int, int)
1688    * @see #getType(char)
1689    */
1690   public static boolean isDigit(char ch)
1691   {
1692     return getType(ch) == DECIMAL_DIGIT_NUMBER;
1693   }
1694
1695   /**
1696    * Determines if a character is part of the Unicode Standard. This is an
1697    * evolving standard, but covers every character in the data file.
1698    * <br>
1699    * defined = not [Cn]
1700    *
1701    * @param ch character to test
1702    * @return true if ch is a Unicode character, else false
1703    * @see #isDigit(char)
1704    * @see #isLetter(char)
1705    * @see #isLetterOrDigit(char)
1706    * @see #isLowerCase(char)
1707    * @see #isTitleCase(char)
1708    * @see #isUpperCase(char)
1709    */
1710   public static boolean isDefined(char ch)
1711   {
1712     return getType(ch) != UNASSIGNED;
1713   }
1714
1715   /**
1716    * Determines if a character is a Unicode letter. Not all letters have case,
1717    * so this may return true when isLowerCase and isUpperCase return false.
1718    * <br>
1719    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
1720    *
1721    * @param ch character to test
1722    * @return true if ch is a Unicode letter, else false
1723    * @see #isDigit(char)
1724    * @see #isJavaIdentifierStart(char)
1725    * @see #isJavaLetter(char)
1726    * @see #isJavaLetterOrDigit(char)
1727    * @see #isLetterOrDigit(char)
1728    * @see #isLowerCase(char)
1729    * @see #isTitleCase(char)
1730    * @see #isUnicodeIdentifierStart(char)
1731    * @see #isUpperCase(char)
1732    */
1733   public static boolean isLetter(char ch)
1734   {
1735     return ((1 << getType(ch))
1736             & ((1 << UPPERCASE_LETTER)
1737                | (1 << LOWERCASE_LETTER)
1738                | (1 << TITLECASE_LETTER)
1739                | (1 << MODIFIER_LETTER)
1740                | (1 << OTHER_LETTER))) != 0;
1741   }
1742
1743   /**
1744    * Determines if a character is a Unicode letter or a Unicode digit. This
1745    * is the combination of isLetter and isDigit.
1746    * <br>
1747    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
1748    *
1749    * @param ch character to test
1750    * @return true if ch is a Unicode letter or a Unicode digit, else false
1751    * @see #isDigit(char)
1752    * @see #isJavaIdentifierPart(char)
1753    * @see #isJavaLetter(char)
1754    * @see #isJavaLetterOrDigit(char)
1755    * @see #isLetter(char)
1756    * @see #isUnicodeIdentifierPart(char)
1757    */
1758   public static boolean isLetterOrDigit(char ch)
1759   {
1760     return ((1 << getType(ch))
1761             & ((1 << UPPERCASE_LETTER)
1762                | (1 << LOWERCASE_LETTER)
1763                | (1 << TITLECASE_LETTER)
1764                | (1 << MODIFIER_LETTER)
1765                | (1 << OTHER_LETTER)
1766                | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
1767   }
1768
1769   /**
1770    * Determines if a character can start a Java identifier. This is the
1771    * combination of isLetter, any character where getType returns
1772    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1773    * (like '_').
1774    *
1775    * @param ch character to test
1776    * @return true if ch can start a Java identifier, else false
1777    * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
1778    * @see #isJavaLetterOrDigit(char)
1779    * @see #isJavaIdentifierStart(char)
1780    * @see #isJavaIdentifierPart(char)
1781    * @see #isLetter(char)
1782    * @see #isLetterOrDigit(char)
1783    * @see #isUnicodeIdentifierStart(char)
1784    */
1785   public static boolean isJavaLetter(char ch)
1786   {
1787     return isJavaIdentifierStart(ch);
1788   }
1789
1790   /**
1791    * Determines if a character can follow the first letter in
1792    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
1793    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1794    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1795    * or isIdentifierIgnorable.
1796    *
1797    * @param ch character to test
1798    * @return true if ch can follow the first letter in a Java identifier
1799    * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
1800    * @see #isJavaLetter(char)
1801    * @see #isJavaIdentifierStart(char)
1802    * @see #isJavaIdentifierPart(char)
1803    * @see #isLetter(char)
1804    * @see #isLetterOrDigit(char)
1805    * @see #isUnicodeIdentifierPart(char)
1806    * @see #isIdentifierIgnorable(char)
1807    */
1808   public static boolean isJavaLetterOrDigit(char ch)
1809   {
1810     return isJavaIdentifierPart(ch);
1811   }
1812
1813   /**
1814    * Determines if a character can start a Java identifier. This is the
1815    * combination of isLetter, any character where getType returns
1816    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1817    * (like '_').
1818    * <br>
1819    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
1820    *
1821    * @param ch character to test
1822    * @return true if ch can start a Java identifier, else false
1823    * @see #isJavaIdentifierPart(char)
1824    * @see #isLetter(char)
1825    * @see #isUnicodeIdentifierStart(char)
1826    * @since 1.1
1827    */
1828   public static boolean isJavaIdentifierStart(char ch)
1829   {
1830     return ((1 << getType(ch))
1831             & ((1 << UPPERCASE_LETTER)
1832                | (1 << LOWERCASE_LETTER)
1833                | (1 << TITLECASE_LETTER)
1834                | (1 << MODIFIER_LETTER)
1835                | (1 << OTHER_LETTER)
1836                | (1 << LETTER_NUMBER)
1837                | (1 << CURRENCY_SYMBOL)
1838                | (1 << CONNECTOR_PUNCTUATION))) != 0;
1839   }
1840
1841   /**
1842    * Determines if a character can follow the first letter in
1843    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
1844    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1845    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1846    * or isIdentifierIgnorable.
1847    * <br>
1848    * Java identifier extender =
1849    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
1850    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1851    *
1852    * @param ch character to test
1853    * @return true if ch can follow the first letter in a Java identifier
1854    * @see #isIdentifierIgnorable(char)
1855    * @see #isJavaIdentifierStart(char)
1856    * @see #isLetterOrDigit(char)
1857    * @see #isUnicodeIdentifierPart(char)
1858    * @since 1.1
1859    */
1860   public static boolean isJavaIdentifierPart(char ch)
1861   {
1862     int category = getType(ch);
1863     return ((1 << category)
1864             & ((1 << UPPERCASE_LETTER)
1865                | (1 << LOWERCASE_LETTER)
1866                | (1 << TITLECASE_LETTER)
1867                | (1 << MODIFIER_LETTER)
1868                | (1 << OTHER_LETTER)
1869                | (1 << NON_SPACING_MARK)
1870                | (1 << COMBINING_SPACING_MARK)
1871                | (1 << DECIMAL_DIGIT_NUMBER)
1872                | (1 << LETTER_NUMBER)
1873                | (1 << CURRENCY_SYMBOL)
1874                | (1 << CONNECTOR_PUNCTUATION)
1875                | (1 << FORMAT))) != 0
1876       || (category == CONTROL && isIdentifierIgnorable(ch));
1877   }
1878
1879   /**
1880    * Determines if a character can start a Unicode identifier.  Only
1881    * letters can start a Unicode identifier, but this includes characters
1882    * in LETTER_NUMBER.
1883    * <br>
1884    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
1885    *
1886    * @param ch character to test
1887    * @return true if ch can start a Unicode identifier, else false
1888    * @see #isJavaIdentifierStart(char)
1889    * @see #isLetter(char)
1890    * @see #isUnicodeIdentifierPart(char)
1891    * @since 1.1
1892    */
1893   public static boolean isUnicodeIdentifierStart(char ch)
1894   {
1895     return ((1 << getType(ch))
1896             & ((1 << UPPERCASE_LETTER)
1897                | (1 << LOWERCASE_LETTER)
1898                | (1 << TITLECASE_LETTER)
1899                | (1 << MODIFIER_LETTER)
1900                | (1 << OTHER_LETTER)
1901                | (1 << LETTER_NUMBER))) != 0;
1902   }
1903
1904   /**
1905    * Determines if a character can follow the first letter in
1906    * a Unicode identifier. This includes letters, connecting punctuation,
1907    * digits, numeric letters, combining marks, non-spacing marks, and
1908    * isIdentifierIgnorable.
1909    * <br>
1910    * Unicode identifier extender =
1911    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
1912    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1913    *
1914    * @param ch character to test
1915    * @return true if ch can follow the first letter in a Unicode identifier
1916    * @see #isIdentifierIgnorable(char)
1917    * @see #isJavaIdentifierPart(char)
1918    * @see #isLetterOrDigit(char)
1919    * @see #isUnicodeIdentifierStart(char)
1920    * @since 1.1
1921    */
1922   public static boolean isUnicodeIdentifierPart(char ch)
1923   {
1924     int category = getType(ch);
1925     return ((1 << category)
1926             & ((1 << UPPERCASE_LETTER)
1927                | (1 << LOWERCASE_LETTER)
1928                | (1 << TITLECASE_LETTER)
1929                | (1 << MODIFIER_LETTER)
1930                | (1 << OTHER_LETTER)
1931                | (1 << NON_SPACING_MARK)
1932                | (1 << COMBINING_SPACING_MARK)
1933                | (1 << DECIMAL_DIGIT_NUMBER)
1934                | (1 << LETTER_NUMBER)
1935                | (1 << CONNECTOR_PUNCTUATION)
1936                | (1 << FORMAT))) != 0
1937       || (category == CONTROL && isIdentifierIgnorable(ch));
1938   }
1939
1940   /**
1941    * Determines if a character is ignorable in a Unicode identifier. This
1942    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
1943    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
1944    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
1945    * <code>'\u009F'</code>), and FORMAT characters.
1946    * <br>
1947    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
1948    *    |U+007F-U+009F
1949    *
1950    * @param ch character to test
1951    * @return true if ch is ignorable in a Unicode or Java identifier
1952    * @see #isJavaIdentifierPart(char)
1953    * @see #isUnicodeIdentifierPart(char)
1954    * @since 1.1
1955    */
1956   public static boolean isIdentifierIgnorable(char ch)
1957   {
1958     return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F'
1959                                || (ch <= '\u001B' && ch >= '\u000E')))
1960       || getType(ch) == FORMAT;
1961   }
1962
1963   /**
1964    * Converts a Unicode character into its lowercase equivalent mapping.
1965    * If a mapping does not exist, then the character passed is returned.
1966    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
1967    *
1968    * @param ch character to convert to lowercase
1969    * @return lowercase mapping of ch, or ch if lowercase mapping does
1970    *         not exist
1971    * @see #isLowerCase(char)
1972    * @see #isUpperCase(char)
1973    * @see #toTitleCase(char)
1974    * @see #toUpperCase(char)
1975    */
1976   public static char toLowerCase(char ch)
1977   {
1978     // Signedness doesn't matter, as result is cast back to char.
1979     return (char) (ch + lower[readChar(ch) >> 7]);
1980   }
1981
1982   /**
1983    * Converts a Unicode character into its uppercase equivalent mapping.
1984    * If a mapping does not exist, then the character passed is returned.
1985    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
1986    *
1987    * @param ch character to convert to uppercase
1988    * @return uppercase mapping of ch, or ch if uppercase mapping does
1989    *         not exist
1990    * @see #isLowerCase(char)
1991    * @see #isUpperCase(char)
1992    * @see #toLowerCase(char)
1993    * @see #toTitleCase(char)
1994    */
1995   public static char toUpperCase(char ch)
1996   {
1997     // Signedness doesn't matter, as result is cast back to char.
1998     return (char) (ch + upper[readChar(ch) >> 7]);
1999   }
2000
2001   /**
2002    * Converts a Unicode character into its titlecase equivalent mapping.
2003    * If a mapping does not exist, then the character passed is returned.
2004    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
2005    *
2006    * @param ch character to convert to titlecase
2007    * @return titlecase mapping of ch, or ch if titlecase mapping does
2008    *         not exist
2009    * @see #isTitleCase(char)
2010    * @see #toLowerCase(char)
2011    * @see #toUpperCase(char)
2012    */
2013   public static char toTitleCase(char ch)
2014   {
2015     // As title is short, it doesn't hurt to exhaustively iterate over it.
2016     for (int i = title.length - 2; i >= 0; i -= 2)
2017       if (title[i] == ch)
2018         return title[i + 1];
2019     return toUpperCase(ch);
2020   }
2021
2022   /**
2023    * Converts a character into a digit of the specified radix. If the radix
2024    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
2025    * exceeds the radix, or if ch is not a decimal digit or in the case
2026    * insensitive set of 'a'-'z', the result is -1.
2027    * <br>
2028    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
2029    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
2030    *
2031    * @param ch character to convert into a digit
2032    * @param radix radix in which ch is a digit
2033    * @return digit which ch represents in radix, or -1 not a valid digit
2034    * @see #MIN_RADIX
2035    * @see #MAX_RADIX
2036    * @see #forDigit(int, int)
2037    * @see #isDigit(char)
2038    * @see #getNumericValue(char)
2039    */
2040   public static int digit(char ch, int radix)
2041   {
2042     if (radix < MIN_RADIX || radix > MAX_RADIX)
2043       return -1;
2044     char attr = readChar(ch);
2045     if (((1 << (attr & TYPE_MASK))
2046          & ((1 << UPPERCASE_LETTER)
2047             | (1 << LOWERCASE_LETTER)
2048             | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
2049       {
2050         // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
2051         int digit = numValue[attr >> 7];
2052         return (digit < radix) ? digit : -1;
2053       }
2054     return -1;
2055   }
2056
2057   /**
2058    * Returns the Unicode numeric value property of a character. For example,
2059    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
2060    *
2061    * <p>This method also returns values for the letters A through Z, (not
2062    * specified by Unicode), in these ranges: <code>'\u0041'</code>
2063    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
2064    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
2065    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
2066    * <code>'\uFF5A'</code> (full width variants).
2067    *
2068    * <p>If the character lacks a numeric value property, -1 is returned.
2069    * If the character has a numeric value property which is not representable
2070    * as a nonnegative integer, such as a fraction, -2 is returned.
2071    *
2072    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
2073    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
2074    *
2075    * @param ch character from which the numeric value property will
2076    *        be retrieved
2077    * @return the numeric value property of ch, or -1 if it does not exist, or
2078    *         -2 if it is not representable as a nonnegative integer
2079    * @see #forDigit(int, int)
2080    * @see #digit(char, int)
2081    * @see #isDigit(char)
2082    * @since 1.1
2083    */
2084   public static int getNumericValue(char ch)
2085   {
2086     // Treat numValue as signed.
2087     return (short) numValue[readChar(ch) >> 7];
2088   }
2089
2090   /**
2091    * Determines if a character is a ISO-LATIN-1 space. This is only the five
2092    * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
2093    * <code>'\r'</code>, and <code>' '</code>.
2094    * <br>
2095    * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
2096    *
2097    * @param ch character to test
2098    * @return true if ch is a space, else false
2099    * @deprecated Replaced by {@link #isWhitespace(char)}
2100    * @see #isSpaceChar(char)
2101    * @see #isWhitespace(char)
2102    */
2103   public static boolean isSpace(char ch)
2104   {
2105     // Performing the subtraction up front alleviates need to compare longs.
2106     return ch-- <= ' ' && ((1 << ch)
2107                            & ((1 << (' ' - 1))
2108                               | (1 << ('\t' - 1))
2109                               | (1 << ('\n' - 1))
2110                               | (1 << ('\r' - 1))
2111                               | (1 << ('\f' - 1)))) != 0;
2112   }
2113
2114   /**
2115    * Determines if a character is a Unicode space character. This includes
2116    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
2117    * <br>
2118    * Unicode space = [Zs]|[Zp]|[Zl]
2119    *
2120    * @param ch character to test
2121    * @return true if ch is a Unicode space, else false
2122    * @see #isWhitespace(char)
2123    * @since 1.1
2124    */
2125   public static boolean isSpaceChar(char ch)
2126   {
2127     return ((1 << getType(ch))
2128             & ((1 << SPACE_SEPARATOR)
2129                | (1 << LINE_SEPARATOR)
2130                | (1 << PARAGRAPH_SEPARATOR))) != 0;
2131   }
2132
2133   /**
2134    * Determines if a character is Java whitespace. This includes Unicode
2135    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
2136    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
2137    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
2138    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
2139    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
2140    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
2141    * and <code>'\u001F'</code>.
2142    * <br>
2143    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
2144    *
2145    * @param ch character to test
2146    * @return true if ch is Java whitespace, else false
2147    * @see #isSpaceChar(char)
2148    * @since 1.1
2149    */
2150   public static boolean isWhitespace(char ch)
2151   {
2152     int attr = readChar(ch);
2153     return ((((1 << (attr & TYPE_MASK))
2154               & ((1 << SPACE_SEPARATOR)
2155                  | (1 << LINE_SEPARATOR)
2156                  | (1 << PARAGRAPH_SEPARATOR))) != 0)
2157             && (attr & NO_BREAK_MASK) == 0)
2158       || (ch <= '\u001F' && ((1 << ch)
2159                              & ((1 << '\t')
2160                                 | (1 << '\n')
2161                                 | (1 << '\u000B')
2162                                 | (1 << '\u000C')
2163                                 | (1 << '\r')
2164                                 | (1 << '\u001C')
2165                                 | (1 << '\u001D')
2166                                 | (1 << '\u001E')
2167                                 | (1 << '\u001F'))) != 0);
2168   }
2169
2170   /**
2171    * Determines if a character has the ISO Control property.
2172    * <br>
2173    * ISO Control = [Cc]
2174    *
2175    * @param ch character to test
2176    * @return true if ch is an ISO Control character, else false
2177    * @see #isSpaceChar(char)
2178    * @see #isWhitespace(char)
2179    * @since 1.1
2180    */
2181   public static boolean isISOControl(char ch)
2182   {
2183     return getType(ch) == CONTROL;
2184   }
2185
2186   /**
2187    * Returns the Unicode general category property of a character.
2188    *
2189    * @param ch character from which the general category property will
2190    *        be retrieved
2191    * @return the character category property of ch as an integer
2192    * @see #UNASSIGNED
2193    * @see #UPPERCASE_LETTER
2194    * @see #LOWERCASE_LETTER
2195    * @see #TITLECASE_LETTER
2196    * @see #MODIFIER_LETTER
2197    * @see #OTHER_LETTER
2198    * @see #NON_SPACING_MARK
2199    * @see #ENCLOSING_MARK
2200    * @see #COMBINING_SPACING_MARK
2201    * @see #DECIMAL_DIGIT_NUMBER
2202    * @see #LETTER_NUMBER
2203    * @see #OTHER_NUMBER
2204    * @see #SPACE_SEPARATOR
2205    * @see #LINE_SEPARATOR
2206    * @see #PARAGRAPH_SEPARATOR
2207    * @see #CONTROL
2208    * @see #FORMAT
2209    * @see #PRIVATE_USE
2210    * @see #SURROGATE
2211    * @see #DASH_PUNCTUATION
2212    * @see #START_PUNCTUATION
2213    * @see #END_PUNCTUATION
2214    * @see #CONNECTOR_PUNCTUATION
2215    * @see #OTHER_PUNCTUATION
2216    * @see #MATH_SYMBOL
2217    * @see #CURRENCY_SYMBOL
2218    * @see #MODIFIER_SYMBOL
2219    * @see #INITIAL_QUOTE_PUNCTUATION
2220    * @see #FINAL_QUOTE_PUNCTUATION
2221    * @since 1.1
2222    */
2223   public static int getType(char ch)
2224   {
2225     return readChar(ch) & TYPE_MASK;
2226   }
2227
2228   /**
2229    * Converts a digit into a character which represents that digit
2230    * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
2231    * or the digit exceeds the radix, then the null character <code>'\0'</code>
2232    * is returned.  Otherwise the return value is in '0'-'9' and 'a'-'z'.
2233    * <br>
2234    * return value boundary = U+0030-U+0039|U+0061-U+007A
2235    *
2236    * @param digit digit to be converted into a character
2237    * @param radix radix of digit
2238    * @return character representing digit in radix, or '\0'
2239    * @see #MIN_RADIX
2240    * @see #MAX_RADIX
2241    * @see #digit(char, int)
2242    */
2243   public static char forDigit(int digit, int radix)
2244   {
2245     if (radix < MIN_RADIX || radix > MAX_RADIX
2246         || digit < 0 || digit >= radix)
2247       return '\0';
2248     return Number.digits[digit];
2249   }
2250
2251   /**
2252    * Returns the Unicode directionality property of the character. This
2253    * is used in the visual ordering of text.
2254    *
2255    * @param ch the character to look up
2256    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
2257    * @see #DIRECTIONALITY_UNDEFINED
2258    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
2259    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
2260    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
2261    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
2262    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
2263    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
2264    * @see #DIRECTIONALITY_ARABIC_NUMBER
2265    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
2266    * @see #DIRECTIONALITY_NONSPACING_MARK
2267    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
2268    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
2269    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
2270    * @see #DIRECTIONALITY_WHITESPACE
2271    * @see #DIRECTIONALITY_OTHER_NEUTRALS
2272    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
2273    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
2274    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
2275    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
2276    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
2277    * @since 1.4
2278    */
2279   public static byte getDirectionality(char ch)
2280   {
2281     // The result will correctly be signed.
2282     return (byte) (direction[readChar(ch) >> 7] >> 2);
2283   }
2284
2285   /**
2286    * Determines whether the character is mirrored according to Unicode. For
2287    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
2288    * left-to-right text, but ')' in right-to-left text.
2289    *
2290    * @param ch the character to look up
2291    * @return true if the character is mirrored
2292    * @since 1.4
2293    */
2294   public static boolean isMirrored(char ch)
2295   {
2296     return (readChar(ch) & MIRROR_MASK) != 0;
2297   }
2298
2299   /**
2300    * Compares another Character to this Character, numerically.
2301    *
2302    * @param anotherCharacter Character to compare with this Character
2303    * @return a negative integer if this Character is less than
2304    *         anotherCharacter, zero if this Character is equal, and
2305    *         a positive integer if this Character is greater
2306    * @throws NullPointerException if anotherCharacter is null
2307    * @since 1.2
2308    */
2309   public int compareTo(Character anotherCharacter)
2310   {
2311     return value - anotherCharacter.value;
2312   }
2313
2314   /**
2315    * Compares an object to this Character.  Assuming the object is a
2316    * Character object, this method performs the same comparison as
2317    * compareTo(Character).
2318    *
2319    * @param o object to compare
2320    * @return the comparison value
2321    * @throws ClassCastException if o is not a Character object
2322    * @throws NullPointerException if o is null
2323    * @see #compareTo(Character)
2324    * @since 1.2
2325    */
2326   public int compareTo(Object o)
2327   {
2328     return compareTo((Character) o);
2329   }
2330
2331   /**
2332    * Returns an <code>Character</code> object wrapping the value.
2333    * In contrast to the <code>Character</code> constructor, this method
2334    * will cache some values.  It is used by boxing conversion.
2335    *
2336    * @param val the value to wrap
2337    * @return the <code>Character</code>
2338    *
2339    * @since 1.5
2340    */
2341   public static Character valueOf(char val)
2342   {
2343     if (val > MAX_CACHE)
2344       return new Character(val);
2345     synchronized (charCache)
2346       {
2347     if (charCache[val - MIN_VALUE] == null)
2348       charCache[val - MIN_VALUE] = new Character(val);
2349     return charCache[val - MIN_VALUE];
2350       }
2351   }
2352
2353   /**
2354    * Reverse the bytes in val.
2355    * @since 1.5
2356    */
2357   public static char reverseBytes(char val)
2358   {
2359     return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
2360   }
2361
2362   /**
2363    * Converts a unicode code point to a UTF-16 representation of that
2364    * code point.
2365    *
2366    * @param codePoint the unicode code point
2367    *
2368    * @return the UTF-16 representation of that code point
2369    *
2370    * @throws IllegalArgumentException if the code point is not a valid
2371    *         unicode code point
2372    *
2373    * @since 1.5
2374    */
2375   public static char[] toChars(int codePoint)
2376   {
2377     char[] result = new char[charCount(codePoint)];
2378     int ignore = toChars(codePoint, result, 0);
2379     return result;
2380   }
2381
2382   /**
2383    * Converts a unicode code point to its UTF-16 representation.
2384    *
2385    * @param codePoint the unicode code point
2386    * @param dst the target char array
2387    * @param dstIndex the start index for the target
2388    *
2389    * @return number of characters written to <code>dst</code>
2390    *
2391    * @throws IllegalArgumentException if <code>codePoint</code> is not a
2392    *         valid unicode code point
2393    * @throws NullPointerException if <code>dst</code> is <code>null</code>
2394    * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
2395    *         in <code>dst</code> or if the UTF-16 representation does not
2396    *         fit into <code>dst</code>
2397    *
2398    * @since 1.5
2399    */
2400   public static int toChars(int codePoint, char[] dst, int dstIndex)
2401   {
2402     if (!isValidCodePoint(codePoint))
2403       {
2404         throw new IllegalArgumentException("not a valid code point: "
2405                                            + codePoint);
2406       }
2407
2408     int result;
2409     if (isSupplementaryCodePoint(codePoint))
2410       {
2411         // Write second char first to cause IndexOutOfBoundsException
2412         // immediately.
2413         final int cp2 = codePoint - 0x10000;
2414         dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
2415         dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
2416         result = 2;
2417       }
2418     else
2419       {
2420         dst[dstIndex] = (char) codePoint;
2421         result = 1;
2422       }
2423     return result;
2424   }
2425
2426   /**
2427    * Return number of 16-bit characters required to represent the given
2428    * code point.
2429    *
2430    * @param codePoint a unicode code point
2431    *
2432    * @return 2 if codePoint >= 0x10000, 1 otherwise.
2433    *
2434    * @since 1.5
2435    */
2436   public static int charCount(int codePoint)
2437   {
2438     return
2439       (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
2440       ? 2
2441       : 1;
2442   }
2443
2444   /**
2445    * Determines whether the specified code point is
2446    * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
2447    * supplementary character range.
2448    *
2449    * @param codePoint a Unicode code point
2450    *
2451    * @return <code>true</code> if code point is in supplementary range
2452    *
2453    * @since 1.5
2454    */
2455   public static boolean isSupplementaryCodePoint(int codePoint)
2456   {
2457     return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
2458       && codePoint <= MAX_CODE_POINT;
2459   }
2460
2461   /**
2462    * Determines whether the specified code point is
2463    * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
2464    *
2465    * @param codePoint a Unicode code point
2466    *
2467    * @return <code>true</code> if code point is valid
2468    *
2469    * @since 1.5
2470    */
2471   public static boolean isValidCodePoint(int codePoint)
2472   {
2473     return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
2474   }
2475
2476   /**
2477    * Return true if the given character is a high surrogate.
2478    * @param ch the character
2479    * @return true if the character is a high surrogate character
2480    *
2481    * @since 1.5
2482    */
2483   public static boolean isHighSurrogate(char ch)
2484   {
2485     return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
2486   }
2487
2488   /**
2489    * Return true if the given character is a low surrogate.
2490    * @param ch the character
2491    * @return true if the character is a low surrogate character
2492    *
2493    * @since 1.5
2494    */
2495   public static boolean isLowSurrogate(char ch)
2496   {
2497     return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
2498   }
2499
2500   /**
2501    * Return true if the given characters compose a surrogate pair.
2502    * This is true if the first character is a high surrogate and the
2503    * second character is a low surrogate.
2504    * @param ch1 the first character
2505    * @param ch2 the first character
2506    * @return true if the characters compose a surrogate pair
2507    *
2508    * @since 1.5
2509    */
2510   public static boolean isSurrogatePair(char ch1, char ch2)
2511   {
2512     return isHighSurrogate(ch1) && isLowSurrogate(ch2);
2513   }
2514
2515   /**
2516    * Given a valid surrogate pair, this returns the corresponding
2517    * code point.
2518    * @param high the high character of the pair
2519    * @param low the low character of the pair
2520    * @return the corresponding code point
2521    *
2522    * @since 1.5
2523    */
2524   public static int toCodePoint(char high, char low)
2525   {
2526     return ((high - MIN_HIGH_SURROGATE) * 0x400) +
2527       (low - MIN_LOW_SURROGATE) + 0x10000;
2528   }
2529
2530   /**
2531    * Get the code point at the specified index in the CharSequence.
2532    * This is like CharSequence#charAt(int), but if the character is
2533    * the start of a surrogate pair, and there is a following
2534    * character, and this character completes the pair, then the
2535    * corresponding supplementary code point is returned.  Otherwise,
2536    * the character at the index is returned.
2537    *
2538    * @param sequence the CharSequence
2539    * @param index the index of the codepoint to get, starting at 0
2540    * @return the codepoint at the specified index
2541    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2542    * @since 1.5
2543    */
2544   public static int codePointAt(CharSequence sequence, int index)
2545   {
2546     int len = sequence.length();
2547     if (index < 0 || index >= len)
2548       throw new IndexOutOfBoundsException();
2549     char high = sequence.charAt(index);
2550     if (! isHighSurrogate(high) || ++index >= len)
2551       return high;
2552     char low = sequence.charAt(index);
2553     if (! isLowSurrogate(low))
2554       return high;
2555     return toCodePoint(high, low);
2556   }
2557
2558   /**
2559    * Get the code point at the specified index in the CharSequence.
2560    * If the character is the start of a surrogate pair, and there is a
2561    * following character, and this character completes the pair, then
2562    * the corresponding supplementary code point is returned.
2563    * Otherwise, the character at the index is returned.
2564    *
2565    * @param chars the character array in which to look
2566    * @param index the index of the codepoint to get, starting at 0
2567    * @return the codepoint at the specified index
2568    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2569    * @since 1.5
2570    */
2571   public static int codePointAt(char[] chars, int index)
2572   {
2573     return codePointAt(chars, index, chars.length);
2574   }
2575
2576   /**
2577    * Get the code point at the specified index in the CharSequence.
2578    * If the character is the start of a surrogate pair, and there is a
2579    * following character within the specified range, and this
2580    * character completes the pair, then the corresponding
2581    * supplementary code point is returned.  Otherwise, the character
2582    * at the index is returned.
2583    *
2584    * @param chars the character array in which to look
2585    * @param index the index of the codepoint to get, starting at 0
2586    * @param limit the limit past which characters should not be examined
2587    * @return the codepoint at the specified index
2588    * @throws IndexOutOfBoundsException if index is negative or &gt;=
2589    * limit, or if limit is negative or &gt;= the length of the array
2590    * @since 1.5
2591    */
2592   public static int codePointAt(char[] chars, int index, int limit)
2593   {
2594     if (index < 0 || index >= limit || limit < 0 || limit >= chars.length)
2595       throw new IndexOutOfBoundsException();
2596     char high = chars[index];
2597     if (! isHighSurrogate(high) || ++index >= limit)
2598       return high;
2599     char low = chars[index];
2600     if (! isLowSurrogate(low))
2601       return high;
2602     return toCodePoint(high, low);
2603   }
2604
2605   /**
2606    * Get the code point before the specified index.  This is like
2607    * #codePointAt(char[], int), but checks the characters at
2608    * <code>index-1</code> and <code>index-2</code> to see if they form
2609    * a supplementary code point.  If they do not, the character at
2610    * <code>index-1</code> is returned.
2611    *
2612    * @param chars the character array
2613    * @param index the index just past the codepoint to get, starting at 0
2614    * @return the codepoint at the specified index
2615    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2616    * @since 1.5
2617    */
2618   public static int codePointBefore(char[] chars, int index)
2619   {
2620     return codePointBefore(chars, index, 1);
2621   }
2622
2623   /**
2624    * Get the code point before the specified index.  This is like
2625    * #codePointAt(char[], int), but checks the characters at
2626    * <code>index-1</code> and <code>index-2</code> to see if they form
2627    * a supplementary code point.  If they do not, the character at
2628    * <code>index-1</code> is returned.  The start parameter is used to
2629    * limit the range of the array which may be examined.
2630    *
2631    * @param chars the character array
2632    * @param index the index just past the codepoint to get, starting at 0
2633    * @param start the index before which characters should not be examined
2634    * @return the codepoint at the specified index
2635    * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
2636    * the length of the array, or if limit is negative or &gt;= the
2637    * length of the array
2638    * @since 1.5
2639    */
2640   public static int codePointBefore(char[] chars, int index, int start)
2641   {
2642     if (index < start || index > chars.length
2643         || start < 0 || start >= chars.length)
2644       throw new IndexOutOfBoundsException();
2645     --index;
2646     char low = chars[index];
2647     if (! isLowSurrogate(low) || --index < start)
2648       return low;
2649     char high = chars[index];
2650     if (! isHighSurrogate(high))
2651       return low;
2652     return toCodePoint(high, low);
2653   }
2654
2655   /**
2656    * Get the code point before the specified index.  This is like
2657    * #codePointAt(CharSequence, int), but checks the characters at
2658    * <code>index-1</code> and <code>index-2</code> to see if they form
2659    * a supplementary code point.  If they do not, the character at
2660    * <code>index-1</code> is returned.
2661    *
2662    * @param sequence the CharSequence
2663    * @param index the index just past the codepoint to get, starting at 0
2664    * @return the codepoint at the specified index
2665    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2666    * @since 1.5
2667    */
2668   public static int codePointBefore(CharSequence sequence, int index)
2669   {
2670     int len = sequence.length();
2671     if (index < 1 || index > len)
2672       throw new IndexOutOfBoundsException();
2673     --index;
2674     char low = sequence.charAt(index);
2675     if (! isLowSurrogate(low) || --index < 0)
2676       return low;
2677     char high = sequence.charAt(index);
2678     if (! isHighSurrogate(high))
2679       return low;
2680     return toCodePoint(high, low);
2681   }
2682 } // class Character