libjava/classpath/java/lang/Character.java

   1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
   2    Copyright (C) 1998, 1999, 2001, 2002, 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package java.lang;
  40
  41 import gnu.java.lang.CharData;
  42
  43 import java.io.Serializable;
  44 import java.text.Collator;
  45 import java.util.Locale;
  46
  47 /**
  48  * Wrapper class for the primitive char data type.  In addition, this class
  49  * allows one to retrieve property information and perform transformations
  50  * on the defined characters in the Unicode Standard, Version 4.0.0.
  51  * java.lang.Character is designed to be very dynamic, and as such, it
  52  * retrieves information on the Unicode character set from a separate
  53  * database, gnu.java.lang.CharData, which can be easily upgraded.
  54  *
  55  * <p>For predicates, boundaries are used to describe
  56  * the set of characters for which the method will return true.
  57  * This syntax uses fairly normal regular expression notation.
  58  * See 5.13 of the Unicode Standard, Version 4.0, for the
  59  * boundary specification.
  60  *
  61  * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
  62  * for more information on the Unicode Standard.
  63  *
  64  * @author Tom Tromey (tromey@cygnus.com)
  65  * @author Paul N. Fisher
  66  * @author Jochen Hoenicke
  67  * @author Eric Blake (ebb9@email.byu.edu)
  68  * @see CharData
  69  * @since 1.0
  70  * @status updated to 1.4
  71  */
  72 public final class Character implements Serializable, Comparable
  73 {
  74   /**
  75    * A subset of Unicode blocks.
  76    *
  77    * @author Paul N. Fisher
  78    * @author Eric Blake (ebb9@email.byu.edu)
  79    * @since 1.2
  80    */
  81   public static class Subset
  82   {
  83     /** The name of the subset. */
  84     private final String name;
  85
  86     /**
  87      * Construct a new subset of characters.
  88      *
  89      * @param name the name of the subset
  90      * @throws NullPointerException if name is null
  91      */
  92     protected Subset(String name)
  93     {
  94       // Note that name.toString() is name, unless name was null.
  95       this.name = name.toString();
  96     }
  97
  98     /**
  99      * Compares two Subsets for equality. This is <code>final</code>, and
 100      * restricts the comparison on the <code>==</code> operator, so it returns
 101      * true only for the same object.
 102      *
 103      * @param o the object to compare
 104      * @return true if o is this
 105      */
 106     public final boolean equals(Object o)
 107     {
 108       return o == this;
 109     }
 110
 111     /**
 112      * Makes the original hashCode of Object final, to be consistent with
 113      * equals.
 114      *
 115      * @return the hash code for this object
 116      */
 117     public final int hashCode()
 118     {
 119       return super.hashCode();
 120     }
 121
 122     /**
 123      * Returns the name of the subset.
 124      *
 125      * @return the name
 126      */
 127     public final String toString()
 128     {
 129       return name;
 130     }
 131   } // class Subset
 132
 133   /**
 134    * A family of character subsets in the Unicode specification. A character
 135    * is in at most one of these blocks.
 136    *
 137    * This inner class was generated automatically from
 138    * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts.
 139    * This Unicode definition file can be found on the
 140    * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 141    * JDK 1.5 uses Unicode version 4.0.0.
 142    *
 143    * @author scripts/unicode-blocks.pl (written by Eric Blake)
 144    * @since 1.2
 145    */
 146   public static final class UnicodeBlock extends Subset
 147   {
 148     /** The start of the subset. */
 149     private final int start;
 150
 151     /** The end of the subset. */
 152     private final int end;
 153
 154     /** The canonical name of the block according to the Unicode standard. */
 155     private final String canonicalName;
 156
 157     /** Constants for the <code>forName()</code> method */
 158     private static final int CANONICAL_NAME = 0;
 159     private static final int NO_SPACES_NAME = 1;
 160     private static final int CONSTANT_NAME = 2;
 161
 162     /**
 163      * Constructor for strictly defined blocks.
 164      *
 165      * @param start the start character of the range
 166      * @param end the end character of the range
 167      * @param name the block name
 168      * @param canonicalName the name of the block as defined in the Unicode
 169      *        standard.
 170      */
 171     private UnicodeBlock(int start, int end, String name,
 172              String canonicalName)
 173     {
 174       super(name);
 175       this.start = start;
 176       this.end = end;
 177       this.canonicalName = canonicalName;
 178     }
 179
 180     /**
 181      * Returns the Unicode character block which a character belongs to.
 182      * <strong>Note</strong>: This method does not support the use of
 183      * supplementary characters.  For such support, <code>of(int)</code>
 184      * should be used instead.
 185      *
 186      * @param ch the character to look up
 187      * @return the set it belongs to, or null if it is not in one
 188      */
 189     public static UnicodeBlock of(char ch)
 190     {
 191       return of((int) ch);
 192     }
 193
 194     /**
 195      * Returns the Unicode character block which a code point belongs to.
 196      *
 197      * @param codePoint the character to look up
 198      * @return the set it belongs to, or null if it is not in one.
 199      * @throws IllegalArgumentException if the specified code point is
 200      *         invalid.
 201      * @since 1.5
 202      */
 203     public static UnicodeBlock of(int codePoint)
 204     {
 205       if (codePoint > MAX_CODE_POINT)
 206     throw new IllegalArgumentException("The supplied integer value is " +
 207                        "too large to be a codepoint.");
 208       // Simple binary search for the correct block.
 209       int low = 0;
 210       int hi = sets.length - 1;
 211       while (low <= hi)
 212         {
 213           int mid = (low + hi) >> 1;
 214           UnicodeBlock b = sets[mid];
 215           if (codePoint < b.start)
 216             hi = mid - 1;
 217           else if (codePoint > b.end)
 218             low = mid + 1;
 219           else
 220             return b;
 221         }
 222       return null;
 223     }
 224
 225     /**
 226      * <p>
 227      * Returns the <code>UnicodeBlock</code> with the given name, as defined
 228      * by the Unicode standard.  The version of Unicode in use is defined by
 229      * the <code>Character</code> class, and the names are given in the
 230      * <code>Blocks-<version>.txt</code> file corresponding to that version.
 231      * The name may be specified in one of three ways:
 232      * </p>
 233      * <ol>
 234      * <li>The canonical, human-readable name used by the Unicode standard.
 235      * This is the name with all spaces and hyphens retained.  For example,
 236      * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
 237      * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
 238      * <li>The name used for the constants specified by this class, which
 239      * is the canonical name with all spaces and hyphens replaced with
 240      * underscores e.g. `BASIC_LATIN'</li>
 241      * </ol>
 242      * <p>
 243      * The names are compared case-insensitively using the case comparison
 244      * associated with the U.S. English locale.  The method recognises the
 245      * previous names used for blocks as well as the current ones.  At
 246      * present, this simply means that the deprecated `SURROGATES_AREA'
 247      * will be recognised by this method (the <code>of()</code> methods
 248      * only return one of the three new surrogate blocks).
 249      * </p>
 250      *
 251      * @param blockName the name of the block to look up.
 252      * @return the specified block.
 253      * @throws NullPointerException if the <code>blockName</code> is
 254      *         <code>null</code>.
 255      * @throws IllegalArgumentException if the name does not match any Unicode
 256      *         block.
 257      * @since 1.5
 258      */
 259     public static final UnicodeBlock forName(String blockName)
 260     {
 261       int type;
 262       if (blockName.indexOf(' ') != -1)
 263         type = CANONICAL_NAME;
 264       else if (blockName.indexOf('_') != -1)
 265         type = CONSTANT_NAME;
 266       else
 267         type = NO_SPACES_NAME;
 268       Collator usCollator = Collator.getInstance(Locale.US);
 269       usCollator.setStrength(Collator.PRIMARY);
 270       /* Special case for deprecated blocks not in sets */
 271       switch (type)
 272       {
 273         case CANONICAL_NAME:
 274           if (usCollator.compare(blockName, "Surrogates Area") == 0)
 275             return SURROGATES_AREA;
 276           break;
 277         case NO_SPACES_NAME:
 278           if (usCollator.compare(blockName, "SurrogatesArea") == 0)
 279             return SURROGATES_AREA;
 280           break;
 281         case CONSTANT_NAME:
 282           if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
 283             return SURROGATES_AREA;
 284           break;
 285       }
 286       /* Other cases */
 287       int setLength = sets.length;
 288       switch (type)
 289       {
 290         case CANONICAL_NAME:
 291           for (int i = 0; i < setLength; i++)
 292             {
 293               UnicodeBlock block = sets[i];
 294               if (usCollator.compare(blockName, block.canonicalName) == 0)
 295                 return block;
 296             }
 297           break;
 298         case NO_SPACES_NAME:
 299           for (int i = 0; i < setLength; i++)
 300             {
 301               UnicodeBlock block = sets[i];
 302               String nsName = block.canonicalName.replaceAll(" ","");
 303               if (usCollator.compare(blockName, nsName) == 0)
 304                 return block;
 305             }
 306           break;
 307         case CONSTANT_NAME:
 308           for (int i = 0; i < setLength; i++)
 309             {
 310               UnicodeBlock block = sets[i];
 311               if (usCollator.compare(blockName, block.toString()) == 0)
 312                 return block;
 313             }
 314           break;
 315       }
 316       throw new IllegalArgumentException("No Unicode block found for " +
 317                                          blockName + ".");
 318     }
 319
 320     /**
 321      * Basic Latin.
 322      * 0x0000 - 0x007F.
 323      */
 324     public static final UnicodeBlock BASIC_LATIN
 325       = new UnicodeBlock(0x0000, 0x007F,
 326                          "BASIC_LATIN",
 327                          "Basic Latin");
 328
 329     /**
 330      * Latin-1 Supplement.
 331      * 0x0080 - 0x00FF.
 332      */
 333     public static final UnicodeBlock LATIN_1_SUPPLEMENT
 334       = new UnicodeBlock(0x0080, 0x00FF,
 335                          "LATIN_1_SUPPLEMENT",
 336                          "Latin-1 Supplement");
 337
 338     /**
 339      * Latin Extended-A.
 340      * 0x0100 - 0x017F.
 341      */
 342     public static final UnicodeBlock LATIN_EXTENDED_A
 343       = new UnicodeBlock(0x0100, 0x017F,
 344                          "LATIN_EXTENDED_A",
 345                          "Latin Extended-A");
 346
 347     /**
 348      * Latin Extended-B.
 349      * 0x0180 - 0x024F.
 350      */
 351     public static final UnicodeBlock LATIN_EXTENDED_B
 352       = new UnicodeBlock(0x0180, 0x024F,
 353                          "LATIN_EXTENDED_B",
 354                          "Latin Extended-B");
 355
 356     /**
 357      * IPA Extensions.
 358      * 0x0250 - 0x02AF.
 359      */
 360     public static final UnicodeBlock IPA_EXTENSIONS
 361       = new UnicodeBlock(0x0250, 0x02AF,
 362                          "IPA_EXTENSIONS",
 363                          "IPA Extensions");
 364
 365     /**
 366      * Spacing Modifier Letters.
 367      * 0x02B0 - 0x02FF.
 368      */
 369     public static final UnicodeBlock SPACING_MODIFIER_LETTERS
 370       = new UnicodeBlock(0x02B0, 0x02FF,
 371                          "SPACING_MODIFIER_LETTERS",
 372                          "Spacing Modifier Letters");
 373
 374     /**
 375      * Combining Diacritical Marks.
 376      * 0x0300 - 0x036F.
 377      */
 378     public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
 379       = new UnicodeBlock(0x0300, 0x036F,
 380                          "COMBINING_DIACRITICAL_MARKS",
 381                          "Combining Diacritical Marks");
 382
 383     /**
 384      * Greek.
 385      * 0x0370 - 0x03FF.
 386      */
 387     public static final UnicodeBlock GREEK
 388       = new UnicodeBlock(0x0370, 0x03FF,
 389                          "GREEK",
 390                          "Greek");
 391
 392     /**
 393      * Cyrillic.
 394      * 0x0400 - 0x04FF.
 395      */
 396     public static final UnicodeBlock CYRILLIC
 397       = new UnicodeBlock(0x0400, 0x04FF,
 398                          "CYRILLIC",
 399                          "Cyrillic");
 400
 401     /**
 402      * Cyrillic Supplementary.
 403      * 0x0500 - 0x052F.
 404      * @since 1.5
 405      */
 406     public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
 407       = new UnicodeBlock(0x0500, 0x052F,
 408                          "CYRILLIC_SUPPLEMENTARY",
 409                          "Cyrillic Supplementary");
 410
 411     /**
 412      * Armenian.
 413      * 0x0530 - 0x058F.
 414      */
 415     public static final UnicodeBlock ARMENIAN
 416       = new UnicodeBlock(0x0530, 0x058F,
 417                          "ARMENIAN",
 418                          "Armenian");
 419
 420     /**
 421      * Hebrew.
 422      * 0x0590 - 0x05FF.
 423      */
 424     public static final UnicodeBlock HEBREW
 425       = new UnicodeBlock(0x0590, 0x05FF,
 426                          "HEBREW",
 427                          "Hebrew");
 428
 429     /**
 430      * Arabic.
 431      * 0x0600 - 0x06FF.
 432      */
 433     public static final UnicodeBlock ARABIC
 434       = new UnicodeBlock(0x0600, 0x06FF,
 435                          "ARABIC",
 436                          "Arabic");
 437
 438     /**
 439      * Syriac.
 440      * 0x0700 - 0x074F.
 441      * @since 1.4
 442      */
 443     public static final UnicodeBlock SYRIAC
 444       = new UnicodeBlock(0x0700, 0x074F,
 445                          "SYRIAC",
 446                          "Syriac");
 447
 448     /**
 449      * Thaana.
 450      * 0x0780 - 0x07BF.
 451      * @since 1.4
 452      */
 453     public static final UnicodeBlock THAANA
 454       = new UnicodeBlock(0x0780, 0x07BF,
 455                          "THAANA",
 456                          "Thaana");
 457
 458     /**
 459      * Devanagari.
 460      * 0x0900 - 0x097F.
 461      */
 462     public static final UnicodeBlock DEVANAGARI
 463       = new UnicodeBlock(0x0900, 0x097F,
 464                          "DEVANAGARI",
 465                          "Devanagari");
 466
 467     /**
 468      * Bengali.
 469      * 0x0980 - 0x09FF.
 470      */
 471     public static final UnicodeBlock BENGALI
 472       = new UnicodeBlock(0x0980, 0x09FF,
 473                          "BENGALI",
 474                          "Bengali");
 475
 476     /**
 477      * Gurmukhi.
 478      * 0x0A00 - 0x0A7F.
 479      */
 480     public static final UnicodeBlock GURMUKHI
 481       = new UnicodeBlock(0x0A00, 0x0A7F,
 482                          "GURMUKHI",
 483                          "Gurmukhi");
 484
 485     /**
 486      * Gujarati.
 487      * 0x0A80 - 0x0AFF.
 488      */
 489     public static final UnicodeBlock GUJARATI
 490       = new UnicodeBlock(0x0A80, 0x0AFF,
 491                          "GUJARATI",
 492                          "Gujarati");
 493
 494     /**
 495      * Oriya.
 496      * 0x0B00 - 0x0B7F.
 497      */
 498     public static final UnicodeBlock ORIYA
 499       = new UnicodeBlock(0x0B00, 0x0B7F,
 500                          "ORIYA",
 501                          "Oriya");
 502
 503     /**
 504      * Tamil.
 505      * 0x0B80 - 0x0BFF.
 506      */
 507     public static final UnicodeBlock TAMIL
 508       = new UnicodeBlock(0x0B80, 0x0BFF,
 509                          "TAMIL",
 510                          "Tamil");
 511
 512     /**
 513      * Telugu.
 514      * 0x0C00 - 0x0C7F.
 515      */
 516     public static final UnicodeBlock TELUGU
 517       = new UnicodeBlock(0x0C00, 0x0C7F,
 518                          "TELUGU",
 519                          "Telugu");
 520
 521     /**
 522      * Kannada.
 523      * 0x0C80 - 0x0CFF.
 524      */
 525     public static final UnicodeBlock KANNADA
 526       = new UnicodeBlock(0x0C80, 0x0CFF,
 527                          "KANNADA",
 528                          "Kannada");
 529
 530     /**
 531      * Malayalam.
 532      * 0x0D00 - 0x0D7F.
 533      */
 534     public static final UnicodeBlock MALAYALAM
 535       = new UnicodeBlock(0x0D00, 0x0D7F,
 536                          "MALAYALAM",
 537                          "Malayalam");
 538
 539     /**
 540      * Sinhala.
 541      * 0x0D80 - 0x0DFF.
 542      * @since 1.4
 543      */
 544     public static final UnicodeBlock SINHALA
 545       = new UnicodeBlock(0x0D80, 0x0DFF,
 546                          "SINHALA",
 547                          "Sinhala");
 548
 549     /**
 550      * Thai.
 551      * 0x0E00 - 0x0E7F.
 552      */
 553     public static final UnicodeBlock THAI
 554       = new UnicodeBlock(0x0E00, 0x0E7F,
 555                          "THAI",
 556                          "Thai");
 557
 558     /**
 559      * Lao.
 560      * 0x0E80 - 0x0EFF.
 561      */
 562     public static final UnicodeBlock LAO
 563       = new UnicodeBlock(0x0E80, 0x0EFF,
 564                          "LAO",
 565                          "Lao");
 566
 567     /**
 568      * Tibetan.
 569      * 0x0F00 - 0x0FFF.
 570      */
 571     public static final UnicodeBlock TIBETAN
 572       = new UnicodeBlock(0x0F00, 0x0FFF,
 573                          "TIBETAN",
 574                          "Tibetan");
 575
 576     /**
 577      * Myanmar.
 578      * 0x1000 - 0x109F.
 579      * @since 1.4
 580      */
 581     public static final UnicodeBlock MYANMAR
 582       = new UnicodeBlock(0x1000, 0x109F,
 583                          "MYANMAR",
 584                          "Myanmar");
 585
 586     /**
 587      * Georgian.
 588      * 0x10A0 - 0x10FF.
 589      */
 590     public static final UnicodeBlock GEORGIAN
 591       = new UnicodeBlock(0x10A0, 0x10FF,
 592                          "GEORGIAN",
 593                          "Georgian");
 594
 595     /**
 596      * Hangul Jamo.
 597      * 0x1100 - 0x11FF.
 598      */
 599     public static final UnicodeBlock HANGUL_JAMO
 600       = new UnicodeBlock(0x1100, 0x11FF,
 601                          "HANGUL_JAMO",
 602                          "Hangul Jamo");
 603
 604     /**
 605      * Ethiopic.
 606      * 0x1200 - 0x137F.
 607      * @since 1.4
 608      */
 609     public static final UnicodeBlock ETHIOPIC
 610       = new UnicodeBlock(0x1200, 0x137F,
 611                          "ETHIOPIC",
 612                          "Ethiopic");
 613
 614     /**
 615      * Cherokee.
 616      * 0x13A0 - 0x13FF.
 617      * @since 1.4
 618      */
 619     public static final UnicodeBlock CHEROKEE
 620       = new UnicodeBlock(0x13A0, 0x13FF,
 621                          "CHEROKEE",
 622                          "Cherokee");
 623
 624     /**
 625      * Unified Canadian Aboriginal Syllabics.
 626      * 0x1400 - 0x167F.
 627      * @since 1.4
 628      */
 629     public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
 630       = new UnicodeBlock(0x1400, 0x167F,
 631                          "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
 632                          "Unified Canadian Aboriginal Syllabics");
 633
 634     /**
 635      * Ogham.
 636      * 0x1680 - 0x169F.
 637      * @since 1.4
 638      */
 639     public static final UnicodeBlock OGHAM
 640       = new UnicodeBlock(0x1680, 0x169F,
 641                          "OGHAM",
 642                          "Ogham");
 643
 644     /**
 645      * Runic.
 646      * 0x16A0 - 0x16FF.
 647      * @since 1.4
 648      */
 649     public static final UnicodeBlock RUNIC
 650       = new UnicodeBlock(0x16A0, 0x16FF,
 651                          "RUNIC",
 652                          "Runic");
 653
 654     /**
 655      * Tagalog.
 656      * 0x1700 - 0x171F.
 657      * @since 1.5
 658      */
 659     public static final UnicodeBlock TAGALOG
 660       = new UnicodeBlock(0x1700, 0x171F,
 661                          "TAGALOG",
 662                          "Tagalog");
 663
 664     /**
 665      * Hanunoo.
 666      * 0x1720 - 0x173F.
 667      * @since 1.5
 668      */
 669     public static final UnicodeBlock HANUNOO
 670       = new UnicodeBlock(0x1720, 0x173F,
 671                          "HANUNOO",
 672                          "Hanunoo");
 673
 674     /**
 675      * Buhid.
 676      * 0x1740 - 0x175F.
 677      * @since 1.5
 678      */
 679     public static final UnicodeBlock BUHID
 680       = new UnicodeBlock(0x1740, 0x175F,
 681                          "BUHID",
 682                          "Buhid");
 683
 684     /**
 685      * Tagbanwa.
 686      * 0x1760 - 0x177F.
 687      * @since 1.5
 688      */
 689     public static final UnicodeBlock TAGBANWA
 690       = new UnicodeBlock(0x1760, 0x177F,
 691                          "TAGBANWA",
 692                          "Tagbanwa");
 693
 694     /**
 695      * Khmer.
 696      * 0x1780 - 0x17FF.
 697      * @since 1.4
 698      */
 699     public static final UnicodeBlock KHMER
 700       = new UnicodeBlock(0x1780, 0x17FF,
 701                          "KHMER",
 702                          "Khmer");
 703
 704     /**
 705      * Mongolian.
 706      * 0x1800 - 0x18AF.
 707      * @since 1.4
 708      */
 709     public static final UnicodeBlock MONGOLIAN
 710       = new UnicodeBlock(0x1800, 0x18AF,
 711                          "MONGOLIAN",
 712                          "Mongolian");
 713
 714     /**
 715      * Limbu.
 716      * 0x1900 - 0x194F.
 717      * @since 1.5
 718      */
 719     public static final UnicodeBlock LIMBU
 720       = new UnicodeBlock(0x1900, 0x194F,
 721                          "LIMBU",
 722                          "Limbu");
 723
 724     /**
 725      * Tai Le.
 726      * 0x1950 - 0x197F.
 727      * @since 1.5
 728      */
 729     public static final UnicodeBlock TAI_LE
 730       = new UnicodeBlock(0x1950, 0x197F,
 731                          "TAI_LE",
 732                          "Tai Le");
 733
 734     /**
 735      * Khmer Symbols.
 736      * 0x19E0 - 0x19FF.
 737      * @since 1.5
 738      */
 739     public static final UnicodeBlock KHMER_SYMBOLS
 740       = new UnicodeBlock(0x19E0, 0x19FF,
 741                          "KHMER_SYMBOLS",
 742                          "Khmer Symbols");
 743
 744     /**
 745      * Phonetic Extensions.
 746      * 0x1D00 - 0x1D7F.
 747      * @since 1.5
 748      */
 749     public static final UnicodeBlock PHONETIC_EXTENSIONS
 750       = new UnicodeBlock(0x1D00, 0x1D7F,
 751                          "PHONETIC_EXTENSIONS",
 752                          "Phonetic Extensions");
 753
 754     /**
 755      * Latin Extended Additional.
 756      * 0x1E00 - 0x1EFF.
 757      */
 758     public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
 759       = new UnicodeBlock(0x1E00, 0x1EFF,
 760                          "LATIN_EXTENDED_ADDITIONAL",
 761                          "Latin Extended Additional");
 762
 763     /**
 764      * Greek Extended.
 765      * 0x1F00 - 0x1FFF.
 766      */
 767     public static final UnicodeBlock GREEK_EXTENDED
 768       = new UnicodeBlock(0x1F00, 0x1FFF,
 769                          "GREEK_EXTENDED",
 770                          "Greek Extended");
 771
 772     /**
 773      * General Punctuation.
 774      * 0x2000 - 0x206F.
 775      */
 776     public static final UnicodeBlock GENERAL_PUNCTUATION
 777       = new UnicodeBlock(0x2000, 0x206F,
 778                          "GENERAL_PUNCTUATION",
 779                          "General Punctuation");
 780
 781     /**
 782      * Superscripts and Subscripts.
 783      * 0x2070 - 0x209F.
 784      */
 785     public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
 786       = new UnicodeBlock(0x2070, 0x209F,
 787                          "SUPERSCRIPTS_AND_SUBSCRIPTS",
 788                          "Superscripts and Subscripts");
 789
 790     /**
 791      * Currency Symbols.
 792      * 0x20A0 - 0x20CF.
 793      */
 794     public static final UnicodeBlock CURRENCY_SYMBOLS
 795       = new UnicodeBlock(0x20A0, 0x20CF,
 796                          "CURRENCY_SYMBOLS",
 797                          "Currency Symbols");
 798
 799     /**
 800      * Combining Marks for Symbols.
 801      * 0x20D0 - 0x20FF.
 802      */
 803     public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
 804       = new UnicodeBlock(0x20D0, 0x20FF,
 805                          "COMBINING_MARKS_FOR_SYMBOLS",
 806                          "Combining Marks for Symbols");
 807
 808     /**
 809      * Letterlike Symbols.
 810      * 0x2100 - 0x214F.
 811      */
 812     public static final UnicodeBlock LETTERLIKE_SYMBOLS
 813       = new UnicodeBlock(0x2100, 0x214F,
 814                          "LETTERLIKE_SYMBOLS",
 815                          "Letterlike Symbols");
 816
 817     /**
 818      * Number Forms.
 819      * 0x2150 - 0x218F.
 820      */
 821     public static final UnicodeBlock NUMBER_FORMS
 822       = new UnicodeBlock(0x2150, 0x218F,
 823                          "NUMBER_FORMS",
 824                          "Number Forms");
 825
 826     /**
 827      * Arrows.
 828      * 0x2190 - 0x21FF.
 829      */
 830     public static final UnicodeBlock ARROWS
 831       = new UnicodeBlock(0x2190, 0x21FF,
 832                          "ARROWS",
 833                          "Arrows");
 834
 835     /**
 836      * Mathematical Operators.
 837      * 0x2200 - 0x22FF.
 838      */
 839     public static final UnicodeBlock MATHEMATICAL_OPERATORS
 840       = new UnicodeBlock(0x2200, 0x22FF,
 841                          "MATHEMATICAL_OPERATORS",
 842                          "Mathematical Operators");
 843
 844     /**
 845      * Miscellaneous Technical.
 846      * 0x2300 - 0x23FF.
 847      */
 848     public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
 849       = new UnicodeBlock(0x2300, 0x23FF,
 850                          "MISCELLANEOUS_TECHNICAL",
 851                          "Miscellaneous Technical");
 852
 853     /**
 854      * Control Pictures.
 855      * 0x2400 - 0x243F.
 856      */
 857     public static final UnicodeBlock CONTROL_PICTURES
 858       = new UnicodeBlock(0x2400, 0x243F,
 859                          "CONTROL_PICTURES",
 860                          "Control Pictures");
 861
 862     /**
 863      * Optical Character Recognition.
 864      * 0x2440 - 0x245F.
 865      */
 866     public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
 867       = new UnicodeBlock(0x2440, 0x245F,
 868                          "OPTICAL_CHARACTER_RECOGNITION",
 869                          "Optical Character Recognition");
 870
 871     /**
 872      * Enclosed Alphanumerics.
 873      * 0x2460 - 0x24FF.
 874      */
 875     public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
 876       = new UnicodeBlock(0x2460, 0x24FF,
 877                          "ENCLOSED_ALPHANUMERICS",
 878                          "Enclosed Alphanumerics");
 879
 880     /**
 881      * Box Drawing.
 882      * 0x2500 - 0x257F.
 883      */
 884     public static final UnicodeBlock BOX_DRAWING
 885       = new UnicodeBlock(0x2500, 0x257F,
 886                          "BOX_DRAWING",
 887                          "Box Drawing");
 888
 889     /**
 890      * Block Elements.
 891      * 0x2580 - 0x259F.
 892      */
 893     public static final UnicodeBlock BLOCK_ELEMENTS
 894       = new UnicodeBlock(0x2580, 0x259F,
 895                          "BLOCK_ELEMENTS",
 896                          "Block Elements");
 897
 898     /**
 899      * Geometric Shapes.
 900      * 0x25A0 - 0x25FF.
 901      */
 902     public static final UnicodeBlock GEOMETRIC_SHAPES
 903       = new UnicodeBlock(0x25A0, 0x25FF,
 904                          "GEOMETRIC_SHAPES",
 905                          "Geometric Shapes");
 906
 907     /**
 908      * Miscellaneous Symbols.
 909      * 0x2600 - 0x26FF.
 910      */
 911     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
 912       = new UnicodeBlock(0x2600, 0x26FF,
 913                          "MISCELLANEOUS_SYMBOLS",
 914                          "Miscellaneous Symbols");
 915
 916     /**
 917      * Dingbats.
 918      * 0x2700 - 0x27BF.
 919      */
 920     public static final UnicodeBlock DINGBATS
 921       = new UnicodeBlock(0x2700, 0x27BF,
 922                          "DINGBATS",
 923                          "Dingbats");
 924
 925     /**
 926      * Miscellaneous Mathematical Symbols-A.
 927      * 0x27C0 - 0x27EF.
 928      * @since 1.5
 929      */
 930     public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
 931       = new UnicodeBlock(0x27C0, 0x27EF,
 932                          "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
 933                          "Miscellaneous Mathematical Symbols-A");
 934
 935     /**
 936      * Supplemental Arrows-A.
 937      * 0x27F0 - 0x27FF.
 938      * @since 1.5
 939      */
 940     public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
 941       = new UnicodeBlock(0x27F0, 0x27FF,
 942                          "SUPPLEMENTAL_ARROWS_A",
 943                          "Supplemental Arrows-A");
 944
 945     /**
 946      * Braille Patterns.
 947      * 0x2800 - 0x28FF.
 948      * @since 1.4
 949      */
 950     public static final UnicodeBlock BRAILLE_PATTERNS
 951       = new UnicodeBlock(0x2800, 0x28FF,
 952                          "BRAILLE_PATTERNS",
 953                          "Braille Patterns");
 954
 955     /**
 956      * Supplemental Arrows-B.
 957      * 0x2900 - 0x297F.
 958      * @since 1.5
 959      */
 960     public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
 961       = new UnicodeBlock(0x2900, 0x297F,
 962                          "SUPPLEMENTAL_ARROWS_B",
 963                          "Supplemental Arrows-B");
 964
 965     /**
 966      * Miscellaneous Mathematical Symbols-B.
 967      * 0x2980 - 0x29FF.
 968      * @since 1.5
 969      */
 970     public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
 971       = new UnicodeBlock(0x2980, 0x29FF,
 972                          "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
 973                          "Miscellaneous Mathematical Symbols-B");
 974
 975     /**
 976      * Supplemental Mathematical Operators.
 977      * 0x2A00 - 0x2AFF.
 978      * @since 1.5
 979      */
 980     public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
 981       = new UnicodeBlock(0x2A00, 0x2AFF,
 982                          "SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
 983                          "Supplemental Mathematical Operators");
 984
 985     /**
 986      * Miscellaneous Symbols and Arrows.
 987      * 0x2B00 - 0x2BFF.
 988      * @since 1.5
 989      */
 990     public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
 991       = new UnicodeBlock(0x2B00, 0x2BFF,
 992                          "MISCELLANEOUS_SYMBOLS_AND_ARROWS",
 993                          "Miscellaneous Symbols and Arrows");
 994
 995     /**
 996      * CJK Radicals Supplement.
 997      * 0x2E80 - 0x2EFF.
 998      * @since 1.4
 999      */
1000     public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
1001       = new UnicodeBlock(0x2E80, 0x2EFF,
1002                          "CJK_RADICALS_SUPPLEMENT",
1003                          "CJK Radicals Supplement");
1004
1005     /**
1006      * Kangxi Radicals.
1007      * 0x2F00 - 0x2FDF.
1008      * @since 1.4
1009      */
1010     public static final UnicodeBlock KANGXI_RADICALS
1011       = new UnicodeBlock(0x2F00, 0x2FDF,
1012                          "KANGXI_RADICALS",
1013                          "Kangxi Radicals");
1014
1015     /**
1016      * Ideographic Description Characters.
1017      * 0x2FF0 - 0x2FFF.
1018      * @since 1.4
1019      */
1020     public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
1021       = new UnicodeBlock(0x2FF0, 0x2FFF,
1022                          "IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
1023                          "Ideographic Description Characters");
1024
1025     /**
1026      * CJK Symbols and Punctuation.
1027      * 0x3000 - 0x303F.
1028      */
1029     public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
1030       = new UnicodeBlock(0x3000, 0x303F,
1031                          "CJK_SYMBOLS_AND_PUNCTUATION",
1032                          "CJK Symbols and Punctuation");
1033
1034     /**
1035      * Hiragana.
1036      * 0x3040 - 0x309F.
1037      */
1038     public static final UnicodeBlock HIRAGANA
1039       = new UnicodeBlock(0x3040, 0x309F,
1040                          "HIRAGANA",
1041                          "Hiragana");
1042
1043     /**
1044      * Katakana.
1045      * 0x30A0 - 0x30FF.
1046      */
1047     public static final UnicodeBlock KATAKANA
1048       = new UnicodeBlock(0x30A0, 0x30FF,
1049                          "KATAKANA",
1050                          "Katakana");
1051
1052     /**
1053      * Bopomofo.
1054      * 0x3100 - 0x312F.
1055      */
1056     public static final UnicodeBlock BOPOMOFO
1057       = new UnicodeBlock(0x3100, 0x312F,
1058                          "BOPOMOFO",
1059                          "Bopomofo");
1060
1061     /**
1062      * Hangul Compatibility Jamo.
1063      * 0x3130 - 0x318F.
1064      */
1065     public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
1066       = new UnicodeBlock(0x3130, 0x318F,
1067                          "HANGUL_COMPATIBILITY_JAMO",
1068                          "Hangul Compatibility Jamo");
1069
1070     /**
1071      * Kanbun.
1072      * 0x3190 - 0x319F.
1073      */
1074     public static final UnicodeBlock KANBUN
1075       = new UnicodeBlock(0x3190, 0x319F,
1076                          "KANBUN",
1077                          "Kanbun");
1078
1079     /**
1080      * Bopomofo Extended.
1081      * 0x31A0 - 0x31BF.
1082      * @since 1.4
1083      */
1084     public static final UnicodeBlock BOPOMOFO_EXTENDED
1085       = new UnicodeBlock(0x31A0, 0x31BF,
1086                          "BOPOMOFO_EXTENDED",
1087                          "Bopomofo Extended");
1088
1089     /**
1090      * Katakana Phonetic Extensions.
1091      * 0x31F0 - 0x31FF.
1092      * @since 1.5
1093      */
1094     public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
1095       = new UnicodeBlock(0x31F0, 0x31FF,
1096                          "KATAKANA_PHONETIC_EXTENSIONS",
1097                          "Katakana Phonetic Extensions");
1098
1099     /**
1100      * Enclosed CJK Letters and Months.
1101      * 0x3200 - 0x32FF.
1102      */
1103     public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
1104       = new UnicodeBlock(0x3200, 0x32FF,
1105                          "ENCLOSED_CJK_LETTERS_AND_MONTHS",
1106                          "Enclosed CJK Letters and Months");
1107
1108     /**
1109      * CJK Compatibility.
1110      * 0x3300 - 0x33FF.
1111      */
1112     public static final UnicodeBlock CJK_COMPATIBILITY
1113       = new UnicodeBlock(0x3300, 0x33FF,
1114                          "CJK_COMPATIBILITY",
1115                          "CJK Compatibility");
1116
1117     /**
1118      * CJK Unified Ideographs Extension A.
1119      * 0x3400 - 0x4DBF.
1120      * @since 1.4
1121      */
1122     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1123       = new UnicodeBlock(0x3400, 0x4DBF,
1124                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
1125                          "CJK Unified Ideographs Extension A");
1126
1127     /**
1128      * Yijing Hexagram Symbols.
1129      * 0x4DC0 - 0x4DFF.
1130      * @since 1.5
1131      */
1132     public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
1133       = new UnicodeBlock(0x4DC0, 0x4DFF,
1134                          "YIJING_HEXAGRAM_SYMBOLS",
1135                          "Yijing Hexagram Symbols");
1136
1137     /**
1138      * CJK Unified Ideographs.
1139      * 0x4E00 - 0x9FFF.
1140      */
1141     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
1142       = new UnicodeBlock(0x4E00, 0x9FFF,
1143                          "CJK_UNIFIED_IDEOGRAPHS",
1144                          "CJK Unified Ideographs");
1145
1146     /**
1147      * Yi Syllables.
1148      * 0xA000 - 0xA48F.
1149      * @since 1.4
1150      */
1151     public static final UnicodeBlock YI_SYLLABLES
1152       = new UnicodeBlock(0xA000, 0xA48F,
1153                          "YI_SYLLABLES",
1154                          "Yi Syllables");
1155
1156     /**
1157      * Yi Radicals.
1158      * 0xA490 - 0xA4CF.
1159      * @since 1.4
1160      */
1161     public static final UnicodeBlock YI_RADICALS
1162       = new UnicodeBlock(0xA490, 0xA4CF,
1163                          "YI_RADICALS",
1164                          "Yi Radicals");
1165
1166     /**
1167      * Hangul Syllables.
1168      * 0xAC00 - 0xD7AF.
1169      */
1170     public static final UnicodeBlock HANGUL_SYLLABLES
1171       = new UnicodeBlock(0xAC00, 0xD7AF,
1172                          "HANGUL_SYLLABLES",
1173                          "Hangul Syllables");
1174
1175     /**
1176      * High Surrogates.
1177      * 0xD800 - 0xDB7F.
1178      * @since 1.5
1179      */
1180     public static final UnicodeBlock HIGH_SURROGATES
1181       = new UnicodeBlock(0xD800, 0xDB7F,
1182                          "HIGH_SURROGATES",
1183                          "High Surrogates");
1184
1185     /**
1186      * High Private Use Surrogates.
1187      * 0xDB80 - 0xDBFF.
1188      * @since 1.5
1189      */
1190     public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
1191       = new UnicodeBlock(0xDB80, 0xDBFF,
1192                          "HIGH_PRIVATE_USE_SURROGATES",
1193                          "High Private Use Surrogates");
1194
1195     /**
1196      * Low Surrogates.
1197      * 0xDC00 - 0xDFFF.
1198      * @since 1.5
1199      */
1200     public static final UnicodeBlock LOW_SURROGATES
1201       = new UnicodeBlock(0xDC00, 0xDFFF,
1202                          "LOW_SURROGATES",
1203                          "Low Surrogates");
1204
1205     /**
1206      * Private Use Area.
1207      * 0xE000 - 0xF8FF.
1208      */
1209     public static final UnicodeBlock PRIVATE_USE_AREA
1210       = new UnicodeBlock(0xE000, 0xF8FF,
1211                          "PRIVATE_USE_AREA",
1212                          "Private Use Area");
1213
1214     /**
1215      * CJK Compatibility Ideographs.
1216      * 0xF900 - 0xFAFF.
1217      */
1218     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
1219       = new UnicodeBlock(0xF900, 0xFAFF,
1220                          "CJK_COMPATIBILITY_IDEOGRAPHS",
1221                          "CJK Compatibility Ideographs");
1222
1223     /**
1224      * Alphabetic Presentation Forms.
1225      * 0xFB00 - 0xFB4F.
1226      */
1227     public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
1228       = new UnicodeBlock(0xFB00, 0xFB4F,
1229                          "ALPHABETIC_PRESENTATION_FORMS",
1230                          "Alphabetic Presentation Forms");
1231
1232     /**
1233      * Arabic Presentation Forms-A.
1234      * 0xFB50 - 0xFDFF.
1235      */
1236     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
1237       = new UnicodeBlock(0xFB50, 0xFDFF,
1238                          "ARABIC_PRESENTATION_FORMS_A",
1239                          "Arabic Presentation Forms-A");
1240
1241     /**
1242      * Variation Selectors.
1243      * 0xFE00 - 0xFE0F.
1244      * @since 1.5
1245      */
1246     public static final UnicodeBlock VARIATION_SELECTORS
1247       = new UnicodeBlock(0xFE00, 0xFE0F,
1248                          "VARIATION_SELECTORS",
1249                          "Variation Selectors");
1250
1251     /**
1252      * Combining Half Marks.
1253      * 0xFE20 - 0xFE2F.
1254      */
1255     public static final UnicodeBlock COMBINING_HALF_MARKS
1256       = new UnicodeBlock(0xFE20, 0xFE2F,
1257                          "COMBINING_HALF_MARKS",
1258                          "Combining Half Marks");
1259
1260     /**
1261      * CJK Compatibility Forms.
1262      * 0xFE30 - 0xFE4F.
1263      */
1264     public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
1265       = new UnicodeBlock(0xFE30, 0xFE4F,
1266                          "CJK_COMPATIBILITY_FORMS",
1267                          "CJK Compatibility Forms");
1268
1269     /**
1270      * Small Form Variants.
1271      * 0xFE50 - 0xFE6F.
1272      */
1273     public static final UnicodeBlock SMALL_FORM_VARIANTS
1274       = new UnicodeBlock(0xFE50, 0xFE6F,
1275                          "SMALL_FORM_VARIANTS",
1276                          "Small Form Variants");
1277
1278     /**
1279      * Arabic Presentation Forms-B.
1280      * 0xFE70 - 0xFEFF.
1281      */
1282     public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
1283       = new UnicodeBlock(0xFE70, 0xFEFF,
1284                          "ARABIC_PRESENTATION_FORMS_B",
1285                          "Arabic Presentation Forms-B");
1286
1287     /**
1288      * Halfwidth and Fullwidth Forms.
1289      * 0xFF00 - 0xFFEF.
1290      */
1291     public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
1292       = new UnicodeBlock(0xFF00, 0xFFEF,
1293                          "HALFWIDTH_AND_FULLWIDTH_FORMS",
1294                          "Halfwidth and Fullwidth Forms");
1295
1296     /**
1297      * Specials.
1298      * 0xFFF0 - 0xFFFF.
1299      */
1300     public static final UnicodeBlock SPECIALS
1301       = new UnicodeBlock(0xFFF0, 0xFFFF,
1302                          "SPECIALS",
1303                          "Specials");
1304
1305     /**
1306      * Linear B Syllabary.
1307      * 0x10000 - 0x1007F.
1308      * @since 1.5
1309      */
1310     public static final UnicodeBlock LINEAR_B_SYLLABARY
1311       = new UnicodeBlock(0x10000, 0x1007F,
1312                          "LINEAR_B_SYLLABARY",
1313                          "Linear B Syllabary");
1314
1315     /**
1316      * Linear B Ideograms.
1317      * 0x10080 - 0x100FF.
1318      * @since 1.5
1319      */
1320     public static final UnicodeBlock LINEAR_B_IDEOGRAMS
1321       = new UnicodeBlock(0x10080, 0x100FF,
1322                          "LINEAR_B_IDEOGRAMS",
1323                          "Linear B Ideograms");
1324
1325     /**
1326      * Aegean Numbers.
1327      * 0x10100 - 0x1013F.
1328      * @since 1.5
1329      */
1330     public static final UnicodeBlock AEGEAN_NUMBERS
1331       = new UnicodeBlock(0x10100, 0x1013F,
1332                          "AEGEAN_NUMBERS",
1333                          "Aegean Numbers");
1334
1335     /**
1336      * Old Italic.
1337      * 0x10300 - 0x1032F.
1338      * @since 1.5
1339      */
1340     public static final UnicodeBlock OLD_ITALIC
1341       = new UnicodeBlock(0x10300, 0x1032F,
1342                          "OLD_ITALIC",
1343                          "Old Italic");
1344
1345     /**
1346      * Gothic.
1347      * 0x10330 - 0x1034F.
1348      * @since 1.5
1349      */
1350     public static final UnicodeBlock GOTHIC
1351       = new UnicodeBlock(0x10330, 0x1034F,
1352                          "GOTHIC",
1353                          "Gothic");
1354
1355     /**
1356      * Ugaritic.
1357      * 0x10380 - 0x1039F.
1358      * @since 1.5
1359      */
1360     public static final UnicodeBlock UGARITIC
1361       = new UnicodeBlock(0x10380, 0x1039F,
1362                          "UGARITIC",
1363                          "Ugaritic");
1364
1365     /**
1366      * Deseret.
1367      * 0x10400 - 0x1044F.
1368      * @since 1.5
1369      */
1370     public static final UnicodeBlock DESERET
1371       = new UnicodeBlock(0x10400, 0x1044F,
1372                          "DESERET",
1373                          "Deseret");
1374
1375     /**
1376      * Shavian.
1377      * 0x10450 - 0x1047F.
1378      * @since 1.5
1379      */
1380     public static final UnicodeBlock SHAVIAN
1381       = new UnicodeBlock(0x10450, 0x1047F,
1382                          "SHAVIAN",
1383                          "Shavian");
1384
1385     /**
1386      * Osmanya.
1387      * 0x10480 - 0x104AF.
1388      * @since 1.5
1389      */
1390     public static final UnicodeBlock OSMANYA
1391       = new UnicodeBlock(0x10480, 0x104AF,
1392                          "OSMANYA",
1393                          "Osmanya");
1394
1395     /**
1396      * Cypriot Syllabary.
1397      * 0x10800 - 0x1083F.
1398      * @since 1.5
1399      */
1400     public static final UnicodeBlock CYPRIOT_SYLLABARY
1401       = new UnicodeBlock(0x10800, 0x1083F,
1402                          "CYPRIOT_SYLLABARY",
1403                          "Cypriot Syllabary");
1404
1405     /**
1406      * Byzantine Musical Symbols.
1407      * 0x1D000 - 0x1D0FF.
1408      * @since 1.5
1409      */
1410     public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
1411       = new UnicodeBlock(0x1D000, 0x1D0FF,
1412                          "BYZANTINE_MUSICAL_SYMBOLS",
1413                          "Byzantine Musical Symbols");
1414
1415     /**
1416      * Musical Symbols.
1417      * 0x1D100 - 0x1D1FF.
1418      * @since 1.5
1419      */
1420     public static final UnicodeBlock MUSICAL_SYMBOLS
1421       = new UnicodeBlock(0x1D100, 0x1D1FF,
1422                          "MUSICAL_SYMBOLS",
1423                          "Musical Symbols");
1424
1425     /**
1426      * Tai Xuan Jing Symbols.
1427      * 0x1D300 - 0x1D35F.
1428      * @since 1.5
1429      */
1430     public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
1431       = new UnicodeBlock(0x1D300, 0x1D35F,
1432                          "TAI_XUAN_JING_SYMBOLS",
1433                          "Tai Xuan Jing Symbols");
1434
1435     /**
1436      * Mathematical Alphanumeric Symbols.
1437      * 0x1D400 - 0x1D7FF.
1438      * @since 1.5
1439      */
1440     public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
1441       = new UnicodeBlock(0x1D400, 0x1D7FF,
1442                          "MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
1443                          "Mathematical Alphanumeric Symbols");
1444
1445     /**
1446      * CJK Unified Ideographs Extension B.
1447      * 0x20000 - 0x2A6DF.
1448      * @since 1.5
1449      */
1450     public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1451       = new UnicodeBlock(0x20000, 0x2A6DF,
1452                          "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
1453                          "CJK Unified Ideographs Extension B");
1454
1455     /**
1456      * CJK Compatibility Ideographs Supplement.
1457      * 0x2F800 - 0x2FA1F.
1458      * @since 1.5
1459      */
1460     public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
1461       = new UnicodeBlock(0x2F800, 0x2FA1F,
1462                          "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
1463                          "CJK Compatibility Ideographs Supplement");
1464
1465     /**
1466      * Tags.
1467      * 0xE0000 - 0xE007F.
1468      * @since 1.5
1469      */
1470     public static final UnicodeBlock TAGS
1471       = new UnicodeBlock(0xE0000, 0xE007F,
1472                          "TAGS",
1473                          "Tags");
1474
1475     /**
1476      * Variation Selectors Supplement.
1477      * 0xE0100 - 0xE01EF.
1478      * @since 1.5
1479      */
1480     public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
1481       = new UnicodeBlock(0xE0100, 0xE01EF,
1482                          "VARIATION_SELECTORS_SUPPLEMENT",
1483                          "Variation Selectors Supplement");
1484
1485     /**
1486      * Supplementary Private Use Area-A.
1487      * 0xF0000 - 0xFFFFF.
1488      * @since 1.5
1489      */
1490     public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
1491       = new UnicodeBlock(0xF0000, 0xFFFFF,
1492                          "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
1493                          "Supplementary Private Use Area-A");
1494
1495     /**
1496      * Supplementary Private Use Area-B.
1497      * 0x100000 - 0x10FFFF.
1498      * @since 1.5
1499      */
1500     public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
1501       = new UnicodeBlock(0x100000, 0x10FFFF,
1502                          "SUPPLEMENTARY_PRIVATE_USE_AREA_B",
1503                          "Supplementary Private Use Area-B");
1504
1505     /**
1506      * Surrogates Area.
1507      * 'D800' - 'DFFF'.
1508      * @deprecated As of 1.5, the three areas,
1509      * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
1510      * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
1511      * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
1512      * by the Unicode standard, should be used in preference to
1513      * this.  These are also returned from calls to <code>of(int)</code>
1514      * and <code>of(char)</code>.
1515      */
1516     public static final UnicodeBlock SURROGATES_AREA
1517       = new UnicodeBlock(0xD800, 0xDFFF,
1518                          "SURROGATES_AREA",
1519              "Surrogates Area");
1520
1521     /**
1522      * The defined subsets.
1523      */
1524     private static final UnicodeBlock sets[] = {
1525       BASIC_LATIN,
1526       LATIN_1_SUPPLEMENT,
1527       LATIN_EXTENDED_A,
1528       LATIN_EXTENDED_B,
1529       IPA_EXTENSIONS,
1530       SPACING_MODIFIER_LETTERS,
1531       COMBINING_DIACRITICAL_MARKS,
1532       GREEK,
1533       CYRILLIC,
1534       CYRILLIC_SUPPLEMENTARY,
1535       ARMENIAN,
1536       HEBREW,
1537       ARABIC,
1538       SYRIAC,
1539       THAANA,
1540       DEVANAGARI,
1541       BENGALI,
1542       GURMUKHI,
1543       GUJARATI,
1544       ORIYA,
1545       TAMIL,
1546       TELUGU,
1547       KANNADA,
1548       MALAYALAM,
1549       SINHALA,
1550       THAI,
1551       LAO,
1552       TIBETAN,
1553       MYANMAR,
1554       GEORGIAN,
1555       HANGUL_JAMO,
1556       ETHIOPIC,
1557       CHEROKEE,
1558       UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
1559       OGHAM,
1560       RUNIC,
1561       TAGALOG,
1562       HANUNOO,
1563       BUHID,
1564       TAGBANWA,
1565       KHMER,
1566       MONGOLIAN,
1567       LIMBU,
1568       TAI_LE,
1569       KHMER_SYMBOLS,
1570       PHONETIC_EXTENSIONS,
1571       LATIN_EXTENDED_ADDITIONAL,
1572       GREEK_EXTENDED,
1573       GENERAL_PUNCTUATION,
1574       SUPERSCRIPTS_AND_SUBSCRIPTS,
1575       CURRENCY_SYMBOLS,
1576       COMBINING_MARKS_FOR_SYMBOLS,
1577       LETTERLIKE_SYMBOLS,
1578       NUMBER_FORMS,
1579       ARROWS,
1580       MATHEMATICAL_OPERATORS,
1581       MISCELLANEOUS_TECHNICAL,
1582       CONTROL_PICTURES,
1583       OPTICAL_CHARACTER_RECOGNITION,
1584       ENCLOSED_ALPHANUMERICS,
1585       BOX_DRAWING,
1586       BLOCK_ELEMENTS,
1587       GEOMETRIC_SHAPES,
1588       MISCELLANEOUS_SYMBOLS,
1589       DINGBATS,
1590       MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
1591       SUPPLEMENTAL_ARROWS_A,
1592       BRAILLE_PATTERNS,
1593       SUPPLEMENTAL_ARROWS_B,
1594       MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
1595       SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
1596       MISCELLANEOUS_SYMBOLS_AND_ARROWS,
1597       CJK_RADICALS_SUPPLEMENT,
1598       KANGXI_RADICALS,
1599       IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
1600       CJK_SYMBOLS_AND_PUNCTUATION,
1601       HIRAGANA,
1602       KATAKANA,
1603       BOPOMOFO,
1604       HANGUL_COMPATIBILITY_JAMO,
1605       KANBUN,
1606       BOPOMOFO_EXTENDED,
1607       KATAKANA_PHONETIC_EXTENSIONS,
1608       ENCLOSED_CJK_LETTERS_AND_MONTHS,
1609       CJK_COMPATIBILITY,
1610       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
1611       YIJING_HEXAGRAM_SYMBOLS,
1612       CJK_UNIFIED_IDEOGRAPHS,
1613       YI_SYLLABLES,
1614       YI_RADICALS,
1615       HANGUL_SYLLABLES,
1616       HIGH_SURROGATES,
1617       HIGH_PRIVATE_USE_SURROGATES,
1618       LOW_SURROGATES,
1619       PRIVATE_USE_AREA,
1620       CJK_COMPATIBILITY_IDEOGRAPHS,
1621       ALPHABETIC_PRESENTATION_FORMS,
1622       ARABIC_PRESENTATION_FORMS_A,
1623       VARIATION_SELECTORS,
1624       COMBINING_HALF_MARKS,
1625       CJK_COMPATIBILITY_FORMS,
1626       SMALL_FORM_VARIANTS,
1627       ARABIC_PRESENTATION_FORMS_B,
1628       HALFWIDTH_AND_FULLWIDTH_FORMS,
1629       SPECIALS,
1630       LINEAR_B_SYLLABARY,
1631       LINEAR_B_IDEOGRAMS,
1632       AEGEAN_NUMBERS,
1633       OLD_ITALIC,
1634       GOTHIC,
1635       UGARITIC,
1636       DESERET,
1637       SHAVIAN,
1638       OSMANYA,
1639       CYPRIOT_SYLLABARY,
1640       BYZANTINE_MUSICAL_SYMBOLS,
1641       MUSICAL_SYMBOLS,
1642       TAI_XUAN_JING_SYMBOLS,
1643       MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
1644       CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
1645       CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
1646       TAGS,
1647       VARIATION_SELECTORS_SUPPLEMENT,
1648       SUPPLEMENTARY_PRIVATE_USE_AREA_A,
1649       SUPPLEMENTARY_PRIVATE_USE_AREA_B,
1650     };
1651   } // class UnicodeBlock
1652
1653   /**
1654    * A class to encompass all the properties of characters in the
1655    * private use blocks in the Unicode standard.  This class extends
1656    * UnassignedCharacters because the return type from getType() is
1657    * different.
1658    * @author Anthony Balkissoon abalkiss at redhat dot com
1659    *
1660    */
1661   private static class PrivateUseCharacters extends UnassignedCharacters
1662   {
1663     /**
1664      * Returns the type of the character cp.
1665      */
1666     static int getType(int cp)
1667     {
1668       // The upper 2 code points in any plane are considered unassigned,
1669       // even in the private-use planes.
1670       if ((cp & 0xffff) >= 0xfffe)
1671         return UnassignedCharacters.getType(cp);
1672       return PRIVATE_USE;
1673     }
1674
1675     /**
1676      * Returns true if the character cp is defined.
1677      */
1678     static boolean isDefined(int cp)
1679     {
1680       // The upper 2 code points in any plane are considered unassigned,
1681       // even in the private-use planes.
1682       if ((cp & 0xffff) >= 0xfffe)
1683         return UnassignedCharacters.isDefined(cp);
1684       return true;
1685     }
1686
1687     /**
1688      * Gets the directionality for the character cp.
1689      */
1690     static byte getDirectionality(int cp)
1691     {
1692       if ((cp & 0xffff) >= 0xfffe)
1693         return UnassignedCharacters.getDirectionality(cp);
1694       return DIRECTIONALITY_LEFT_TO_RIGHT;
1695     }
1696   }
1697
1698   /**
1699    * A class to encompass all the properties of code points that are
1700    * currently undefined in the Unicode standard.
1701    * @author Anthony Balkissoon abalkiss at redhat dot com
1702    *
1703    */
1704   private static class UnassignedCharacters
1705   {
1706     /**
1707      * Returns the numeric value for the unassigned characters.
1708      * @param cp the character
1709      * @param radix the radix (not used)
1710      * @return the numeric value of this character in this radix
1711      */
1712     static int digit(int cp, int radix)
1713     {
1714       return -1;
1715     }
1716
1717     /**
1718      * Returns the Unicode directionality property for unassigned
1719      * characters.
1720      * @param cp the character
1721      * @return DIRECTIONALITY_UNDEFINED
1722      */
1723     static byte getDirectionality(int cp)
1724     {
1725       return DIRECTIONALITY_UNDEFINED;
1726     }
1727
1728     /**
1729      * Returns -1, the numeric value for unassigned Unicode characters.
1730      * @param cp the character
1731      * @return -1
1732      */
1733     static int getNumericValue(int cp)
1734     {
1735       return -1;
1736     }
1737
1738     /**
1739      * Returns UNASSIGNED, the type of unassigned Unicode characters.
1740      * @param cp the character
1741      * @return UNASSIGNED
1742      */
1743     static int getType(int cp)
1744     {
1745       return UNASSIGNED;
1746     }
1747
1748     /**
1749      * Returns false to indiciate that the character is not defined in the
1750      * Unicode standard.
1751      * @param cp the character
1752      * @return false
1753      */
1754     static boolean isDefined(int cp)
1755     {
1756       return false;
1757     }
1758
1759     /**
1760      * Returns false to indicate that the character is not a digit.
1761      * @param cp the character
1762      * @return false
1763      */
1764     static boolean isDigit(int cp)
1765     {
1766       return false;
1767     }
1768
1769     /**
1770      * Returns false to indicate that the character cannot be ignored
1771      * within an identifier
1772      * @param cp the character
1773      * @return false
1774      */
1775     static boolean isIdentifierIgnorable(int cp)
1776     {
1777       return false;
1778     }
1779
1780     /**
1781      * Returns false to indicate that the character cannot be part of a
1782      * Java identifier.
1783      * @param cp the character
1784      * @return false
1785      */
1786     static boolean isJavaIdentifierPart(int cp)
1787     {
1788       return false;
1789     }
1790
1791     /**
1792      * Returns false to indicate that the character cannot be start a
1793      * Java identifier.
1794      * @param cp the character
1795      * @return false
1796      */
1797     static boolean isJavaIdentiferStart(int cp)
1798     {
1799       return false;
1800     }
1801
1802     /**
1803      * Returns false to indicate that the character is not a letter.
1804      * @param cp the character
1805      * @return false
1806      */
1807     static boolean isLetter(int cp)
1808     {
1809       return false;
1810     }
1811
1812     /**
1813      * Returns false to indicate that the character cannot is neither a letter
1814      * nor a digit.
1815      * @param cp the character
1816      * @return false
1817      */
1818     static boolean isLetterOrDigit(int cp)
1819     {
1820       return false;
1821     }
1822
1823     /**
1824      * Returns false to indicate that the character is not a lowercase letter.
1825      * @param cp the character
1826      * @return false
1827      */
1828     static boolean isLowerCase(int cp)
1829     {
1830       return false;
1831     }
1832
1833     /**
1834      * Returns false to indicate that the character cannot is not mirrored.
1835      * @param cp the character
1836      * @return false
1837      */
1838     static boolean isMirrored(int cp)
1839     {
1840       return false;
1841     }
1842
1843     /**
1844      * Returns false to indicate that the character is not a space character.
1845      * @param cp the character
1846      * @return false
1847      */
1848     static boolean isSpaceChar(int cp)
1849     {
1850       return false;
1851     }
1852
1853     /**
1854      * Returns false to indicate that the character it not a titlecase letter.
1855      * @param cp the character
1856      * @return false
1857      */
1858     static boolean isTitleCase(int cp)
1859     {
1860       return false;
1861     }
1862
1863     /**
1864      * Returns false to indicate that the character cannot be part of a
1865      * Unicode identifier.
1866      * @param cp the character
1867      * @return false
1868      */
1869     static boolean isUnicodeIdentifierPart(int cp)
1870     {
1871       return false;
1872     }
1873
1874     /**
1875      * Returns false to indicate that the character cannot start a
1876      * Unicode identifier.
1877      * @param cp the character
1878      * @return false
1879      */
1880     static boolean isUnicodeIdentifierStart(int cp)
1881     {
1882       return false;
1883     }
1884
1885     /**
1886      * Returns false to indicate that the character is not an uppercase letter.
1887      * @param cp the character
1888      * @return false
1889      */
1890     static boolean isUpperCase(int cp)
1891     {
1892       return false;
1893     }
1894
1895     /**
1896      * Returns false to indicate that the character is not a whitespace
1897      * character.
1898      * @param cp the character
1899      * @return false
1900      */
1901     static boolean isWhiteSpace(int cp)
1902     {
1903       return false;
1904     }
1905
1906     /**
1907      * Returns cp to indicate this character has no lowercase conversion.
1908      * @param cp the character
1909      * @return cp
1910      */
1911     static int toLowerCase(int cp)
1912     {
1913       return cp;
1914     }
1915
1916     /**
1917      * Returns cp to indicate this character has no titlecase conversion.
1918      * @param cp the character
1919      * @return cp
1920      */
1921     static int toTitleCase(int cp)
1922     {
1923       return cp;
1924     }
1925
1926     /**
1927      * Returns cp to indicate this character has no uppercase conversion.
1928      * @param cp the character
1929      * @return cp
1930      */
1931     static int toUpperCase(int cp)
1932     {
1933       return cp;
1934     }
1935   }
1936
1937   /**
1938    * The immutable value of this Character.
1939    *
1940    * @serial the value of this Character
1941    */
1942   private final char value;
1943
1944   /**
1945    * Compatible with JDK 1.0+.
1946    */
1947   private static final long serialVersionUID = 3786198910865385080L;
1948
1949   /**
1950    * Smallest value allowed for radix arguments in Java. This value is 2.
1951    *
1952    * @see #digit(char, int)
1953    * @see #forDigit(int, int)
1954    * @see Integer#toString(int, int)
1955    * @see Integer#valueOf(String)
1956    */
1957   public static final int MIN_RADIX = 2;
1958
1959   /**
1960    * Largest value allowed for radix arguments in Java. This value is 36.
1961    *
1962    * @see #digit(char, int)
1963    * @see #forDigit(int, int)
1964    * @see Integer#toString(int, int)
1965    * @see Integer#valueOf(String)
1966    */
1967   public static final int MAX_RADIX = 36;
1968
1969   /**
1970    * The minimum value the char data type can hold.
1971    * This value is <code>'\\u0000'</code>.
1972    */
1973   public static final char MIN_VALUE = '\u0000';
1974
1975   /**
1976    * The maximum value the char data type can hold.
1977    * This value is <code>'\\uFFFF'</code>.
1978    */
1979   public static final char MAX_VALUE = '\uFFFF';
1980
1981   /**
1982    * Class object representing the primitive char data type.
1983    *
1984    * @since 1.1
1985    */
1986   public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1987
1988   /**
1989    * The number of bits needed to represent a <code>char</code>.
1990    * @since 1.5
1991    */
1992   public static final int SIZE = 16;
1993
1994   // This caches some Character values, and is used by boxing
1995   // conversions via valueOf().  We must cache at least 0..127;
1996   // this constant controls how much we actually cache.
1997   private static final int MAX_CACHE = 127;
1998   private static Character[] charCache = new Character[MAX_CACHE + 1];
1999
2000   /**
2001    * Lu = Letter, Uppercase (Informative).
2002    *
2003    * @since 1.1
2004    */
2005   public static final byte UPPERCASE_LETTER = 1;
2006
2007   /**
2008    * Ll = Letter, Lowercase (Informative).
2009    *
2010    * @since 1.1
2011    */
2012   public static final byte LOWERCASE_LETTER = 2;
2013
2014   /**
2015    * Lt = Letter, Titlecase (Informative).
2016    *
2017    * @since 1.1
2018    */
2019   public static final byte TITLECASE_LETTER = 3;
2020
2021   /**
2022    * Mn = Mark, Non-Spacing (Normative).
2023    *
2024    * @since 1.1
2025    */
2026   public static final byte NON_SPACING_MARK = 6;
2027
2028   /**
2029    * Mc = Mark, Spacing Combining (Normative).
2030    *
2031    * @since 1.1
2032    */
2033   public static final byte COMBINING_SPACING_MARK = 8;
2034
2035   /**
2036    * Me = Mark, Enclosing (Normative).
2037    *
2038    * @since 1.1
2039    */
2040   public static final byte ENCLOSING_MARK = 7;
2041
2042   /**
2043    * Nd = Number, Decimal Digit (Normative).
2044    *
2045    * @since 1.1
2046    */
2047   public static final byte DECIMAL_DIGIT_NUMBER = 9;
2048
2049   /**
2050    * Nl = Number, Letter (Normative).
2051    *
2052    * @since 1.1
2053    */
2054   public static final byte LETTER_NUMBER = 10;
2055
2056   /**
2057    * No = Number, Other (Normative).
2058    *
2059    * @since 1.1
2060    */
2061   public static final byte OTHER_NUMBER = 11;
2062
2063   /**
2064    * Zs = Separator, Space (Normative).
2065    *
2066    * @since 1.1
2067    */
2068   public static final byte SPACE_SEPARATOR = 12;
2069
2070   /**
2071    * Zl = Separator, Line (Normative).
2072    *
2073    * @since 1.1
2074    */
2075   public static final byte LINE_SEPARATOR = 13;
2076
2077   /**
2078    * Zp = Separator, Paragraph (Normative).
2079    *
2080    * @since 1.1
2081    */
2082   public static final byte PARAGRAPH_SEPARATOR = 14;
2083
2084   /**
2085    * Cc = Other, Control (Normative).
2086    *
2087    * @since 1.1
2088    */
2089   public static final byte CONTROL = 15;
2090
2091   /**
2092    * Cf = Other, Format (Normative).
2093    *
2094    * @since 1.1
2095    */
2096   public static final byte FORMAT = 16;
2097
2098   /**
2099    * Cs = Other, Surrogate (Normative).
2100    *
2101    * @since 1.1
2102    */
2103   public static final byte SURROGATE = 19;
2104
2105   /**
2106    * Co = Other, Private Use (Normative).
2107    *
2108    * @since 1.1
2109    */
2110   public static final byte PRIVATE_USE = 18;
2111
2112   /**
2113    * Cn = Other, Not Assigned (Normative).
2114    *
2115    * @since 1.1
2116    */
2117   public static final byte UNASSIGNED = 0;
2118
2119   /**
2120    * Lm = Letter, Modifier (Informative).
2121    *
2122    * @since 1.1
2123    */
2124   public static final byte MODIFIER_LETTER = 4;
2125
2126   /**
2127    * Lo = Letter, Other (Informative).
2128    *
2129    * @since 1.1
2130    */
2131   public static final byte OTHER_LETTER = 5;
2132
2133   /**
2134    * Pc = Punctuation, Connector (Informative).
2135    *
2136    * @since 1.1
2137    */
2138   public static final byte CONNECTOR_PUNCTUATION = 23;
2139
2140   /**
2141    * Pd = Punctuation, Dash (Informative).
2142    *
2143    * @since 1.1
2144    */
2145   public static final byte DASH_PUNCTUATION = 20;
2146
2147   /**
2148    * Ps = Punctuation, Open (Informative).
2149    *
2150    * @since 1.1
2151    */
2152   public static final byte START_PUNCTUATION = 21;
2153
2154   /**
2155    * Pe = Punctuation, Close (Informative).
2156    *
2157    * @since 1.1
2158    */
2159   public static final byte END_PUNCTUATION = 22;
2160
2161   /**
2162    * Pi = Punctuation, Initial Quote (Informative).
2163    *
2164    * @since 1.4
2165    */
2166   public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
2167
2168   /**
2169    * Pf = Punctuation, Final Quote (Informative).
2170    *
2171    * @since 1.4
2172    */
2173   public static final byte FINAL_QUOTE_PUNCTUATION = 30;
2174
2175   /**
2176    * Po = Punctuation, Other (Informative).
2177    *
2178    * @since 1.1
2179    */
2180   public static final byte OTHER_PUNCTUATION = 24;
2181
2182   /**
2183    * Sm = Symbol, Math (Informative).
2184    *
2185    * @since 1.1
2186    */
2187   public static final byte MATH_SYMBOL = 25;
2188
2189   /**
2190    * Sc = Symbol, Currency (Informative).
2191    *
2192    * @since 1.1
2193    */
2194   public static final byte CURRENCY_SYMBOL = 26;
2195
2196   /**
2197    * Sk = Symbol, Modifier (Informative).
2198    *
2199    * @since 1.1
2200    */
2201   public static final byte MODIFIER_SYMBOL = 27;
2202
2203   /**
2204    * So = Symbol, Other (Informative).
2205    *
2206    * @since 1.1
2207    */
2208   public static final byte OTHER_SYMBOL = 28;
2209
2210   /**
2211    * Undefined bidirectional character type. Undefined char values have
2212    * undefined directionality in the Unicode specification.
2213    *
2214    * @since 1.4
2215    */
2216   public static final byte DIRECTIONALITY_UNDEFINED = -1;
2217
2218   /**
2219    * Strong bidirectional character type "L".
2220    *
2221    * @since 1.4
2222    */
2223   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
2224
2225   /**
2226    * Strong bidirectional character type "R".
2227    *
2228    * @since 1.4
2229    */
2230   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
2231
2232   /**
2233    * Strong bidirectional character type "AL".
2234    *
2235    * @since 1.4
2236    */
2237   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
2238
2239   /**
2240    * Weak bidirectional character type "EN".
2241    *
2242    * @since 1.4
2243    */
2244   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
2245
2246   /**
2247    * Weak bidirectional character type "ES".
2248    *
2249    * @since 1.4
2250    */
2251   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
2252
2253   /**
2254    * Weak bidirectional character type "ET".
2255    *
2256    * @since 1.4
2257    */
2258   public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
2259
2260   /**
2261    * Weak bidirectional character type "AN".
2262    *
2263    * @since 1.4
2264    */
2265   public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
2266
2267   /**
2268    * Weak bidirectional character type "CS".
2269    *
2270    * @since 1.4
2271    */
2272   public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
2273
2274   /**
2275    * Weak bidirectional character type "NSM".
2276    *
2277    * @since 1.4
2278    */
2279   public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
2280
2281   /**
2282    * Weak bidirectional character type "BN".
2283    *
2284    * @since 1.4
2285    */
2286   public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
2287
2288   /**
2289    * Neutral bidirectional character type "B".
2290    *
2291    * @since 1.4
2292    */
2293   public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
2294
2295   /**
2296    * Neutral bidirectional character type "S".
2297    *
2298    * @since 1.4
2299    */
2300   public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
2301
2302   /**
2303    * Strong bidirectional character type "WS".
2304    *
2305    * @since 1.4
2306    */
2307   public static final byte DIRECTIONALITY_WHITESPACE = 12;
2308
2309   /**
2310    * Neutral bidirectional character type "ON".
2311    *
2312    * @since 1.4
2313    */
2314   public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
2315
2316   /**
2317    * Strong bidirectional character type "LRE".
2318    *
2319    * @since 1.4
2320    */
2321   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
2322
2323   /**
2324    * Strong bidirectional character type "LRO".
2325    *
2326    * @since 1.4
2327    */
2328   public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
2329
2330   /**
2331    * Strong bidirectional character type "RLE".
2332    *
2333    * @since 1.4
2334    */
2335   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
2336
2337   /**
2338    * Strong bidirectional character type "RLO".
2339    *
2340    * @since 1.4
2341    */
2342   public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
2343
2344   /**
2345    * Weak bidirectional character type "PDF".
2346    *
2347    * @since 1.4
2348    */
2349   public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
2350
2351   /**
2352    * Stores unicode block offset lookup table. Exploit package visibility of
2353    * String.value to avoid copying the array.
2354    * @see #readCodePoint(int)
2355    * @see CharData#BLOCKS
2356    */
2357   private static final char[][] blocks =
2358     new char[][]{
2359                  String.zeroBasedStringValue(CharData.BLOCKS[0]),
2360                  String.zeroBasedStringValue(CharData.BLOCKS[1]),
2361                  String.zeroBasedStringValue(CharData.BLOCKS[2]),
2362                  String.zeroBasedStringValue(CharData.BLOCKS[3]),
2363                  String.zeroBasedStringValue(CharData.BLOCKS[4]),
2364                  String.zeroBasedStringValue(CharData.BLOCKS[5]),
2365                  String.zeroBasedStringValue(CharData.BLOCKS[6]),
2366                  String.zeroBasedStringValue(CharData.BLOCKS[7]),
2367                  String.zeroBasedStringValue(CharData.BLOCKS[8]),
2368                  String.zeroBasedStringValue(CharData.BLOCKS[9]),
2369                  String.zeroBasedStringValue(CharData.BLOCKS[10]),
2370                  String.zeroBasedStringValue(CharData.BLOCKS[11]),
2371                  String.zeroBasedStringValue(CharData.BLOCKS[12]),
2372                  String.zeroBasedStringValue(CharData.BLOCKS[13]),
2373                  String.zeroBasedStringValue(CharData.BLOCKS[14]),
2374                  String.zeroBasedStringValue(CharData.BLOCKS[15]),
2375                  String.zeroBasedStringValue(CharData.BLOCKS[16])};
2376
2377   /**
2378    * Stores unicode attribute offset lookup table. Exploit package visibility
2379    * of String.value to avoid copying the array.
2380    * @see CharData#DATA
2381    */
2382   private static final char[][] data =
2383     new char[][]{
2384                  String.zeroBasedStringValue(CharData.DATA[0]),
2385                  String.zeroBasedStringValue(CharData.DATA[1]),
2386                  String.zeroBasedStringValue(CharData.DATA[2]),
2387                  String.zeroBasedStringValue(CharData.DATA[3]),
2388                  String.zeroBasedStringValue(CharData.DATA[4]),
2389                  String.zeroBasedStringValue(CharData.DATA[5]),
2390                  String.zeroBasedStringValue(CharData.DATA[6]),
2391                  String.zeroBasedStringValue(CharData.DATA[7]),
2392                  String.zeroBasedStringValue(CharData.DATA[8]),
2393                  String.zeroBasedStringValue(CharData.DATA[9]),
2394                  String.zeroBasedStringValue(CharData.DATA[10]),
2395                  String.zeroBasedStringValue(CharData.DATA[11]),
2396                  String.zeroBasedStringValue(CharData.DATA[12]),
2397                  String.zeroBasedStringValue(CharData.DATA[13]),
2398                  String.zeroBasedStringValue(CharData.DATA[14]),
2399                  String.zeroBasedStringValue(CharData.DATA[15]),
2400                  String.zeroBasedStringValue(CharData.DATA[16])};
2401
2402   /**
2403    * Stores unicode numeric value attribute table. Exploit package visibility
2404    * of String.value to avoid copying the array.
2405    * @see CharData#NUM_VALUE
2406    */
2407   private static final char[][] numValue =
2408     new char[][]{
2409                  String.zeroBasedStringValue(CharData.NUM_VALUE[0]),
2410                  String.zeroBasedStringValue(CharData.NUM_VALUE[1]),
2411                  String.zeroBasedStringValue(CharData.NUM_VALUE[2]),
2412                  String.zeroBasedStringValue(CharData.NUM_VALUE[3]),
2413                  String.zeroBasedStringValue(CharData.NUM_VALUE[4]),
2414                  String.zeroBasedStringValue(CharData.NUM_VALUE[5]),
2415                  String.zeroBasedStringValue(CharData.NUM_VALUE[6]),
2416                  String.zeroBasedStringValue(CharData.NUM_VALUE[7]),
2417                  String.zeroBasedStringValue(CharData.NUM_VALUE[8]),
2418                  String.zeroBasedStringValue(CharData.NUM_VALUE[9]),
2419                  String.zeroBasedStringValue(CharData.NUM_VALUE[10]),
2420                  String.zeroBasedStringValue(CharData.NUM_VALUE[11]),
2421                  String.zeroBasedStringValue(CharData.NUM_VALUE[12]),
2422                  String.zeroBasedStringValue(CharData.NUM_VALUE[13]),
2423                  String.zeroBasedStringValue(CharData.NUM_VALUE[14]),
2424                  String.zeroBasedStringValue(CharData.NUM_VALUE[15]),
2425                  String.zeroBasedStringValue(CharData.NUM_VALUE[16])};
2426
2427   /**
2428    * Stores unicode uppercase attribute table. Exploit package visibility
2429    * of String.value to avoid copying the array.
2430    * @see CharData#UPPER
2431    */
2432   private static final char[][] upper =
2433     new char[][]{
2434                  String.zeroBasedStringValue(CharData.UPPER[0]),
2435                  String.zeroBasedStringValue(CharData.UPPER[1]),
2436                  String.zeroBasedStringValue(CharData.UPPER[2]),
2437                  String.zeroBasedStringValue(CharData.UPPER[3]),
2438                  String.zeroBasedStringValue(CharData.UPPER[4]),
2439                  String.zeroBasedStringValue(CharData.UPPER[5]),
2440                  String.zeroBasedStringValue(CharData.UPPER[6]),
2441                  String.zeroBasedStringValue(CharData.UPPER[7]),
2442                  String.zeroBasedStringValue(CharData.UPPER[8]),
2443                  String.zeroBasedStringValue(CharData.UPPER[9]),
2444                  String.zeroBasedStringValue(CharData.UPPER[10]),
2445                  String.zeroBasedStringValue(CharData.UPPER[11]),
2446                  String.zeroBasedStringValue(CharData.UPPER[12]),
2447                  String.zeroBasedStringValue(CharData.UPPER[13]),
2448                  String.zeroBasedStringValue(CharData.UPPER[14]),
2449                  String.zeroBasedStringValue(CharData.UPPER[15]),
2450                  String.zeroBasedStringValue(CharData.UPPER[16])};
2451
2452   /**
2453    * Stores unicode lowercase attribute table. Exploit package visibility
2454    * of String.value to avoid copying the array.
2455    * @see CharData#LOWER
2456    */
2457   private static final char[][] lower =
2458     new char[][]{
2459                  String.zeroBasedStringValue(CharData.LOWER[0]),
2460                  String.zeroBasedStringValue(CharData.LOWER[1]),
2461                  String.zeroBasedStringValue(CharData.LOWER[2]),
2462                  String.zeroBasedStringValue(CharData.LOWER[3]),
2463                  String.zeroBasedStringValue(CharData.LOWER[4]),
2464                  String.zeroBasedStringValue(CharData.LOWER[5]),
2465                  String.zeroBasedStringValue(CharData.LOWER[6]),
2466                  String.zeroBasedStringValue(CharData.LOWER[7]),
2467                  String.zeroBasedStringValue(CharData.LOWER[8]),
2468                  String.zeroBasedStringValue(CharData.LOWER[9]),
2469                  String.zeroBasedStringValue(CharData.LOWER[10]),
2470                  String.zeroBasedStringValue(CharData.LOWER[11]),
2471                  String.zeroBasedStringValue(CharData.LOWER[12]),
2472                  String.zeroBasedStringValue(CharData.LOWER[13]),
2473                  String.zeroBasedStringValue(CharData.LOWER[14]),
2474                  String.zeroBasedStringValue(CharData.LOWER[15]),
2475                  String.zeroBasedStringValue(CharData.LOWER[16])};
2476
2477   /**
2478    * Stores unicode direction attribute table. Exploit package visibility
2479    * of String.value to avoid copying the array.
2480    * @see CharData#DIRECTION
2481    */
2482   // Package visible for use by String.
2483   static final char[][] direction =
2484     new char[][]{
2485                  String.zeroBasedStringValue(CharData.DIRECTION[0]),
2486                  String.zeroBasedStringValue(CharData.DIRECTION[1]),
2487                  String.zeroBasedStringValue(CharData.DIRECTION[2]),
2488                  String.zeroBasedStringValue(CharData.DIRECTION[3]),
2489                  String.zeroBasedStringValue(CharData.DIRECTION[4]),
2490                  String.zeroBasedStringValue(CharData.DIRECTION[5]),
2491                  String.zeroBasedStringValue(CharData.DIRECTION[6]),
2492                  String.zeroBasedStringValue(CharData.DIRECTION[7]),
2493                  String.zeroBasedStringValue(CharData.DIRECTION[8]),
2494                  String.zeroBasedStringValue(CharData.DIRECTION[9]),
2495                  String.zeroBasedStringValue(CharData.DIRECTION[10]),
2496                  String.zeroBasedStringValue(CharData.DIRECTION[11]),
2497                  String.zeroBasedStringValue(CharData.DIRECTION[12]),
2498                  String.zeroBasedStringValue(CharData.DIRECTION[13]),
2499                  String.zeroBasedStringValue(CharData.DIRECTION[14]),
2500                  String.zeroBasedStringValue(CharData.DIRECTION[15]),
2501                  String.zeroBasedStringValue(CharData.DIRECTION[16])};
2502
2503   /**
2504    * Stores unicode titlecase table. Exploit package visibility of
2505    * String.value to avoid copying the array.
2506    * @see CharData#TITLE
2507    */
2508   private static final char[] title = String.zeroBasedStringValue(CharData.TITLE);
2509
2510   /**
2511    * Mask for grabbing the type out of the contents of data.
2512    * @see CharData#DATA
2513    */
2514   private static final int TYPE_MASK = 0x1F;
2515
2516   /**
2517    * Mask for grabbing the non-breaking space flag out of the contents of
2518    * data.
2519    * @see CharData#DATA
2520    */
2521   private static final int NO_BREAK_MASK = 0x20;
2522
2523   /**
2524    * Mask for grabbing the mirrored directionality flag out of the contents
2525    * of data.
2526    * @see CharData#DATA
2527    */
2528   private static final int MIRROR_MASK = 0x40;
2529
2530   /**
2531    * Min value for supplementary code point.
2532    *
2533    * @since 1.5
2534    */
2535   public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
2536
2537   /**
2538    * Min value for code point.
2539    *
2540    * @since 1.5
2541    */
2542   public static final int MIN_CODE_POINT = 0;
2543
2544
2545   /**
2546    * Max value for code point.
2547    *
2548    * @since 1.5
2549    */
2550   public static final int MAX_CODE_POINT = 0x010ffff;
2551
2552
2553   /**
2554    * Minimum high surrogate code in UTF-16 encoding.
2555    *
2556    * @since 1.5
2557    */
2558   public static final char MIN_HIGH_SURROGATE = '\ud800';
2559
2560   /**
2561    * Maximum high surrogate code in UTF-16 encoding.
2562    *
2563    * @since 1.5
2564    */
2565   public static final char MAX_HIGH_SURROGATE = '\udbff';
2566
2567   /**
2568    * Minimum low surrogate code in UTF-16 encoding.
2569    *
2570    * @since 1.5
2571    */
2572   public static final char MIN_LOW_SURROGATE = '\udc00';
2573
2574   /**
2575    * Maximum low surrogate code in UTF-16 encoding.
2576    *
2577    * @since 1.5
2578    */
2579   public static final char MAX_LOW_SURROGATE = '\udfff';
2580
2581   /**
2582    * Minimum surrogate code in UTF-16 encoding.
2583    *
2584    * @since 1.5
2585    */
2586   public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
2587
2588   /**
2589    * Maximum low surrogate code in UTF-16 encoding.
2590    *
2591    * @since 1.5
2592    */
2593   public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
2594
2595   /**
2596    * Grabs an attribute offset from the Unicode attribute database. The lower
2597    * 5 bits are the character type, the next 2 bits are flags, and the top
2598    * 9 bits are the offset into the attribute tables.
2599    *
2600    * @param codePoint the character to look up
2601    * @return the character's attribute offset and type
2602    * @see #TYPE_MASK
2603    * @see #NO_BREAK_MASK
2604    * @see #MIRROR_MASK
2605    * @see CharData#DATA
2606    * @see CharData#SHIFT
2607    */
2608   static char readCodePoint(int codePoint)
2609   {
2610     int plane = codePoint >>> 16;
2611     char offset = (char) (codePoint & 0xffff);
2612     return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)];
2613   }
2614
2615   /**
2616    * Wraps up a character.
2617    *
2618    * @param value the character to wrap
2619    */
2620   public Character(char value)
2621   {
2622     this.value = value;
2623   }
2624
2625   /**
2626    * Returns the character which has been wrapped by this class.
2627    *
2628    * @return the character wrapped
2629    */
2630   public char charValue()
2631   {
2632     return value;
2633   }
2634
2635   /**
2636    * Returns the numerical value (unsigned) of the wrapped character.
2637    * Range of returned values: 0x0000-0xFFFF.
2638    *
2639    * @return the value of the wrapped character
2640    */
2641   public int hashCode()
2642   {
2643     return value;
2644   }
2645
2646   /**
2647    * Determines if an object is equal to this object. This is only true for
2648    * another Character object wrapping the same value.
2649    *
2650    * @param o object to compare
2651    * @return true if o is a Character with the same value
2652    */
2653   public boolean equals(Object o)
2654   {
2655     return o instanceof Character && value == ((Character) o).value;
2656   }
2657
2658   /**
2659    * Converts the wrapped character into a String.
2660    *
2661    * @return a String containing one character -- the wrapped character
2662    *         of this instance
2663    */
2664   public String toString()
2665   {
2666     // Package constructor avoids an array copy.
2667     return new String(new char[] { value }, 0, 1, true);
2668   }
2669
2670   /**
2671    * Returns a String of length 1 representing the specified character.
2672    *
2673    * @param ch the character to convert
2674    * @return a String containing the character
2675    * @since 1.4
2676    */
2677   public static String toString(char ch)
2678   {
2679     // Package constructor avoids an array copy.
2680     return new String(new char[] { ch }, 0, 1, true);
2681   }
2682
2683   /**
2684    * Determines if a character is a Unicode lowercase letter. For example,
2685    * <code>'a'</code> is lowercase.  Returns true if getType() returns
2686    * LOWERCASE_LETTER.
2687    * <br>
2688    * lowercase = [Ll]
2689    *
2690    * @param ch character to test
2691    * @return true if ch is a Unicode lowercase letter, else false
2692    * @see #isUpperCase(char)
2693    * @see #isTitleCase(char)
2694    * @see #toLowerCase(char)
2695    * @see #getType(char)
2696    */
2697   public static boolean isLowerCase(char ch)
2698   {
2699     return isLowerCase((int)ch);
2700   }
2701
2702   /**
2703    * Determines if a character is a Unicode lowercase letter. For example,
2704    * <code>'a'</code> is lowercase.  Returns true if getType() returns
2705    * LOWERCASE_LETTER.
2706    * <br>
2707    * lowercase = [Ll]
2708    *
2709    * @param codePoint character to test
2710    * @return true if ch is a Unicode lowercase letter, else false
2711    * @see #isUpperCase(char)
2712    * @see #isTitleCase(char)
2713    * @see #toLowerCase(char)
2714    * @see #getType(char)
2715    *
2716    * @since 1.5
2717    */
2718   public static boolean isLowerCase(int codePoint)
2719   {
2720     return getType(codePoint) == LOWERCASE_LETTER;
2721   }
2722
2723   /**
2724    * Determines if a character is a Unicode uppercase letter. For example,
2725    * <code>'A'</code> is uppercase.  Returns true if getType() returns
2726    * UPPERCASE_LETTER.
2727    * <br>
2728    * uppercase = [Lu]
2729    *
2730    * @param ch character to test
2731    * @return true if ch is a Unicode uppercase letter, else false
2732    * @see #isLowerCase(char)
2733    * @see #isTitleCase(char)
2734    * @see #toUpperCase(char)
2735    * @see #getType(char)
2736    */
2737   public static boolean isUpperCase(char ch)
2738   {
2739     return isUpperCase((int)ch);
2740   }
2741
2742   /**
2743    * Determines if a character is a Unicode uppercase letter. For example,
2744    * <code>'A'</code> is uppercase.  Returns true if getType() returns
2745    * UPPERCASE_LETTER.
2746    * <br>
2747    * uppercase = [Lu]
2748    *
2749    * @param codePoint character to test
2750    * @return true if ch is a Unicode uppercase letter, else false
2751    * @see #isLowerCase(char)
2752    * @see #isTitleCase(char)
2753    * @see #toUpperCase(char)
2754    * @see #getType(char)
2755    *
2756    * @since 1.5
2757    */
2758   public static boolean isUpperCase(int codePoint)
2759   {
2760     return getType(codePoint) == UPPERCASE_LETTER;
2761   }
2762
2763   /**
2764    * Determines if a character is a Unicode titlecase letter. For example,
2765    * the character "Lj" (Latin capital L with small letter j) is titlecase.
2766    * True if getType() returns TITLECASE_LETTER.
2767    * <br>
2768    * titlecase = [Lt]
2769    *
2770    * @param ch character to test
2771    * @return true if ch is a Unicode titlecase letter, else false
2772    * @see #isLowerCase(char)
2773    * @see #isUpperCase(char)
2774    * @see #toTitleCase(char)
2775    * @see #getType(char)
2776    */
2777   public static boolean isTitleCase(char ch)
2778   {
2779     return isTitleCase((int)ch);
2780   }
2781
2782   /**
2783    * Determines if a character is a Unicode titlecase letter. For example,
2784    * the character "Lj" (Latin capital L with small letter j) is titlecase.
2785    * True if getType() returns TITLECASE_LETTER.
2786    * <br>
2787    * titlecase = [Lt]
2788    *
2789    * @param codePoint character to test
2790    * @return true if ch is a Unicode titlecase letter, else false
2791    * @see #isLowerCase(char)
2792    * @see #isUpperCase(char)
2793    * @see #toTitleCase(char)
2794    * @see #getType(char)
2795    *
2796    * @since 1.5
2797    */
2798   public static boolean isTitleCase(int codePoint)
2799   {
2800     return getType(codePoint) == TITLECASE_LETTER;
2801   }
2802
2803
2804   /**
2805    * Determines if a character is a Unicode decimal digit. For example,
2806    * <code>'0'</code> is a digit.  A character is a Unicode digit if
2807    * getType() returns DECIMAL_DIGIT_NUMBER.
2808    * <br>
2809    * Unicode decimal digit = [Nd]
2810    *
2811    * @param ch character to test
2812    * @return true if ch is a Unicode decimal digit, else false
2813    * @see #digit(char, int)
2814    * @see #forDigit(int, int)
2815    * @see #getType(char)
2816    */
2817   public static boolean isDigit(char ch)
2818   {
2819     return isDigit((int)ch);
2820   }
2821
2822   /**
2823    * Determines if a character is a Unicode decimal digit. For example,
2824    * <code>'0'</code> is a digit. A character is a Unicode digit if
2825    * getType() returns DECIMAL_DIGIT_NUMBER.
2826    * <br>
2827    * Unicode decimal digit = [Nd]
2828    *
2829    * @param codePoint character to test
2830    * @return true if ch is a Unicode decimal digit, else false
2831    * @see #digit(char, int)
2832    * @see #forDigit(int, int)
2833    * @see #getType(char)
2834    *
2835    * @since 1.5
2836    */
2837
2838   public static boolean isDigit(int codePoint)
2839   {
2840     return getType(codePoint) == DECIMAL_DIGIT_NUMBER;
2841   }
2842
2843   /**
2844    * Determines if a character is part of the Unicode Standard. This is an
2845    * evolving standard, but covers every character in the data file.
2846    * <br>
2847    * defined = not [Cn]
2848    *
2849    * @param ch character to test
2850    * @return true if ch is a Unicode character, else false
2851    * @see #isDigit(char)
2852    * @see #isLetter(char)
2853    * @see #isLetterOrDigit(char)
2854    * @see #isLowerCase(char)
2855    * @see #isTitleCase(char)
2856    * @see #isUpperCase(char)
2857    */
2858   public static boolean isDefined(char ch)
2859   {
2860     return isDefined((int)ch);
2861   }
2862
2863   /**
2864    * Determines if a character is part of the Unicode Standard. This is an
2865    * evolving standard, but covers every character in the data file.
2866    * <br>
2867    * defined = not [Cn]
2868    *
2869    * @param codePoint character to test
2870    * @return true if ch is a Unicode character, else false
2871    * @see #isDigit(char)
2872    * @see #isLetter(char)
2873    * @see #isLetterOrDigit(char)
2874    * @see #isLowerCase(char)
2875    * @see #isTitleCase(char)
2876    * @see #isUpperCase(char)
2877    *
2878    * @since 1.5
2879    */
2880   public static boolean isDefined(int codePoint)
2881   {
2882     return getType(codePoint) != UNASSIGNED;
2883   }
2884
2885   /**
2886    * Determines if a character is a Unicode letter. Not all letters have case,
2887    * so this may return true when isLowerCase and isUpperCase return false.
2888    * A character is a Unicode letter if getType() returns one of
2889    * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
2890    * or OTHER_LETTER.
2891    * <br>
2892    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2893    *
2894    * @param ch character to test
2895    * @return true if ch is a Unicode letter, else false
2896    * @see #isDigit(char)
2897    * @see #isJavaIdentifierStart(char)
2898    * @see #isJavaLetter(char)
2899    * @see #isJavaLetterOrDigit(char)
2900    * @see #isLetterOrDigit(char)
2901    * @see #isLowerCase(char)
2902    * @see #isTitleCase(char)
2903    * @see #isUnicodeIdentifierStart(char)
2904    * @see #isUpperCase(char)
2905    */
2906   public static boolean isLetter(char ch)
2907   {
2908     return isLetter((int)ch);
2909   }
2910
2911   /**
2912    * Determines if a character is a Unicode letter. Not all letters have case,
2913    * so this may return true when isLowerCase and isUpperCase return false.
2914    * A character is a Unicode letter if getType() returns one of
2915    * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
2916    * or OTHER_LETTER.
2917    * <br>
2918    * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2919    *
2920    * @param codePoint character to test
2921    * @return true if ch is a Unicode letter, else false
2922    * @see #isDigit(char)
2923    * @see #isJavaIdentifierStart(char)
2924    * @see #isJavaLetter(char)
2925    * @see #isJavaLetterOrDigit(char)
2926    * @see #isLetterOrDigit(char)
2927    * @see #isLowerCase(char)
2928    * @see #isTitleCase(char)
2929    * @see #isUnicodeIdentifierStart(char)
2930    * @see #isUpperCase(char)
2931    *
2932    * @since 1.5
2933    */
2934   public static boolean isLetter(int codePoint)
2935   {
2936     return ((1 << getType(codePoint))
2937         & ((1 << UPPERCASE_LETTER)
2938             | (1 << LOWERCASE_LETTER)
2939             | (1 << TITLECASE_LETTER)
2940             | (1 << MODIFIER_LETTER)
2941             | (1 << OTHER_LETTER))) != 0;
2942   }
2943   /**
2944    * Returns the index into the given CharSequence that is offset
2945    * <code>codePointOffset</code> code points from <code>index</code>.
2946    * @param seq the CharSequence
2947    * @param index the start position in the CharSequence
2948    * @param codePointOffset the number of code points offset from the start
2949    * position
2950    * @return the index into the CharSequence that is codePointOffset code
2951    * points offset from index
2952    *
2953    * @throws NullPointerException if seq is null
2954    * @throws IndexOutOfBoundsException if index is negative or greater than the
2955    * length of the sequence.
2956    * @throws IndexOutOfBoundsException if codePointOffset is positive and the
2957    * subsequence from index to the end of seq has fewer than codePointOffset
2958    * code points
2959    * @throws IndexOutOfBoundsException if codePointOffset is negative and the
2960    * subsequence from the start of seq to index has fewer than
2961    * (-codePointOffset) code points
2962    * @since 1.5
2963    */
2964   public static int offsetByCodePoints(CharSequence seq,
2965                                        int index,
2966                                        int codePointOffset)
2967   {
2968     int len = seq.length();
2969     if (index < 0 || index > len)
2970       throw new IndexOutOfBoundsException();
2971
2972     int numToGo = codePointOffset;
2973     int offset = index;
2974     int adjust = 1;
2975     if (numToGo >= 0)
2976       {
2977         for (; numToGo > 0; offset++)
2978           {
2979             numToGo--;
2980             if (Character.isHighSurrogate(seq.charAt(offset))
2981                 && (offset + 1) < len
2982                 && Character.isLowSurrogate(seq.charAt(offset + 1)))
2983               offset++;
2984           }
2985         return offset;
2986       }
2987     else
2988       {
2989         numToGo *= -1;
2990         for (; numToGo > 0;)
2991           {
2992             numToGo--;
2993             offset--;
2994             if (Character.isLowSurrogate(seq.charAt(offset))
2995                 && (offset - 1) >= 0
2996                 && Character.isHighSurrogate(seq.charAt(offset - 1)))
2997               offset--;
2998           }
2999         return offset;
3000       }
3001   }
3002
3003   /**
3004    * Returns the index into the given char subarray that is offset
3005    * <code>codePointOffset</code> code points from <code>index</code>.
3006    * @param a the char array
3007    * @param start the start index of the subarray
3008    * @param count the length of the subarray
3009    * @param index the index to be offset
3010    * @param codePointOffset the number of code points offset from <code>index
3011    * </code>
3012    * @return the index into the char array
3013    *
3014    * @throws NullPointerException if a is null
3015    * @throws IndexOutOfBoundsException if start or count is negative or if
3016    * start + count is greater than the length of the array
3017    * @throws IndexOutOfBoundsException if index is less than start or larger
3018    * than start + count
3019    * @throws IndexOutOfBoundsException if codePointOffset is positive and the
3020    * subarray from index to start + count - 1 has fewer than codePointOffset
3021    * code points.
3022    * @throws IndexOutOfBoundsException if codePointOffset is negative and the
3023    * subarray from start to index - 1 has fewer than (-codePointOffset) code
3024    * points
3025    *
3026    * @since 1.5
3027    */
3028   public static int offsetByCodePoints(char[] a,
3029                                        int start,
3030                                        int count,
3031                                        int index,
3032                                        int codePointOffset)
3033   {
3034     int len = a.length;
3035     int end = start + count;
3036     if (start < 0 || count < 0 || end > len || index < start || index > end)
3037       throw new IndexOutOfBoundsException();
3038
3039     int numToGo = codePointOffset;
3040     int offset = index;
3041     int adjust = 1;
3042     if (numToGo >= 0)
3043       {
3044         for (; numToGo > 0; offset++)
3045           {
3046             numToGo--;
3047             if (Character.isHighSurrogate(a[offset])
3048                 && (offset + 1) < len
3049                 && Character.isLowSurrogate(a[offset + 1]))
3050               offset++;
3051           }
3052         return offset;
3053       }
3054     else
3055       {
3056         numToGo *= -1;
3057         for (; numToGo > 0;)
3058           {
3059             numToGo--;
3060             offset--;
3061             if (Character.isLowSurrogate(a[offset])
3062                 && (offset - 1) >= 0
3063                 && Character.isHighSurrogate(a[offset - 1]))
3064               offset--;
3065             if (offset < start)
3066               throw new IndexOutOfBoundsException();
3067           }
3068         return offset;
3069       }
3070
3071   }
3072
3073   /**
3074    * Returns the number of Unicode code points in the specified range of the
3075    * given CharSequence.  The first char in the range is at position
3076    * beginIndex and the last one is at position endIndex - 1.  Paired
3077    * surrogates (supplementary characters are represented by a pair of chars -
3078    * one from the high surrogates and one from the low surrogates)
3079    * count as just one code point.
3080    * @param seq the CharSequence to inspect
3081    * @param beginIndex the beginning of the range
3082    * @param endIndex the end of the range
3083    * @return the number of Unicode code points in the given range of the
3084    * sequence
3085    * @throws NullPointerException if seq is null
3086    * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is
3087    * larger than the length of seq, or if beginIndex is greater than endIndex.
3088    * @since 1.5
3089    */
3090   public static int codePointCount(CharSequence seq, int beginIndex,
3091                                    int endIndex)
3092   {
3093     int len = seq.length();
3094     if (beginIndex < 0 || endIndex > len || beginIndex > endIndex)
3095       throw new IndexOutOfBoundsException();
3096
3097     int count = 0;
3098     for (int i = beginIndex; i < endIndex; i++)
3099       {
3100         count++;
3101         // If there is a pairing, count it only once.
3102         if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex
3103             && isLowSurrogate(seq.charAt(i + 1)))
3104           i ++;
3105       }
3106     return count;
3107   }
3108
3109   /**
3110    * Returns the number of Unicode code points in the specified range of the
3111    * given char array.  The first char in the range is at position
3112    * offset and the length of the range is count.  Paired surrogates
3113    * (supplementary characters are represented by a pair of chars -
3114    * one from the high surrogates and one from the low surrogates)
3115    * count as just one code point.
3116    * @param a the char array to inspect
3117    * @param offset the beginning of the range
3118    * @param count the length of the range
3119    * @return the number of Unicode code points in the given range of the
3120    * array
3121    * @throws NullPointerException if a is null
3122    * @throws IndexOutOfBoundsException if offset or count is negative or if
3123    * offset + countendIndex is larger than the length of a.
3124    * @since 1.5
3125    */
3126   public static int codePointCount(char[] a, int offset,
3127                                    int count)
3128   {
3129     int len = a.length;
3130     int end = offset + count;
3131     if (offset < 0 || count < 0 || end > len)
3132       throw new IndexOutOfBoundsException();
3133
3134     int counter = 0;
3135     for (int i = offset; i < end; i++)
3136       {
3137         counter++;
3138         // If there is a pairing, count it only once.
3139         if (isHighSurrogate(a[i]) && (i + 1) < end
3140             && isLowSurrogate(a[i + 1]))
3141           i ++;
3142       }
3143     return counter;
3144   }
3145
3146   /**
3147    * Determines if a character is a Unicode letter or a Unicode digit. This
3148    * is the combination of isLetter and isDigit.
3149    * <br>
3150    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
3151    *
3152    * @param ch character to test
3153    * @return true if ch is a Unicode letter or a Unicode digit, else false
3154    * @see #isDigit(char)
3155    * @see #isJavaIdentifierPart(char)
3156    * @see #isJavaLetter(char)
3157    * @see #isJavaLetterOrDigit(char)
3158    * @see #isLetter(char)
3159    * @see #isUnicodeIdentifierPart(char)
3160    */
3161   public static boolean isLetterOrDigit(char ch)
3162   {
3163     return isLetterOrDigit((int)ch);
3164   }
3165
3166   /**
3167    * Determines if a character is a Unicode letter or a Unicode digit. This
3168    * is the combination of isLetter and isDigit.
3169    * <br>
3170    * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
3171    *
3172    * @param codePoint character to test
3173    * @return true if ch is a Unicode letter or a Unicode digit, else false
3174    * @see #isDigit(char)
3175    * @see #isJavaIdentifierPart(char)
3176    * @see #isJavaLetter(char)
3177    * @see #isJavaLetterOrDigit(char)
3178    * @see #isLetter(char)
3179    * @see #isUnicodeIdentifierPart(char)
3180    *
3181    * @since 1.5
3182    */
3183   public static boolean isLetterOrDigit(int codePoint)
3184   {
3185     return ((1 << getType(codePoint))
3186         & ((1 << UPPERCASE_LETTER)
3187            | (1 << LOWERCASE_LETTER)
3188            | (1 << TITLECASE_LETTER)
3189            | (1 << MODIFIER_LETTER)
3190            | (1 << OTHER_LETTER)
3191            | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
3192   }
3193
3194   /**
3195    * Determines if a character can start a Java identifier. This is the
3196    * combination of isLetter, any character where getType returns
3197    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3198    * (like '_').
3199    *
3200    * @param ch character to test
3201    * @return true if ch can start a Java identifier, else false
3202    * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
3203    * @see #isJavaLetterOrDigit(char)
3204    * @see #isJavaIdentifierStart(char)
3205    * @see #isJavaIdentifierPart(char)
3206    * @see #isLetter(char)
3207    * @see #isLetterOrDigit(char)
3208    * @see #isUnicodeIdentifierStart(char)
3209    */
3210   public static boolean isJavaLetter(char ch)
3211   {
3212     return isJavaIdentifierStart(ch);
3213   }
3214
3215   /**
3216    * Determines if a character can follow the first letter in
3217    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
3218    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3219    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3220    * or isIdentifierIgnorable.
3221    *
3222    * @param ch character to test
3223    * @return true if ch can follow the first letter in a Java identifier
3224    * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
3225    * @see #isJavaLetter(char)
3226    * @see #isJavaIdentifierStart(char)
3227    * @see #isJavaIdentifierPart(char)
3228    * @see #isLetter(char)
3229    * @see #isLetterOrDigit(char)
3230    * @see #isUnicodeIdentifierPart(char)
3231    * @see #isIdentifierIgnorable(char)
3232    */
3233   public static boolean isJavaLetterOrDigit(char ch)
3234   {
3235     return isJavaIdentifierPart(ch);
3236   }
3237
3238   /**
3239    * Determines if a character can start a Java identifier. This is the
3240    * combination of isLetter, any character where getType returns
3241    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3242    * (like '_').
3243    * <br>
3244    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
3245    *
3246    * @param ch character to test
3247    * @return true if ch can start a Java identifier, else false
3248    * @see #isJavaIdentifierPart(char)
3249    * @see #isLetter(char)
3250    * @see #isUnicodeIdentifierStart(char)
3251    * @since 1.1
3252    */
3253   public static boolean isJavaIdentifierStart(char ch)
3254   {
3255     return isJavaIdentifierStart((int)ch);
3256   }
3257
3258   /**
3259    * Determines if a character can start a Java identifier. This is the
3260    * combination of isLetter, any character where getType returns
3261    * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3262    * (like '_').
3263    * <br>
3264    * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
3265    *
3266    * @param codePoint character to test
3267    * @return true if ch can start a Java identifier, else false
3268    * @see #isJavaIdentifierPart(char)
3269    * @see #isLetter(char)
3270    * @see #isUnicodeIdentifierStart(char)
3271    * @since 1.5
3272    */
3273   public static boolean isJavaIdentifierStart(int codePoint)
3274   {
3275     return ((1 << getType(codePoint))
3276             & ((1 << UPPERCASE_LETTER)
3277                | (1 << LOWERCASE_LETTER)
3278                | (1 << TITLECASE_LETTER)
3279                | (1 << MODIFIER_LETTER)
3280                | (1 << OTHER_LETTER)
3281                | (1 << LETTER_NUMBER)
3282                | (1 << CURRENCY_SYMBOL)
3283                | (1 << CONNECTOR_PUNCTUATION))) != 0;
3284   }
3285
3286   /**
3287    * Determines if a character can follow the first letter in
3288    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
3289    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3290    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3291    * or isIdentifierIgnorable.
3292    * <br>
3293    * Java identifier extender =
3294    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
3295    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3296    *
3297    * @param ch character to test
3298    * @return true if ch can follow the first letter in a Java identifier
3299    * @see #isIdentifierIgnorable(char)
3300    * @see #isJavaIdentifierStart(char)
3301    * @see #isLetterOrDigit(char)
3302    * @see #isUnicodeIdentifierPart(char)
3303    * @since 1.1
3304    */
3305   public static boolean isJavaIdentifierPart(char ch)
3306   {
3307     return isJavaIdentifierPart((int)ch);
3308   }
3309
3310   /**
3311    * Determines if a character can follow the first letter in
3312    * a Java identifier.  This is the combination of isJavaLetter (isLetter,
3313    * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3314    * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3315    * or isIdentifierIgnorable.
3316    * <br>
3317    * Java identifier extender =
3318    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
3319    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3320    *
3321    * @param codePoint character to test
3322    * @return true if ch can follow the first letter in a Java identifier
3323    * @see #isIdentifierIgnorable(char)
3324    * @see #isJavaIdentifierStart(char)
3325    * @see #isLetterOrDigit(char)
3326    * @see #isUnicodeIdentifierPart(char)
3327    * @since 1.5
3328    */
3329   public static boolean isJavaIdentifierPart(int codePoint)
3330   {
3331     int category = getType(codePoint);
3332     return ((1 << category)
3333             & ((1 << UPPERCASE_LETTER)
3334                | (1 << LOWERCASE_LETTER)
3335                | (1 << TITLECASE_LETTER)
3336                | (1 << MODIFIER_LETTER)
3337                | (1 << OTHER_LETTER)
3338                | (1 << NON_SPACING_MARK)
3339                | (1 << COMBINING_SPACING_MARK)
3340                | (1 << DECIMAL_DIGIT_NUMBER)
3341                | (1 << LETTER_NUMBER)
3342                | (1 << CURRENCY_SYMBOL)
3343                | (1 << CONNECTOR_PUNCTUATION)
3344                | (1 << FORMAT))) != 0
3345       || (category == CONTROL && isIdentifierIgnorable(codePoint));
3346   }
3347
3348   /**
3349    * Determines if a character can start a Unicode identifier.  Only
3350    * letters can start a Unicode identifier, but this includes characters
3351    * in LETTER_NUMBER.
3352    * <br>
3353    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
3354    *
3355    * @param ch character to test
3356    * @return true if ch can start a Unicode identifier, else false
3357    * @see #isJavaIdentifierStart(char)
3358    * @see #isLetter(char)
3359    * @see #isUnicodeIdentifierPart(char)
3360    * @since 1.1
3361    */
3362   public static boolean isUnicodeIdentifierStart(char ch)
3363   {
3364     return isUnicodeIdentifierStart((int)ch);
3365   }
3366
3367   /**
3368    * Determines if a character can start a Unicode identifier.  Only
3369    * letters can start a Unicode identifier, but this includes characters
3370    * in LETTER_NUMBER.
3371    * <br>
3372    * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
3373    *
3374    * @param codePoint character to test
3375    * @return true if ch can start a Unicode identifier, else false
3376    * @see #isJavaIdentifierStart(char)
3377    * @see #isLetter(char)
3378    * @see #isUnicodeIdentifierPart(char)
3379    * @since 1.5
3380    */
3381   public static boolean isUnicodeIdentifierStart(int codePoint)
3382   {
3383     return ((1 << getType(codePoint))
3384             & ((1 << UPPERCASE_LETTER)
3385                | (1 << LOWERCASE_LETTER)
3386                | (1 << TITLECASE_LETTER)
3387                | (1 << MODIFIER_LETTER)
3388                | (1 << OTHER_LETTER)
3389                | (1 << LETTER_NUMBER))) != 0;
3390   }
3391
3392   /**
3393    * Determines if a character can follow the first letter in
3394    * a Unicode identifier. This includes letters, connecting punctuation,
3395    * digits, numeric letters, combining marks, non-spacing marks, and
3396    * isIdentifierIgnorable.
3397    * <br>
3398    * Unicode identifier extender =
3399    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
3400    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3401    *
3402    * @param ch character to test
3403    * @return true if ch can follow the first letter in a Unicode identifier
3404    * @see #isIdentifierIgnorable(char)
3405    * @see #isJavaIdentifierPart(char)
3406    * @see #isLetterOrDigit(char)
3407    * @see #isUnicodeIdentifierStart(char)
3408    * @since 1.1
3409    */
3410   public static boolean isUnicodeIdentifierPart(char ch)
3411   {
3412     return isUnicodeIdentifierPart((int)ch);
3413   }
3414
3415   /**
3416    * Determines if a character can follow the first letter in
3417    * a Unicode identifier. This includes letters, connecting punctuation,
3418    * digits, numeric letters, combining marks, non-spacing marks, and
3419    * isIdentifierIgnorable.
3420    * <br>
3421    * Unicode identifier extender =
3422    *   [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
3423    *   |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3424    *
3425    * @param codePoint character to test
3426    * @return true if ch can follow the first letter in a Unicode identifier
3427    * @see #isIdentifierIgnorable(char)
3428    * @see #isJavaIdentifierPart(char)
3429    * @see #isLetterOrDigit(char)
3430    * @see #isUnicodeIdentifierStart(char)
3431    * @since 1.5
3432    */
3433   public static boolean isUnicodeIdentifierPart(int codePoint)
3434   {
3435     int category = getType(codePoint);
3436     return ((1 << category)
3437             & ((1 << UPPERCASE_LETTER)
3438                | (1 << LOWERCASE_LETTER)
3439                | (1 << TITLECASE_LETTER)
3440                | (1 << MODIFIER_LETTER)
3441                | (1 << OTHER_LETTER)
3442                | (1 << NON_SPACING_MARK)
3443                | (1 << COMBINING_SPACING_MARK)
3444                | (1 << DECIMAL_DIGIT_NUMBER)
3445                | (1 << LETTER_NUMBER)
3446                | (1 << CONNECTOR_PUNCTUATION)
3447                | (1 << FORMAT))) != 0
3448       || (category == CONTROL && isIdentifierIgnorable(codePoint));
3449   }
3450
3451   /**
3452    * Determines if a character is ignorable in a Unicode identifier. This
3453    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
3454    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
3455    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
3456    * <code>'\u009F'</code>), and FORMAT characters.
3457    * <br>
3458    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
3459    *    |U+007F-U+009F
3460    *
3461    * @param ch character to test
3462    * @return true if ch is ignorable in a Unicode or Java identifier
3463    * @see #isJavaIdentifierPart(char)
3464    * @see #isUnicodeIdentifierPart(char)
3465    * @since 1.1
3466    */
3467   public static boolean isIdentifierIgnorable(char ch)
3468   {
3469     return isIdentifierIgnorable((int)ch);
3470   }
3471   /**
3472    * Determines if a character is ignorable in a Unicode identifier. This
3473    * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
3474    * through <code>'\u0008'</code>, <code>'\u000E'</code> through
3475    * <code>'\u001B'</code>, and <code>'\u007F'</code> through
3476    * <code>'\u009F'</code>), and FORMAT characters.
3477    * <br>
3478    * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
3479    *    |U+007F-U+009F
3480    *
3481    * @param codePoint character to test
3482    * @return true if ch is ignorable in a Unicode or Java identifier
3483    * @see #isJavaIdentifierPart(char)
3484    * @see #isUnicodeIdentifierPart(char)
3485    * @since 1.5
3486    */
3487   public static boolean isIdentifierIgnorable(int codePoint)
3488   {
3489     if ((codePoint >= 0 && codePoint <= 0x0008)
3490         || (codePoint >= 0x000E && codePoint <= 0x001B)
3491         || (codePoint >= 0x007F && codePoint <= 0x009F)
3492         || getType(codePoint) == FORMAT)
3493       return true;
3494     return false;
3495   }
3496
3497   /**
3498    * Converts a Unicode character into its lowercase equivalent mapping.
3499    * If a mapping does not exist, then the character passed is returned.
3500    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
3501    *
3502    * @param ch character to convert to lowercase
3503    * @return lowercase mapping of ch, or ch if lowercase mapping does
3504    *         not exist
3505    * @see #isLowerCase(char)
3506    * @see #isUpperCase(char)
3507    * @see #toTitleCase(char)
3508    * @see #toUpperCase(char)
3509    */
3510   public static char toLowerCase(char ch)
3511   {
3512     return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch);
3513   }
3514
3515   /**
3516    * Converts a Unicode character into its lowercase equivalent mapping.
3517    * If a mapping does not exist, then the character passed is returned.
3518    * Note that isLowerCase(toLowerCase(ch)) does not always return true.
3519    *
3520    * @param codePoint character to convert to lowercase
3521    * @return lowercase mapping of ch, or ch if lowercase mapping does
3522    *         not exist
3523    * @see #isLowerCase(char)
3524    * @see #isUpperCase(char)
3525    * @see #toTitleCase(char)
3526    * @see #toUpperCase(char)
3527    *
3528    * @since 1.5
3529    */
3530   public static int toLowerCase(int codePoint)
3531   {
3532     // If the code point is unassigned or in one of the private use areas
3533     // then we delegate the call to the appropriate private static inner class.
3534     int plane = codePoint >>> 16;
3535     if (plane > 2 && plane < 14)
3536       return UnassignedCharacters.toLowerCase(codePoint);
3537     if (plane > 14)
3538       return PrivateUseCharacters.toLowerCase(codePoint);
3539
3540     // The short value stored in lower[plane] is the signed difference between
3541     // codePoint and its lowercase conversion.
3542     return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
3543   }
3544
3545   /**
3546    * Converts a Unicode character into its uppercase equivalent mapping.
3547    * If a mapping does not exist, then the character passed is returned.
3548    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
3549    *
3550    * @param ch character to convert to uppercase
3551    * @return uppercase mapping of ch, or ch if uppercase mapping does
3552    *         not exist
3553    * @see #isLowerCase(char)
3554    * @see #isUpperCase(char)
3555    * @see #toLowerCase(char)
3556    * @see #toTitleCase(char)
3557    */
3558   public static char toUpperCase(char ch)
3559   {
3560     return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch);
3561   }
3562
3563   /**
3564    * Converts a Unicode character into its uppercase equivalent mapping.
3565    * If a mapping does not exist, then the character passed is returned.
3566    * Note that isUpperCase(toUpperCase(ch)) does not always return true.
3567    *
3568    * @param codePoint character to convert to uppercase
3569    * @return uppercase mapping of ch, or ch if uppercase mapping does
3570    *         not exist
3571    * @see #isLowerCase(char)
3572    * @see #isUpperCase(char)
3573    * @see #toLowerCase(char)
3574    * @see #toTitleCase(char)
3575    *
3576    * @since 1.5
3577    */
3578   public static int toUpperCase(int codePoint)
3579   {
3580     // If the code point is unassigned or in one of the private use areas
3581     // then we delegate the call to the appropriate private static inner class.
3582     int plane = codePoint >>> 16;
3583     if (plane > 2 && plane < 14)
3584       return UnassignedCharacters.toUpperCase(codePoint);
3585     if (plane > 14)
3586       return PrivateUseCharacters.toUpperCase(codePoint);
3587
3588     // The short value stored in upper[plane] is the signed difference between
3589     // codePoint and its uppercase conversion.
3590     return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
3591   }
3592
3593   /**
3594    * Converts a Unicode character into its titlecase equivalent mapping.
3595    * If a mapping does not exist, then the character passed is returned.
3596    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
3597    *
3598    * @param ch character to convert to titlecase
3599    * @return titlecase mapping of ch, or ch if titlecase mapping does
3600    *         not exist
3601    * @see #isTitleCase(char)
3602    * @see #toLowerCase(char)
3603    * @see #toUpperCase(char)
3604    */
3605   public static char toTitleCase(char ch)
3606   {
3607     // As title is short, it doesn't hurt to exhaustively iterate over it.
3608     for (int i = title.length - 2; i >= 0; i -= 2)
3609       if (title[i] == ch)
3610         return title[i + 1];
3611     return toUpperCase(ch);
3612   }
3613
3614   /**
3615    * Converts a Unicode character into its titlecase equivalent mapping.
3616    * If a mapping does not exist, then the character passed is returned.
3617    * Note that isTitleCase(toTitleCase(ch)) does not always return true.
3618    *
3619    * @param codePoint character to convert to titlecase
3620    * @return titlecase mapping of ch, or ch if titlecase mapping does
3621    *         not exist
3622    * @see #isTitleCase(char)
3623    * @see #toLowerCase(char)
3624    * @see #toUpperCase(char)
3625    *
3626    * @since 1.5
3627    */
3628   public static int toTitleCase(int codePoint)
3629   {
3630     // As of Unicode 4.0.0 no characters outside of plane 0 have
3631     // titlecase mappings that are different from their uppercase
3632     // mapping.
3633     if (codePoint < 0x10000)
3634       return (int) toTitleCase((char)codePoint);
3635     return toUpperCase(codePoint);
3636   }
3637
3638   /**
3639    * Converts a character into a digit of the specified radix. If the radix
3640    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3641    * exceeds the radix, or if ch is not a decimal digit or in the case
3642    * insensitive set of 'a'-'z', the result is -1.
3643    * <br>
3644    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3645    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3646    *
3647    * @param ch character to convert into a digit
3648    * @param radix radix in which ch is a digit
3649    * @return digit which ch represents in radix, or -1 not a valid digit
3650    * @see #MIN_RADIX
3651    * @see #MAX_RADIX
3652    * @see #forDigit(int, int)
3653    * @see #isDigit(char)
3654    * @see #getNumericValue(char)
3655    */
3656   public static int digit(char ch, int radix)
3657   {
3658     if (radix < MIN_RADIX || radix > MAX_RADIX)
3659       return -1;
3660     char attr = readCodePoint((int)ch);
3661     if (((1 << (attr & TYPE_MASK))
3662          & ((1 << UPPERCASE_LETTER)
3663             | (1 << LOWERCASE_LETTER)
3664             | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
3665       {
3666         // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
3667         int digit = numValue[0][attr >> 7];
3668         return (digit < radix) ? digit : -1;
3669       }
3670     return -1;
3671   }
3672
3673   /**
3674    * Converts a character into a digit of the specified radix. If the radix
3675    * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3676    * exceeds the radix, or if ch is not a decimal digit or in the case
3677    * insensitive set of 'a'-'z', the result is -1.
3678    * <br>
3679    * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3680    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3681    *
3682    * @param codePoint character to convert into a digit
3683    * @param radix radix in which ch is a digit
3684    * @return digit which ch represents in radix, or -1 not a valid digit
3685    * @see #MIN_RADIX
3686    * @see #MAX_RADIX
3687    * @see #forDigit(int, int)
3688    * @see #isDigit(char)
3689    * @see #getNumericValue(char)
3690    */
3691   public static int digit(int codePoint, int radix)
3692   {
3693     if (radix < MIN_RADIX || radix > MAX_RADIX)
3694       return -1;
3695
3696     // If the code point is unassigned or in one of the private use areas
3697     // then we delegate the call to the appropriate private static inner class.
3698     int plane = codePoint >>> 16;
3699     if (plane > 2 && plane < 14)
3700       return UnassignedCharacters.digit(codePoint, radix);
3701     if (plane > 14)
3702       return PrivateUseCharacters.digit(codePoint, radix);
3703     char attr = readCodePoint(codePoint);
3704     if (((1 << (attr & TYPE_MASK))
3705          & ((1 << UPPERCASE_LETTER)
3706             | (1 << LOWERCASE_LETTER)
3707             | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
3708       {
3709         // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
3710         int digit = numValue[plane][attr >> 7];
3711
3712         // If digit is less than or equal to -3 then the numerical value was
3713         // too large to fit into numValue and is stored in CharData.LARGENUMS.
3714         if (digit <= -3)
3715           digit = CharData.LARGENUMS[-digit - 3];
3716         return (digit < radix) ? digit : -1;
3717       }
3718     return -1;
3719   }
3720
3721   /**
3722    * Returns the Unicode numeric value property of a character. For example,
3723    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3724    *
3725    * <p>This method also returns values for the letters A through Z, (not
3726    * specified by Unicode), in these ranges: <code>'\u0041'</code>
3727    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3728    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3729    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3730    * <code>'\uFF5A'</code> (full width variants).
3731    *
3732    * <p>If the character lacks a numeric value property, -1 is returned.
3733    * If the character has a numeric value property which is not representable
3734    * as a nonnegative integer, such as a fraction, -2 is returned.
3735    *
3736    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3737    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3738    *
3739    * @param ch character from which the numeric value property will
3740    *        be retrieved
3741    * @return the numeric value property of ch, or -1 if it does not exist, or
3742    *         -2 if it is not representable as a nonnegative integer
3743    * @see #forDigit(int, int)
3744    * @see #digit(char, int)
3745    * @see #isDigit(char)
3746    * @since 1.1
3747    */
3748   public static int getNumericValue(char ch)
3749   {
3750     // Treat numValue as signed.
3751     return (short) numValue[0][readCodePoint((int)ch) >> 7];
3752   }
3753
3754   /**
3755    * Returns the Unicode numeric value property of a character. For example,
3756    * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3757    *
3758    * <p>This method also returns values for the letters A through Z, (not
3759    * specified by Unicode), in these ranges: <code>'\u0041'</code>
3760    * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3761    * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3762    * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3763    * <code>'\uFF5A'</code> (full width variants).
3764    *
3765    * <p>If the character lacks a numeric value property, -1 is returned.
3766    * If the character has a numeric value property which is not representable
3767    * as a nonnegative integer, such as a fraction, -2 is returned.
3768    *
3769    * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3770    *    |U+FF21-U+FF3A|U+FF41-U+FF5A
3771    *
3772    * @param codePoint character from which the numeric value property will
3773    *        be retrieved
3774    * @return the numeric value property of ch, or -1 if it does not exist, or
3775    *         -2 if it is not representable as a nonnegative integer
3776    * @see #forDigit(int, int)
3777    * @see #digit(char, int)
3778    * @see #isDigit(char)
3779    * @since 1.5
3780    */
3781   public static int getNumericValue(int codePoint)
3782   {
3783     // If the code point is unassigned or in one of the private use areas
3784     // then we delegate the call to the appropriate private static inner class.
3785     int plane = codePoint >>> 16;
3786     if (plane > 2 && plane < 14)
3787       return UnassignedCharacters.getNumericValue(codePoint);
3788     if (plane > 14)
3789       return PrivateUseCharacters.getNumericValue(codePoint);
3790
3791     // If the value N found in numValue[plane] is less than or equal to -3
3792     // then the numeric value was too big to fit into 16 bits and is
3793     // stored in CharData.LARGENUMS at offset (-N - 3).
3794     short num = (short)numValue[plane][readCodePoint(codePoint) >> 7];
3795     if (num <= -3)
3796       return CharData.LARGENUMS[-num - 3];
3797     return num;
3798   }
3799
3800   /**
3801    * Determines if a character is a ISO-LATIN-1 space. This is only the five
3802    * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
3803    * <code>'\r'</code>, and <code>' '</code>.
3804    * <br>
3805    * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
3806    *
3807    * @param ch character to test
3808    * @return true if ch is a space, else false
3809    * @deprecated Replaced by {@link #isWhitespace(char)}
3810    * @see #isSpaceChar(char)
3811    * @see #isWhitespace(char)
3812    */
3813   public static boolean isSpace(char ch)
3814   {
3815     // Performing the subtraction up front alleviates need to compare longs.
3816     return ch-- <= ' ' && ((1 << ch)
3817                            & ((1 << (' ' - 1))
3818                               | (1 << ('\t' - 1))
3819                               | (1 << ('\n' - 1))
3820                               | (1 << ('\r' - 1))
3821                               | (1 << ('\f' - 1)))) != 0;
3822   }
3823
3824   /**
3825    * Determines if a character is a Unicode space character. This includes
3826    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3827    * <br>
3828    * Unicode space = [Zs]|[Zp]|[Zl]
3829    *
3830    * @param ch character to test
3831    * @return true if ch is a Unicode space, else false
3832    * @see #isWhitespace(char)
3833    * @since 1.1
3834    */
3835   public static boolean isSpaceChar(char ch)
3836   {
3837     return isSpaceChar((int)ch);
3838   }
3839
3840   /**
3841    * Determines if a character is a Unicode space character. This includes
3842    * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3843    * <br>
3844    * Unicode space = [Zs]|[Zp]|[Zl]
3845    *
3846    * @param codePoint character to test
3847    * @return true if ch is a Unicode space, else false
3848    * @see #isWhitespace(char)
3849    * @since 1.5
3850    */
3851   public static boolean isSpaceChar(int codePoint)
3852   {
3853     return ((1 << getType(codePoint))
3854             & ((1 << SPACE_SEPARATOR)
3855                | (1 << LINE_SEPARATOR)
3856                | (1 << PARAGRAPH_SEPARATOR))) != 0;
3857   }
3858
3859   /**
3860    * Determines if a character is Java whitespace. This includes Unicode
3861    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3862    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3863    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3864    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3865    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3866    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3867    * and <code>'\u001F'</code>.
3868    * <br>
3869    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3870    *
3871    * @param ch character to test
3872    * @return true if ch is Java whitespace, else false
3873    * @see #isSpaceChar(char)
3874    * @since 1.1
3875    */
3876   public static boolean isWhitespace(char ch)
3877   {
3878     return isWhitespace((int) ch);
3879   }
3880
3881   /**
3882    * Determines if a character is Java whitespace. This includes Unicode
3883    * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3884    * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3885    * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3886    * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3887    * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3888    * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3889    * and <code>'\u001F'</code>.
3890    * <br>
3891    * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3892    *
3893    * @param codePoint character to test
3894    * @return true if ch is Java whitespace, else false
3895    * @see #isSpaceChar(char)
3896    * @since 1.5
3897    */
3898   public static boolean isWhitespace(int codePoint)
3899   {
3900     int plane = codePoint >>> 16;
3901     if (plane > 2 && plane < 14)
3902       return UnassignedCharacters.isWhiteSpace(codePoint);
3903     if (plane > 14)
3904       return PrivateUseCharacters.isWhiteSpace(codePoint);
3905
3906     int attr = readCodePoint(codePoint);
3907     return ((((1 << (attr & TYPE_MASK))
3908               & ((1 << SPACE_SEPARATOR)
3909                  | (1 << LINE_SEPARATOR)
3910                  | (1 << PARAGRAPH_SEPARATOR))) != 0)
3911             && (attr & NO_BREAK_MASK) == 0)
3912       || (codePoint <= '\u001F' && ((1 << codePoint)
3913                              & ((1 << '\t')
3914                                 | (1 << '\n')
3915                                 | (1 << '\u000B')
3916                                 | (1 << '\u000C')
3917                                 | (1 << '\r')
3918                                 | (1 << '\u001C')
3919                                 | (1 << '\u001D')
3920                                 | (1 << '\u001E')
3921                                 | (1 << '\u001F'))) != 0);
3922   }
3923
3924   /**
3925    * Determines if a character has the ISO Control property.
3926    * <br>
3927    * ISO Control = [Cc]
3928    *
3929    * @param ch character to test
3930    * @return true if ch is an ISO Control character, else false
3931    * @see #isSpaceChar(char)
3932    * @see #isWhitespace(char)
3933    * @since 1.1
3934    */
3935   public static boolean isISOControl(char ch)
3936   {
3937     return isISOControl((int)ch);
3938   }
3939
3940   /**
3941    * Determines if the character is an ISO Control character.  This is true
3942    * if the code point is in the range [0, 0x001F] or if it is in the range
3943    * [0x007F, 0x009F].
3944    * @param codePoint the character to check
3945    * @return true if the character is in one of the above ranges
3946    *
3947    * @since 1.5
3948    */
3949   public static boolean isISOControl(int codePoint)
3950   {
3951     if ((codePoint >= 0 && codePoint <= 0x001F)
3952         || (codePoint >= 0x007F && codePoint <= 0x009F))
3953       return true;
3954     return false;
3955   }
3956
3957   /**
3958    * Returns the Unicode general category property of a character.
3959    *
3960    * @param ch character from which the general category property will
3961    *        be retrieved
3962    * @return the character category property of ch as an integer
3963    * @see #UNASSIGNED
3964    * @see #UPPERCASE_LETTER
3965    * @see #LOWERCASE_LETTER
3966    * @see #TITLECASE_LETTER
3967    * @see #MODIFIER_LETTER
3968    * @see #OTHER_LETTER
3969    * @see #NON_SPACING_MARK
3970    * @see #ENCLOSING_MARK
3971    * @see #COMBINING_SPACING_MARK
3972    * @see #DECIMAL_DIGIT_NUMBER
3973    * @see #LETTER_NUMBER
3974    * @see #OTHER_NUMBER
3975    * @see #SPACE_SEPARATOR
3976    * @see #LINE_SEPARATOR
3977    * @see #PARAGRAPH_SEPARATOR
3978    * @see #CONTROL
3979    * @see #FORMAT
3980    * @see #PRIVATE_USE
3981    * @see #SURROGATE
3982    * @see #DASH_PUNCTUATION
3983    * @see #START_PUNCTUATION
3984    * @see #END_PUNCTUATION
3985    * @see #CONNECTOR_PUNCTUATION
3986    * @see #OTHER_PUNCTUATION
3987    * @see #MATH_SYMBOL
3988    * @see #CURRENCY_SYMBOL
3989    * @see #MODIFIER_SYMBOL
3990    * @see #INITIAL_QUOTE_PUNCTUATION
3991    * @see #FINAL_QUOTE_PUNCTUATION
3992    * @since 1.1
3993    */
3994   public static int getType(char ch)
3995   {
3996     return getType((int)ch);
3997   }
3998
3999   /**
4000    * Returns the Unicode general category property of a character.
4001    *
4002    * @param codePoint character from which the general category property will
4003    *        be retrieved
4004    * @return the character category property of ch as an integer
4005    * @see #UNASSIGNED
4006    * @see #UPPERCASE_LETTER
4007    * @see #LOWERCASE_LETTER
4008    * @see #TITLECASE_LETTER
4009    * @see #MODIFIER_LETTER
4010    * @see #OTHER_LETTER
4011    * @see #NON_SPACING_MARK
4012    * @see #ENCLOSING_MARK
4013    * @see #COMBINING_SPACING_MARK
4014    * @see #DECIMAL_DIGIT_NUMBER
4015    * @see #LETTER_NUMBER
4016    * @see #OTHER_NUMBER
4017    * @see #SPACE_SEPARATOR
4018    * @see #LINE_SEPARATOR
4019    * @see #PARAGRAPH_SEPARATOR
4020    * @see #CONTROL
4021    * @see #FORMAT
4022    * @see #PRIVATE_USE
4023    * @see #SURROGATE
4024    * @see #DASH_PUNCTUATION
4025    * @see #START_PUNCTUATION
4026    * @see #END_PUNCTUATION
4027    * @see #CONNECTOR_PUNCTUATION
4028    * @see #OTHER_PUNCTUATION
4029    * @see #MATH_SYMBOL
4030    * @see #CURRENCY_SYMBOL
4031    * @see #MODIFIER_SYMBOL
4032    * @see #INITIAL_QUOTE_PUNCTUATION
4033    * @see #FINAL_QUOTE_PUNCTUATION
4034    *
4035    * @since 1.5
4036    */
4037   public static int getType(int codePoint)
4038   {
4039     // If the codePoint is unassigned or in one of the private use areas
4040     // then we delegate the call to the appropriate private static inner class.
4041     int plane = codePoint >>> 16;
4042     if (plane > 2 && plane < 14)
4043       return UnassignedCharacters.getType(codePoint);
4044     if (plane > 14)
4045       return PrivateUseCharacters.getType(codePoint);
4046
4047     return readCodePoint(codePoint) & TYPE_MASK;
4048   }
4049
4050   /**
4051    * Converts a digit into a character which represents that digit
4052    * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
4053    * or the digit exceeds the radix, then the null character <code>'\0'</code>
4054    * is returned.  Otherwise the return value is in '0'-'9' and 'a'-'z'.
4055    * <br>
4056    * return value boundary = U+0030-U+0039|U+0061-U+007A
4057    *
4058    * @param digit digit to be converted into a character
4059    * @param radix radix of digit
4060    * @return character representing digit in radix, or '\0'
4061    * @see #MIN_RADIX
4062    * @see #MAX_RADIX
4063    * @see #digit(char, int)
4064    */
4065   public static char forDigit(int digit, int radix)
4066   {
4067     if (radix < MIN_RADIX || radix > MAX_RADIX
4068         || digit < 0 || digit >= radix)
4069       return '\0';
4070     return Number.digits[digit];
4071   }
4072
4073   /**
4074    * Returns the Unicode directionality property of the character. This
4075    * is used in the visual ordering of text.
4076    *
4077    * @param ch the character to look up
4078    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
4079    * @see #DIRECTIONALITY_UNDEFINED
4080    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
4081    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
4082    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
4083    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
4084    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
4085    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
4086    * @see #DIRECTIONALITY_ARABIC_NUMBER
4087    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
4088    * @see #DIRECTIONALITY_NONSPACING_MARK
4089    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
4090    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
4091    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
4092    * @see #DIRECTIONALITY_WHITESPACE
4093    * @see #DIRECTIONALITY_OTHER_NEUTRALS
4094    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
4095    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
4096    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
4097    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
4098    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
4099    * @since 1.4
4100    */
4101   public static byte getDirectionality(char ch)
4102   {
4103     // The result will correctly be signed.
4104     return getDirectionality((int)ch);
4105   }
4106
4107   /**
4108    * Returns the Unicode directionality property of the character. This
4109    * is used in the visual ordering of text.
4110    *
4111    * @param codePoint the character to look up
4112    * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
4113    * @see #DIRECTIONALITY_UNDEFINED
4114    * @see #DIRECTIONALITY_LEFT_TO_RIGHT
4115    * @see #DIRECTIONALITY_RIGHT_TO_LEFT
4116    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
4117    * @see #DIRECTIONALITY_EUROPEAN_NUMBER
4118    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
4119    * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
4120    * @see #DIRECTIONALITY_ARABIC_NUMBER
4121    * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
4122    * @see #DIRECTIONALITY_NONSPACING_MARK
4123    * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
4124    * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
4125    * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
4126    * @see #DIRECTIONALITY_WHITESPACE
4127    * @see #DIRECTIONALITY_OTHER_NEUTRALS
4128    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
4129    * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
4130    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
4131    * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
4132    * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
4133    * @since 1.5
4134    */
4135   public static byte getDirectionality(int codePoint)
4136   {
4137     // If the code point is unassigned or in one of the private use areas
4138     // then we delegate the call to the appropriate private static inner class.
4139     int plane = codePoint >>> 16;
4140     if (plane > 2 && plane < 14)
4141       return UnassignedCharacters.getDirectionality(codePoint);
4142     if (plane > 14)
4143       return PrivateUseCharacters.getDirectionality(codePoint);
4144
4145     // The result will correctly be signed.
4146     return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2);
4147   }
4148
4149   /**
4150    * Determines whether the character is mirrored according to Unicode. For
4151    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
4152    * left-to-right text, but ')' in right-to-left text.
4153    *
4154    * @param ch the character to look up
4155    * @return true if the character is mirrored
4156    * @since 1.4
4157    */
4158   public static boolean isMirrored(char ch)
4159   {
4160     return (readCodePoint((int)ch) & MIRROR_MASK) != 0;
4161   }
4162
4163   /**
4164    * Determines whether the character is mirrored according to Unicode. For
4165    * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
4166    * left-to-right text, but ')' in right-to-left text.
4167    *
4168    * @param codePoint the character to look up
4169    * @return true if the character is mirrored
4170    * @since 1.5
4171    */
4172   public static boolean isMirrored(int codePoint)
4173   {
4174     // If the code point is unassigned or part of one of the private use areas
4175     // then we delegate the call to the appropriate private static inner class.
4176     int plane = codePoint >>> 16;
4177     if (plane > 2 && plane < 14)
4178       return UnassignedCharacters.isMirrored(codePoint);
4179     if (plane > 14)
4180       return PrivateUseCharacters.isMirrored(codePoint);
4181
4182     return (readCodePoint(codePoint) & MIRROR_MASK) != 0;
4183   }
4184
4185   /**
4186    * Compares another Character to this Character, numerically.
4187    *
4188    * @param anotherCharacter Character to compare with this Character
4189    * @return a negative integer if this Character is less than
4190    *         anotherCharacter, zero if this Character is equal, and
4191    *         a positive integer if this Character is greater
4192    * @throws NullPointerException if anotherCharacter is null
4193    * @since 1.2
4194    */
4195   public int compareTo(Character anotherCharacter)
4196   {
4197     return value - anotherCharacter.value;
4198   }
4199
4200   /**
4201    * Compares an object to this Character.  Assuming the object is a
4202    * Character object, this method performs the same comparison as
4203    * compareTo(Character).
4204    *
4205    * @param o object to compare
4206    * @return the comparison value
4207    * @throws ClassCastException if o is not a Character object
4208    * @throws NullPointerException if o is null
4209    * @see #compareTo(Character)
4210    * @since 1.2
4211    */
4212   public int compareTo(Object o)
4213   {
4214     return compareTo((Character) o);
4215   }
4216
4217   /**
4218    * Returns an <code>Character</code> object wrapping the value.
4219    * In contrast to the <code>Character</code> constructor, this method
4220    * will cache some values.  It is used by boxing conversion.
4221    *
4222    * @param val the value to wrap
4223    * @return the <code>Character</code>
4224    *
4225    * @since 1.5
4226    */
4227   public static Character valueOf(char val)
4228   {
4229     if (val > MAX_CACHE)
4230       return new Character(val);
4231     synchronized (charCache)
4232       {
4233     if (charCache[val - MIN_VALUE] == null)
4234       charCache[val - MIN_VALUE] = new Character(val);
4235     return charCache[val - MIN_VALUE];
4236       }
4237   }
4238
4239   /**
4240    * Reverse the bytes in val.
4241    * @since 1.5
4242    */
4243   public static char reverseBytes(char val)
4244   {
4245     return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
4246   }
4247
4248   /**
4249    * Converts a unicode code point to a UTF-16 representation of that
4250    * code point.
4251    *
4252    * @param codePoint the unicode code point
4253    *
4254    * @return the UTF-16 representation of that code point
4255    *
4256    * @throws IllegalArgumentException if the code point is not a valid
4257    *         unicode code point
4258    *
4259    * @since 1.5
4260    */
4261   public static char[] toChars(int codePoint)
4262   {
4263     if (!isValidCodePoint(codePoint))
4264       throw new IllegalArgumentException("Illegal Unicode code point : "
4265                                          + codePoint);
4266     char[] result = new char[charCount(codePoint)];
4267     int ignore = toChars(codePoint, result, 0);
4268     return result;
4269   }
4270
4271   /**
4272    * Converts a unicode code point to its UTF-16 representation.
4273    *
4274    * @param codePoint the unicode code point
4275    * @param dst the target char array
4276    * @param dstIndex the start index for the target
4277    *
4278    * @return number of characters written to <code>dst</code>
4279    *
4280    * @throws IllegalArgumentException if <code>codePoint</code> is not a
4281    *         valid unicode code point
4282    * @throws NullPointerException if <code>dst</code> is <code>null</code>
4283    * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
4284    *         in <code>dst</code> or if the UTF-16 representation does not
4285    *         fit into <code>dst</code>
4286    *
4287    * @since 1.5
4288    */
4289   public static int toChars(int codePoint, char[] dst, int dstIndex)
4290   {
4291     if (!isValidCodePoint(codePoint))
4292       {
4293         throw new IllegalArgumentException("not a valid code point: "
4294                                            + codePoint);
4295       }
4296
4297     int result;
4298     if (isSupplementaryCodePoint(codePoint))
4299       {
4300         // Write second char first to cause IndexOutOfBoundsException
4301         // immediately.
4302         final int cp2 = codePoint - 0x10000;
4303         dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
4304         dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
4305         result = 2;
4306       }
4307     else
4308       {
4309         dst[dstIndex] = (char) codePoint;
4310         result = 1;
4311       }
4312     return result;
4313   }
4314
4315   /**
4316    * Return number of 16-bit characters required to represent the given
4317    * code point.
4318    *
4319    * @param codePoint a unicode code point
4320    *
4321    * @return 2 if codePoint >= 0x10000, 1 otherwise.
4322    *
4323    * @since 1.5
4324    */
4325   public static int charCount(int codePoint)
4326   {
4327     return
4328       (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
4329       ? 2
4330       : 1;
4331   }
4332
4333   /**
4334    * Determines whether the specified code point is
4335    * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
4336    * supplementary character range.
4337    *
4338    * @param codePoint a Unicode code point
4339    *
4340    * @return <code>true</code> if code point is in supplementary range
4341    *
4342    * @since 1.5
4343    */
4344   public static boolean isSupplementaryCodePoint(int codePoint)
4345   {
4346     return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
4347       && codePoint <= MAX_CODE_POINT;
4348   }
4349
4350   /**
4351    * Determines whether the specified code point is
4352    * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
4353    *
4354    * @param codePoint a Unicode code point
4355    *
4356    * @return <code>true</code> if code point is valid
4357    *
4358    * @since 1.5
4359    */
4360   public static boolean isValidCodePoint(int codePoint)
4361   {
4362     return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
4363   }
4364
4365   /**
4366    * Return true if the given character is a high surrogate.
4367    * @param ch the character
4368    * @return true if the character is a high surrogate character
4369    *
4370    * @since 1.5
4371    */
4372   public static boolean isHighSurrogate(char ch)
4373   {
4374     return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
4375   }
4376
4377   /**
4378    * Return true if the given character is a low surrogate.
4379    * @param ch the character
4380    * @return true if the character is a low surrogate character
4381    *
4382    * @since 1.5
4383    */
4384   public static boolean isLowSurrogate(char ch)
4385   {
4386     return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
4387   }
4388
4389   /**
4390    * Return true if the given characters compose a surrogate pair.
4391    * This is true if the first character is a high surrogate and the
4392    * second character is a low surrogate.
4393    * @param ch1 the first character
4394    * @param ch2 the first character
4395    * @return true if the characters compose a surrogate pair
4396    *
4397    * @since 1.5
4398    */
4399   public static boolean isSurrogatePair(char ch1, char ch2)
4400   {
4401     return isHighSurrogate(ch1) && isLowSurrogate(ch2);
4402   }
4403
4404   /**
4405    * Given a valid surrogate pair, this returns the corresponding
4406    * code point.
4407    * @param high the high character of the pair
4408    * @param low the low character of the pair
4409    * @return the corresponding code point
4410    *
4411    * @since 1.5
4412    */
4413   public static int toCodePoint(char high, char low)
4414   {
4415     return ((high - MIN_HIGH_SURROGATE) * 0x400) +
4416       (low - MIN_LOW_SURROGATE) + 0x10000;
4417   }
4418
4419   /**
4420    * Get the code point at the specified index in the CharSequence.
4421    * This is like CharSequence#charAt(int), but if the character is
4422    * the start of a surrogate pair, and there is a following
4423    * character, and this character completes the pair, then the
4424    * corresponding supplementary code point is returned.  Otherwise,
4425    * the character at the index is returned.
4426    *
4427    * @param sequence the CharSequence
4428    * @param index the index of the codepoint to get, starting at 0
4429    * @return the codepoint at the specified index
4430    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4431    * @since 1.5
4432    */
4433   public static int codePointAt(CharSequence sequence, int index)
4434   {
4435     int len = sequence.length();
4436     if (index < 0 || index >= len)
4437       throw new IndexOutOfBoundsException();
4438     char high = sequence.charAt(index);
4439     if (! isHighSurrogate(high) || ++index >= len)
4440       return high;
4441     char low = sequence.charAt(index);
4442     if (! isLowSurrogate(low))
4443       return high;
4444     return toCodePoint(high, low);
4445   }
4446
4447   /**
4448    * Get the code point at the specified index in the CharSequence.
4449    * If the character is the start of a surrogate pair, and there is a
4450    * following character, and this character completes the pair, then
4451    * the corresponding supplementary code point is returned.
4452    * Otherwise, the character at the index is returned.
4453    *
4454    * @param chars the character array in which to look
4455    * @param index the index of the codepoint to get, starting at 0
4456    * @return the codepoint at the specified index
4457    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4458    * @since 1.5
4459    */
4460   public static int codePointAt(char[] chars, int index)
4461   {
4462     return codePointAt(chars, index, chars.length);
4463   }
4464
4465   /**
4466    * Get the code point at the specified index in the CharSequence.
4467    * If the character is the start of a surrogate pair, and there is a
4468    * following character within the specified range, and this
4469    * character completes the pair, then the corresponding
4470    * supplementary code point is returned.  Otherwise, the character
4471    * at the index is returned.
4472    *
4473    * @param chars the character array in which to look
4474    * @param index the index of the codepoint to get, starting at 0
4475    * @param limit the limit past which characters should not be examined
4476    * @return the codepoint at the specified index
4477    * @throws IndexOutOfBoundsException if index is negative or &gt;=
4478    * limit, or if limit is negative or &gt;= the length of the array
4479    * @since 1.5
4480    */
4481   public static int codePointAt(char[] chars, int index, int limit)
4482   {
4483     if (index < 0 || index >= limit || limit < 0 || limit > chars.length)
4484       throw new IndexOutOfBoundsException();
4485     char high = chars[index];
4486     if (! isHighSurrogate(high) || ++index >= limit)
4487       return high;
4488     char low = chars[index];
4489     if (! isLowSurrogate(low))
4490       return high;
4491     return toCodePoint(high, low);
4492   }
4493
4494   /**
4495    * Get the code point before the specified index.  This is like
4496    * #codePointAt(char[], int), but checks the characters at
4497    * <code>index-1</code> and <code>index-2</code> to see if they form
4498    * a supplementary code point.  If they do not, the character at
4499    * <code>index-1</code> is returned.
4500    *
4501    * @param chars the character array
4502    * @param index the index just past the codepoint to get, starting at 0
4503    * @return the codepoint at the specified index
4504    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4505    * @since 1.5
4506    */
4507   public static int codePointBefore(char[] chars, int index)
4508   {
4509     return codePointBefore(chars, index, 1);
4510   }
4511
4512   /**
4513    * Get the code point before the specified index.  This is like
4514    * #codePointAt(char[], int), but checks the characters at
4515    * <code>index-1</code> and <code>index-2</code> to see if they form
4516    * a supplementary code point.  If they do not, the character at
4517    * <code>index-1</code> is returned.  The start parameter is used to
4518    * limit the range of the array which may be examined.
4519    *
4520    * @param chars the character array
4521    * @param index the index just past the codepoint to get, starting at 0
4522    * @param start the index before which characters should not be examined
4523    * @return the codepoint at the specified index
4524    * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
4525    * the length of the array, or if limit is negative or &gt;= the
4526    * length of the array
4527    * @since 1.5
4528    */
4529   public static int codePointBefore(char[] chars, int index, int start)
4530   {
4531     if (index < start || index > chars.length
4532         || start < 0 || start >= chars.length)
4533       throw new IndexOutOfBoundsException();
4534     --index;
4535     char low = chars[index];
4536     if (! isLowSurrogate(low) || --index < start)
4537       return low;
4538     char high = chars[index];
4539     if (! isHighSurrogate(high))
4540       return low;
4541     return toCodePoint(high, low);
4542   }
4543
4544   /**
4545    * Get the code point before the specified index.  This is like
4546    * #codePointAt(CharSequence, int), but checks the characters at
4547    * <code>index-1</code> and <code>index-2</code> to see if they form
4548    * a supplementary code point.  If they do not, the character at
4549    * <code>index-1</code> is returned.
4550    *
4551    * @param sequence the CharSequence
4552    * @param index the index just past the codepoint to get, starting at 0
4553    * @return the codepoint at the specified index
4554    * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4555    * @since 1.5
4556    */
4557   public static int codePointBefore(CharSequence sequence, int index)
4558   {
4559     int len = sequence.length();
4560     if (index < 1 || index > len)
4561       throw new IndexOutOfBoundsException();
4562     --index;
4563     char low = sequence.charAt(index);
4564     if (! isLowSurrogate(low) || --index < 0)
4565       return low;
4566     char high = sequence.charAt(index);
4567     if (! isHighSurrogate(high))
4568       return low;
4569     return toCodePoint(high, low);
4570   }
4571 } // class Character