Dead
[official-gcc.git] / gomp-20050608-branch / libjava / classpath / java / lang / Character.java
blob3c88ff805c7ac77da368ec5452b3c4fbd476ac34
1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
2 Copyright (C) 1998, 1999, 2001, 2002, 2005 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package java.lang;
41 import gnu.java.lang.CharData;
43 import java.io.Serializable;
45 /**
46 * Wrapper class for the primitive char data type. In addition, this class
47 * allows one to retrieve property information and perform transformations
48 * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0.
49 * java.lang.Character is designed to be very dynamic, and as such, it
50 * retrieves information on the Unicode character set from a separate
51 * database, gnu.java.lang.CharData, which can be easily upgraded.
53 * <p>For predicates, boundaries are used to describe
54 * the set of characters for which the method will return true.
55 * This syntax uses fairly normal regular expression notation.
56 * See 5.13 of the Unicode Standard, Version 3.0, for the
57 * boundary specification.
59 * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
60 * for more information on the Unicode Standard.
62 * @author Tom Tromey (tromey@cygnus.com)
63 * @author Paul N. Fisher
64 * @author Jochen Hoenicke
65 * @author Eric Blake (ebb9@email.byu.edu)
66 * @see CharData
67 * @since 1.0
68 * @status updated to 1.4
70 public final class Character implements Serializable, Comparable
72 /**
73 * A subset of Unicode blocks.
75 * @author Paul N. Fisher
76 * @author Eric Blake (ebb9@email.byu.edu)
77 * @since 1.2
79 public static class Subset
81 /** The name of the subset. */
82 private final String name;
84 /**
85 * Construct a new subset of characters.
87 * @param name the name of the subset
88 * @throws NullPointerException if name is null
90 protected Subset(String name)
92 // Note that name.toString() is name, unless name was null.
93 this.name = name.toString();
96 /**
97 * Compares two Subsets for equality. This is <code>final</code>, and
98 * restricts the comparison on the <code>==</code> operator, so it returns
99 * true only for the same object.
101 * @param o the object to compare
102 * @return true if o is this
104 public final boolean equals(Object o)
106 return o == this;
110 * Makes the original hashCode of Object final, to be consistent with
111 * equals.
113 * @return the hash code for this object
115 public final int hashCode()
117 return super.hashCode();
121 * Returns the name of the subset.
123 * @return the name
125 public final String toString()
127 return name;
129 } // class Subset
132 * A family of character subsets in the Unicode specification. A character
133 * is in at most one of these blocks.
135 * This inner class was generated automatically from
136 * <code>doc/unicode/Block-3.txt</code>, by some perl scripts.
137 * This Unicode definition file can be found on the
138 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
139 * JDK 1.4 uses Unicode version 3.0.0.
141 * @author scripts/unicode-blocks.pl (written by Eric Blake)
142 * @since 1.2
144 public static final class UnicodeBlock extends Subset
146 /** The start of the subset. */
147 private final char start;
149 /** The end of the subset. */
150 private final char end;
153 * Constructor for strictly defined blocks.
155 * @param start the start character of the range
156 * @param end the end character of the range
157 * @param name the block name
159 private UnicodeBlock(char start, char end, String name)
161 super(name);
162 this.start = start;
163 this.end = end;
167 * Returns the Unicode character block which a character belongs to.
169 * @param ch the character to look up
170 * @return the set it belongs to, or null if it is not in one
172 public static UnicodeBlock of(char ch)
174 // Special case, since SPECIALS contains two ranges.
175 if (ch == '\uFEFF')
176 return SPECIALS;
177 // Simple binary search for the correct block.
178 int low = 0;
179 int hi = sets.length - 1;
180 while (low <= hi)
182 int mid = (low + hi) >> 1;
183 UnicodeBlock b = sets[mid];
184 if (ch < b.start)
185 hi = mid - 1;
186 else if (ch > b.end)
187 low = mid + 1;
188 else
189 return b;
191 return null;
195 * Basic Latin.
196 * '\u0000' - '\u007F'.
198 public static final UnicodeBlock BASIC_LATIN
199 = new UnicodeBlock('\u0000', '\u007F',
200 "BASIC_LATIN");
203 * Latin-1 Supplement.
204 * '\u0080' - '\u00FF'.
206 public static final UnicodeBlock LATIN_1_SUPPLEMENT
207 = new UnicodeBlock('\u0080', '\u00FF',
208 "LATIN_1_SUPPLEMENT");
211 * Latin Extended-A.
212 * '\u0100' - '\u017F'.
214 public static final UnicodeBlock LATIN_EXTENDED_A
215 = new UnicodeBlock('\u0100', '\u017F',
216 "LATIN_EXTENDED_A");
219 * Latin Extended-B.
220 * '\u0180' - '\u024F'.
222 public static final UnicodeBlock LATIN_EXTENDED_B
223 = new UnicodeBlock('\u0180', '\u024F',
224 "LATIN_EXTENDED_B");
227 * IPA Extensions.
228 * '\u0250' - '\u02AF'.
230 public static final UnicodeBlock IPA_EXTENSIONS
231 = new UnicodeBlock('\u0250', '\u02AF',
232 "IPA_EXTENSIONS");
235 * Spacing Modifier Letters.
236 * '\u02B0' - '\u02FF'.
238 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
239 = new UnicodeBlock('\u02B0', '\u02FF',
240 "SPACING_MODIFIER_LETTERS");
243 * Combining Diacritical Marks.
244 * '\u0300' - '\u036F'.
246 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
247 = new UnicodeBlock('\u0300', '\u036F',
248 "COMBINING_DIACRITICAL_MARKS");
251 * Greek.
252 * '\u0370' - '\u03FF'.
254 public static final UnicodeBlock GREEK
255 = new UnicodeBlock('\u0370', '\u03FF',
256 "GREEK");
259 * Cyrillic.
260 * '\u0400' - '\u04FF'.
262 public static final UnicodeBlock CYRILLIC
263 = new UnicodeBlock('\u0400', '\u04FF',
264 "CYRILLIC");
267 * Armenian.
268 * '\u0530' - '\u058F'.
270 public static final UnicodeBlock ARMENIAN
271 = new UnicodeBlock('\u0530', '\u058F',
272 "ARMENIAN");
275 * Hebrew.
276 * '\u0590' - '\u05FF'.
278 public static final UnicodeBlock HEBREW
279 = new UnicodeBlock('\u0590', '\u05FF',
280 "HEBREW");
283 * Arabic.
284 * '\u0600' - '\u06FF'.
286 public static final UnicodeBlock ARABIC
287 = new UnicodeBlock('\u0600', '\u06FF',
288 "ARABIC");
291 * Syriac.
292 * '\u0700' - '\u074F'.
293 * @since 1.4
295 public static final UnicodeBlock SYRIAC
296 = new UnicodeBlock('\u0700', '\u074F',
297 "SYRIAC");
300 * Thaana.
301 * '\u0780' - '\u07BF'.
302 * @since 1.4
304 public static final UnicodeBlock THAANA
305 = new UnicodeBlock('\u0780', '\u07BF',
306 "THAANA");
309 * Devanagari.
310 * '\u0900' - '\u097F'.
312 public static final UnicodeBlock DEVANAGARI
313 = new UnicodeBlock('\u0900', '\u097F',
314 "DEVANAGARI");
317 * Bengali.
318 * '\u0980' - '\u09FF'.
320 public static final UnicodeBlock BENGALI
321 = new UnicodeBlock('\u0980', '\u09FF',
322 "BENGALI");
325 * Gurmukhi.
326 * '\u0A00' - '\u0A7F'.
328 public static final UnicodeBlock GURMUKHI
329 = new UnicodeBlock('\u0A00', '\u0A7F',
330 "GURMUKHI");
333 * Gujarati.
334 * '\u0A80' - '\u0AFF'.
336 public static final UnicodeBlock GUJARATI
337 = new UnicodeBlock('\u0A80', '\u0AFF',
338 "GUJARATI");
341 * Oriya.
342 * '\u0B00' - '\u0B7F'.
344 public static final UnicodeBlock ORIYA
345 = new UnicodeBlock('\u0B00', '\u0B7F',
346 "ORIYA");
349 * Tamil.
350 * '\u0B80' - '\u0BFF'.
352 public static final UnicodeBlock TAMIL
353 = new UnicodeBlock('\u0B80', '\u0BFF',
354 "TAMIL");
357 * Telugu.
358 * '\u0C00' - '\u0C7F'.
360 public static final UnicodeBlock TELUGU
361 = new UnicodeBlock('\u0C00', '\u0C7F',
362 "TELUGU");
365 * Kannada.
366 * '\u0C80' - '\u0CFF'.
368 public static final UnicodeBlock KANNADA
369 = new UnicodeBlock('\u0C80', '\u0CFF',
370 "KANNADA");
373 * Malayalam.
374 * '\u0D00' - '\u0D7F'.
376 public static final UnicodeBlock MALAYALAM
377 = new UnicodeBlock('\u0D00', '\u0D7F',
378 "MALAYALAM");
381 * Sinhala.
382 * '\u0D80' - '\u0DFF'.
383 * @since 1.4
385 public static final UnicodeBlock SINHALA
386 = new UnicodeBlock('\u0D80', '\u0DFF',
387 "SINHALA");
390 * Thai.
391 * '\u0E00' - '\u0E7F'.
393 public static final UnicodeBlock THAI
394 = new UnicodeBlock('\u0E00', '\u0E7F',
395 "THAI");
398 * Lao.
399 * '\u0E80' - '\u0EFF'.
401 public static final UnicodeBlock LAO
402 = new UnicodeBlock('\u0E80', '\u0EFF',
403 "LAO");
406 * Tibetan.
407 * '\u0F00' - '\u0FFF'.
409 public static final UnicodeBlock TIBETAN
410 = new UnicodeBlock('\u0F00', '\u0FFF',
411 "TIBETAN");
414 * Myanmar.
415 * '\u1000' - '\u109F'.
416 * @since 1.4
418 public static final UnicodeBlock MYANMAR
419 = new UnicodeBlock('\u1000', '\u109F',
420 "MYANMAR");
423 * Georgian.
424 * '\u10A0' - '\u10FF'.
426 public static final UnicodeBlock GEORGIAN
427 = new UnicodeBlock('\u10A0', '\u10FF',
428 "GEORGIAN");
431 * Hangul Jamo.
432 * '\u1100' - '\u11FF'.
434 public static final UnicodeBlock HANGUL_JAMO
435 = new UnicodeBlock('\u1100', '\u11FF',
436 "HANGUL_JAMO");
439 * Ethiopic.
440 * '\u1200' - '\u137F'.
441 * @since 1.4
443 public static final UnicodeBlock ETHIOPIC
444 = new UnicodeBlock('\u1200', '\u137F',
445 "ETHIOPIC");
448 * Cherokee.
449 * '\u13A0' - '\u13FF'.
450 * @since 1.4
452 public static final UnicodeBlock CHEROKEE
453 = new UnicodeBlock('\u13A0', '\u13FF',
454 "CHEROKEE");
457 * Unified Canadian Aboriginal Syllabics.
458 * '\u1400' - '\u167F'.
459 * @since 1.4
461 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
462 = new UnicodeBlock('\u1400', '\u167F',
463 "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS");
466 * Ogham.
467 * '\u1680' - '\u169F'.
468 * @since 1.4
470 public static final UnicodeBlock OGHAM
471 = new UnicodeBlock('\u1680', '\u169F',
472 "OGHAM");
475 * Runic.
476 * '\u16A0' - '\u16FF'.
477 * @since 1.4
479 public static final UnicodeBlock RUNIC
480 = new UnicodeBlock('\u16A0', '\u16FF',
481 "RUNIC");
484 * Khmer.
485 * '\u1780' - '\u17FF'.
486 * @since 1.4
488 public static final UnicodeBlock KHMER
489 = new UnicodeBlock('\u1780', '\u17FF',
490 "KHMER");
493 * Mongolian.
494 * '\u1800' - '\u18AF'.
495 * @since 1.4
497 public static final UnicodeBlock MONGOLIAN
498 = new UnicodeBlock('\u1800', '\u18AF',
499 "MONGOLIAN");
502 * Latin Extended Additional.
503 * '\u1E00' - '\u1EFF'.
505 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
506 = new UnicodeBlock('\u1E00', '\u1EFF',
507 "LATIN_EXTENDED_ADDITIONAL");
510 * Greek Extended.
511 * '\u1F00' - '\u1FFF'.
513 public static final UnicodeBlock GREEK_EXTENDED
514 = new UnicodeBlock('\u1F00', '\u1FFF',
515 "GREEK_EXTENDED");
518 * General Punctuation.
519 * '\u2000' - '\u206F'.
521 public static final UnicodeBlock GENERAL_PUNCTUATION
522 = new UnicodeBlock('\u2000', '\u206F',
523 "GENERAL_PUNCTUATION");
526 * Superscripts and Subscripts.
527 * '\u2070' - '\u209F'.
529 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
530 = new UnicodeBlock('\u2070', '\u209F',
531 "SUPERSCRIPTS_AND_SUBSCRIPTS");
534 * Currency Symbols.
535 * '\u20A0' - '\u20CF'.
537 public static final UnicodeBlock CURRENCY_SYMBOLS
538 = new UnicodeBlock('\u20A0', '\u20CF',
539 "CURRENCY_SYMBOLS");
542 * Combining Marks for Symbols.
543 * '\u20D0' - '\u20FF'.
545 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
546 = new UnicodeBlock('\u20D0', '\u20FF',
547 "COMBINING_MARKS_FOR_SYMBOLS");
550 * Letterlike Symbols.
551 * '\u2100' - '\u214F'.
553 public static final UnicodeBlock LETTERLIKE_SYMBOLS
554 = new UnicodeBlock('\u2100', '\u214F',
555 "LETTERLIKE_SYMBOLS");
558 * Number Forms.
559 * '\u2150' - '\u218F'.
561 public static final UnicodeBlock NUMBER_FORMS
562 = new UnicodeBlock('\u2150', '\u218F',
563 "NUMBER_FORMS");
566 * Arrows.
567 * '\u2190' - '\u21FF'.
569 public static final UnicodeBlock ARROWS
570 = new UnicodeBlock('\u2190', '\u21FF',
571 "ARROWS");
574 * Mathematical Operators.
575 * '\u2200' - '\u22FF'.
577 public static final UnicodeBlock MATHEMATICAL_OPERATORS
578 = new UnicodeBlock('\u2200', '\u22FF',
579 "MATHEMATICAL_OPERATORS");
582 * Miscellaneous Technical.
583 * '\u2300' - '\u23FF'.
585 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
586 = new UnicodeBlock('\u2300', '\u23FF',
587 "MISCELLANEOUS_TECHNICAL");
590 * Control Pictures.
591 * '\u2400' - '\u243F'.
593 public static final UnicodeBlock CONTROL_PICTURES
594 = new UnicodeBlock('\u2400', '\u243F',
595 "CONTROL_PICTURES");
598 * Optical Character Recognition.
599 * '\u2440' - '\u245F'.
601 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
602 = new UnicodeBlock('\u2440', '\u245F',
603 "OPTICAL_CHARACTER_RECOGNITION");
606 * Enclosed Alphanumerics.
607 * '\u2460' - '\u24FF'.
609 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
610 = new UnicodeBlock('\u2460', '\u24FF',
611 "ENCLOSED_ALPHANUMERICS");
614 * Box Drawing.
615 * '\u2500' - '\u257F'.
617 public static final UnicodeBlock BOX_DRAWING
618 = new UnicodeBlock('\u2500', '\u257F',
619 "BOX_DRAWING");
622 * Block Elements.
623 * '\u2580' - '\u259F'.
625 public static final UnicodeBlock BLOCK_ELEMENTS
626 = new UnicodeBlock('\u2580', '\u259F',
627 "BLOCK_ELEMENTS");
630 * Geometric Shapes.
631 * '\u25A0' - '\u25FF'.
633 public static final UnicodeBlock GEOMETRIC_SHAPES
634 = new UnicodeBlock('\u25A0', '\u25FF',
635 "GEOMETRIC_SHAPES");
638 * Miscellaneous Symbols.
639 * '\u2600' - '\u26FF'.
641 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
642 = new UnicodeBlock('\u2600', '\u26FF',
643 "MISCELLANEOUS_SYMBOLS");
646 * Dingbats.
647 * '\u2700' - '\u27BF'.
649 public static final UnicodeBlock DINGBATS
650 = new UnicodeBlock('\u2700', '\u27BF',
651 "DINGBATS");
654 * Braille Patterns.
655 * '\u2800' - '\u28FF'.
656 * @since 1.4
658 public static final UnicodeBlock BRAILLE_PATTERNS
659 = new UnicodeBlock('\u2800', '\u28FF',
660 "BRAILLE_PATTERNS");
663 * CJK Radicals Supplement.
664 * '\u2E80' - '\u2EFF'.
665 * @since 1.4
667 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
668 = new UnicodeBlock('\u2E80', '\u2EFF',
669 "CJK_RADICALS_SUPPLEMENT");
672 * Kangxi Radicals.
673 * '\u2F00' - '\u2FDF'.
674 * @since 1.4
676 public static final UnicodeBlock KANGXI_RADICALS
677 = new UnicodeBlock('\u2F00', '\u2FDF',
678 "KANGXI_RADICALS");
681 * Ideographic Description Characters.
682 * '\u2FF0' - '\u2FFF'.
683 * @since 1.4
685 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
686 = new UnicodeBlock('\u2FF0', '\u2FFF',
687 "IDEOGRAPHIC_DESCRIPTION_CHARACTERS");
690 * CJK Symbols and Punctuation.
691 * '\u3000' - '\u303F'.
693 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
694 = new UnicodeBlock('\u3000', '\u303F',
695 "CJK_SYMBOLS_AND_PUNCTUATION");
698 * Hiragana.
699 * '\u3040' - '\u309F'.
701 public static final UnicodeBlock HIRAGANA
702 = new UnicodeBlock('\u3040', '\u309F',
703 "HIRAGANA");
706 * Katakana.
707 * '\u30A0' - '\u30FF'.
709 public static final UnicodeBlock KATAKANA
710 = new UnicodeBlock('\u30A0', '\u30FF',
711 "KATAKANA");
714 * Bopomofo.
715 * '\u3100' - '\u312F'.
717 public static final UnicodeBlock BOPOMOFO
718 = new UnicodeBlock('\u3100', '\u312F',
719 "BOPOMOFO");
722 * Hangul Compatibility Jamo.
723 * '\u3130' - '\u318F'.
725 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
726 = new UnicodeBlock('\u3130', '\u318F',
727 "HANGUL_COMPATIBILITY_JAMO");
730 * Kanbun.
731 * '\u3190' - '\u319F'.
733 public static final UnicodeBlock KANBUN
734 = new UnicodeBlock('\u3190', '\u319F',
735 "KANBUN");
738 * Bopomofo Extended.
739 * '\u31A0' - '\u31BF'.
740 * @since 1.4
742 public static final UnicodeBlock BOPOMOFO_EXTENDED
743 = new UnicodeBlock('\u31A0', '\u31BF',
744 "BOPOMOFO_EXTENDED");
747 * Enclosed CJK Letters and Months.
748 * '\u3200' - '\u32FF'.
750 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
751 = new UnicodeBlock('\u3200', '\u32FF',
752 "ENCLOSED_CJK_LETTERS_AND_MONTHS");
755 * CJK Compatibility.
756 * '\u3300' - '\u33FF'.
758 public static final UnicodeBlock CJK_COMPATIBILITY
759 = new UnicodeBlock('\u3300', '\u33FF',
760 "CJK_COMPATIBILITY");
763 * CJK Unified Ideographs Extension A.
764 * '\u3400' - '\u4DB5'.
765 * @since 1.4
767 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
768 = new UnicodeBlock('\u3400', '\u4DB5',
769 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A");
772 * CJK Unified Ideographs.
773 * '\u4E00' - '\u9FFF'.
775 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
776 = new UnicodeBlock('\u4E00', '\u9FFF',
777 "CJK_UNIFIED_IDEOGRAPHS");
780 * Yi Syllables.
781 * '\uA000' - '\uA48F'.
782 * @since 1.4
784 public static final UnicodeBlock YI_SYLLABLES
785 = new UnicodeBlock('\uA000', '\uA48F',
786 "YI_SYLLABLES");
789 * Yi Radicals.
790 * '\uA490' - '\uA4CF'.
791 * @since 1.4
793 public static final UnicodeBlock YI_RADICALS
794 = new UnicodeBlock('\uA490', '\uA4CF',
795 "YI_RADICALS");
798 * Hangul Syllables.
799 * '\uAC00' - '\uD7A3'.
801 public static final UnicodeBlock HANGUL_SYLLABLES
802 = new UnicodeBlock('\uAC00', '\uD7A3',
803 "HANGUL_SYLLABLES");
806 * Surrogates Area.
807 * '\uD800' - '\uDFFF'.
809 public static final UnicodeBlock SURROGATES_AREA
810 = new UnicodeBlock('\uD800', '\uDFFF',
811 "SURROGATES_AREA");
814 * Private Use Area.
815 * '\uE000' - '\uF8FF'.
817 public static final UnicodeBlock PRIVATE_USE_AREA
818 = new UnicodeBlock('\uE000', '\uF8FF',
819 "PRIVATE_USE_AREA");
822 * CJK Compatibility Ideographs.
823 * '\uF900' - '\uFAFF'.
825 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
826 = new UnicodeBlock('\uF900', '\uFAFF',
827 "CJK_COMPATIBILITY_IDEOGRAPHS");
830 * Alphabetic Presentation Forms.
831 * '\uFB00' - '\uFB4F'.
833 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
834 = new UnicodeBlock('\uFB00', '\uFB4F',
835 "ALPHABETIC_PRESENTATION_FORMS");
838 * Arabic Presentation Forms-A.
839 * '\uFB50' - '\uFDFF'.
841 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
842 = new UnicodeBlock('\uFB50', '\uFDFF',
843 "ARABIC_PRESENTATION_FORMS_A");
846 * Combining Half Marks.
847 * '\uFE20' - '\uFE2F'.
849 public static final UnicodeBlock COMBINING_HALF_MARKS
850 = new UnicodeBlock('\uFE20', '\uFE2F',
851 "COMBINING_HALF_MARKS");
854 * CJK Compatibility Forms.
855 * '\uFE30' - '\uFE4F'.
857 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
858 = new UnicodeBlock('\uFE30', '\uFE4F',
859 "CJK_COMPATIBILITY_FORMS");
862 * Small Form Variants.
863 * '\uFE50' - '\uFE6F'.
865 public static final UnicodeBlock SMALL_FORM_VARIANTS
866 = new UnicodeBlock('\uFE50', '\uFE6F',
867 "SMALL_FORM_VARIANTS");
870 * Arabic Presentation Forms-B.
871 * '\uFE70' - '\uFEFE'.
873 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
874 = new UnicodeBlock('\uFE70', '\uFEFE',
875 "ARABIC_PRESENTATION_FORMS_B");
878 * Halfwidth and Fullwidth Forms.
879 * '\uFF00' - '\uFFEF'.
881 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
882 = new UnicodeBlock('\uFF00', '\uFFEF',
883 "HALFWIDTH_AND_FULLWIDTH_FORMS");
886 * Specials.
887 * '\uFEFF', '\uFFF0' - '\uFFFD'.
889 public static final UnicodeBlock SPECIALS
890 = new UnicodeBlock('\uFFF0', '\uFFFD',
891 "SPECIALS");
894 * The defined subsets.
896 private static final UnicodeBlock sets[] = {
897 BASIC_LATIN,
898 LATIN_1_SUPPLEMENT,
899 LATIN_EXTENDED_A,
900 LATIN_EXTENDED_B,
901 IPA_EXTENSIONS,
902 SPACING_MODIFIER_LETTERS,
903 COMBINING_DIACRITICAL_MARKS,
904 GREEK,
905 CYRILLIC,
906 ARMENIAN,
907 HEBREW,
908 ARABIC,
909 SYRIAC,
910 THAANA,
911 DEVANAGARI,
912 BENGALI,
913 GURMUKHI,
914 GUJARATI,
915 ORIYA,
916 TAMIL,
917 TELUGU,
918 KANNADA,
919 MALAYALAM,
920 SINHALA,
921 THAI,
922 LAO,
923 TIBETAN,
924 MYANMAR,
925 GEORGIAN,
926 HANGUL_JAMO,
927 ETHIOPIC,
928 CHEROKEE,
929 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
930 OGHAM,
931 RUNIC,
932 KHMER,
933 MONGOLIAN,
934 LATIN_EXTENDED_ADDITIONAL,
935 GREEK_EXTENDED,
936 GENERAL_PUNCTUATION,
937 SUPERSCRIPTS_AND_SUBSCRIPTS,
938 CURRENCY_SYMBOLS,
939 COMBINING_MARKS_FOR_SYMBOLS,
940 LETTERLIKE_SYMBOLS,
941 NUMBER_FORMS,
942 ARROWS,
943 MATHEMATICAL_OPERATORS,
944 MISCELLANEOUS_TECHNICAL,
945 CONTROL_PICTURES,
946 OPTICAL_CHARACTER_RECOGNITION,
947 ENCLOSED_ALPHANUMERICS,
948 BOX_DRAWING,
949 BLOCK_ELEMENTS,
950 GEOMETRIC_SHAPES,
951 MISCELLANEOUS_SYMBOLS,
952 DINGBATS,
953 BRAILLE_PATTERNS,
954 CJK_RADICALS_SUPPLEMENT,
955 KANGXI_RADICALS,
956 IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
957 CJK_SYMBOLS_AND_PUNCTUATION,
958 HIRAGANA,
959 KATAKANA,
960 BOPOMOFO,
961 HANGUL_COMPATIBILITY_JAMO,
962 KANBUN,
963 BOPOMOFO_EXTENDED,
964 ENCLOSED_CJK_LETTERS_AND_MONTHS,
965 CJK_COMPATIBILITY,
966 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
967 CJK_UNIFIED_IDEOGRAPHS,
968 YI_SYLLABLES,
969 YI_RADICALS,
970 HANGUL_SYLLABLES,
971 SURROGATES_AREA,
972 PRIVATE_USE_AREA,
973 CJK_COMPATIBILITY_IDEOGRAPHS,
974 ALPHABETIC_PRESENTATION_FORMS,
975 ARABIC_PRESENTATION_FORMS_A,
976 COMBINING_HALF_MARKS,
977 CJK_COMPATIBILITY_FORMS,
978 SMALL_FORM_VARIANTS,
979 ARABIC_PRESENTATION_FORMS_B,
980 HALFWIDTH_AND_FULLWIDTH_FORMS,
981 SPECIALS,
983 } // class UnicodeBlock
986 * The immutable value of this Character.
988 * @serial the value of this Character
990 private final char value;
993 * Compatible with JDK 1.0+.
995 private static final long serialVersionUID = 3786198910865385080L;
998 * Smallest value allowed for radix arguments in Java. This value is 2.
1000 * @see #digit(char, int)
1001 * @see #forDigit(int, int)
1002 * @see Integer#toString(int, int)
1003 * @see Integer#valueOf(String)
1005 public static final int MIN_RADIX = 2;
1008 * Largest value allowed for radix arguments in Java. This value is 36.
1010 * @see #digit(char, int)
1011 * @see #forDigit(int, int)
1012 * @see Integer#toString(int, int)
1013 * @see Integer#valueOf(String)
1015 public static final int MAX_RADIX = 36;
1018 * The minimum value the char data type can hold.
1019 * This value is <code>'\\u0000'</code>.
1021 public static final char MIN_VALUE = '\u0000';
1024 * The maximum value the char data type can hold.
1025 * This value is <code>'\\uFFFF'</code>.
1027 public static final char MAX_VALUE = '\uFFFF';
1030 * Class object representing the primitive char data type.
1032 * @since 1.1
1034 public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1037 * The number of bits needed to represent a <code>char</code>.
1038 * @since 1.5
1040 public static final int SIZE = 16;
1042 // This caches some Character values, and is used by boxing
1043 // conversions via valueOf(). We must cache at least 0..127;
1044 // this constant controls how much we actually cache.
1045 private static final int MAX_CACHE = 127;
1046 private static Character[] charCache = new Character[MAX_CACHE + 1];
1049 * Lu = Letter, Uppercase (Informative).
1051 * @since 1.1
1053 public static final byte UPPERCASE_LETTER = 1;
1056 * Ll = Letter, Lowercase (Informative).
1058 * @since 1.1
1060 public static final byte LOWERCASE_LETTER = 2;
1063 * Lt = Letter, Titlecase (Informative).
1065 * @since 1.1
1067 public static final byte TITLECASE_LETTER = 3;
1070 * Mn = Mark, Non-Spacing (Normative).
1072 * @since 1.1
1074 public static final byte NON_SPACING_MARK = 6;
1077 * Mc = Mark, Spacing Combining (Normative).
1079 * @since 1.1
1081 public static final byte COMBINING_SPACING_MARK = 8;
1084 * Me = Mark, Enclosing (Normative).
1086 * @since 1.1
1088 public static final byte ENCLOSING_MARK = 7;
1091 * Nd = Number, Decimal Digit (Normative).
1093 * @since 1.1
1095 public static final byte DECIMAL_DIGIT_NUMBER = 9;
1098 * Nl = Number, Letter (Normative).
1100 * @since 1.1
1102 public static final byte LETTER_NUMBER = 10;
1105 * No = Number, Other (Normative).
1107 * @since 1.1
1109 public static final byte OTHER_NUMBER = 11;
1112 * Zs = Separator, Space (Normative).
1114 * @since 1.1
1116 public static final byte SPACE_SEPARATOR = 12;
1119 * Zl = Separator, Line (Normative).
1121 * @since 1.1
1123 public static final byte LINE_SEPARATOR = 13;
1126 * Zp = Separator, Paragraph (Normative).
1128 * @since 1.1
1130 public static final byte PARAGRAPH_SEPARATOR = 14;
1133 * Cc = Other, Control (Normative).
1135 * @since 1.1
1137 public static final byte CONTROL = 15;
1140 * Cf = Other, Format (Normative).
1142 * @since 1.1
1144 public static final byte FORMAT = 16;
1147 * Cs = Other, Surrogate (Normative).
1149 * @since 1.1
1151 public static final byte SURROGATE = 19;
1154 * Co = Other, Private Use (Normative).
1156 * @since 1.1
1158 public static final byte PRIVATE_USE = 18;
1161 * Cn = Other, Not Assigned (Normative).
1163 * @since 1.1
1165 public static final byte UNASSIGNED = 0;
1168 * Lm = Letter, Modifier (Informative).
1170 * @since 1.1
1172 public static final byte MODIFIER_LETTER = 4;
1175 * Lo = Letter, Other (Informative).
1177 * @since 1.1
1179 public static final byte OTHER_LETTER = 5;
1182 * Pc = Punctuation, Connector (Informative).
1184 * @since 1.1
1186 public static final byte CONNECTOR_PUNCTUATION = 23;
1189 * Pd = Punctuation, Dash (Informative).
1191 * @since 1.1
1193 public static final byte DASH_PUNCTUATION = 20;
1196 * Ps = Punctuation, Open (Informative).
1198 * @since 1.1
1200 public static final byte START_PUNCTUATION = 21;
1203 * Pe = Punctuation, Close (Informative).
1205 * @since 1.1
1207 public static final byte END_PUNCTUATION = 22;
1210 * Pi = Punctuation, Initial Quote (Informative).
1212 * @since 1.4
1214 public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
1217 * Pf = Punctuation, Final Quote (Informative).
1219 * @since 1.4
1221 public static final byte FINAL_QUOTE_PUNCTUATION = 30;
1224 * Po = Punctuation, Other (Informative).
1226 * @since 1.1
1228 public static final byte OTHER_PUNCTUATION = 24;
1231 * Sm = Symbol, Math (Informative).
1233 * @since 1.1
1235 public static final byte MATH_SYMBOL = 25;
1238 * Sc = Symbol, Currency (Informative).
1240 * @since 1.1
1242 public static final byte CURRENCY_SYMBOL = 26;
1245 * Sk = Symbol, Modifier (Informative).
1247 * @since 1.1
1249 public static final byte MODIFIER_SYMBOL = 27;
1252 * So = Symbol, Other (Informative).
1254 * @since 1.1
1256 public static final byte OTHER_SYMBOL = 28;
1259 * Undefined bidirectional character type. Undefined char values have
1260 * undefined directionality in the Unicode specification.
1262 * @since 1.4
1264 public static final byte DIRECTIONALITY_UNDEFINED = -1;
1267 * Strong bidirectional character type "L".
1269 * @since 1.4
1271 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
1274 * Strong bidirectional character type "R".
1276 * @since 1.4
1278 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
1281 * Strong bidirectional character type "AL".
1283 * @since 1.4
1285 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
1288 * Weak bidirectional character type "EN".
1290 * @since 1.4
1292 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
1295 * Weak bidirectional character type "ES".
1297 * @since 1.4
1299 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
1302 * Weak bidirectional character type "ET".
1304 * @since 1.4
1306 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
1309 * Weak bidirectional character type "AN".
1311 * @since 1.4
1313 public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
1316 * Weak bidirectional character type "CS".
1318 * @since 1.4
1320 public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
1323 * Weak bidirectional character type "NSM".
1325 * @since 1.4
1327 public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
1330 * Weak bidirectional character type "BN".
1332 * @since 1.4
1334 public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
1337 * Neutral bidirectional character type "B".
1339 * @since 1.4
1341 public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
1344 * Neutral bidirectional character type "S".
1346 * @since 1.4
1348 public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
1351 * Strong bidirectional character type "WS".
1353 * @since 1.4
1355 public static final byte DIRECTIONALITY_WHITESPACE = 12;
1358 * Neutral bidirectional character type "ON".
1360 * @since 1.4
1362 public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
1365 * Strong bidirectional character type "LRE".
1367 * @since 1.4
1369 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
1372 * Strong bidirectional character type "LRO".
1374 * @since 1.4
1376 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
1379 * Strong bidirectional character type "RLE".
1381 * @since 1.4
1383 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
1386 * Strong bidirectional character type "RLO".
1388 * @since 1.4
1390 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
1393 * Weak bidirectional character type "PDF".
1395 * @since 1.4
1397 public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
1400 * Stores unicode block offset lookup table. Exploit package visibility of
1401 * String.value to avoid copying the array.
1402 * @see #readChar(char)
1403 * @see CharData#BLOCKS
1405 private static final char[] blocks = String.zeroBasedStringValue(CharData.BLOCKS);
1408 * Stores unicode attribute offset lookup table. Exploit package visibility
1409 * of String.value to avoid copying the array.
1410 * @see CharData#DATA
1412 private static final char[] data = String.zeroBasedStringValue(CharData.DATA);
1415 * Stores unicode numeric value attribute table. Exploit package visibility
1416 * of String.value to avoid copying the array.
1417 * @see CharData#NUM_VALUE
1419 private static final char[] numValue
1420 = String.zeroBasedStringValue(CharData.NUM_VALUE);
1423 * Stores unicode uppercase attribute table. Exploit package visibility
1424 * of String.value to avoid copying the array.
1425 * @see CharData#UPPER
1427 private static final char[] upper = String.zeroBasedStringValue(CharData.UPPER);
1430 * Stores unicode lowercase attribute table. Exploit package visibility
1431 * of String.value to avoid copying the array.
1432 * @see CharData#LOWER
1434 private static final char[] lower = String.zeroBasedStringValue(CharData.LOWER);
1437 * Stores unicode direction attribute table. Exploit package visibility
1438 * of String.value to avoid copying the array.
1439 * @see CharData#DIRECTION
1441 // Package visible for use by String.
1442 static final char[] direction = String.zeroBasedStringValue(CharData.DIRECTION);
1445 * Stores unicode titlecase table. Exploit package visibility of
1446 * String.value to avoid copying the array.
1447 * @see CharData#TITLE
1449 private static final char[] title = String.zeroBasedStringValue(CharData.TITLE);
1452 * Mask for grabbing the type out of the contents of data.
1453 * @see CharData#DATA
1455 private static final int TYPE_MASK = 0x1F;
1458 * Mask for grabbing the non-breaking space flag out of the contents of
1459 * data.
1460 * @see CharData#DATA
1462 private static final int NO_BREAK_MASK = 0x20;
1465 * Mask for grabbing the mirrored directionality flag out of the contents
1466 * of data.
1467 * @see CharData#DATA
1469 private static final int MIRROR_MASK = 0x40;
1472 * Min value for supplementary code point.
1474 * @since 1.5
1476 public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
1479 * Min value for code point.
1481 * @since 1.5
1483 public static final int MIN_CODE_POINT = 0;
1487 * Max value for code point.
1489 * @since 1.5
1491 public static final int MAX_CODE_POINT = 0x010ffff;
1495 * Minimum high surrogate code in UTF-16 encoding.
1497 * @since 1.5
1499 public static final char MIN_HIGH_SURROGATE = '\ud800';
1502 * Maximum high surrogate code in UTF-16 encoding.
1504 * @since 1.5
1506 public static final char MAX_HIGH_SURROGATE = '\udbff';
1509 * Minimum low surrogate code in UTF-16 encoding.
1511 * @since 1.5
1513 public static final char MIN_LOW_SURROGATE = '\udc00';
1516 * Maximum low surrogate code in UTF-16 encoding.
1518 * @since 1.5
1520 public static final char MAX_LOW_SURROGATE = '\udfff';
1523 * Minimum surrogate code in UTF-16 encoding.
1525 * @since 1.5
1527 public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
1530 * Maximum low surrogate code in UTF-16 encoding.
1532 * @since 1.5
1534 public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
1537 * Grabs an attribute offset from the Unicode attribute database. The lower
1538 * 5 bits are the character type, the next 2 bits are flags, and the top
1539 * 9 bits are the offset into the attribute tables.
1541 * @param ch the character to look up
1542 * @return the character's attribute offset and type
1543 * @see #TYPE_MASK
1544 * @see #NO_BREAK_MASK
1545 * @see #MIRROR_MASK
1546 * @see CharData#DATA
1547 * @see CharData#SHIFT
1549 // Package visible for use in String.
1550 static char readChar(char ch)
1552 // Perform 16-bit addition to find the correct entry in data.
1553 return data[(char) (blocks[ch >> CharData.SHIFT] + ch)];
1557 * Wraps up a character.
1559 * @param value the character to wrap
1561 public Character(char value)
1563 this.value = value;
1567 * Returns the character which has been wrapped by this class.
1569 * @return the character wrapped
1571 public char charValue()
1573 return value;
1577 * Returns the numerical value (unsigned) of the wrapped character.
1578 * Range of returned values: 0x0000-0xFFFF.
1580 * @return the value of the wrapped character
1582 public int hashCode()
1584 return value;
1588 * Determines if an object is equal to this object. This is only true for
1589 * another Character object wrapping the same value.
1591 * @param o object to compare
1592 * @return true if o is a Character with the same value
1594 public boolean equals(Object o)
1596 return o instanceof Character && value == ((Character) o).value;
1600 * Converts the wrapped character into a String.
1602 * @return a String containing one character -- the wrapped character
1603 * of this instance
1605 public String toString()
1607 // Package constructor avoids an array copy.
1608 return new String(new char[] { value }, 0, 1, true);
1612 * Returns a String of length 1 representing the specified character.
1614 * @param ch the character to convert
1615 * @return a String containing the character
1616 * @since 1.4
1618 public static String toString(char ch)
1620 // Package constructor avoids an array copy.
1621 return new String(new char[] { ch }, 0, 1, true);
1625 * Determines if a character is a Unicode lowercase letter. For example,
1626 * <code>'a'</code> is lowercase.
1627 * <br>
1628 * lowercase = [Ll]
1630 * @param ch character to test
1631 * @return true if ch is a Unicode lowercase letter, else false
1632 * @see #isUpperCase(char)
1633 * @see #isTitleCase(char)
1634 * @see #toLowerCase(char)
1635 * @see #getType(char)
1637 public static boolean isLowerCase(char ch)
1639 return getType(ch) == LOWERCASE_LETTER;
1643 * Determines if a character is a Unicode uppercase letter. For example,
1644 * <code>'A'</code> is uppercase.
1645 * <br>
1646 * uppercase = [Lu]
1648 * @param ch character to test
1649 * @return true if ch is a Unicode uppercase letter, else false
1650 * @see #isLowerCase(char)
1651 * @see #isTitleCase(char)
1652 * @see #toUpperCase(char)
1653 * @see #getType(char)
1655 public static boolean isUpperCase(char ch)
1657 return getType(ch) == UPPERCASE_LETTER;
1661 * Determines if a character is a Unicode titlecase letter. For example,
1662 * the character "Lj" (Latin capital L with small letter j) is titlecase.
1663 * <br>
1664 * titlecase = [Lt]
1666 * @param ch character to test
1667 * @return true if ch is a Unicode titlecase letter, else false
1668 * @see #isLowerCase(char)
1669 * @see #isUpperCase(char)
1670 * @see #toTitleCase(char)
1671 * @see #getType(char)
1673 public static boolean isTitleCase(char ch)
1675 return getType(ch) == TITLECASE_LETTER;
1679 * Determines if a character is a Unicode decimal digit. For example,
1680 * <code>'0'</code> is a digit.
1681 * <br>
1682 * Unicode decimal digit = [Nd]
1684 * @param ch character to test
1685 * @return true if ch is a Unicode decimal digit, else false
1686 * @see #digit(char, int)
1687 * @see #forDigit(int, int)
1688 * @see #getType(char)
1690 public static boolean isDigit(char ch)
1692 return getType(ch) == DECIMAL_DIGIT_NUMBER;
1696 * Determines if a character is part of the Unicode Standard. This is an
1697 * evolving standard, but covers every character in the data file.
1698 * <br>
1699 * defined = not [Cn]
1701 * @param ch character to test
1702 * @return true if ch is a Unicode character, else false
1703 * @see #isDigit(char)
1704 * @see #isLetter(char)
1705 * @see #isLetterOrDigit(char)
1706 * @see #isLowerCase(char)
1707 * @see #isTitleCase(char)
1708 * @see #isUpperCase(char)
1710 public static boolean isDefined(char ch)
1712 return getType(ch) != UNASSIGNED;
1716 * Determines if a character is a Unicode letter. Not all letters have case,
1717 * so this may return true when isLowerCase and isUpperCase return false.
1718 * <br>
1719 * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
1721 * @param ch character to test
1722 * @return true if ch is a Unicode letter, else false
1723 * @see #isDigit(char)
1724 * @see #isJavaIdentifierStart(char)
1725 * @see #isJavaLetter(char)
1726 * @see #isJavaLetterOrDigit(char)
1727 * @see #isLetterOrDigit(char)
1728 * @see #isLowerCase(char)
1729 * @see #isTitleCase(char)
1730 * @see #isUnicodeIdentifierStart(char)
1731 * @see #isUpperCase(char)
1733 public static boolean isLetter(char ch)
1735 return ((1 << getType(ch))
1736 & ((1 << UPPERCASE_LETTER)
1737 | (1 << LOWERCASE_LETTER)
1738 | (1 << TITLECASE_LETTER)
1739 | (1 << MODIFIER_LETTER)
1740 | (1 << OTHER_LETTER))) != 0;
1744 * Determines if a character is a Unicode letter or a Unicode digit. This
1745 * is the combination of isLetter and isDigit.
1746 * <br>
1747 * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
1749 * @param ch character to test
1750 * @return true if ch is a Unicode letter or a Unicode digit, else false
1751 * @see #isDigit(char)
1752 * @see #isJavaIdentifierPart(char)
1753 * @see #isJavaLetter(char)
1754 * @see #isJavaLetterOrDigit(char)
1755 * @see #isLetter(char)
1756 * @see #isUnicodeIdentifierPart(char)
1758 public static boolean isLetterOrDigit(char ch)
1760 return ((1 << getType(ch))
1761 & ((1 << UPPERCASE_LETTER)
1762 | (1 << LOWERCASE_LETTER)
1763 | (1 << TITLECASE_LETTER)
1764 | (1 << MODIFIER_LETTER)
1765 | (1 << OTHER_LETTER)
1766 | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
1770 * Determines if a character can start a Java identifier. This is the
1771 * combination of isLetter, any character where getType returns
1772 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1773 * (like '_').
1775 * @param ch character to test
1776 * @return true if ch can start a Java identifier, else false
1777 * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
1778 * @see #isJavaLetterOrDigit(char)
1779 * @see #isJavaIdentifierStart(char)
1780 * @see #isJavaIdentifierPart(char)
1781 * @see #isLetter(char)
1782 * @see #isLetterOrDigit(char)
1783 * @see #isUnicodeIdentifierStart(char)
1785 public static boolean isJavaLetter(char ch)
1787 return isJavaIdentifierStart(ch);
1791 * Determines if a character can follow the first letter in
1792 * a Java identifier. This is the combination of isJavaLetter (isLetter,
1793 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1794 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1795 * or isIdentifierIgnorable.
1797 * @param ch character to test
1798 * @return true if ch can follow the first letter in a Java identifier
1799 * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
1800 * @see #isJavaLetter(char)
1801 * @see #isJavaIdentifierStart(char)
1802 * @see #isJavaIdentifierPart(char)
1803 * @see #isLetter(char)
1804 * @see #isLetterOrDigit(char)
1805 * @see #isUnicodeIdentifierPart(char)
1806 * @see #isIdentifierIgnorable(char)
1808 public static boolean isJavaLetterOrDigit(char ch)
1810 return isJavaIdentifierPart(ch);
1814 * Determines if a character can start a Java identifier. This is the
1815 * combination of isLetter, any character where getType returns
1816 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1817 * (like '_').
1818 * <br>
1819 * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
1821 * @param ch character to test
1822 * @return true if ch can start a Java identifier, else false
1823 * @see #isJavaIdentifierPart(char)
1824 * @see #isLetter(char)
1825 * @see #isUnicodeIdentifierStart(char)
1826 * @since 1.1
1828 public static boolean isJavaIdentifierStart(char ch)
1830 return ((1 << getType(ch))
1831 & ((1 << UPPERCASE_LETTER)
1832 | (1 << LOWERCASE_LETTER)
1833 | (1 << TITLECASE_LETTER)
1834 | (1 << MODIFIER_LETTER)
1835 | (1 << OTHER_LETTER)
1836 | (1 << LETTER_NUMBER)
1837 | (1 << CURRENCY_SYMBOL)
1838 | (1 << CONNECTOR_PUNCTUATION))) != 0;
1842 * Determines if a character can follow the first letter in
1843 * a Java identifier. This is the combination of isJavaLetter (isLetter,
1844 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1845 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1846 * or isIdentifierIgnorable.
1847 * <br>
1848 * Java identifier extender =
1849 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
1850 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1852 * @param ch character to test
1853 * @return true if ch can follow the first letter in a Java identifier
1854 * @see #isIdentifierIgnorable(char)
1855 * @see #isJavaIdentifierStart(char)
1856 * @see #isLetterOrDigit(char)
1857 * @see #isUnicodeIdentifierPart(char)
1858 * @since 1.1
1860 public static boolean isJavaIdentifierPart(char ch)
1862 int category = getType(ch);
1863 return ((1 << category)
1864 & ((1 << UPPERCASE_LETTER)
1865 | (1 << LOWERCASE_LETTER)
1866 | (1 << TITLECASE_LETTER)
1867 | (1 << MODIFIER_LETTER)
1868 | (1 << OTHER_LETTER)
1869 | (1 << NON_SPACING_MARK)
1870 | (1 << COMBINING_SPACING_MARK)
1871 | (1 << DECIMAL_DIGIT_NUMBER)
1872 | (1 << LETTER_NUMBER)
1873 | (1 << CURRENCY_SYMBOL)
1874 | (1 << CONNECTOR_PUNCTUATION)
1875 | (1 << FORMAT))) != 0
1876 || (category == CONTROL && isIdentifierIgnorable(ch));
1880 * Determines if a character can start a Unicode identifier. Only
1881 * letters can start a Unicode identifier, but this includes characters
1882 * in LETTER_NUMBER.
1883 * <br>
1884 * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
1886 * @param ch character to test
1887 * @return true if ch can start a Unicode identifier, else false
1888 * @see #isJavaIdentifierStart(char)
1889 * @see #isLetter(char)
1890 * @see #isUnicodeIdentifierPart(char)
1891 * @since 1.1
1893 public static boolean isUnicodeIdentifierStart(char ch)
1895 return ((1 << getType(ch))
1896 & ((1 << UPPERCASE_LETTER)
1897 | (1 << LOWERCASE_LETTER)
1898 | (1 << TITLECASE_LETTER)
1899 | (1 << MODIFIER_LETTER)
1900 | (1 << OTHER_LETTER)
1901 | (1 << LETTER_NUMBER))) != 0;
1905 * Determines if a character can follow the first letter in
1906 * a Unicode identifier. This includes letters, connecting punctuation,
1907 * digits, numeric letters, combining marks, non-spacing marks, and
1908 * isIdentifierIgnorable.
1909 * <br>
1910 * Unicode identifier extender =
1911 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
1912 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1914 * @param ch character to test
1915 * @return true if ch can follow the first letter in a Unicode identifier
1916 * @see #isIdentifierIgnorable(char)
1917 * @see #isJavaIdentifierPart(char)
1918 * @see #isLetterOrDigit(char)
1919 * @see #isUnicodeIdentifierStart(char)
1920 * @since 1.1
1922 public static boolean isUnicodeIdentifierPart(char ch)
1924 int category = getType(ch);
1925 return ((1 << category)
1926 & ((1 << UPPERCASE_LETTER)
1927 | (1 << LOWERCASE_LETTER)
1928 | (1 << TITLECASE_LETTER)
1929 | (1 << MODIFIER_LETTER)
1930 | (1 << OTHER_LETTER)
1931 | (1 << NON_SPACING_MARK)
1932 | (1 << COMBINING_SPACING_MARK)
1933 | (1 << DECIMAL_DIGIT_NUMBER)
1934 | (1 << LETTER_NUMBER)
1935 | (1 << CONNECTOR_PUNCTUATION)
1936 | (1 << FORMAT))) != 0
1937 || (category == CONTROL && isIdentifierIgnorable(ch));
1941 * Determines if a character is ignorable in a Unicode identifier. This
1942 * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
1943 * through <code>'\u0008'</code>, <code>'\u000E'</code> through
1944 * <code>'\u001B'</code>, and <code>'\u007F'</code> through
1945 * <code>'\u009F'</code>), and FORMAT characters.
1946 * <br>
1947 * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
1948 * |U+007F-U+009F
1950 * @param ch character to test
1951 * @return true if ch is ignorable in a Unicode or Java identifier
1952 * @see #isJavaIdentifierPart(char)
1953 * @see #isUnicodeIdentifierPart(char)
1954 * @since 1.1
1956 public static boolean isIdentifierIgnorable(char ch)
1958 return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F'
1959 || (ch <= '\u001B' && ch >= '\u000E')))
1960 || getType(ch) == FORMAT;
1964 * Converts a Unicode character into its lowercase equivalent mapping.
1965 * If a mapping does not exist, then the character passed is returned.
1966 * Note that isLowerCase(toLowerCase(ch)) does not always return true.
1968 * @param ch character to convert to lowercase
1969 * @return lowercase mapping of ch, or ch if lowercase mapping does
1970 * not exist
1971 * @see #isLowerCase(char)
1972 * @see #isUpperCase(char)
1973 * @see #toTitleCase(char)
1974 * @see #toUpperCase(char)
1976 public static char toLowerCase(char ch)
1978 // Signedness doesn't matter, as result is cast back to char.
1979 return (char) (ch + lower[readChar(ch) >> 7]);
1983 * Converts a Unicode character into its uppercase equivalent mapping.
1984 * If a mapping does not exist, then the character passed is returned.
1985 * Note that isUpperCase(toUpperCase(ch)) does not always return true.
1987 * @param ch character to convert to uppercase
1988 * @return uppercase mapping of ch, or ch if uppercase mapping does
1989 * not exist
1990 * @see #isLowerCase(char)
1991 * @see #isUpperCase(char)
1992 * @see #toLowerCase(char)
1993 * @see #toTitleCase(char)
1995 public static char toUpperCase(char ch)
1997 // Signedness doesn't matter, as result is cast back to char.
1998 return (char) (ch + upper[readChar(ch) >> 7]);
2002 * Converts a Unicode character into its titlecase equivalent mapping.
2003 * If a mapping does not exist, then the character passed is returned.
2004 * Note that isTitleCase(toTitleCase(ch)) does not always return true.
2006 * @param ch character to convert to titlecase
2007 * @return titlecase mapping of ch, or ch if titlecase mapping does
2008 * not exist
2009 * @see #isTitleCase(char)
2010 * @see #toLowerCase(char)
2011 * @see #toUpperCase(char)
2013 public static char toTitleCase(char ch)
2015 // As title is short, it doesn't hurt to exhaustively iterate over it.
2016 for (int i = title.length - 2; i >= 0; i -= 2)
2017 if (title[i] == ch)
2018 return title[i + 1];
2019 return toUpperCase(ch);
2023 * Converts a character into a digit of the specified radix. If the radix
2024 * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
2025 * exceeds the radix, or if ch is not a decimal digit or in the case
2026 * insensitive set of 'a'-'z', the result is -1.
2027 * <br>
2028 * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
2029 * |U+FF21-U+FF3A|U+FF41-U+FF5A
2031 * @param ch character to convert into a digit
2032 * @param radix radix in which ch is a digit
2033 * @return digit which ch represents in radix, or -1 not a valid digit
2034 * @see #MIN_RADIX
2035 * @see #MAX_RADIX
2036 * @see #forDigit(int, int)
2037 * @see #isDigit(char)
2038 * @see #getNumericValue(char)
2040 public static int digit(char ch, int radix)
2042 if (radix < MIN_RADIX || radix > MAX_RADIX)
2043 return -1;
2044 char attr = readChar(ch);
2045 if (((1 << (attr & TYPE_MASK))
2046 & ((1 << UPPERCASE_LETTER)
2047 | (1 << LOWERCASE_LETTER)
2048 | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
2050 // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
2051 int digit = numValue[attr >> 7];
2052 return (digit < radix) ? digit : -1;
2054 return -1;
2058 * Returns the Unicode numeric value property of a character. For example,
2059 * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
2061 * <p>This method also returns values for the letters A through Z, (not
2062 * specified by Unicode), in these ranges: <code>'\u0041'</code>
2063 * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
2064 * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
2065 * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
2066 * <code>'\uFF5A'</code> (full width variants).
2068 * <p>If the character lacks a numeric value property, -1 is returned.
2069 * If the character has a numeric value property which is not representable
2070 * as a nonnegative integer, such as a fraction, -2 is returned.
2072 * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
2073 * |U+FF21-U+FF3A|U+FF41-U+FF5A
2075 * @param ch character from which the numeric value property will
2076 * be retrieved
2077 * @return the numeric value property of ch, or -1 if it does not exist, or
2078 * -2 if it is not representable as a nonnegative integer
2079 * @see #forDigit(int, int)
2080 * @see #digit(char, int)
2081 * @see #isDigit(char)
2082 * @since 1.1
2084 public static int getNumericValue(char ch)
2086 // Treat numValue as signed.
2087 return (short) numValue[readChar(ch) >> 7];
2091 * Determines if a character is a ISO-LATIN-1 space. This is only the five
2092 * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
2093 * <code>'\r'</code>, and <code>' '</code>.
2094 * <br>
2095 * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
2097 * @param ch character to test
2098 * @return true if ch is a space, else false
2099 * @deprecated Replaced by {@link #isWhitespace(char)}
2100 * @see #isSpaceChar(char)
2101 * @see #isWhitespace(char)
2103 public static boolean isSpace(char ch)
2105 // Performing the subtraction up front alleviates need to compare longs.
2106 return ch-- <= ' ' && ((1 << ch)
2107 & ((1 << (' ' - 1))
2108 | (1 << ('\t' - 1))
2109 | (1 << ('\n' - 1))
2110 | (1 << ('\r' - 1))
2111 | (1 << ('\f' - 1)))) != 0;
2115 * Determines if a character is a Unicode space character. This includes
2116 * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
2117 * <br>
2118 * Unicode space = [Zs]|[Zp]|[Zl]
2120 * @param ch character to test
2121 * @return true if ch is a Unicode space, else false
2122 * @see #isWhitespace(char)
2123 * @since 1.1
2125 public static boolean isSpaceChar(char ch)
2127 return ((1 << getType(ch))
2128 & ((1 << SPACE_SEPARATOR)
2129 | (1 << LINE_SEPARATOR)
2130 | (1 << PARAGRAPH_SEPARATOR))) != 0;
2134 * Determines if a character is Java whitespace. This includes Unicode
2135 * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
2136 * PARAGRAPH_SEPARATOR) except the non-breaking spaces
2137 * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
2138 * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
2139 * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
2140 * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
2141 * and <code>'\u001F'</code>.
2142 * <br>
2143 * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
2145 * @param ch character to test
2146 * @return true if ch is Java whitespace, else false
2147 * @see #isSpaceChar(char)
2148 * @since 1.1
2150 public static boolean isWhitespace(char ch)
2152 int attr = readChar(ch);
2153 return ((((1 << (attr & TYPE_MASK))
2154 & ((1 << SPACE_SEPARATOR)
2155 | (1 << LINE_SEPARATOR)
2156 | (1 << PARAGRAPH_SEPARATOR))) != 0)
2157 && (attr & NO_BREAK_MASK) == 0)
2158 || (ch <= '\u001F' && ((1 << ch)
2159 & ((1 << '\t')
2160 | (1 << '\n')
2161 | (1 << '\u000B')
2162 | (1 << '\u000C')
2163 | (1 << '\r')
2164 | (1 << '\u001C')
2165 | (1 << '\u001D')
2166 | (1 << '\u001E')
2167 | (1 << '\u001F'))) != 0);
2171 * Determines if a character has the ISO Control property.
2172 * <br>
2173 * ISO Control = [Cc]
2175 * @param ch character to test
2176 * @return true if ch is an ISO Control character, else false
2177 * @see #isSpaceChar(char)
2178 * @see #isWhitespace(char)
2179 * @since 1.1
2181 public static boolean isISOControl(char ch)
2183 return getType(ch) == CONTROL;
2187 * Returns the Unicode general category property of a character.
2189 * @param ch character from which the general category property will
2190 * be retrieved
2191 * @return the character category property of ch as an integer
2192 * @see #UNASSIGNED
2193 * @see #UPPERCASE_LETTER
2194 * @see #LOWERCASE_LETTER
2195 * @see #TITLECASE_LETTER
2196 * @see #MODIFIER_LETTER
2197 * @see #OTHER_LETTER
2198 * @see #NON_SPACING_MARK
2199 * @see #ENCLOSING_MARK
2200 * @see #COMBINING_SPACING_MARK
2201 * @see #DECIMAL_DIGIT_NUMBER
2202 * @see #LETTER_NUMBER
2203 * @see #OTHER_NUMBER
2204 * @see #SPACE_SEPARATOR
2205 * @see #LINE_SEPARATOR
2206 * @see #PARAGRAPH_SEPARATOR
2207 * @see #CONTROL
2208 * @see #FORMAT
2209 * @see #PRIVATE_USE
2210 * @see #SURROGATE
2211 * @see #DASH_PUNCTUATION
2212 * @see #START_PUNCTUATION
2213 * @see #END_PUNCTUATION
2214 * @see #CONNECTOR_PUNCTUATION
2215 * @see #OTHER_PUNCTUATION
2216 * @see #MATH_SYMBOL
2217 * @see #CURRENCY_SYMBOL
2218 * @see #MODIFIER_SYMBOL
2219 * @see #INITIAL_QUOTE_PUNCTUATION
2220 * @see #FINAL_QUOTE_PUNCTUATION
2221 * @since 1.1
2223 public static int getType(char ch)
2225 return readChar(ch) & TYPE_MASK;
2229 * Converts a digit into a character which represents that digit
2230 * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
2231 * or the digit exceeds the radix, then the null character <code>'\0'</code>
2232 * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'.
2233 * <br>
2234 * return value boundary = U+0030-U+0039|U+0061-U+007A
2236 * @param digit digit to be converted into a character
2237 * @param radix radix of digit
2238 * @return character representing digit in radix, or '\0'
2239 * @see #MIN_RADIX
2240 * @see #MAX_RADIX
2241 * @see #digit(char, int)
2243 public static char forDigit(int digit, int radix)
2245 if (radix < MIN_RADIX || radix > MAX_RADIX
2246 || digit < 0 || digit >= radix)
2247 return '\0';
2248 return Number.digits[digit];
2252 * Returns the Unicode directionality property of the character. This
2253 * is used in the visual ordering of text.
2255 * @param ch the character to look up
2256 * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
2257 * @see #DIRECTIONALITY_UNDEFINED
2258 * @see #DIRECTIONALITY_LEFT_TO_RIGHT
2259 * @see #DIRECTIONALITY_RIGHT_TO_LEFT
2260 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
2261 * @see #DIRECTIONALITY_EUROPEAN_NUMBER
2262 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
2263 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
2264 * @see #DIRECTIONALITY_ARABIC_NUMBER
2265 * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
2266 * @see #DIRECTIONALITY_NONSPACING_MARK
2267 * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
2268 * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
2269 * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
2270 * @see #DIRECTIONALITY_WHITESPACE
2271 * @see #DIRECTIONALITY_OTHER_NEUTRALS
2272 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
2273 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
2274 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
2275 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
2276 * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
2277 * @since 1.4
2279 public static byte getDirectionality(char ch)
2281 // The result will correctly be signed.
2282 return (byte) (direction[readChar(ch) >> 7] >> 2);
2286 * Determines whether the character is mirrored according to Unicode. For
2287 * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
2288 * left-to-right text, but ')' in right-to-left text.
2290 * @param ch the character to look up
2291 * @return true if the character is mirrored
2292 * @since 1.4
2294 public static boolean isMirrored(char ch)
2296 return (readChar(ch) & MIRROR_MASK) != 0;
2300 * Compares another Character to this Character, numerically.
2302 * @param anotherCharacter Character to compare with this Character
2303 * @return a negative integer if this Character is less than
2304 * anotherCharacter, zero if this Character is equal, and
2305 * a positive integer if this Character is greater
2306 * @throws NullPointerException if anotherCharacter is null
2307 * @since 1.2
2309 public int compareTo(Character anotherCharacter)
2311 return value - anotherCharacter.value;
2315 * Compares an object to this Character. Assuming the object is a
2316 * Character object, this method performs the same comparison as
2317 * compareTo(Character).
2319 * @param o object to compare
2320 * @return the comparison value
2321 * @throws ClassCastException if o is not a Character object
2322 * @throws NullPointerException if o is null
2323 * @see #compareTo(Character)
2324 * @since 1.2
2326 public int compareTo(Object o)
2328 return compareTo((Character) o);
2332 * Returns an <code>Character</code> object wrapping the value.
2333 * In contrast to the <code>Character</code> constructor, this method
2334 * will cache some values. It is used by boxing conversion.
2336 * @param val the value to wrap
2337 * @return the <code>Character</code>
2339 * @since 1.5
2341 public static Character valueOf(char val)
2343 if (val > MAX_CACHE)
2344 return new Character(val);
2345 synchronized (charCache)
2347 if (charCache[val - MIN_VALUE] == null)
2348 charCache[val - MIN_VALUE] = new Character(val);
2349 return charCache[val - MIN_VALUE];
2354 * Reverse the bytes in val.
2355 * @since 1.5
2357 public static char reverseBytes(char val)
2359 return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
2363 * Converts a unicode code point to a UTF-16 representation of that
2364 * code point.
2366 * @param codePoint the unicode code point
2368 * @return the UTF-16 representation of that code point
2370 * @throws IllegalArgumentException if the code point is not a valid
2371 * unicode code point
2373 * @since 1.5
2375 public static char[] toChars(int codePoint)
2377 char[] result = new char[charCount(codePoint)];
2378 int ignore = toChars(codePoint, result, 0);
2379 return result;
2383 * Converts a unicode code point to its UTF-16 representation.
2385 * @param codePoint the unicode code point
2386 * @param dst the target char array
2387 * @param dstIndex the start index for the target
2389 * @return number of characters written to <code>dst</code>
2391 * @throws IllegalArgumentException if <code>codePoint</code> is not a
2392 * valid unicode code point
2393 * @throws NullPointerException if <code>dst</code> is <code>null</code>
2394 * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
2395 * in <code>dst</code> or if the UTF-16 representation does not
2396 * fit into <code>dst</code>
2398 * @since 1.5
2400 public static int toChars(int codePoint, char[] dst, int dstIndex)
2402 if (!isValidCodePoint(codePoint))
2404 throw new IllegalArgumentException("not a valid code point: "
2405 + codePoint);
2408 int result;
2409 if (isSupplementaryCodePoint(codePoint))
2411 // Write second char first to cause IndexOutOfBoundsException
2412 // immediately.
2413 final int cp2 = codePoint - 0x10000;
2414 dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
2415 dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
2416 result = 2;
2418 else
2420 dst[dstIndex] = (char) codePoint;
2421 result = 1;
2423 return result;
2427 * Return number of 16-bit characters required to represent the given
2428 * code point.
2430 * @param codePoint a unicode code point
2432 * @return 2 if codePoint >= 0x10000, 1 otherwise.
2434 * @since 1.5
2436 public static int charCount(int codePoint)
2438 return
2439 (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
2440 ? 2
2441 : 1;
2445 * Determines whether the specified code point is
2446 * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
2447 * supplementary character range.
2449 * @param codePoint a Unicode code point
2451 * @return <code>true</code> if code point is in supplementary range
2453 * @since 1.5
2455 public static boolean isSupplementaryCodePoint(int codePoint)
2457 return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
2458 && codePoint <= MAX_CODE_POINT;
2462 * Determines whether the specified code point is
2463 * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
2465 * @param codePoint a Unicode code point
2467 * @return <code>true</code> if code point is valid
2469 * @since 1.5
2471 public static boolean isValidCodePoint(int codePoint)
2473 return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
2477 * Return true if the given character is a high surrogate.
2478 * @param ch the character
2479 * @return true if the character is a high surrogate character
2481 * @since 1.5
2483 public static boolean isHighSurrogate(char ch)
2485 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
2489 * Return true if the given character is a low surrogate.
2490 * @param ch the character
2491 * @return true if the character is a low surrogate character
2493 * @since 1.5
2495 public static boolean isLowSurrogate(char ch)
2497 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
2501 * Return true if the given characters compose a surrogate pair.
2502 * This is true if the first character is a high surrogate and the
2503 * second character is a low surrogate.
2504 * @param ch1 the first character
2505 * @param ch2 the first character
2506 * @return true if the characters compose a surrogate pair
2508 * @since 1.5
2510 public static boolean isSurrogatePair(char ch1, char ch2)
2512 return isHighSurrogate(ch1) && isLowSurrogate(ch2);
2516 * Given a valid surrogate pair, this returns the corresponding
2517 * code point.
2518 * @param high the high character of the pair
2519 * @param low the low character of the pair
2520 * @return the corresponding code point
2522 * @since 1.5
2524 public static int toCodePoint(char high, char low)
2526 return ((high - MIN_HIGH_SURROGATE) * 0x400) +
2527 (low - MIN_LOW_SURROGATE) + 0x10000;
2531 * Get the code point at the specified index in the CharSequence.
2532 * This is like CharSequence#charAt(int), but if the character is
2533 * the start of a surrogate pair, and there is a following
2534 * character, and this character completes the pair, then the
2535 * corresponding supplementary code point is returned. Otherwise,
2536 * the character at the index is returned.
2538 * @param sequence the CharSequence
2539 * @param index the index of the codepoint to get, starting at 0
2540 * @return the codepoint at the specified index
2541 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2542 * @since 1.5
2544 public static int codePointAt(CharSequence sequence, int index)
2546 int len = sequence.length();
2547 if (index < 0 || index >= len)
2548 throw new IndexOutOfBoundsException();
2549 char high = sequence.charAt(index);
2550 if (! isHighSurrogate(high) || ++index >= len)
2551 return high;
2552 char low = sequence.charAt(index);
2553 if (! isLowSurrogate(low))
2554 return high;
2555 return toCodePoint(high, low);
2559 * Get the code point at the specified index in the CharSequence.
2560 * If the character is the start of a surrogate pair, and there is a
2561 * following character, and this character completes the pair, then
2562 * the corresponding supplementary code point is returned.
2563 * Otherwise, the character at the index is returned.
2565 * @param chars the character array in which to look
2566 * @param index the index of the codepoint to get, starting at 0
2567 * @return the codepoint at the specified index
2568 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2569 * @since 1.5
2571 public static int codePointAt(char[] chars, int index)
2573 return codePointAt(chars, index, chars.length);
2577 * Get the code point at the specified index in the CharSequence.
2578 * If the character is the start of a surrogate pair, and there is a
2579 * following character within the specified range, and this
2580 * character completes the pair, then the corresponding
2581 * supplementary code point is returned. Otherwise, the character
2582 * at the index is returned.
2584 * @param chars the character array in which to look
2585 * @param index the index of the codepoint to get, starting at 0
2586 * @param limit the limit past which characters should not be examined
2587 * @return the codepoint at the specified index
2588 * @throws IndexOutOfBoundsException if index is negative or &gt;=
2589 * limit, or if limit is negative or &gt;= the length of the array
2590 * @since 1.5
2592 public static int codePointAt(char[] chars, int index, int limit)
2594 if (index < 0 || index >= limit || limit < 0 || limit >= chars.length)
2595 throw new IndexOutOfBoundsException();
2596 char high = chars[index];
2597 if (! isHighSurrogate(high) || ++index >= limit)
2598 return high;
2599 char low = chars[index];
2600 if (! isLowSurrogate(low))
2601 return high;
2602 return toCodePoint(high, low);
2606 * Get the code point before the specified index. This is like
2607 * #codePointAt(char[], int), but checks the characters at
2608 * <code>index-1</code> and <code>index-2</code> to see if they form
2609 * a supplementary code point. If they do not, the character at
2610 * <code>index-1</code> is returned.
2612 * @param chars the character array
2613 * @param index the index just past the codepoint to get, starting at 0
2614 * @return the codepoint at the specified index
2615 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2616 * @since 1.5
2618 public static int codePointBefore(char[] chars, int index)
2620 return codePointBefore(chars, index, 1);
2624 * Get the code point before the specified index. This is like
2625 * #codePointAt(char[], int), but checks the characters at
2626 * <code>index-1</code> and <code>index-2</code> to see if they form
2627 * a supplementary code point. If they do not, the character at
2628 * <code>index-1</code> is returned. The start parameter is used to
2629 * limit the range of the array which may be examined.
2631 * @param chars the character array
2632 * @param index the index just past the codepoint to get, starting at 0
2633 * @param start the index before which characters should not be examined
2634 * @return the codepoint at the specified index
2635 * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
2636 * the length of the array, or if limit is negative or &gt;= the
2637 * length of the array
2638 * @since 1.5
2640 public static int codePointBefore(char[] chars, int index, int start)
2642 if (index < start || index > chars.length
2643 || start < 0 || start >= chars.length)
2644 throw new IndexOutOfBoundsException();
2645 --index;
2646 char low = chars[index];
2647 if (! isLowSurrogate(low) || --index < start)
2648 return low;
2649 char high = chars[index];
2650 if (! isHighSurrogate(high))
2651 return low;
2652 return toCodePoint(high, low);
2656 * Get the code point before the specified index. This is like
2657 * #codePointAt(CharSequence, int), but checks the characters at
2658 * <code>index-1</code> and <code>index-2</code> to see if they form
2659 * a supplementary code point. If they do not, the character at
2660 * <code>index-1</code> is returned.
2662 * @param sequence the CharSequence
2663 * @param index the index just past the codepoint to get, starting at 0
2664 * @return the codepoint at the specified index
2665 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2666 * @since 1.5
2668 public static int codePointBefore(CharSequence sequence, int index)
2670 int len = sequence.length();
2671 if (index < 1 || index > len)
2672 throw new IndexOutOfBoundsException();
2673 --index;
2674 char low = sequence.charAt(index);
2675 if (! isLowSurrogate(low) || --index < 0)
2676 return low;
2677 char high = sequence.charAt(index);
2678 if (! isHighSurrogate(high))
2679 return low;
2680 return toCodePoint(high, low);
2682 } // class Character