Merge from mainline
[official-gcc.git] / libjava / java / lang / Character.java
blobf56117fdb3108c3ea1e7983f5dcd2ffa72973826
1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
2 Copyright (C) 1998, 1999, 2001, 2002, 2005, 2006 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 * Note: This class must not be merged with Classpath. Gcj uses C-style
40 * arrays (see include/java-chartables.h) to store the Unicode character
41 * database, whereas Classpath uses Java objects (char[] extracted from
42 * String constants) in gnu.java.lang.CharData. Gcj's approach is more
43 * efficient, because there is no vtable or data relocation to worry about.
44 * However, despite the difference in the database interface, the two
45 * versions share identical algorithms.
48 package java.lang;
50 import java.io.Serializable;
52 /**
53 * Wrapper class for the primitive char data type. In addition, this class
54 * allows one to retrieve property information and perform transformations
55 * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0.
56 * java.lang.Character is designed to be very dynamic, and as such, it
57 * retrieves information on the Unicode character set from a separate
58 * database, gnu.java.lang.CharData, which can be easily upgraded.
60 * <p>For predicates, boundaries are used to describe
61 * the set of characters for which the method will return true.
62 * This syntax uses fairly normal regular expression notation.
63 * See 5.13 of the Unicode Standard, Version 3.0, for the
64 * boundary specification.
66 * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
67 * for more information on the Unicode Standard.
69 * @author Tom Tromey (tromey@cygnus.com)
70 * @author Paul N. Fisher
71 * @author Jochen Hoenicke
72 * @author Eric Blake (ebb9@email.byu.edu)
73 * @since 1.0
74 * @status updated to 1.4
76 public final class Character implements Serializable, Comparable
78 /**
79 * A subset of Unicode blocks.
81 * @author Paul N. Fisher
82 * @author Eric Blake (ebb9@email.byu.edu)
83 * @since 1.2
85 public static class Subset
87 /** The name of the subset. */
88 private final String name;
90 /**
91 * Construct a new subset of characters.
93 * @param name the name of the subset
94 * @throws NullPointerException if name is null
96 protected Subset(String name)
98 // Note that name.toString() is name, unless name was null.
99 this.name = name.toString();
103 * Compares two Subsets for equality. This is <code>final</code>, and
104 * restricts the comparison on the <code>==</code> operator, so it returns
105 * true only for the same object.
107 * @param o the object to compare
108 * @return true if o is this
110 public final boolean equals(Object o)
112 return o == this;
116 * Makes the original hashCode of Object final, to be consistent with
117 * equals.
119 * @return the hash code for this object
121 public final int hashCode()
123 return super.hashCode();
127 * Returns the name of the subset.
129 * @return the name
131 public final String toString()
133 return name;
135 } // class Subset
138 * A family of character subsets in the Unicode specification. A character
139 * is in at most one of these blocks.
141 * This inner class was generated automatically from
142 * <code>libjava/gnu/gcj/convert/Blocks-3.txt</code>, by some perl scripts.
143 * This Unicode definition file can be found on the
144 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
145 * JDK 1.4 uses Unicode version 3.0.0.
147 * @author scripts/unicode-blocks.pl (written by Eric Blake)
148 * @since 1.2
150 public static final class UnicodeBlock extends Subset
152 /** The start of the subset. */
153 private final char start;
155 /** The end of the subset. */
156 private final char end;
159 * Constructor for strictly defined blocks.
161 * @param start the start character of the range
162 * @param end the end character of the range
163 * @param name the block name
165 private UnicodeBlock(char start, char end, String name)
167 super(name);
168 this.start = start;
169 this.end = end;
173 * Returns the Unicode character block which a character belongs to.
175 * @param ch the character to look up
176 * @return the set it belongs to, or null if it is not in one
178 public static UnicodeBlock of(char ch)
180 // Special case, since SPECIALS contains two ranges.
181 if (ch == '\uFEFF')
182 return SPECIALS;
183 // Simple binary search for the correct block.
184 int low = 0;
185 int hi = sets.length - 1;
186 while (low <= hi)
188 int mid = (low + hi) >> 1;
189 UnicodeBlock b = sets[mid];
190 if (ch < b.start)
191 hi = mid - 1;
192 else if (ch > b.end)
193 low = mid + 1;
194 else
195 return b;
197 return null;
201 * Basic Latin.
202 * '\u0000' - '\u007F'.
204 public static final UnicodeBlock BASIC_LATIN
205 = new UnicodeBlock('\u0000', '\u007F',
206 "BASIC_LATIN");
209 * Latin-1 Supplement.
210 * '\u0080' - '\u00FF'.
212 public static final UnicodeBlock LATIN_1_SUPPLEMENT
213 = new UnicodeBlock('\u0080', '\u00FF',
214 "LATIN_1_SUPPLEMENT");
217 * Latin Extended-A.
218 * '\u0100' - '\u017F'.
220 public static final UnicodeBlock LATIN_EXTENDED_A
221 = new UnicodeBlock('\u0100', '\u017F',
222 "LATIN_EXTENDED_A");
225 * Latin Extended-B.
226 * '\u0180' - '\u024F'.
228 public static final UnicodeBlock LATIN_EXTENDED_B
229 = new UnicodeBlock('\u0180', '\u024F',
230 "LATIN_EXTENDED_B");
233 * IPA Extensions.
234 * '\u0250' - '\u02AF'.
236 public static final UnicodeBlock IPA_EXTENSIONS
237 = new UnicodeBlock('\u0250', '\u02AF',
238 "IPA_EXTENSIONS");
241 * Spacing Modifier Letters.
242 * '\u02B0' - '\u02FF'.
244 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
245 = new UnicodeBlock('\u02B0', '\u02FF',
246 "SPACING_MODIFIER_LETTERS");
249 * Combining Diacritical Marks.
250 * '\u0300' - '\u036F'.
252 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
253 = new UnicodeBlock('\u0300', '\u036F',
254 "COMBINING_DIACRITICAL_MARKS");
257 * Greek.
258 * '\u0370' - '\u03FF'.
260 public static final UnicodeBlock GREEK
261 = new UnicodeBlock('\u0370', '\u03FF',
262 "GREEK");
265 * Cyrillic.
266 * '\u0400' - '\u04FF'.
268 public static final UnicodeBlock CYRILLIC
269 = new UnicodeBlock('\u0400', '\u04FF',
270 "CYRILLIC");
273 * Armenian.
274 * '\u0530' - '\u058F'.
276 public static final UnicodeBlock ARMENIAN
277 = new UnicodeBlock('\u0530', '\u058F',
278 "ARMENIAN");
281 * Hebrew.
282 * '\u0590' - '\u05FF'.
284 public static final UnicodeBlock HEBREW
285 = new UnicodeBlock('\u0590', '\u05FF',
286 "HEBREW");
289 * Arabic.
290 * '\u0600' - '\u06FF'.
292 public static final UnicodeBlock ARABIC
293 = new UnicodeBlock('\u0600', '\u06FF',
294 "ARABIC");
297 * Syriac.
298 * '\u0700' - '\u074F'.
299 * @since 1.4
301 public static final UnicodeBlock SYRIAC
302 = new UnicodeBlock('\u0700', '\u074F',
303 "SYRIAC");
306 * Thaana.
307 * '\u0780' - '\u07BF'.
308 * @since 1.4
310 public static final UnicodeBlock THAANA
311 = new UnicodeBlock('\u0780', '\u07BF',
312 "THAANA");
315 * Devanagari.
316 * '\u0900' - '\u097F'.
318 public static final UnicodeBlock DEVANAGARI
319 = new UnicodeBlock('\u0900', '\u097F',
320 "DEVANAGARI");
323 * Bengali.
324 * '\u0980' - '\u09FF'.
326 public static final UnicodeBlock BENGALI
327 = new UnicodeBlock('\u0980', '\u09FF',
328 "BENGALI");
331 * Gurmukhi.
332 * '\u0A00' - '\u0A7F'.
334 public static final UnicodeBlock GURMUKHI
335 = new UnicodeBlock('\u0A00', '\u0A7F',
336 "GURMUKHI");
339 * Gujarati.
340 * '\u0A80' - '\u0AFF'.
342 public static final UnicodeBlock GUJARATI
343 = new UnicodeBlock('\u0A80', '\u0AFF',
344 "GUJARATI");
347 * Oriya.
348 * '\u0B00' - '\u0B7F'.
350 public static final UnicodeBlock ORIYA
351 = new UnicodeBlock('\u0B00', '\u0B7F',
352 "ORIYA");
355 * Tamil.
356 * '\u0B80' - '\u0BFF'.
358 public static final UnicodeBlock TAMIL
359 = new UnicodeBlock('\u0B80', '\u0BFF',
360 "TAMIL");
363 * Telugu.
364 * '\u0C00' - '\u0C7F'.
366 public static final UnicodeBlock TELUGU
367 = new UnicodeBlock('\u0C00', '\u0C7F',
368 "TELUGU");
371 * Kannada.
372 * '\u0C80' - '\u0CFF'.
374 public static final UnicodeBlock KANNADA
375 = new UnicodeBlock('\u0C80', '\u0CFF',
376 "KANNADA");
379 * Malayalam.
380 * '\u0D00' - '\u0D7F'.
382 public static final UnicodeBlock MALAYALAM
383 = new UnicodeBlock('\u0D00', '\u0D7F',
384 "MALAYALAM");
387 * Sinhala.
388 * '\u0D80' - '\u0DFF'.
389 * @since 1.4
391 public static final UnicodeBlock SINHALA
392 = new UnicodeBlock('\u0D80', '\u0DFF',
393 "SINHALA");
396 * Thai.
397 * '\u0E00' - '\u0E7F'.
399 public static final UnicodeBlock THAI
400 = new UnicodeBlock('\u0E00', '\u0E7F',
401 "THAI");
404 * Lao.
405 * '\u0E80' - '\u0EFF'.
407 public static final UnicodeBlock LAO
408 = new UnicodeBlock('\u0E80', '\u0EFF',
409 "LAO");
412 * Tibetan.
413 * '\u0F00' - '\u0FFF'.
415 public static final UnicodeBlock TIBETAN
416 = new UnicodeBlock('\u0F00', '\u0FFF',
417 "TIBETAN");
420 * Myanmar.
421 * '\u1000' - '\u109F'.
422 * @since 1.4
424 public static final UnicodeBlock MYANMAR
425 = new UnicodeBlock('\u1000', '\u109F',
426 "MYANMAR");
429 * Georgian.
430 * '\u10A0' - '\u10FF'.
432 public static final UnicodeBlock GEORGIAN
433 = new UnicodeBlock('\u10A0', '\u10FF',
434 "GEORGIAN");
437 * Hangul Jamo.
438 * '\u1100' - '\u11FF'.
440 public static final UnicodeBlock HANGUL_JAMO
441 = new UnicodeBlock('\u1100', '\u11FF',
442 "HANGUL_JAMO");
445 * Ethiopic.
446 * '\u1200' - '\u137F'.
447 * @since 1.4
449 public static final UnicodeBlock ETHIOPIC
450 = new UnicodeBlock('\u1200', '\u137F',
451 "ETHIOPIC");
454 * Cherokee.
455 * '\u13A0' - '\u13FF'.
456 * @since 1.4
458 public static final UnicodeBlock CHEROKEE
459 = new UnicodeBlock('\u13A0', '\u13FF',
460 "CHEROKEE");
463 * Unified Canadian Aboriginal Syllabics.
464 * '\u1400' - '\u167F'.
465 * @since 1.4
467 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
468 = new UnicodeBlock('\u1400', '\u167F',
469 "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS");
472 * Ogham.
473 * '\u1680' - '\u169F'.
474 * @since 1.4
476 public static final UnicodeBlock OGHAM
477 = new UnicodeBlock('\u1680', '\u169F',
478 "OGHAM");
481 * Runic.
482 * '\u16A0' - '\u16FF'.
483 * @since 1.4
485 public static final UnicodeBlock RUNIC
486 = new UnicodeBlock('\u16A0', '\u16FF',
487 "RUNIC");
490 * Khmer.
491 * '\u1780' - '\u17FF'.
492 * @since 1.4
494 public static final UnicodeBlock KHMER
495 = new UnicodeBlock('\u1780', '\u17FF',
496 "KHMER");
499 * Mongolian.
500 * '\u1800' - '\u18AF'.
501 * @since 1.4
503 public static final UnicodeBlock MONGOLIAN
504 = new UnicodeBlock('\u1800', '\u18AF',
505 "MONGOLIAN");
508 * Latin Extended Additional.
509 * '\u1E00' - '\u1EFF'.
511 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
512 = new UnicodeBlock('\u1E00', '\u1EFF',
513 "LATIN_EXTENDED_ADDITIONAL");
516 * Greek Extended.
517 * '\u1F00' - '\u1FFF'.
519 public static final UnicodeBlock GREEK_EXTENDED
520 = new UnicodeBlock('\u1F00', '\u1FFF',
521 "GREEK_EXTENDED");
524 * General Punctuation.
525 * '\u2000' - '\u206F'.
527 public static final UnicodeBlock GENERAL_PUNCTUATION
528 = new UnicodeBlock('\u2000', '\u206F',
529 "GENERAL_PUNCTUATION");
532 * Superscripts and Subscripts.
533 * '\u2070' - '\u209F'.
535 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
536 = new UnicodeBlock('\u2070', '\u209F',
537 "SUPERSCRIPTS_AND_SUBSCRIPTS");
540 * Currency Symbols.
541 * '\u20A0' - '\u20CF'.
543 public static final UnicodeBlock CURRENCY_SYMBOLS
544 = new UnicodeBlock('\u20A0', '\u20CF',
545 "CURRENCY_SYMBOLS");
548 * Combining Marks for Symbols.
549 * '\u20D0' - '\u20FF'.
551 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
552 = new UnicodeBlock('\u20D0', '\u20FF',
553 "COMBINING_MARKS_FOR_SYMBOLS");
556 * Letterlike Symbols.
557 * '\u2100' - '\u214F'.
559 public static final UnicodeBlock LETTERLIKE_SYMBOLS
560 = new UnicodeBlock('\u2100', '\u214F',
561 "LETTERLIKE_SYMBOLS");
564 * Number Forms.
565 * '\u2150' - '\u218F'.
567 public static final UnicodeBlock NUMBER_FORMS
568 = new UnicodeBlock('\u2150', '\u218F',
569 "NUMBER_FORMS");
572 * Arrows.
573 * '\u2190' - '\u21FF'.
575 public static final UnicodeBlock ARROWS
576 = new UnicodeBlock('\u2190', '\u21FF',
577 "ARROWS");
580 * Mathematical Operators.
581 * '\u2200' - '\u22FF'.
583 public static final UnicodeBlock MATHEMATICAL_OPERATORS
584 = new UnicodeBlock('\u2200', '\u22FF',
585 "MATHEMATICAL_OPERATORS");
588 * Miscellaneous Technical.
589 * '\u2300' - '\u23FF'.
591 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
592 = new UnicodeBlock('\u2300', '\u23FF',
593 "MISCELLANEOUS_TECHNICAL");
596 * Control Pictures.
597 * '\u2400' - '\u243F'.
599 public static final UnicodeBlock CONTROL_PICTURES
600 = new UnicodeBlock('\u2400', '\u243F',
601 "CONTROL_PICTURES");
604 * Optical Character Recognition.
605 * '\u2440' - '\u245F'.
607 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
608 = new UnicodeBlock('\u2440', '\u245F',
609 "OPTICAL_CHARACTER_RECOGNITION");
612 * Enclosed Alphanumerics.
613 * '\u2460' - '\u24FF'.
615 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
616 = new UnicodeBlock('\u2460', '\u24FF',
617 "ENCLOSED_ALPHANUMERICS");
620 * Box Drawing.
621 * '\u2500' - '\u257F'.
623 public static final UnicodeBlock BOX_DRAWING
624 = new UnicodeBlock('\u2500', '\u257F',
625 "BOX_DRAWING");
628 * Block Elements.
629 * '\u2580' - '\u259F'.
631 public static final UnicodeBlock BLOCK_ELEMENTS
632 = new UnicodeBlock('\u2580', '\u259F',
633 "BLOCK_ELEMENTS");
636 * Geometric Shapes.
637 * '\u25A0' - '\u25FF'.
639 public static final UnicodeBlock GEOMETRIC_SHAPES
640 = new UnicodeBlock('\u25A0', '\u25FF',
641 "GEOMETRIC_SHAPES");
644 * Miscellaneous Symbols.
645 * '\u2600' - '\u26FF'.
647 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
648 = new UnicodeBlock('\u2600', '\u26FF',
649 "MISCELLANEOUS_SYMBOLS");
652 * Dingbats.
653 * '\u2700' - '\u27BF'.
655 public static final UnicodeBlock DINGBATS
656 = new UnicodeBlock('\u2700', '\u27BF',
657 "DINGBATS");
660 * Braille Patterns.
661 * '\u2800' - '\u28FF'.
662 * @since 1.4
664 public static final UnicodeBlock BRAILLE_PATTERNS
665 = new UnicodeBlock('\u2800', '\u28FF',
666 "BRAILLE_PATTERNS");
669 * CJK Radicals Supplement.
670 * '\u2E80' - '\u2EFF'.
671 * @since 1.4
673 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
674 = new UnicodeBlock('\u2E80', '\u2EFF',
675 "CJK_RADICALS_SUPPLEMENT");
678 * Kangxi Radicals.
679 * '\u2F00' - '\u2FDF'.
680 * @since 1.4
682 public static final UnicodeBlock KANGXI_RADICALS
683 = new UnicodeBlock('\u2F00', '\u2FDF',
684 "KANGXI_RADICALS");
687 * Ideographic Description Characters.
688 * '\u2FF0' - '\u2FFF'.
689 * @since 1.4
691 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
692 = new UnicodeBlock('\u2FF0', '\u2FFF',
693 "IDEOGRAPHIC_DESCRIPTION_CHARACTERS");
696 * CJK Symbols and Punctuation.
697 * '\u3000' - '\u303F'.
699 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
700 = new UnicodeBlock('\u3000', '\u303F',
701 "CJK_SYMBOLS_AND_PUNCTUATION");
704 * Hiragana.
705 * '\u3040' - '\u309F'.
707 public static final UnicodeBlock HIRAGANA
708 = new UnicodeBlock('\u3040', '\u309F',
709 "HIRAGANA");
712 * Katakana.
713 * '\u30A0' - '\u30FF'.
715 public static final UnicodeBlock KATAKANA
716 = new UnicodeBlock('\u30A0', '\u30FF',
717 "KATAKANA");
720 * Bopomofo.
721 * '\u3100' - '\u312F'.
723 public static final UnicodeBlock BOPOMOFO
724 = new UnicodeBlock('\u3100', '\u312F',
725 "BOPOMOFO");
728 * Hangul Compatibility Jamo.
729 * '\u3130' - '\u318F'.
731 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
732 = new UnicodeBlock('\u3130', '\u318F',
733 "HANGUL_COMPATIBILITY_JAMO");
736 * Kanbun.
737 * '\u3190' - '\u319F'.
739 public static final UnicodeBlock KANBUN
740 = new UnicodeBlock('\u3190', '\u319F',
741 "KANBUN");
744 * Bopomofo Extended.
745 * '\u31A0' - '\u31BF'.
746 * @since 1.4
748 public static final UnicodeBlock BOPOMOFO_EXTENDED
749 = new UnicodeBlock('\u31A0', '\u31BF',
750 "BOPOMOFO_EXTENDED");
753 * Enclosed CJK Letters and Months.
754 * '\u3200' - '\u32FF'.
756 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
757 = new UnicodeBlock('\u3200', '\u32FF',
758 "ENCLOSED_CJK_LETTERS_AND_MONTHS");
761 * CJK Compatibility.
762 * '\u3300' - '\u33FF'.
764 public static final UnicodeBlock CJK_COMPATIBILITY
765 = new UnicodeBlock('\u3300', '\u33FF',
766 "CJK_COMPATIBILITY");
769 * CJK Unified Ideographs Extension A.
770 * '\u3400' - '\u4DB5'.
771 * @since 1.4
773 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
774 = new UnicodeBlock('\u3400', '\u4DB5',
775 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A");
778 * CJK Unified Ideographs.
779 * '\u4E00' - '\u9FFF'.
781 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
782 = new UnicodeBlock('\u4E00', '\u9FFF',
783 "CJK_UNIFIED_IDEOGRAPHS");
786 * Yi Syllables.
787 * '\uA000' - '\uA48F'.
788 * @since 1.4
790 public static final UnicodeBlock YI_SYLLABLES
791 = new UnicodeBlock('\uA000', '\uA48F',
792 "YI_SYLLABLES");
795 * Yi Radicals.
796 * '\uA490' - '\uA4CF'.
797 * @since 1.4
799 public static final UnicodeBlock YI_RADICALS
800 = new UnicodeBlock('\uA490', '\uA4CF',
801 "YI_RADICALS");
804 * Hangul Syllables.
805 * '\uAC00' - '\uD7A3'.
807 public static final UnicodeBlock HANGUL_SYLLABLES
808 = new UnicodeBlock('\uAC00', '\uD7A3',
809 "HANGUL_SYLLABLES");
812 * Surrogates Area.
813 * '\uD800' - '\uDFFF'.
815 public static final UnicodeBlock SURROGATES_AREA
816 = new UnicodeBlock('\uD800', '\uDFFF',
817 "SURROGATES_AREA");
820 * Private Use Area.
821 * '\uE000' - '\uF8FF'.
823 public static final UnicodeBlock PRIVATE_USE_AREA
824 = new UnicodeBlock('\uE000', '\uF8FF',
825 "PRIVATE_USE_AREA");
828 * CJK Compatibility Ideographs.
829 * '\uF900' - '\uFAFF'.
831 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
832 = new UnicodeBlock('\uF900', '\uFAFF',
833 "CJK_COMPATIBILITY_IDEOGRAPHS");
836 * Alphabetic Presentation Forms.
837 * '\uFB00' - '\uFB4F'.
839 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
840 = new UnicodeBlock('\uFB00', '\uFB4F',
841 "ALPHABETIC_PRESENTATION_FORMS");
844 * Arabic Presentation Forms-A.
845 * '\uFB50' - '\uFDFF'.
847 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
848 = new UnicodeBlock('\uFB50', '\uFDFF',
849 "ARABIC_PRESENTATION_FORMS_A");
852 * Combining Half Marks.
853 * '\uFE20' - '\uFE2F'.
855 public static final UnicodeBlock COMBINING_HALF_MARKS
856 = new UnicodeBlock('\uFE20', '\uFE2F',
857 "COMBINING_HALF_MARKS");
860 * CJK Compatibility Forms.
861 * '\uFE30' - '\uFE4F'.
863 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
864 = new UnicodeBlock('\uFE30', '\uFE4F',
865 "CJK_COMPATIBILITY_FORMS");
868 * Small Form Variants.
869 * '\uFE50' - '\uFE6F'.
871 public static final UnicodeBlock SMALL_FORM_VARIANTS
872 = new UnicodeBlock('\uFE50', '\uFE6F',
873 "SMALL_FORM_VARIANTS");
876 * Arabic Presentation Forms-B.
877 * '\uFE70' - '\uFEFE'.
879 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
880 = new UnicodeBlock('\uFE70', '\uFEFE',
881 "ARABIC_PRESENTATION_FORMS_B");
884 * Halfwidth and Fullwidth Forms.
885 * '\uFF00' - '\uFFEF'.
887 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
888 = new UnicodeBlock('\uFF00', '\uFFEF',
889 "HALFWIDTH_AND_FULLWIDTH_FORMS");
892 * Specials.
893 * '\uFEFF', '\uFFF0' - '\uFFFD'.
895 public static final UnicodeBlock SPECIALS
896 = new UnicodeBlock('\uFFF0', '\uFFFD',
897 "SPECIALS");
900 * The defined subsets.
902 private static final UnicodeBlock sets[] = {
903 BASIC_LATIN,
904 LATIN_1_SUPPLEMENT,
905 LATIN_EXTENDED_A,
906 LATIN_EXTENDED_B,
907 IPA_EXTENSIONS,
908 SPACING_MODIFIER_LETTERS,
909 COMBINING_DIACRITICAL_MARKS,
910 GREEK,
911 CYRILLIC,
912 ARMENIAN,
913 HEBREW,
914 ARABIC,
915 SYRIAC,
916 THAANA,
917 DEVANAGARI,
918 BENGALI,
919 GURMUKHI,
920 GUJARATI,
921 ORIYA,
922 TAMIL,
923 TELUGU,
924 KANNADA,
925 MALAYALAM,
926 SINHALA,
927 THAI,
928 LAO,
929 TIBETAN,
930 MYANMAR,
931 GEORGIAN,
932 HANGUL_JAMO,
933 ETHIOPIC,
934 CHEROKEE,
935 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
936 OGHAM,
937 RUNIC,
938 KHMER,
939 MONGOLIAN,
940 LATIN_EXTENDED_ADDITIONAL,
941 GREEK_EXTENDED,
942 GENERAL_PUNCTUATION,
943 SUPERSCRIPTS_AND_SUBSCRIPTS,
944 CURRENCY_SYMBOLS,
945 COMBINING_MARKS_FOR_SYMBOLS,
946 LETTERLIKE_SYMBOLS,
947 NUMBER_FORMS,
948 ARROWS,
949 MATHEMATICAL_OPERATORS,
950 MISCELLANEOUS_TECHNICAL,
951 CONTROL_PICTURES,
952 OPTICAL_CHARACTER_RECOGNITION,
953 ENCLOSED_ALPHANUMERICS,
954 BOX_DRAWING,
955 BLOCK_ELEMENTS,
956 GEOMETRIC_SHAPES,
957 MISCELLANEOUS_SYMBOLS,
958 DINGBATS,
959 BRAILLE_PATTERNS,
960 CJK_RADICALS_SUPPLEMENT,
961 KANGXI_RADICALS,
962 IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
963 CJK_SYMBOLS_AND_PUNCTUATION,
964 HIRAGANA,
965 KATAKANA,
966 BOPOMOFO,
967 HANGUL_COMPATIBILITY_JAMO,
968 KANBUN,
969 BOPOMOFO_EXTENDED,
970 ENCLOSED_CJK_LETTERS_AND_MONTHS,
971 CJK_COMPATIBILITY,
972 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
973 CJK_UNIFIED_IDEOGRAPHS,
974 YI_SYLLABLES,
975 YI_RADICALS,
976 HANGUL_SYLLABLES,
977 SURROGATES_AREA,
978 PRIVATE_USE_AREA,
979 CJK_COMPATIBILITY_IDEOGRAPHS,
980 ALPHABETIC_PRESENTATION_FORMS,
981 ARABIC_PRESENTATION_FORMS_A,
982 COMBINING_HALF_MARKS,
983 CJK_COMPATIBILITY_FORMS,
984 SMALL_FORM_VARIANTS,
985 ARABIC_PRESENTATION_FORMS_B,
986 HALFWIDTH_AND_FULLWIDTH_FORMS,
987 SPECIALS,
989 } // class UnicodeBlock
992 * The immutable value of this Character.
994 * @serial the value of this Character
996 private final char value;
999 * Compatible with JDK 1.0+.
1001 private static final long serialVersionUID = 3786198910865385080L;
1004 * Smallest value allowed for radix arguments in Java. This value is 2.
1006 * @see #digit(char, int)
1007 * @see #forDigit(int, int)
1008 * @see Integer#toString(int, int)
1009 * @see Integer#valueOf(String)
1011 public static final int MIN_RADIX = 2;
1014 * Largest value allowed for radix arguments in Java. This value is 36.
1016 * @see #digit(char, int)
1017 * @see #forDigit(int, int)
1018 * @see Integer#toString(int, int)
1019 * @see Integer#valueOf(String)
1021 public static final int MAX_RADIX = 36;
1024 * The minimum value the char data type can hold.
1025 * This value is <code>'\\u0000'</code>.
1027 public static final char MIN_VALUE = '\u0000';
1030 * The maximum value the char data type can hold.
1031 * This value is <code>'\\uFFFF'</code>.
1033 public static final char MAX_VALUE = '\uFFFF';
1036 * Class object representing the primitive char data type.
1038 * @since 1.1
1040 public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1043 * The number of bits needed to represent a <code>char</code>.
1044 * @since 1.5
1046 public static final int SIZE = 16;
1048 // This caches some Character values, and is used by boxing
1049 // conversions via valueOf(). We must cache at least 0..127;
1050 // this constant controls how much we actually cache.
1051 private static final int MAX_CACHE = 127;
1052 private static Character[] charCache = new Character[MAX_CACHE + 1];
1055 * Lu = Letter, Uppercase (Informative).
1057 * @since 1.1
1059 public static final byte UPPERCASE_LETTER = 1;
1062 * Ll = Letter, Lowercase (Informative).
1064 * @since 1.1
1066 public static final byte LOWERCASE_LETTER = 2;
1069 * Lt = Letter, Titlecase (Informative).
1071 * @since 1.1
1073 public static final byte TITLECASE_LETTER = 3;
1076 * Mn = Mark, Non-Spacing (Normative).
1078 * @since 1.1
1080 public static final byte NON_SPACING_MARK = 6;
1083 * Mc = Mark, Spacing Combining (Normative).
1085 * @since 1.1
1087 public static final byte COMBINING_SPACING_MARK = 8;
1090 * Me = Mark, Enclosing (Normative).
1092 * @since 1.1
1094 public static final byte ENCLOSING_MARK = 7;
1097 * Nd = Number, Decimal Digit (Normative).
1099 * @since 1.1
1101 public static final byte DECIMAL_DIGIT_NUMBER = 9;
1104 * Nl = Number, Letter (Normative).
1106 * @since 1.1
1108 public static final byte LETTER_NUMBER = 10;
1111 * No = Number, Other (Normative).
1113 * @since 1.1
1115 public static final byte OTHER_NUMBER = 11;
1118 * Zs = Separator, Space (Normative).
1120 * @since 1.1
1122 public static final byte SPACE_SEPARATOR = 12;
1125 * Zl = Separator, Line (Normative).
1127 * @since 1.1
1129 public static final byte LINE_SEPARATOR = 13;
1132 * Zp = Separator, Paragraph (Normative).
1134 * @since 1.1
1136 public static final byte PARAGRAPH_SEPARATOR = 14;
1139 * Cc = Other, Control (Normative).
1141 * @since 1.1
1143 public static final byte CONTROL = 15;
1146 * Cf = Other, Format (Normative).
1148 * @since 1.1
1150 public static final byte FORMAT = 16;
1153 * Cs = Other, Surrogate (Normative).
1155 * @since 1.1
1157 public static final byte SURROGATE = 19;
1160 * Co = Other, Private Use (Normative).
1162 * @since 1.1
1164 public static final byte PRIVATE_USE = 18;
1167 * Cn = Other, Not Assigned (Normative).
1169 * @since 1.1
1171 public static final byte UNASSIGNED = 0;
1174 * Lm = Letter, Modifier (Informative).
1176 * @since 1.1
1178 public static final byte MODIFIER_LETTER = 4;
1181 * Lo = Letter, Other (Informative).
1183 * @since 1.1
1185 public static final byte OTHER_LETTER = 5;
1188 * Pc = Punctuation, Connector (Informative).
1190 * @since 1.1
1192 public static final byte CONNECTOR_PUNCTUATION = 23;
1195 * Pd = Punctuation, Dash (Informative).
1197 * @since 1.1
1199 public static final byte DASH_PUNCTUATION = 20;
1202 * Ps = Punctuation, Open (Informative).
1204 * @since 1.1
1206 public static final byte START_PUNCTUATION = 21;
1209 * Pe = Punctuation, Close (Informative).
1211 * @since 1.1
1213 public static final byte END_PUNCTUATION = 22;
1216 * Pi = Punctuation, Initial Quote (Informative).
1218 * @since 1.4
1220 public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
1223 * Pf = Punctuation, Final Quote (Informative).
1225 * @since 1.4
1227 public static final byte FINAL_QUOTE_PUNCTUATION = 30;
1230 * Po = Punctuation, Other (Informative).
1232 * @since 1.1
1234 public static final byte OTHER_PUNCTUATION = 24;
1237 * Sm = Symbol, Math (Informative).
1239 * @since 1.1
1241 public static final byte MATH_SYMBOL = 25;
1244 * Sc = Symbol, Currency (Informative).
1246 * @since 1.1
1248 public static final byte CURRENCY_SYMBOL = 26;
1251 * Sk = Symbol, Modifier (Informative).
1253 * @since 1.1
1255 public static final byte MODIFIER_SYMBOL = 27;
1258 * So = Symbol, Other (Informative).
1260 * @since 1.1
1262 public static final byte OTHER_SYMBOL = 28;
1265 * Undefined bidirectional character type. Undefined char values have
1266 * undefined directionality in the Unicode specification.
1268 * @since 1.4
1270 public static final byte DIRECTIONALITY_UNDEFINED = -1;
1273 * Strong bidirectional character type "L".
1275 * @since 1.4
1277 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
1280 * Strong bidirectional character type "R".
1282 * @since 1.4
1284 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
1287 * Strong bidirectional character type "AL".
1289 * @since 1.4
1291 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
1294 * Weak bidirectional character type "EN".
1296 * @since 1.4
1298 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
1301 * Weak bidirectional character type "ES".
1303 * @since 1.4
1305 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
1308 * Weak bidirectional character type "ET".
1310 * @since 1.4
1312 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
1315 * Weak bidirectional character type "AN".
1317 * @since 1.4
1319 public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
1322 * Weak bidirectional character type "CS".
1324 * @since 1.4
1326 public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
1329 * Weak bidirectional character type "NSM".
1331 * @since 1.4
1333 public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
1336 * Weak bidirectional character type "BN".
1338 * @since 1.4
1340 public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
1343 * Neutral bidirectional character type "B".
1345 * @since 1.4
1347 public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
1350 * Neutral bidirectional character type "S".
1352 * @since 1.4
1354 public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
1357 * Strong bidirectional character type "WS".
1359 * @since 1.4
1361 public static final byte DIRECTIONALITY_WHITESPACE = 12;
1364 * Neutral bidirectional character type "ON".
1366 * @since 1.4
1368 public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
1371 * Strong bidirectional character type "LRE".
1373 * @since 1.4
1375 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
1378 * Strong bidirectional character type "LRO".
1380 * @since 1.4
1382 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
1385 * Strong bidirectional character type "RLE".
1387 * @since 1.4
1389 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
1392 * Strong bidirectional character type "RLO".
1394 * @since 1.4
1396 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
1399 * Weak bidirectional character type "PDF".
1401 * @since 1.4
1403 public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
1406 * Mask for grabbing the type out of the result of readChar.
1407 * @see #readChar(char)
1409 private static final int TYPE_MASK = 0x1F;
1412 * Mask for grabbing the non-breaking space flag out of the result of
1413 * readChar.
1414 * @see #readChar(char)
1416 private static final int NO_BREAK_MASK = 0x20;
1419 * Mask for grabbing the mirrored directionality flag out of the result
1420 * of readChar.
1421 * @see #readChar(char)
1423 private static final int MIRROR_MASK = 0x40;
1426 * Min value for supplementary code point.
1428 * @since 1.5
1430 public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
1433 * Min value for code point.
1435 * @since 1.5
1437 public static final int MIN_CODE_POINT = 0;
1441 * Max value for code point.
1443 * @since 1.5
1445 public static final int MAX_CODE_POINT = 0x010ffff;
1449 * Minimum high surrogate code in UTF-16 encoding.
1451 * @since 1.5
1453 public static final char MIN_HIGH_SURROGATE = '\ud800';
1456 * Maximum high surrogate code in UTF-16 encoding.
1458 * @since 1.5
1460 public static final char MAX_HIGH_SURROGATE = '\udbff';
1463 * Minimum low surrogate code in UTF-16 encoding.
1465 * @since 1.5
1467 public static final char MIN_LOW_SURROGATE = '\udc00';
1470 * Maximum low surrogate code in UTF-16 encoding.
1472 * @since 1.5
1474 public static final char MAX_LOW_SURROGATE = '\udfff';
1477 * Minimum surrogate code in UTF-16 encoding.
1479 * @since 1.5
1481 public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
1484 * Maximum low surrogate code in UTF-16 encoding.
1486 * @since 1.5
1488 public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
1491 * Grabs an attribute offset from the Unicode attribute database. The lower
1492 * 5 bits are the character type, the next 2 bits are flags, and the top
1493 * 9 bits are the offset into the attribute tables. Note that the top 9
1494 * bits are meaningless in this context; they are useful only in the native
1495 * code.
1497 * @param ch the character to look up
1498 * @return the character's attribute offset and type
1499 * @see #TYPE_MASK
1500 * @see #NO_BREAK_MASK
1501 * @see #MIRROR_MASK
1503 private static native char readChar(char ch);
1506 * Wraps up a character.
1508 * @param value the character to wrap
1510 public Character(char value)
1512 this.value = value;
1516 * Returns the character which has been wrapped by this class.
1518 * @return the character wrapped
1520 public char charValue()
1522 return value;
1526 * Returns the numerical value (unsigned) of the wrapped character.
1527 * Range of returned values: 0x0000-0xFFFF.
1529 * @return the value of the wrapped character
1531 public int hashCode()
1533 return value;
1537 * Determines if an object is equal to this object. This is only true for
1538 * another Character object wrapping the same value.
1540 * @param o object to compare
1541 * @return true if o is a Character with the same value
1543 public boolean equals(Object o)
1545 return o instanceof Character && value == ((Character) o).value;
1549 * Converts the wrapped character into a String.
1551 * @return a String containing one character -- the wrapped character
1552 * of this instance
1554 public String toString()
1556 // This assumes that String.valueOf(char) can create a single-character
1557 // String more efficiently than through the public API.
1558 return String.valueOf(value);
1562 * Returns a String of length 1 representing the specified character.
1564 * @param ch the character to convert
1565 * @return a String containing the character
1566 * @since 1.4
1568 public static String toString(char ch)
1570 // This assumes that String.valueOf(char) can create a single-character
1571 // String more efficiently than through the public API.
1572 return String.valueOf(ch);
1576 * Determines if a character is a Unicode lowercase letter. For example,
1577 * <code>'a'</code> is lowercase.
1578 * <br>
1579 * lowercase = [Ll]
1581 * @param ch character to test
1582 * @return true if ch is a Unicode lowercase letter, else false
1583 * @see #isUpperCase(char)
1584 * @see #isTitleCase(char)
1585 * @see #toLowerCase(char)
1586 * @see #getType(char)
1588 public static boolean isLowerCase(char ch)
1590 return getType(ch) == LOWERCASE_LETTER;
1594 * Determines if a character is a Unicode uppercase letter. For example,
1595 * <code>'A'</code> is uppercase.
1596 * <br>
1597 * uppercase = [Lu]
1599 * @param ch character to test
1600 * @return true if ch is a Unicode uppercase letter, else false
1601 * @see #isLowerCase(char)
1602 * @see #isTitleCase(char)
1603 * @see #toUpperCase(char)
1604 * @see #getType(char)
1606 public static boolean isUpperCase(char ch)
1608 return getType(ch) == UPPERCASE_LETTER;
1612 * Determines if a character is a Unicode titlecase letter. For example,
1613 * the character "Lj" (Latin capital L with small letter j) is titlecase.
1614 * <br>
1615 * titlecase = [Lt]
1617 * @param ch character to test
1618 * @return true if ch is a Unicode titlecase letter, else false
1619 * @see #isLowerCase(char)
1620 * @see #isUpperCase(char)
1621 * @see #toTitleCase(char)
1622 * @see #getType(char)
1624 public static boolean isTitleCase(char ch)
1626 return getType(ch) == TITLECASE_LETTER;
1630 * Determines if a character is a Unicode decimal digit. For example,
1631 * <code>'0'</code> is a digit.
1632 * <br>
1633 * Unicode decimal digit = [Nd]
1635 * @param ch character to test
1636 * @return true if ch is a Unicode decimal digit, else false
1637 * @see #digit(char, int)
1638 * @see #forDigit(int, int)
1639 * @see #getType(char)
1641 public static boolean isDigit(char ch)
1643 return getType(ch) == DECIMAL_DIGIT_NUMBER;
1647 * Determines if a character is part of the Unicode Standard. This is an
1648 * evolving standard, but covers every character in the data file.
1649 * <br>
1650 * defined = not [Cn]
1652 * @param ch character to test
1653 * @return true if ch is a Unicode character, else false
1654 * @see #isDigit(char)
1655 * @see #isLetter(char)
1656 * @see #isLetterOrDigit(char)
1657 * @see #isLowerCase(char)
1658 * @see #isTitleCase(char)
1659 * @see #isUpperCase(char)
1661 public static boolean isDefined(char ch)
1663 return getType(ch) != UNASSIGNED;
1667 * Determines if a character is a Unicode letter. Not all letters have case,
1668 * so this may return true when isLowerCase and isUpperCase return false.
1669 * <br>
1670 * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
1672 * @param ch character to test
1673 * @return true if ch is a Unicode letter, else false
1674 * @see #isDigit(char)
1675 * @see #isJavaIdentifierStart(char)
1676 * @see #isJavaLetter(char)
1677 * @see #isJavaLetterOrDigit(char)
1678 * @see #isLetterOrDigit(char)
1679 * @see #isLowerCase(char)
1680 * @see #isTitleCase(char)
1681 * @see #isUnicodeIdentifierStart(char)
1682 * @see #isUpperCase(char)
1684 public static boolean isLetter(char ch)
1686 return ((1 << getType(ch))
1687 & ((1 << UPPERCASE_LETTER)
1688 | (1 << LOWERCASE_LETTER)
1689 | (1 << TITLECASE_LETTER)
1690 | (1 << MODIFIER_LETTER)
1691 | (1 << OTHER_LETTER))) != 0;
1695 * Determines if a character is a Unicode letter or a Unicode digit. This
1696 * is the combination of isLetter and isDigit.
1697 * <br>
1698 * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
1700 * @param ch character to test
1701 * @return true if ch is a Unicode letter or a Unicode digit, else false
1702 * @see #isDigit(char)
1703 * @see #isJavaIdentifierPart(char)
1704 * @see #isJavaLetter(char)
1705 * @see #isJavaLetterOrDigit(char)
1706 * @see #isLetter(char)
1707 * @see #isUnicodeIdentifierPart(char)
1709 public static boolean isLetterOrDigit(char ch)
1711 return ((1 << getType(ch))
1712 & ((1 << UPPERCASE_LETTER)
1713 | (1 << LOWERCASE_LETTER)
1714 | (1 << TITLECASE_LETTER)
1715 | (1 << MODIFIER_LETTER)
1716 | (1 << OTHER_LETTER)
1717 | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
1721 * Determines if a character can start a Java identifier. This is the
1722 * combination of isLetter, any character where getType returns
1723 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1724 * (like '_').
1726 * @param ch character to test
1727 * @return true if ch can start a Java identifier, else false
1728 * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
1729 * @see #isJavaLetterOrDigit(char)
1730 * @see #isJavaIdentifierStart(char)
1731 * @see #isJavaIdentifierPart(char)
1732 * @see #isLetter(char)
1733 * @see #isLetterOrDigit(char)
1734 * @see #isUnicodeIdentifierStart(char)
1736 public static boolean isJavaLetter(char ch)
1738 return isJavaIdentifierStart(ch);
1742 * Determines if a character can follow the first letter in
1743 * a Java identifier. This is the combination of isJavaLetter (isLetter,
1744 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1745 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1746 * or isIdentifierIgnorable.
1748 * @param ch character to test
1749 * @return true if ch can follow the first letter in a Java identifier
1750 * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
1751 * @see #isJavaLetter(char)
1752 * @see #isJavaIdentifierStart(char)
1753 * @see #isJavaIdentifierPart(char)
1754 * @see #isLetter(char)
1755 * @see #isLetterOrDigit(char)
1756 * @see #isUnicodeIdentifierPart(char)
1757 * @see #isIdentifierIgnorable(char)
1759 public static boolean isJavaLetterOrDigit(char ch)
1761 return isJavaIdentifierPart(ch);
1765 * Determines if a character can start a Java identifier. This is the
1766 * combination of isLetter, any character where getType returns
1767 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
1768 * (like '_').
1769 * <br>
1770 * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
1772 * @param ch character to test
1773 * @return true if ch can start a Java identifier, else false
1774 * @see #isJavaIdentifierPart(char)
1775 * @see #isLetter(char)
1776 * @see #isUnicodeIdentifierStart(char)
1777 * @since 1.1
1779 public static boolean isJavaIdentifierStart(char ch)
1781 return ((1 << getType(ch))
1782 & ((1 << UPPERCASE_LETTER)
1783 | (1 << LOWERCASE_LETTER)
1784 | (1 << TITLECASE_LETTER)
1785 | (1 << MODIFIER_LETTER)
1786 | (1 << OTHER_LETTER)
1787 | (1 << LETTER_NUMBER)
1788 | (1 << CURRENCY_SYMBOL)
1789 | (1 << CONNECTOR_PUNCTUATION))) != 0;
1793 * Determines if a character can follow the first letter in
1794 * a Java identifier. This is the combination of isJavaLetter (isLetter,
1795 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
1796 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
1797 * or isIdentifierIgnorable.
1798 * <br>
1799 * Java identifier extender =
1800 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
1801 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1803 * @param ch character to test
1804 * @return true if ch can follow the first letter in a Java identifier
1805 * @see #isIdentifierIgnorable(char)
1806 * @see #isJavaIdentifierStart(char)
1807 * @see #isLetterOrDigit(char)
1808 * @see #isUnicodeIdentifierPart(char)
1809 * @since 1.1
1811 public static boolean isJavaIdentifierPart(char ch)
1813 int category = getType(ch);
1814 return ((1 << category)
1815 & ((1 << UPPERCASE_LETTER)
1816 | (1 << LOWERCASE_LETTER)
1817 | (1 << TITLECASE_LETTER)
1818 | (1 << MODIFIER_LETTER)
1819 | (1 << OTHER_LETTER)
1820 | (1 << NON_SPACING_MARK)
1821 | (1 << COMBINING_SPACING_MARK)
1822 | (1 << DECIMAL_DIGIT_NUMBER)
1823 | (1 << LETTER_NUMBER)
1824 | (1 << CURRENCY_SYMBOL)
1825 | (1 << CONNECTOR_PUNCTUATION)
1826 | (1 << FORMAT))) != 0
1827 || (category == CONTROL && isIdentifierIgnorable(ch));
1831 * Determines if a character can start a Unicode identifier. Only
1832 * letters can start a Unicode identifier, but this includes characters
1833 * in LETTER_NUMBER.
1834 * <br>
1835 * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
1837 * @param ch character to test
1838 * @return true if ch can start a Unicode identifier, else false
1839 * @see #isJavaIdentifierStart(char)
1840 * @see #isLetter(char)
1841 * @see #isUnicodeIdentifierPart(char)
1842 * @since 1.1
1844 public static boolean isUnicodeIdentifierStart(char ch)
1846 return ((1 << getType(ch))
1847 & ((1 << UPPERCASE_LETTER)
1848 | (1 << LOWERCASE_LETTER)
1849 | (1 << TITLECASE_LETTER)
1850 | (1 << MODIFIER_LETTER)
1851 | (1 << OTHER_LETTER)
1852 | (1 << LETTER_NUMBER))) != 0;
1856 * Determines if a character can follow the first letter in
1857 * a Unicode identifier. This includes letters, connecting punctuation,
1858 * digits, numeric letters, combining marks, non-spacing marks, and
1859 * isIdentifierIgnorable.
1860 * <br>
1861 * Unicode identifier extender =
1862 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
1863 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
1865 * @param ch character to test
1866 * @return true if ch can follow the first letter in a Unicode identifier
1867 * @see #isIdentifierIgnorable(char)
1868 * @see #isJavaIdentifierPart(char)
1869 * @see #isLetterOrDigit(char)
1870 * @see #isUnicodeIdentifierStart(char)
1871 * @since 1.1
1873 public static boolean isUnicodeIdentifierPart(char ch)
1875 int category = getType(ch);
1876 return ((1 << category)
1877 & ((1 << UPPERCASE_LETTER)
1878 | (1 << LOWERCASE_LETTER)
1879 | (1 << TITLECASE_LETTER)
1880 | (1 << MODIFIER_LETTER)
1881 | (1 << OTHER_LETTER)
1882 | (1 << NON_SPACING_MARK)
1883 | (1 << COMBINING_SPACING_MARK)
1884 | (1 << DECIMAL_DIGIT_NUMBER)
1885 | (1 << LETTER_NUMBER)
1886 | (1 << CONNECTOR_PUNCTUATION)
1887 | (1 << FORMAT))) != 0
1888 || (category == CONTROL && isIdentifierIgnorable(ch));
1892 * Determines if a character is ignorable in a Unicode identifier. This
1893 * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
1894 * through <code>'\u0008'</code>, <code>'\u000E'</code> through
1895 * <code>'\u001B'</code>, and <code>'\u007F'</code> through
1896 * <code>'\u009F'</code>), and FORMAT characters.
1897 * <br>
1898 * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
1899 * |U+007F-U+009F
1901 * @param ch character to test
1902 * @return true if ch is ignorable in a Unicode or Java identifier
1903 * @see #isJavaIdentifierPart(char)
1904 * @see #isUnicodeIdentifierPart(char)
1905 * @since 1.1
1907 public static boolean isIdentifierIgnorable(char ch)
1909 return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F'
1910 || (ch <= '\u001B' && ch >= '\u000E')))
1911 || getType(ch) == FORMAT;
1915 * Converts a Unicode character into its lowercase equivalent mapping.
1916 * If a mapping does not exist, then the character passed is returned.
1917 * Note that isLowerCase(toLowerCase(ch)) does not always return true.
1919 * @param ch character to convert to lowercase
1920 * @return lowercase mapping of ch, or ch if lowercase mapping does
1921 * not exist
1922 * @see #isLowerCase(char)
1923 * @see #isUpperCase(char)
1924 * @see #toTitleCase(char)
1925 * @see #toUpperCase(char)
1927 public static native char toLowerCase(char ch);
1930 * Converts a Unicode character into its uppercase equivalent mapping.
1931 * If a mapping does not exist, then the character passed is returned.
1932 * Note that isUpperCase(toUpperCase(ch)) does not always return true.
1934 * @param ch character to convert to uppercase
1935 * @return uppercase mapping of ch, or ch if uppercase mapping does
1936 * not exist
1937 * @see #isLowerCase(char)
1938 * @see #isUpperCase(char)
1939 * @see #toLowerCase(char)
1940 * @see #toTitleCase(char)
1942 public static native char toUpperCase(char ch);
1945 * Converts a Unicode character into its titlecase equivalent mapping.
1946 * If a mapping does not exist, then the character passed is returned.
1947 * Note that isTitleCase(toTitleCase(ch)) does not always return true.
1949 * @param ch character to convert to titlecase
1950 * @return titlecase mapping of ch, or ch if titlecase mapping does
1951 * not exist
1952 * @see #isTitleCase(char)
1953 * @see #toLowerCase(char)
1954 * @see #toUpperCase(char)
1956 public static native char toTitleCase(char ch);
1959 * Converts a character into a digit of the specified radix. If the radix
1960 * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
1961 * exceeds the radix, or if ch is not a decimal digit or in the case
1962 * insensitive set of 'a'-'z', the result is -1.
1963 * <br>
1964 * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
1965 * |U+FF21-U+FF3A|U+FF41-U+FF5A
1967 * @param ch character to convert into a digit
1968 * @param radix radix in which ch is a digit
1969 * @return digit which ch represents in radix, or -1 not a valid digit
1970 * @see #MIN_RADIX
1971 * @see #MAX_RADIX
1972 * @see #forDigit(int, int)
1973 * @see #isDigit(char)
1974 * @see #getNumericValue(char)
1976 public static native int digit(char ch, int radix);
1979 * Returns the Unicode numeric value property of a character. For example,
1980 * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
1982 * <p>This method also returns values for the letters A through Z, (not
1983 * specified by Unicode), in these ranges: <code>'\u0041'</code>
1984 * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
1985 * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
1986 * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
1987 * <code>'\uFF5A'</code> (full width variants).
1989 * <p>If the character lacks a numeric value property, -1 is returned.
1990 * If the character has a numeric value property which is not representable
1991 * as a nonnegative integer, such as a fraction, -2 is returned.
1993 * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
1994 * |U+FF21-U+FF3A|U+FF41-U+FF5A
1996 * @param ch character from which the numeric value property will
1997 * be retrieved
1998 * @return the numeric value property of ch, or -1 if it does not exist, or
1999 * -2 if it is not representable as a nonnegative integer
2000 * @see #forDigit(int, int)
2001 * @see #digit(char, int)
2002 * @see #isDigit(char)
2003 * @since 1.1
2005 public static native int getNumericValue(char ch);
2008 * Determines if a character is a ISO-LATIN-1 space. This is only the five
2009 * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
2010 * <code>'\r'</code>, and <code>' '</code>.
2011 * <br>
2012 * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
2014 * @param ch character to test
2015 * @return true if ch is a space, else false
2016 * @deprecated Replaced by {@link #isWhitespace(char)}
2017 * @see #isSpaceChar(char)
2018 * @see #isWhitespace(char)
2020 public static boolean isSpace(char ch)
2022 // Performing the subtraction up front alleviates need to compare longs.
2023 return ch-- <= ' ' && ((1 << ch)
2024 & ((1 << (' ' - 1))
2025 | (1 << ('\t' - 1))
2026 | (1 << ('\n' - 1))
2027 | (1 << ('\r' - 1))
2028 | (1 << ('\f' - 1)))) != 0;
2032 * Determines if a character is a Unicode space character. This includes
2033 * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
2034 * <br>
2035 * Unicode space = [Zs]|[Zp]|[Zl]
2037 * @param ch character to test
2038 * @return true if ch is a Unicode space, else false
2039 * @see #isWhitespace(char)
2040 * @since 1.1
2042 public static boolean isSpaceChar(char ch)
2044 return ((1 << getType(ch))
2045 & ((1 << SPACE_SEPARATOR)
2046 | (1 << LINE_SEPARATOR)
2047 | (1 << PARAGRAPH_SEPARATOR))) != 0;
2051 * Determines if a character is Java whitespace. This includes Unicode
2052 * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
2053 * PARAGRAPH_SEPARATOR) except the non-breaking spaces
2054 * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
2055 * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
2056 * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
2057 * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
2058 * and <code>'\u001F'</code>.
2059 * <br>
2060 * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
2062 * @param ch character to test
2063 * @return true if ch is Java whitespace, else false
2064 * @see #isSpaceChar(char)
2065 * @since 1.1
2067 public static boolean isWhitespace(char ch)
2069 int attr = readChar(ch);
2070 return ((((1 << (attr & TYPE_MASK))
2071 & ((1 << SPACE_SEPARATOR)
2072 | (1 << LINE_SEPARATOR)
2073 | (1 << PARAGRAPH_SEPARATOR))) != 0)
2074 && (attr & NO_BREAK_MASK) == 0)
2075 || (ch <= '\u001F' && ((1 << ch)
2076 & ((1 << '\t')
2077 | (1 << '\n')
2078 | (1 << '\u000B')
2079 | (1 << '\u000C')
2080 | (1 << '\r')
2081 | (1 << '\u001C')
2082 | (1 << '\u001D')
2083 | (1 << '\u001E')
2084 | (1 << '\u001F'))) != 0);
2088 * Determines if a character has the ISO Control property.
2089 * <br>
2090 * ISO Control = [Cc]
2092 * @param ch character to test
2093 * @return true if ch is an ISO Control character, else false
2094 * @see #isSpaceChar(char)
2095 * @see #isWhitespace(char)
2096 * @since 1.1
2098 public static boolean isISOControl(char ch)
2100 return getType(ch) == CONTROL;
2104 * Returns the Unicode general category property of a character.
2106 * @param ch character from which the general category property will
2107 * be retrieved
2108 * @return the character category property of ch as an integer
2109 * @see #UNASSIGNED
2110 * @see #UPPERCASE_LETTER
2111 * @see #LOWERCASE_LETTER
2112 * @see #TITLECASE_LETTER
2113 * @see #MODIFIER_LETTER
2114 * @see #OTHER_LETTER
2115 * @see #NON_SPACING_MARK
2116 * @see #ENCLOSING_MARK
2117 * @see #COMBINING_SPACING_MARK
2118 * @see #DECIMAL_DIGIT_NUMBER
2119 * @see #LETTER_NUMBER
2120 * @see #OTHER_NUMBER
2121 * @see #SPACE_SEPARATOR
2122 * @see #LINE_SEPARATOR
2123 * @see #PARAGRAPH_SEPARATOR
2124 * @see #CONTROL
2125 * @see #FORMAT
2126 * @see #PRIVATE_USE
2127 * @see #SURROGATE
2128 * @see #DASH_PUNCTUATION
2129 * @see #START_PUNCTUATION
2130 * @see #END_PUNCTUATION
2131 * @see #CONNECTOR_PUNCTUATION
2132 * @see #OTHER_PUNCTUATION
2133 * @see #MATH_SYMBOL
2134 * @see #CURRENCY_SYMBOL
2135 * @see #MODIFIER_SYMBOL
2136 * @see #INITIAL_QUOTE_PUNCTUATION
2137 * @see #FINAL_QUOTE_PUNCTUATION
2138 * @since 1.1
2140 public static native int getType(char ch);
2143 * Converts a digit into a character which represents that digit
2144 * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
2145 * or the digit exceeds the radix, then the null character <code>'\0'</code>
2146 * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'.
2147 * <br>
2148 * return value boundary = U+0030-U+0039|U+0061-U+007A
2150 * @param digit digit to be converted into a character
2151 * @param radix radix of digit
2152 * @return character representing digit in radix, or '\0'
2153 * @see #MIN_RADIX
2154 * @see #MAX_RADIX
2155 * @see #digit(char, int)
2157 public static char forDigit(int digit, int radix)
2159 if (radix < MIN_RADIX || radix > MAX_RADIX
2160 || digit < 0 || digit >= radix)
2161 return '\0';
2162 return (char) (digit < 10 ? ('0' + digit) : ('a' - 10 + digit));
2166 * Returns the Unicode directionality property of the character. This
2167 * is used in the visual ordering of text.
2169 * @param ch the character to look up
2170 * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
2171 * @see #DIRECTIONALITY_UNDEFINED
2172 * @see #DIRECTIONALITY_LEFT_TO_RIGHT
2173 * @see #DIRECTIONALITY_RIGHT_TO_LEFT
2174 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
2175 * @see #DIRECTIONALITY_EUROPEAN_NUMBER
2176 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
2177 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
2178 * @see #DIRECTIONALITY_ARABIC_NUMBER
2179 * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
2180 * @see #DIRECTIONALITY_NONSPACING_MARK
2181 * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
2182 * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
2183 * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
2184 * @see #DIRECTIONALITY_WHITESPACE
2185 * @see #DIRECTIONALITY_OTHER_NEUTRALS
2186 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
2187 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
2188 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
2189 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
2190 * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
2191 * @since 1.4
2193 public static native byte getDirectionality(char ch);
2196 * Determines whether the character is mirrored according to Unicode. For
2197 * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
2198 * left-to-right text, but ')' in right-to-left text.
2200 * @param ch the character to look up
2201 * @return true if the character is mirrored
2202 * @since 1.4
2204 public static boolean isMirrored(char ch)
2206 return (readChar(ch) & MIRROR_MASK) != 0;
2210 * Compares another Character to this Character, numerically.
2212 * @param anotherCharacter Character to compare with this Character
2213 * @return a negative integer if this Character is less than
2214 * anotherCharacter, zero if this Character is equal, and
2215 * a positive integer if this Character is greater
2216 * @throws NullPointerException if anotherCharacter is null
2217 * @since 1.2
2219 public int compareTo(Character anotherCharacter)
2221 return value - anotherCharacter.value;
2225 * Compares an object to this Character. Assuming the object is a
2226 * Character object, this method performs the same comparison as
2227 * compareTo(Character).
2229 * @param o object to compare
2230 * @return the comparison value
2231 * @throws ClassCastException if o is not a Character object
2232 * @throws NullPointerException if o is null
2233 * @see #compareTo(Character)
2234 * @since 1.2
2236 public int compareTo(Object o)
2238 return compareTo((Character) o);
2242 * Returns an <code>Character</code> object wrapping the value.
2243 * In contrast to the <code>Character</code> constructor, this method
2244 * will cache some values. It is used by boxing conversion.
2246 * @param val the value to wrap
2247 * @return the <code>Character</code>
2249 * @since 1.5
2251 public static Character valueOf(char val)
2253 if (val > MAX_CACHE)
2254 return new Character(val);
2255 synchronized (charCache)
2257 if (charCache[val - MIN_VALUE] == null)
2258 charCache[val - MIN_VALUE] = new Character(val);
2259 return charCache[val - MIN_VALUE];
2264 * Reverse the bytes in val.
2265 * @since 1.5
2267 public static char reverseBytes(char val)
2269 return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
2273 * Converts a unicode code point to a UTF-16 representation of that
2274 * code point.
2276 * @param codePoint the unicode code point
2278 * @return the UTF-16 representation of that code point
2280 * @throws IllegalArgumentException if the code point is not a valid
2281 * unicode code point
2283 * @since 1.5
2285 public static char[] toChars(int codePoint)
2287 char[] result = new char[charCount(codePoint)];
2288 int ignore = toChars(codePoint, result, 0);
2289 return result;
2293 * Converts a unicode code point to its UTF-16 representation.
2295 * @param codePoint the unicode code point
2296 * @param dst the target char array
2297 * @param dstIndex the start index for the target
2299 * @return number of characters written to <code>dst</code>
2301 * @throws IllegalArgumentException if <code>codePoint</code> is not a
2302 * valid unicode code point
2303 * @throws NullPointerException if <code>dst</code> is <code>null</code>
2304 * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
2305 * in <code>dst</code> or if the UTF-16 representation does not
2306 * fit into <code>dst</code>
2308 * @since 1.5
2310 public static int toChars(int codePoint, char[] dst, int dstIndex)
2312 if (!isValidCodePoint(codePoint))
2314 throw new IllegalArgumentException("not a valid code point: "
2315 + codePoint);
2318 int result;
2319 if (isSupplementaryCodePoint(codePoint))
2321 // Write second char first to cause IndexOutOfBoundsException
2322 // immediately.
2323 final int cp2 = codePoint - 0x10000;
2324 dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
2325 dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
2326 result = 2;
2328 else
2330 dst[dstIndex] = (char) codePoint;
2331 result = 1;
2333 return result;
2337 * Return number of 16-bit characters required to represent the given
2338 * code point.
2340 * @param codePoint a unicode code point
2342 * @return 2 if codePoint >= 0x10000, 1 otherwise.
2344 * @since 1.5
2346 public static int charCount(int codePoint)
2348 return
2349 (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
2350 ? 2
2351 : 1;
2355 * Determines whether the specified code point is
2356 * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
2357 * supplementary character range.
2359 * @param codePoint a Unicode code point
2361 * @return <code>true</code> if code point is in supplementary range
2363 * @since 1.5
2365 public static boolean isSupplementaryCodePoint(int codePoint)
2367 return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
2368 && codePoint <= MAX_CODE_POINT;
2372 * Determines whether the specified code point is
2373 * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
2375 * @param codePoint a Unicode code point
2377 * @return <code>true</code> if code point is valid
2379 * @since 1.5
2381 public static boolean isValidCodePoint(int codePoint)
2383 return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
2387 * Return true if the given character is a high surrogate.
2388 * @param ch the character
2389 * @return true if the character is a high surrogate character
2391 * @since 1.5
2393 public static boolean isHighSurrogate(char ch)
2395 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
2399 * Return true if the given character is a low surrogate.
2400 * @param ch the character
2401 * @return true if the character is a low surrogate character
2403 * @since 1.5
2405 public static boolean isLowSurrogate(char ch)
2407 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
2411 * Return true if the given characters compose a surrogate pair.
2412 * This is true if the first character is a high surrogate and the
2413 * second character is a low surrogate.
2414 * @param ch1 the first character
2415 * @param ch2 the first character
2416 * @return true if the characters compose a surrogate pair
2418 * @since 1.5
2420 public static boolean isSurrogatePair(char ch1, char ch2)
2422 return isHighSurrogate(ch1) && isLowSurrogate(ch2);
2426 * Given a valid surrogate pair, this returns the corresponding
2427 * code point.
2428 * @param high the high character of the pair
2429 * @param low the low character of the pair
2430 * @return the corresponding code point
2432 * @since 1.5
2434 public static int toCodePoint(char high, char low)
2436 return ((high - MIN_HIGH_SURROGATE) * 0x400) +
2437 (low - MIN_LOW_SURROGATE) + 0x10000;
2441 * Get the code point at the specified index in the CharSequence.
2442 * This is like CharSequence#charAt(int), but if the character is
2443 * the start of a surrogate pair, and there is a following
2444 * character, and this character completes the pair, then the
2445 * corresponding supplementary code point is returned. Otherwise,
2446 * the character at the index is returned.
2448 * @param sequence the CharSequence
2449 * @param index the index of the codepoint to get, starting at 0
2450 * @return the codepoint at the specified index
2451 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2452 * @since 1.5
2454 public static int codePointAt(CharSequence sequence, int index)
2456 int len = sequence.length();
2457 if (index < 0 || index >= len)
2458 throw new IndexOutOfBoundsException();
2459 char high = sequence.charAt(index);
2460 if (! isHighSurrogate(high) || ++index >= len)
2461 return high;
2462 char low = sequence.charAt(index);
2463 if (! isLowSurrogate(low))
2464 return high;
2465 return toCodePoint(high, low);
2469 * Get the code point at the specified index in the CharSequence.
2470 * If the character is the start of a surrogate pair, and there is a
2471 * following character, and this character completes the pair, then
2472 * the corresponding supplementary code point is returned.
2473 * Otherwise, the character at the index is returned.
2475 * @param chars the character array in which to look
2476 * @param index the index of the codepoint to get, starting at 0
2477 * @return the codepoint at the specified index
2478 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2479 * @since 1.5
2481 public static int codePointAt(char[] chars, int index)
2483 return codePointAt(chars, index, chars.length);
2487 * Get the code point at the specified index in the CharSequence.
2488 * If the character is the start of a surrogate pair, and there is a
2489 * following character within the specified range, and this
2490 * character completes the pair, then the corresponding
2491 * supplementary code point is returned. Otherwise, the character
2492 * at the index is returned.
2494 * @param chars the character array in which to look
2495 * @param index the index of the codepoint to get, starting at 0
2496 * @param limit the limit past which characters should not be examined
2497 * @return the codepoint at the specified index
2498 * @throws IndexOutOfBoundsException if index is negative or &gt;=
2499 * limit, or if limit is negative or &gt;= the length of the array
2500 * @since 1.5
2502 public static int codePointAt(char[] chars, int index, int limit)
2504 if (index < 0 || index >= limit || limit < 0 || limit >= chars.length)
2505 throw new IndexOutOfBoundsException();
2506 char high = chars[index];
2507 if (! isHighSurrogate(high) || ++index >= limit)
2508 return high;
2509 char low = chars[index];
2510 if (! isLowSurrogate(low))
2511 return high;
2512 return toCodePoint(high, low);
2516 * Get the code point before the specified index. This is like
2517 * #codePointAt(char[], int), but checks the characters at
2518 * <code>index-1</code> and <code>index-2</code> to see if they form
2519 * a supplementary code point. If they do not, the character at
2520 * <code>index-1</code> is returned.
2522 * @param chars the character array
2523 * @param index the index just past the codepoint to get, starting at 0
2524 * @return the codepoint at the specified index
2525 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2526 * @since 1.5
2528 public static int codePointBefore(char[] chars, int index)
2530 return codePointBefore(chars, index, 1);
2534 * Get the code point before the specified index. This is like
2535 * #codePointAt(char[], int), but checks the characters at
2536 * <code>index-1</code> and <code>index-2</code> to see if they form
2537 * a supplementary code point. If they do not, the character at
2538 * <code>index-1</code> is returned. The start parameter is used to
2539 * limit the range of the array which may be examined.
2541 * @param chars the character array
2542 * @param index the index just past the codepoint to get, starting at 0
2543 * @param start the index before which characters should not be examined
2544 * @return the codepoint at the specified index
2545 * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
2546 * the length of the array, or if limit is negative or &gt;= the
2547 * length of the array
2548 * @since 1.5
2550 public static int codePointBefore(char[] chars, int index, int start)
2552 if (index < start || index > chars.length
2553 || start < 0 || start >= chars.length)
2554 throw new IndexOutOfBoundsException();
2555 --index;
2556 char low = chars[index];
2557 if (! isLowSurrogate(low) || --index < start)
2558 return low;
2559 char high = chars[index];
2560 if (! isHighSurrogate(high))
2561 return low;
2562 return toCodePoint(high, low);
2566 * Get the code point before the specified index. This is like
2567 * #codePointAt(CharSequence, int), but checks the characters at
2568 * <code>index-1</code> and <code>index-2</code> to see if they form
2569 * a supplementary code point. If they do not, the character at
2570 * <code>index-1</code> is returned.
2572 * @param sequence the CharSequence
2573 * @param index the index just past the codepoint to get, starting at 0
2574 * @return the codepoint at the specified index
2575 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
2576 * @since 1.5
2578 public static int codePointBefore(CharSequence sequence, int index)
2580 int len = sequence.length();
2581 if (index < 1 || index > len)
2582 throw new IndexOutOfBoundsException();
2583 --index;
2584 char low = sequence.charAt(index);
2585 if (! isLowSurrogate(low) || --index < 0)
2586 return low;
2587 char high = sequence.charAt(index);
2588 if (! isHighSurrogate(high))
2589 return low;
2590 return toCodePoint(high, low);
2592 } // class Character