Imported GNU Classpath 0.90
[official-gcc.git] / libjava / classpath / java / lang / Character.java
blob59ae12f7790a48ed441b07b6c7b89137a5b41245
1 /* java.lang.Character -- Wrapper class for char, and Unicode subsets
2 Copyright (C) 1998, 1999, 2001, 2002, 2005 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package java.lang;
41 import gnu.java.lang.CharData;
43 import java.io.Serializable;
44 import java.text.Collator;
45 import java.util.Locale;
47 /**
48 * Wrapper class for the primitive char data type. In addition, this class
49 * allows one to retrieve property information and perform transformations
50 * on the defined characters in the Unicode Standard, Version 4.0.0.
51 * java.lang.Character is designed to be very dynamic, and as such, it
52 * retrieves information on the Unicode character set from a separate
53 * database, gnu.java.lang.CharData, which can be easily upgraded.
55 * <p>For predicates, boundaries are used to describe
56 * the set of characters for which the method will return true.
57 * This syntax uses fairly normal regular expression notation.
58 * See 5.13 of the Unicode Standard, Version 4.0, for the
59 * boundary specification.
61 * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
62 * for more information on the Unicode Standard.
64 * @author Tom Tromey (tromey@cygnus.com)
65 * @author Paul N. Fisher
66 * @author Jochen Hoenicke
67 * @author Eric Blake (ebb9@email.byu.edu)
68 * @see CharData
69 * @since 1.0
70 * @status updated to 1.4
72 public final class Character implements Serializable, Comparable
74 /**
75 * A subset of Unicode blocks.
77 * @author Paul N. Fisher
78 * @author Eric Blake (ebb9@email.byu.edu)
79 * @since 1.2
81 public static class Subset
83 /** The name of the subset. */
84 private final String name;
86 /**
87 * Construct a new subset of characters.
89 * @param name the name of the subset
90 * @throws NullPointerException if name is null
92 protected Subset(String name)
94 // Note that name.toString() is name, unless name was null.
95 this.name = name.toString();
98 /**
99 * Compares two Subsets for equality. This is <code>final</code>, and
100 * restricts the comparison on the <code>==</code> operator, so it returns
101 * true only for the same object.
103 * @param o the object to compare
104 * @return true if o is this
106 public final boolean equals(Object o)
108 return o == this;
112 * Makes the original hashCode of Object final, to be consistent with
113 * equals.
115 * @return the hash code for this object
117 public final int hashCode()
119 return super.hashCode();
123 * Returns the name of the subset.
125 * @return the name
127 public final String toString()
129 return name;
131 } // class Subset
134 * A family of character subsets in the Unicode specification. A character
135 * is in at most one of these blocks.
137 * This inner class was generated automatically from
138 * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts.
139 * This Unicode definition file can be found on the
140 * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
141 * JDK 1.5 uses Unicode version 4.0.0.
143 * @author scripts/unicode-blocks.pl (written by Eric Blake)
144 * @since 1.2
146 public static final class UnicodeBlock extends Subset
148 /** The start of the subset. */
149 private final int start;
151 /** The end of the subset. */
152 private final int end;
154 /** The canonical name of the block according to the Unicode standard. */
155 private final String canonicalName;
157 /** Constants for the <code>forName()</code> method */
158 private static final int CANONICAL_NAME = 0;
159 private static final int NO_SPACES_NAME = 1;
160 private static final int CONSTANT_NAME = 2;
163 * Constructor for strictly defined blocks.
165 * @param start the start character of the range
166 * @param end the end character of the range
167 * @param name the block name
168 * @param canonicalName the name of the block as defined in the Unicode
169 * standard.
171 private UnicodeBlock(int start, int end, String name,
172 String canonicalName)
174 super(name);
175 this.start = start;
176 this.end = end;
177 this.canonicalName = canonicalName;
181 * Returns the Unicode character block which a character belongs to.
182 * <strong>Note</strong>: This method does not support the use of
183 * supplementary characters. For such support, <code>of(int)</code>
184 * should be used instead.
186 * @param ch the character to look up
187 * @return the set it belongs to, or null if it is not in one
189 public static UnicodeBlock of(char ch)
191 return of((int) ch);
195 * Returns the Unicode character block which a code point belongs to.
197 * @param codePoint the character to look up
198 * @return the set it belongs to, or null if it is not in one.
199 * @throws IllegalArgumentException if the specified code point is
200 * invalid.
201 * @since 1.5
203 public static UnicodeBlock of(int codePoint)
205 if (codePoint > MAX_CODE_POINT)
206 throw new IllegalArgumentException("The supplied integer value is " +
207 "too large to be a codepoint.");
208 // Simple binary search for the correct block.
209 int low = 0;
210 int hi = sets.length - 1;
211 while (low <= hi)
213 int mid = (low + hi) >> 1;
214 UnicodeBlock b = sets[mid];
215 if (codePoint < b.start)
216 hi = mid - 1;
217 else if (codePoint > b.end)
218 low = mid + 1;
219 else
220 return b;
222 return null;
226 * <p>
227 * Returns the <code>UnicodeBlock</code> with the given name, as defined
228 * by the Unicode standard. The version of Unicode in use is defined by
229 * the <code>Character</code> class, and the names are given in the
230 * <code>Blocks-<version>.txt</code> file corresponding to that version.
231 * The name may be specified in one of three ways:
232 * </p>
233 * <ol>
234 * <li>The canonical, human-readable name used by the Unicode standard.
235 * This is the name with all spaces and hyphens retained. For example,
236 * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
237 * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
238 * <li>The name used for the constants specified by this class, which
239 * is the canonical name with all spaces and hyphens replaced with
240 * underscores e.g. `BASIC_LATIN'</li>
241 * </ol>
242 * <p>
243 * The names are compared case-insensitively using the case comparison
244 * associated with the U.S. English locale. The method recognises the
245 * previous names used for blocks as well as the current ones. At
246 * present, this simply means that the deprecated `SURROGATES_AREA'
247 * will be recognised by this method (the <code>of()</code> methods
248 * only return one of the three new surrogate blocks).
249 * </p>
251 * @param blockName the name of the block to look up.
252 * @return the specified block.
253 * @throws NullPointerException if the <code>blockName</code> is
254 * <code>null</code>.
255 * @throws IllegalArgumentException if the name does not match any Unicode
256 * block.
257 * @since 1.5
259 public static final UnicodeBlock forName(String blockName)
261 int type;
262 if (blockName.indexOf(' ') != -1)
263 type = CANONICAL_NAME;
264 else if (blockName.indexOf('_') != -1)
265 type = CONSTANT_NAME;
266 else
267 type = NO_SPACES_NAME;
268 Collator usCollator = Collator.getInstance(Locale.US);
269 usCollator.setStrength(Collator.PRIMARY);
270 /* Special case for deprecated blocks not in sets */
271 switch (type)
273 case CANONICAL_NAME:
274 if (usCollator.compare(blockName, "Surrogates Area") == 0)
275 return SURROGATES_AREA;
276 break;
277 case NO_SPACES_NAME:
278 if (usCollator.compare(blockName, "SurrogatesArea") == 0)
279 return SURROGATES_AREA;
280 break;
281 case CONSTANT_NAME:
282 if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
283 return SURROGATES_AREA;
284 break;
286 /* Other cases */
287 int setLength = sets.length;
288 switch (type)
290 case CANONICAL_NAME:
291 for (int i = 0; i < setLength; i++)
293 UnicodeBlock block = sets[i];
294 if (usCollator.compare(blockName, block.canonicalName) == 0)
295 return block;
297 break;
298 case NO_SPACES_NAME:
299 for (int i = 0; i < setLength; i++)
301 UnicodeBlock block = sets[i];
302 String nsName = block.canonicalName.replaceAll(" ","");
303 if (usCollator.compare(blockName, nsName) == 0)
304 return block;
306 break;
307 case CONSTANT_NAME:
308 for (int i = 0; i < setLength; i++)
310 UnicodeBlock block = sets[i];
311 if (usCollator.compare(blockName, block.toString()) == 0)
312 return block;
314 break;
316 throw new IllegalArgumentException("No Unicode block found for " +
317 blockName + ".");
321 * Basic Latin.
322 * 0x0000 - 0x007F.
324 public static final UnicodeBlock BASIC_LATIN
325 = new UnicodeBlock(0x0000, 0x007F,
326 "BASIC_LATIN",
327 "Basic Latin");
330 * Latin-1 Supplement.
331 * 0x0080 - 0x00FF.
333 public static final UnicodeBlock LATIN_1_SUPPLEMENT
334 = new UnicodeBlock(0x0080, 0x00FF,
335 "LATIN_1_SUPPLEMENT",
336 "Latin-1 Supplement");
339 * Latin Extended-A.
340 * 0x0100 - 0x017F.
342 public static final UnicodeBlock LATIN_EXTENDED_A
343 = new UnicodeBlock(0x0100, 0x017F,
344 "LATIN_EXTENDED_A",
345 "Latin Extended-A");
348 * Latin Extended-B.
349 * 0x0180 - 0x024F.
351 public static final UnicodeBlock LATIN_EXTENDED_B
352 = new UnicodeBlock(0x0180, 0x024F,
353 "LATIN_EXTENDED_B",
354 "Latin Extended-B");
357 * IPA Extensions.
358 * 0x0250 - 0x02AF.
360 public static final UnicodeBlock IPA_EXTENSIONS
361 = new UnicodeBlock(0x0250, 0x02AF,
362 "IPA_EXTENSIONS",
363 "IPA Extensions");
366 * Spacing Modifier Letters.
367 * 0x02B0 - 0x02FF.
369 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
370 = new UnicodeBlock(0x02B0, 0x02FF,
371 "SPACING_MODIFIER_LETTERS",
372 "Spacing Modifier Letters");
375 * Combining Diacritical Marks.
376 * 0x0300 - 0x036F.
378 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
379 = new UnicodeBlock(0x0300, 0x036F,
380 "COMBINING_DIACRITICAL_MARKS",
381 "Combining Diacritical Marks");
384 * Greek.
385 * 0x0370 - 0x03FF.
387 public static final UnicodeBlock GREEK
388 = new UnicodeBlock(0x0370, 0x03FF,
389 "GREEK",
390 "Greek");
393 * Cyrillic.
394 * 0x0400 - 0x04FF.
396 public static final UnicodeBlock CYRILLIC
397 = new UnicodeBlock(0x0400, 0x04FF,
398 "CYRILLIC",
399 "Cyrillic");
402 * Cyrillic Supplementary.
403 * 0x0500 - 0x052F.
404 * @since 1.5
406 public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
407 = new UnicodeBlock(0x0500, 0x052F,
408 "CYRILLIC_SUPPLEMENTARY",
409 "Cyrillic Supplementary");
412 * Armenian.
413 * 0x0530 - 0x058F.
415 public static final UnicodeBlock ARMENIAN
416 = new UnicodeBlock(0x0530, 0x058F,
417 "ARMENIAN",
418 "Armenian");
421 * Hebrew.
422 * 0x0590 - 0x05FF.
424 public static final UnicodeBlock HEBREW
425 = new UnicodeBlock(0x0590, 0x05FF,
426 "HEBREW",
427 "Hebrew");
430 * Arabic.
431 * 0x0600 - 0x06FF.
433 public static final UnicodeBlock ARABIC
434 = new UnicodeBlock(0x0600, 0x06FF,
435 "ARABIC",
436 "Arabic");
439 * Syriac.
440 * 0x0700 - 0x074F.
441 * @since 1.4
443 public static final UnicodeBlock SYRIAC
444 = new UnicodeBlock(0x0700, 0x074F,
445 "SYRIAC",
446 "Syriac");
449 * Thaana.
450 * 0x0780 - 0x07BF.
451 * @since 1.4
453 public static final UnicodeBlock THAANA
454 = new UnicodeBlock(0x0780, 0x07BF,
455 "THAANA",
456 "Thaana");
459 * Devanagari.
460 * 0x0900 - 0x097F.
462 public static final UnicodeBlock DEVANAGARI
463 = new UnicodeBlock(0x0900, 0x097F,
464 "DEVANAGARI",
465 "Devanagari");
468 * Bengali.
469 * 0x0980 - 0x09FF.
471 public static final UnicodeBlock BENGALI
472 = new UnicodeBlock(0x0980, 0x09FF,
473 "BENGALI",
474 "Bengali");
477 * Gurmukhi.
478 * 0x0A00 - 0x0A7F.
480 public static final UnicodeBlock GURMUKHI
481 = new UnicodeBlock(0x0A00, 0x0A7F,
482 "GURMUKHI",
483 "Gurmukhi");
486 * Gujarati.
487 * 0x0A80 - 0x0AFF.
489 public static final UnicodeBlock GUJARATI
490 = new UnicodeBlock(0x0A80, 0x0AFF,
491 "GUJARATI",
492 "Gujarati");
495 * Oriya.
496 * 0x0B00 - 0x0B7F.
498 public static final UnicodeBlock ORIYA
499 = new UnicodeBlock(0x0B00, 0x0B7F,
500 "ORIYA",
501 "Oriya");
504 * Tamil.
505 * 0x0B80 - 0x0BFF.
507 public static final UnicodeBlock TAMIL
508 = new UnicodeBlock(0x0B80, 0x0BFF,
509 "TAMIL",
510 "Tamil");
513 * Telugu.
514 * 0x0C00 - 0x0C7F.
516 public static final UnicodeBlock TELUGU
517 = new UnicodeBlock(0x0C00, 0x0C7F,
518 "TELUGU",
519 "Telugu");
522 * Kannada.
523 * 0x0C80 - 0x0CFF.
525 public static final UnicodeBlock KANNADA
526 = new UnicodeBlock(0x0C80, 0x0CFF,
527 "KANNADA",
528 "Kannada");
531 * Malayalam.
532 * 0x0D00 - 0x0D7F.
534 public static final UnicodeBlock MALAYALAM
535 = new UnicodeBlock(0x0D00, 0x0D7F,
536 "MALAYALAM",
537 "Malayalam");
540 * Sinhala.
541 * 0x0D80 - 0x0DFF.
542 * @since 1.4
544 public static final UnicodeBlock SINHALA
545 = new UnicodeBlock(0x0D80, 0x0DFF,
546 "SINHALA",
547 "Sinhala");
550 * Thai.
551 * 0x0E00 - 0x0E7F.
553 public static final UnicodeBlock THAI
554 = new UnicodeBlock(0x0E00, 0x0E7F,
555 "THAI",
556 "Thai");
559 * Lao.
560 * 0x0E80 - 0x0EFF.
562 public static final UnicodeBlock LAO
563 = new UnicodeBlock(0x0E80, 0x0EFF,
564 "LAO",
565 "Lao");
568 * Tibetan.
569 * 0x0F00 - 0x0FFF.
571 public static final UnicodeBlock TIBETAN
572 = new UnicodeBlock(0x0F00, 0x0FFF,
573 "TIBETAN",
574 "Tibetan");
577 * Myanmar.
578 * 0x1000 - 0x109F.
579 * @since 1.4
581 public static final UnicodeBlock MYANMAR
582 = new UnicodeBlock(0x1000, 0x109F,
583 "MYANMAR",
584 "Myanmar");
587 * Georgian.
588 * 0x10A0 - 0x10FF.
590 public static final UnicodeBlock GEORGIAN
591 = new UnicodeBlock(0x10A0, 0x10FF,
592 "GEORGIAN",
593 "Georgian");
596 * Hangul Jamo.
597 * 0x1100 - 0x11FF.
599 public static final UnicodeBlock HANGUL_JAMO
600 = new UnicodeBlock(0x1100, 0x11FF,
601 "HANGUL_JAMO",
602 "Hangul Jamo");
605 * Ethiopic.
606 * 0x1200 - 0x137F.
607 * @since 1.4
609 public static final UnicodeBlock ETHIOPIC
610 = new UnicodeBlock(0x1200, 0x137F,
611 "ETHIOPIC",
612 "Ethiopic");
615 * Cherokee.
616 * 0x13A0 - 0x13FF.
617 * @since 1.4
619 public static final UnicodeBlock CHEROKEE
620 = new UnicodeBlock(0x13A0, 0x13FF,
621 "CHEROKEE",
622 "Cherokee");
625 * Unified Canadian Aboriginal Syllabics.
626 * 0x1400 - 0x167F.
627 * @since 1.4
629 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
630 = new UnicodeBlock(0x1400, 0x167F,
631 "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
632 "Unified Canadian Aboriginal Syllabics");
635 * Ogham.
636 * 0x1680 - 0x169F.
637 * @since 1.4
639 public static final UnicodeBlock OGHAM
640 = new UnicodeBlock(0x1680, 0x169F,
641 "OGHAM",
642 "Ogham");
645 * Runic.
646 * 0x16A0 - 0x16FF.
647 * @since 1.4
649 public static final UnicodeBlock RUNIC
650 = new UnicodeBlock(0x16A0, 0x16FF,
651 "RUNIC",
652 "Runic");
655 * Tagalog.
656 * 0x1700 - 0x171F.
657 * @since 1.5
659 public static final UnicodeBlock TAGALOG
660 = new UnicodeBlock(0x1700, 0x171F,
661 "TAGALOG",
662 "Tagalog");
665 * Hanunoo.
666 * 0x1720 - 0x173F.
667 * @since 1.5
669 public static final UnicodeBlock HANUNOO
670 = new UnicodeBlock(0x1720, 0x173F,
671 "HANUNOO",
672 "Hanunoo");
675 * Buhid.
676 * 0x1740 - 0x175F.
677 * @since 1.5
679 public static final UnicodeBlock BUHID
680 = new UnicodeBlock(0x1740, 0x175F,
681 "BUHID",
682 "Buhid");
685 * Tagbanwa.
686 * 0x1760 - 0x177F.
687 * @since 1.5
689 public static final UnicodeBlock TAGBANWA
690 = new UnicodeBlock(0x1760, 0x177F,
691 "TAGBANWA",
692 "Tagbanwa");
695 * Khmer.
696 * 0x1780 - 0x17FF.
697 * @since 1.4
699 public static final UnicodeBlock KHMER
700 = new UnicodeBlock(0x1780, 0x17FF,
701 "KHMER",
702 "Khmer");
705 * Mongolian.
706 * 0x1800 - 0x18AF.
707 * @since 1.4
709 public static final UnicodeBlock MONGOLIAN
710 = new UnicodeBlock(0x1800, 0x18AF,
711 "MONGOLIAN",
712 "Mongolian");
715 * Limbu.
716 * 0x1900 - 0x194F.
717 * @since 1.5
719 public static final UnicodeBlock LIMBU
720 = new UnicodeBlock(0x1900, 0x194F,
721 "LIMBU",
722 "Limbu");
725 * Tai Le.
726 * 0x1950 - 0x197F.
727 * @since 1.5
729 public static final UnicodeBlock TAI_LE
730 = new UnicodeBlock(0x1950, 0x197F,
731 "TAI_LE",
732 "Tai Le");
735 * Khmer Symbols.
736 * 0x19E0 - 0x19FF.
737 * @since 1.5
739 public static final UnicodeBlock KHMER_SYMBOLS
740 = new UnicodeBlock(0x19E0, 0x19FF,
741 "KHMER_SYMBOLS",
742 "Khmer Symbols");
745 * Phonetic Extensions.
746 * 0x1D00 - 0x1D7F.
747 * @since 1.5
749 public static final UnicodeBlock PHONETIC_EXTENSIONS
750 = new UnicodeBlock(0x1D00, 0x1D7F,
751 "PHONETIC_EXTENSIONS",
752 "Phonetic Extensions");
755 * Latin Extended Additional.
756 * 0x1E00 - 0x1EFF.
758 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
759 = new UnicodeBlock(0x1E00, 0x1EFF,
760 "LATIN_EXTENDED_ADDITIONAL",
761 "Latin Extended Additional");
764 * Greek Extended.
765 * 0x1F00 - 0x1FFF.
767 public static final UnicodeBlock GREEK_EXTENDED
768 = new UnicodeBlock(0x1F00, 0x1FFF,
769 "GREEK_EXTENDED",
770 "Greek Extended");
773 * General Punctuation.
774 * 0x2000 - 0x206F.
776 public static final UnicodeBlock GENERAL_PUNCTUATION
777 = new UnicodeBlock(0x2000, 0x206F,
778 "GENERAL_PUNCTUATION",
779 "General Punctuation");
782 * Superscripts and Subscripts.
783 * 0x2070 - 0x209F.
785 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
786 = new UnicodeBlock(0x2070, 0x209F,
787 "SUPERSCRIPTS_AND_SUBSCRIPTS",
788 "Superscripts and Subscripts");
791 * Currency Symbols.
792 * 0x20A0 - 0x20CF.
794 public static final UnicodeBlock CURRENCY_SYMBOLS
795 = new UnicodeBlock(0x20A0, 0x20CF,
796 "CURRENCY_SYMBOLS",
797 "Currency Symbols");
800 * Combining Marks for Symbols.
801 * 0x20D0 - 0x20FF.
803 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
804 = new UnicodeBlock(0x20D0, 0x20FF,
805 "COMBINING_MARKS_FOR_SYMBOLS",
806 "Combining Marks for Symbols");
809 * Letterlike Symbols.
810 * 0x2100 - 0x214F.
812 public static final UnicodeBlock LETTERLIKE_SYMBOLS
813 = new UnicodeBlock(0x2100, 0x214F,
814 "LETTERLIKE_SYMBOLS",
815 "Letterlike Symbols");
818 * Number Forms.
819 * 0x2150 - 0x218F.
821 public static final UnicodeBlock NUMBER_FORMS
822 = new UnicodeBlock(0x2150, 0x218F,
823 "NUMBER_FORMS",
824 "Number Forms");
827 * Arrows.
828 * 0x2190 - 0x21FF.
830 public static final UnicodeBlock ARROWS
831 = new UnicodeBlock(0x2190, 0x21FF,
832 "ARROWS",
833 "Arrows");
836 * Mathematical Operators.
837 * 0x2200 - 0x22FF.
839 public static final UnicodeBlock MATHEMATICAL_OPERATORS
840 = new UnicodeBlock(0x2200, 0x22FF,
841 "MATHEMATICAL_OPERATORS",
842 "Mathematical Operators");
845 * Miscellaneous Technical.
846 * 0x2300 - 0x23FF.
848 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
849 = new UnicodeBlock(0x2300, 0x23FF,
850 "MISCELLANEOUS_TECHNICAL",
851 "Miscellaneous Technical");
854 * Control Pictures.
855 * 0x2400 - 0x243F.
857 public static final UnicodeBlock CONTROL_PICTURES
858 = new UnicodeBlock(0x2400, 0x243F,
859 "CONTROL_PICTURES",
860 "Control Pictures");
863 * Optical Character Recognition.
864 * 0x2440 - 0x245F.
866 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
867 = new UnicodeBlock(0x2440, 0x245F,
868 "OPTICAL_CHARACTER_RECOGNITION",
869 "Optical Character Recognition");
872 * Enclosed Alphanumerics.
873 * 0x2460 - 0x24FF.
875 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
876 = new UnicodeBlock(0x2460, 0x24FF,
877 "ENCLOSED_ALPHANUMERICS",
878 "Enclosed Alphanumerics");
881 * Box Drawing.
882 * 0x2500 - 0x257F.
884 public static final UnicodeBlock BOX_DRAWING
885 = new UnicodeBlock(0x2500, 0x257F,
886 "BOX_DRAWING",
887 "Box Drawing");
890 * Block Elements.
891 * 0x2580 - 0x259F.
893 public static final UnicodeBlock BLOCK_ELEMENTS
894 = new UnicodeBlock(0x2580, 0x259F,
895 "BLOCK_ELEMENTS",
896 "Block Elements");
899 * Geometric Shapes.
900 * 0x25A0 - 0x25FF.
902 public static final UnicodeBlock GEOMETRIC_SHAPES
903 = new UnicodeBlock(0x25A0, 0x25FF,
904 "GEOMETRIC_SHAPES",
905 "Geometric Shapes");
908 * Miscellaneous Symbols.
909 * 0x2600 - 0x26FF.
911 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
912 = new UnicodeBlock(0x2600, 0x26FF,
913 "MISCELLANEOUS_SYMBOLS",
914 "Miscellaneous Symbols");
917 * Dingbats.
918 * 0x2700 - 0x27BF.
920 public static final UnicodeBlock DINGBATS
921 = new UnicodeBlock(0x2700, 0x27BF,
922 "DINGBATS",
923 "Dingbats");
926 * Miscellaneous Mathematical Symbols-A.
927 * 0x27C0 - 0x27EF.
928 * @since 1.5
930 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
931 = new UnicodeBlock(0x27C0, 0x27EF,
932 "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
933 "Miscellaneous Mathematical Symbols-A");
936 * Supplemental Arrows-A.
937 * 0x27F0 - 0x27FF.
938 * @since 1.5
940 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
941 = new UnicodeBlock(0x27F0, 0x27FF,
942 "SUPPLEMENTAL_ARROWS_A",
943 "Supplemental Arrows-A");
946 * Braille Patterns.
947 * 0x2800 - 0x28FF.
948 * @since 1.4
950 public static final UnicodeBlock BRAILLE_PATTERNS
951 = new UnicodeBlock(0x2800, 0x28FF,
952 "BRAILLE_PATTERNS",
953 "Braille Patterns");
956 * Supplemental Arrows-B.
957 * 0x2900 - 0x297F.
958 * @since 1.5
960 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
961 = new UnicodeBlock(0x2900, 0x297F,
962 "SUPPLEMENTAL_ARROWS_B",
963 "Supplemental Arrows-B");
966 * Miscellaneous Mathematical Symbols-B.
967 * 0x2980 - 0x29FF.
968 * @since 1.5
970 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
971 = new UnicodeBlock(0x2980, 0x29FF,
972 "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
973 "Miscellaneous Mathematical Symbols-B");
976 * Supplemental Mathematical Operators.
977 * 0x2A00 - 0x2AFF.
978 * @since 1.5
980 public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
981 = new UnicodeBlock(0x2A00, 0x2AFF,
982 "SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
983 "Supplemental Mathematical Operators");
986 * Miscellaneous Symbols and Arrows.
987 * 0x2B00 - 0x2BFF.
988 * @since 1.5
990 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
991 = new UnicodeBlock(0x2B00, 0x2BFF,
992 "MISCELLANEOUS_SYMBOLS_AND_ARROWS",
993 "Miscellaneous Symbols and Arrows");
996 * CJK Radicals Supplement.
997 * 0x2E80 - 0x2EFF.
998 * @since 1.4
1000 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
1001 = new UnicodeBlock(0x2E80, 0x2EFF,
1002 "CJK_RADICALS_SUPPLEMENT",
1003 "CJK Radicals Supplement");
1006 * Kangxi Radicals.
1007 * 0x2F00 - 0x2FDF.
1008 * @since 1.4
1010 public static final UnicodeBlock KANGXI_RADICALS
1011 = new UnicodeBlock(0x2F00, 0x2FDF,
1012 "KANGXI_RADICALS",
1013 "Kangxi Radicals");
1016 * Ideographic Description Characters.
1017 * 0x2FF0 - 0x2FFF.
1018 * @since 1.4
1020 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
1021 = new UnicodeBlock(0x2FF0, 0x2FFF,
1022 "IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
1023 "Ideographic Description Characters");
1026 * CJK Symbols and Punctuation.
1027 * 0x3000 - 0x303F.
1029 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
1030 = new UnicodeBlock(0x3000, 0x303F,
1031 "CJK_SYMBOLS_AND_PUNCTUATION",
1032 "CJK Symbols and Punctuation");
1035 * Hiragana.
1036 * 0x3040 - 0x309F.
1038 public static final UnicodeBlock HIRAGANA
1039 = new UnicodeBlock(0x3040, 0x309F,
1040 "HIRAGANA",
1041 "Hiragana");
1044 * Katakana.
1045 * 0x30A0 - 0x30FF.
1047 public static final UnicodeBlock KATAKANA
1048 = new UnicodeBlock(0x30A0, 0x30FF,
1049 "KATAKANA",
1050 "Katakana");
1053 * Bopomofo.
1054 * 0x3100 - 0x312F.
1056 public static final UnicodeBlock BOPOMOFO
1057 = new UnicodeBlock(0x3100, 0x312F,
1058 "BOPOMOFO",
1059 "Bopomofo");
1062 * Hangul Compatibility Jamo.
1063 * 0x3130 - 0x318F.
1065 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
1066 = new UnicodeBlock(0x3130, 0x318F,
1067 "HANGUL_COMPATIBILITY_JAMO",
1068 "Hangul Compatibility Jamo");
1071 * Kanbun.
1072 * 0x3190 - 0x319F.
1074 public static final UnicodeBlock KANBUN
1075 = new UnicodeBlock(0x3190, 0x319F,
1076 "KANBUN",
1077 "Kanbun");
1080 * Bopomofo Extended.
1081 * 0x31A0 - 0x31BF.
1082 * @since 1.4
1084 public static final UnicodeBlock BOPOMOFO_EXTENDED
1085 = new UnicodeBlock(0x31A0, 0x31BF,
1086 "BOPOMOFO_EXTENDED",
1087 "Bopomofo Extended");
1090 * Katakana Phonetic Extensions.
1091 * 0x31F0 - 0x31FF.
1092 * @since 1.5
1094 public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
1095 = new UnicodeBlock(0x31F0, 0x31FF,
1096 "KATAKANA_PHONETIC_EXTENSIONS",
1097 "Katakana Phonetic Extensions");
1100 * Enclosed CJK Letters and Months.
1101 * 0x3200 - 0x32FF.
1103 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
1104 = new UnicodeBlock(0x3200, 0x32FF,
1105 "ENCLOSED_CJK_LETTERS_AND_MONTHS",
1106 "Enclosed CJK Letters and Months");
1109 * CJK Compatibility.
1110 * 0x3300 - 0x33FF.
1112 public static final UnicodeBlock CJK_COMPATIBILITY
1113 = new UnicodeBlock(0x3300, 0x33FF,
1114 "CJK_COMPATIBILITY",
1115 "CJK Compatibility");
1118 * CJK Unified Ideographs Extension A.
1119 * 0x3400 - 0x4DBF.
1120 * @since 1.4
1122 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1123 = new UnicodeBlock(0x3400, 0x4DBF,
1124 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
1125 "CJK Unified Ideographs Extension A");
1128 * Yijing Hexagram Symbols.
1129 * 0x4DC0 - 0x4DFF.
1130 * @since 1.5
1132 public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
1133 = new UnicodeBlock(0x4DC0, 0x4DFF,
1134 "YIJING_HEXAGRAM_SYMBOLS",
1135 "Yijing Hexagram Symbols");
1138 * CJK Unified Ideographs.
1139 * 0x4E00 - 0x9FFF.
1141 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
1142 = new UnicodeBlock(0x4E00, 0x9FFF,
1143 "CJK_UNIFIED_IDEOGRAPHS",
1144 "CJK Unified Ideographs");
1147 * Yi Syllables.
1148 * 0xA000 - 0xA48F.
1149 * @since 1.4
1151 public static final UnicodeBlock YI_SYLLABLES
1152 = new UnicodeBlock(0xA000, 0xA48F,
1153 "YI_SYLLABLES",
1154 "Yi Syllables");
1157 * Yi Radicals.
1158 * 0xA490 - 0xA4CF.
1159 * @since 1.4
1161 public static final UnicodeBlock YI_RADICALS
1162 = new UnicodeBlock(0xA490, 0xA4CF,
1163 "YI_RADICALS",
1164 "Yi Radicals");
1167 * Hangul Syllables.
1168 * 0xAC00 - 0xD7AF.
1170 public static final UnicodeBlock HANGUL_SYLLABLES
1171 = new UnicodeBlock(0xAC00, 0xD7AF,
1172 "HANGUL_SYLLABLES",
1173 "Hangul Syllables");
1176 * High Surrogates.
1177 * 0xD800 - 0xDB7F.
1178 * @since 1.5
1180 public static final UnicodeBlock HIGH_SURROGATES
1181 = new UnicodeBlock(0xD800, 0xDB7F,
1182 "HIGH_SURROGATES",
1183 "High Surrogates");
1186 * High Private Use Surrogates.
1187 * 0xDB80 - 0xDBFF.
1188 * @since 1.5
1190 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
1191 = new UnicodeBlock(0xDB80, 0xDBFF,
1192 "HIGH_PRIVATE_USE_SURROGATES",
1193 "High Private Use Surrogates");
1196 * Low Surrogates.
1197 * 0xDC00 - 0xDFFF.
1198 * @since 1.5
1200 public static final UnicodeBlock LOW_SURROGATES
1201 = new UnicodeBlock(0xDC00, 0xDFFF,
1202 "LOW_SURROGATES",
1203 "Low Surrogates");
1206 * Private Use Area.
1207 * 0xE000 - 0xF8FF.
1209 public static final UnicodeBlock PRIVATE_USE_AREA
1210 = new UnicodeBlock(0xE000, 0xF8FF,
1211 "PRIVATE_USE_AREA",
1212 "Private Use Area");
1215 * CJK Compatibility Ideographs.
1216 * 0xF900 - 0xFAFF.
1218 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
1219 = new UnicodeBlock(0xF900, 0xFAFF,
1220 "CJK_COMPATIBILITY_IDEOGRAPHS",
1221 "CJK Compatibility Ideographs");
1224 * Alphabetic Presentation Forms.
1225 * 0xFB00 - 0xFB4F.
1227 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
1228 = new UnicodeBlock(0xFB00, 0xFB4F,
1229 "ALPHABETIC_PRESENTATION_FORMS",
1230 "Alphabetic Presentation Forms");
1233 * Arabic Presentation Forms-A.
1234 * 0xFB50 - 0xFDFF.
1236 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
1237 = new UnicodeBlock(0xFB50, 0xFDFF,
1238 "ARABIC_PRESENTATION_FORMS_A",
1239 "Arabic Presentation Forms-A");
1242 * Variation Selectors.
1243 * 0xFE00 - 0xFE0F.
1244 * @since 1.5
1246 public static final UnicodeBlock VARIATION_SELECTORS
1247 = new UnicodeBlock(0xFE00, 0xFE0F,
1248 "VARIATION_SELECTORS",
1249 "Variation Selectors");
1252 * Combining Half Marks.
1253 * 0xFE20 - 0xFE2F.
1255 public static final UnicodeBlock COMBINING_HALF_MARKS
1256 = new UnicodeBlock(0xFE20, 0xFE2F,
1257 "COMBINING_HALF_MARKS",
1258 "Combining Half Marks");
1261 * CJK Compatibility Forms.
1262 * 0xFE30 - 0xFE4F.
1264 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
1265 = new UnicodeBlock(0xFE30, 0xFE4F,
1266 "CJK_COMPATIBILITY_FORMS",
1267 "CJK Compatibility Forms");
1270 * Small Form Variants.
1271 * 0xFE50 - 0xFE6F.
1273 public static final UnicodeBlock SMALL_FORM_VARIANTS
1274 = new UnicodeBlock(0xFE50, 0xFE6F,
1275 "SMALL_FORM_VARIANTS",
1276 "Small Form Variants");
1279 * Arabic Presentation Forms-B.
1280 * 0xFE70 - 0xFEFF.
1282 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
1283 = new UnicodeBlock(0xFE70, 0xFEFF,
1284 "ARABIC_PRESENTATION_FORMS_B",
1285 "Arabic Presentation Forms-B");
1288 * Halfwidth and Fullwidth Forms.
1289 * 0xFF00 - 0xFFEF.
1291 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
1292 = new UnicodeBlock(0xFF00, 0xFFEF,
1293 "HALFWIDTH_AND_FULLWIDTH_FORMS",
1294 "Halfwidth and Fullwidth Forms");
1297 * Specials.
1298 * 0xFFF0 - 0xFFFF.
1300 public static final UnicodeBlock SPECIALS
1301 = new UnicodeBlock(0xFFF0, 0xFFFF,
1302 "SPECIALS",
1303 "Specials");
1306 * Linear B Syllabary.
1307 * 0x10000 - 0x1007F.
1308 * @since 1.5
1310 public static final UnicodeBlock LINEAR_B_SYLLABARY
1311 = new UnicodeBlock(0x10000, 0x1007F,
1312 "LINEAR_B_SYLLABARY",
1313 "Linear B Syllabary");
1316 * Linear B Ideograms.
1317 * 0x10080 - 0x100FF.
1318 * @since 1.5
1320 public static final UnicodeBlock LINEAR_B_IDEOGRAMS
1321 = new UnicodeBlock(0x10080, 0x100FF,
1322 "LINEAR_B_IDEOGRAMS",
1323 "Linear B Ideograms");
1326 * Aegean Numbers.
1327 * 0x10100 - 0x1013F.
1328 * @since 1.5
1330 public static final UnicodeBlock AEGEAN_NUMBERS
1331 = new UnicodeBlock(0x10100, 0x1013F,
1332 "AEGEAN_NUMBERS",
1333 "Aegean Numbers");
1336 * Old Italic.
1337 * 0x10300 - 0x1032F.
1338 * @since 1.5
1340 public static final UnicodeBlock OLD_ITALIC
1341 = new UnicodeBlock(0x10300, 0x1032F,
1342 "OLD_ITALIC",
1343 "Old Italic");
1346 * Gothic.
1347 * 0x10330 - 0x1034F.
1348 * @since 1.5
1350 public static final UnicodeBlock GOTHIC
1351 = new UnicodeBlock(0x10330, 0x1034F,
1352 "GOTHIC",
1353 "Gothic");
1356 * Ugaritic.
1357 * 0x10380 - 0x1039F.
1358 * @since 1.5
1360 public static final UnicodeBlock UGARITIC
1361 = new UnicodeBlock(0x10380, 0x1039F,
1362 "UGARITIC",
1363 "Ugaritic");
1366 * Deseret.
1367 * 0x10400 - 0x1044F.
1368 * @since 1.5
1370 public static final UnicodeBlock DESERET
1371 = new UnicodeBlock(0x10400, 0x1044F,
1372 "DESERET",
1373 "Deseret");
1376 * Shavian.
1377 * 0x10450 - 0x1047F.
1378 * @since 1.5
1380 public static final UnicodeBlock SHAVIAN
1381 = new UnicodeBlock(0x10450, 0x1047F,
1382 "SHAVIAN",
1383 "Shavian");
1386 * Osmanya.
1387 * 0x10480 - 0x104AF.
1388 * @since 1.5
1390 public static final UnicodeBlock OSMANYA
1391 = new UnicodeBlock(0x10480, 0x104AF,
1392 "OSMANYA",
1393 "Osmanya");
1396 * Cypriot Syllabary.
1397 * 0x10800 - 0x1083F.
1398 * @since 1.5
1400 public static final UnicodeBlock CYPRIOT_SYLLABARY
1401 = new UnicodeBlock(0x10800, 0x1083F,
1402 "CYPRIOT_SYLLABARY",
1403 "Cypriot Syllabary");
1406 * Byzantine Musical Symbols.
1407 * 0x1D000 - 0x1D0FF.
1408 * @since 1.5
1410 public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
1411 = new UnicodeBlock(0x1D000, 0x1D0FF,
1412 "BYZANTINE_MUSICAL_SYMBOLS",
1413 "Byzantine Musical Symbols");
1416 * Musical Symbols.
1417 * 0x1D100 - 0x1D1FF.
1418 * @since 1.5
1420 public static final UnicodeBlock MUSICAL_SYMBOLS
1421 = new UnicodeBlock(0x1D100, 0x1D1FF,
1422 "MUSICAL_SYMBOLS",
1423 "Musical Symbols");
1426 * Tai Xuan Jing Symbols.
1427 * 0x1D300 - 0x1D35F.
1428 * @since 1.5
1430 public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
1431 = new UnicodeBlock(0x1D300, 0x1D35F,
1432 "TAI_XUAN_JING_SYMBOLS",
1433 "Tai Xuan Jing Symbols");
1436 * Mathematical Alphanumeric Symbols.
1437 * 0x1D400 - 0x1D7FF.
1438 * @since 1.5
1440 public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
1441 = new UnicodeBlock(0x1D400, 0x1D7FF,
1442 "MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
1443 "Mathematical Alphanumeric Symbols");
1446 * CJK Unified Ideographs Extension B.
1447 * 0x20000 - 0x2A6DF.
1448 * @since 1.5
1450 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1451 = new UnicodeBlock(0x20000, 0x2A6DF,
1452 "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
1453 "CJK Unified Ideographs Extension B");
1456 * CJK Compatibility Ideographs Supplement.
1457 * 0x2F800 - 0x2FA1F.
1458 * @since 1.5
1460 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
1461 = new UnicodeBlock(0x2F800, 0x2FA1F,
1462 "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
1463 "CJK Compatibility Ideographs Supplement");
1466 * Tags.
1467 * 0xE0000 - 0xE007F.
1468 * @since 1.5
1470 public static final UnicodeBlock TAGS
1471 = new UnicodeBlock(0xE0000, 0xE007F,
1472 "TAGS",
1473 "Tags");
1476 * Variation Selectors Supplement.
1477 * 0xE0100 - 0xE01EF.
1478 * @since 1.5
1480 public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
1481 = new UnicodeBlock(0xE0100, 0xE01EF,
1482 "VARIATION_SELECTORS_SUPPLEMENT",
1483 "Variation Selectors Supplement");
1486 * Supplementary Private Use Area-A.
1487 * 0xF0000 - 0xFFFFF.
1488 * @since 1.5
1490 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
1491 = new UnicodeBlock(0xF0000, 0xFFFFF,
1492 "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
1493 "Supplementary Private Use Area-A");
1496 * Supplementary Private Use Area-B.
1497 * 0x100000 - 0x10FFFF.
1498 * @since 1.5
1500 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
1501 = new UnicodeBlock(0x100000, 0x10FFFF,
1502 "SUPPLEMENTARY_PRIVATE_USE_AREA_B",
1503 "Supplementary Private Use Area-B");
1506 * Surrogates Area.
1507 * 'D800' - 'DFFF'.
1508 * @deprecated As of 1.5, the three areas,
1509 * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
1510 * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
1511 * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
1512 * by the Unicode standard, should be used in preference to
1513 * this. These are also returned from calls to <code>of(int)</code>
1514 * and <code>of(char)</code>.
1516 public static final UnicodeBlock SURROGATES_AREA
1517 = new UnicodeBlock(0xD800, 0xDFFF,
1518 "SURROGATES_AREA",
1519 "Surrogates Area");
1522 * The defined subsets.
1524 private static final UnicodeBlock sets[] = {
1525 BASIC_LATIN,
1526 LATIN_1_SUPPLEMENT,
1527 LATIN_EXTENDED_A,
1528 LATIN_EXTENDED_B,
1529 IPA_EXTENSIONS,
1530 SPACING_MODIFIER_LETTERS,
1531 COMBINING_DIACRITICAL_MARKS,
1532 GREEK,
1533 CYRILLIC,
1534 CYRILLIC_SUPPLEMENTARY,
1535 ARMENIAN,
1536 HEBREW,
1537 ARABIC,
1538 SYRIAC,
1539 THAANA,
1540 DEVANAGARI,
1541 BENGALI,
1542 GURMUKHI,
1543 GUJARATI,
1544 ORIYA,
1545 TAMIL,
1546 TELUGU,
1547 KANNADA,
1548 MALAYALAM,
1549 SINHALA,
1550 THAI,
1551 LAO,
1552 TIBETAN,
1553 MYANMAR,
1554 GEORGIAN,
1555 HANGUL_JAMO,
1556 ETHIOPIC,
1557 CHEROKEE,
1558 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
1559 OGHAM,
1560 RUNIC,
1561 TAGALOG,
1562 HANUNOO,
1563 BUHID,
1564 TAGBANWA,
1565 KHMER,
1566 MONGOLIAN,
1567 LIMBU,
1568 TAI_LE,
1569 KHMER_SYMBOLS,
1570 PHONETIC_EXTENSIONS,
1571 LATIN_EXTENDED_ADDITIONAL,
1572 GREEK_EXTENDED,
1573 GENERAL_PUNCTUATION,
1574 SUPERSCRIPTS_AND_SUBSCRIPTS,
1575 CURRENCY_SYMBOLS,
1576 COMBINING_MARKS_FOR_SYMBOLS,
1577 LETTERLIKE_SYMBOLS,
1578 NUMBER_FORMS,
1579 ARROWS,
1580 MATHEMATICAL_OPERATORS,
1581 MISCELLANEOUS_TECHNICAL,
1582 CONTROL_PICTURES,
1583 OPTICAL_CHARACTER_RECOGNITION,
1584 ENCLOSED_ALPHANUMERICS,
1585 BOX_DRAWING,
1586 BLOCK_ELEMENTS,
1587 GEOMETRIC_SHAPES,
1588 MISCELLANEOUS_SYMBOLS,
1589 DINGBATS,
1590 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
1591 SUPPLEMENTAL_ARROWS_A,
1592 BRAILLE_PATTERNS,
1593 SUPPLEMENTAL_ARROWS_B,
1594 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
1595 SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
1596 MISCELLANEOUS_SYMBOLS_AND_ARROWS,
1597 CJK_RADICALS_SUPPLEMENT,
1598 KANGXI_RADICALS,
1599 IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
1600 CJK_SYMBOLS_AND_PUNCTUATION,
1601 HIRAGANA,
1602 KATAKANA,
1603 BOPOMOFO,
1604 HANGUL_COMPATIBILITY_JAMO,
1605 KANBUN,
1606 BOPOMOFO_EXTENDED,
1607 KATAKANA_PHONETIC_EXTENSIONS,
1608 ENCLOSED_CJK_LETTERS_AND_MONTHS,
1609 CJK_COMPATIBILITY,
1610 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
1611 YIJING_HEXAGRAM_SYMBOLS,
1612 CJK_UNIFIED_IDEOGRAPHS,
1613 YI_SYLLABLES,
1614 YI_RADICALS,
1615 HANGUL_SYLLABLES,
1616 HIGH_SURROGATES,
1617 HIGH_PRIVATE_USE_SURROGATES,
1618 LOW_SURROGATES,
1619 PRIVATE_USE_AREA,
1620 CJK_COMPATIBILITY_IDEOGRAPHS,
1621 ALPHABETIC_PRESENTATION_FORMS,
1622 ARABIC_PRESENTATION_FORMS_A,
1623 VARIATION_SELECTORS,
1624 COMBINING_HALF_MARKS,
1625 CJK_COMPATIBILITY_FORMS,
1626 SMALL_FORM_VARIANTS,
1627 ARABIC_PRESENTATION_FORMS_B,
1628 HALFWIDTH_AND_FULLWIDTH_FORMS,
1629 SPECIALS,
1630 LINEAR_B_SYLLABARY,
1631 LINEAR_B_IDEOGRAMS,
1632 AEGEAN_NUMBERS,
1633 OLD_ITALIC,
1634 GOTHIC,
1635 UGARITIC,
1636 DESERET,
1637 SHAVIAN,
1638 OSMANYA,
1639 CYPRIOT_SYLLABARY,
1640 BYZANTINE_MUSICAL_SYMBOLS,
1641 MUSICAL_SYMBOLS,
1642 TAI_XUAN_JING_SYMBOLS,
1643 MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
1644 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
1645 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
1646 TAGS,
1647 VARIATION_SELECTORS_SUPPLEMENT,
1648 SUPPLEMENTARY_PRIVATE_USE_AREA_A,
1649 SUPPLEMENTARY_PRIVATE_USE_AREA_B,
1651 } // class UnicodeBlock
1654 * A class to encompass all the properties of characters in the
1655 * private use blocks in the Unicode standard. This class extends
1656 * UnassignedCharacters because the return type from getType() is
1657 * different.
1658 * @author Anthony Balkissoon abalkiss at redhat dot com
1661 private static class PrivateUseCharacters extends UnassignedCharacters
1664 * Returns the type of the character cp.
1666 static int getType(int cp)
1668 // The upper 2 code points in any plane are considered unassigned,
1669 // even in the private-use planes.
1670 if ((cp & 0xffff) >= 0xfffe)
1671 return UnassignedCharacters.getType(cp);
1672 return PRIVATE_USE;
1676 * Returns true if the character cp is defined.
1678 static boolean isDefined(int cp)
1680 // The upper 2 code points in any plane are considered unassigned,
1681 // even in the private-use planes.
1682 if ((cp & 0xffff) >= 0xfffe)
1683 return UnassignedCharacters.isDefined(cp);
1684 return true;
1688 * Gets the directionality for the character cp.
1690 static byte getDirectionality(int cp)
1692 if ((cp & 0xffff) >= 0xfffe)
1693 return UnassignedCharacters.getDirectionality(cp);
1694 return DIRECTIONALITY_LEFT_TO_RIGHT;
1699 * A class to encompass all the properties of code points that are
1700 * currently undefined in the Unicode standard.
1701 * @author Anthony Balkissoon abalkiss at redhat dot com
1704 private static class UnassignedCharacters
1707 * Returns the numeric value for the unassigned characters.
1708 * @param cp the character
1709 * @param radix the radix (not used)
1710 * @return the numeric value of this character in this radix
1712 static int digit(int cp, int radix)
1714 return -1;
1718 * Returns the Unicode directionality property for unassigned
1719 * characters.
1720 * @param cp the character
1721 * @return DIRECTIONALITY_UNDEFINED
1723 static byte getDirectionality(int cp)
1725 return DIRECTIONALITY_UNDEFINED;
1729 * Returns -1, the numeric value for unassigned Unicode characters.
1730 * @param cp the character
1731 * @return -1
1733 static int getNumericValue(int cp)
1735 return -1;
1739 * Returns UNASSIGNED, the type of unassigned Unicode characters.
1740 * @param cp the character
1741 * @return UNASSIGNED
1743 static int getType(int cp)
1745 return UNASSIGNED;
1749 * Returns false to indiciate that the character is not defined in the
1750 * Unicode standard.
1751 * @param cp the character
1752 * @return false
1754 static boolean isDefined(int cp)
1756 return false;
1760 * Returns false to indicate that the character is not a digit.
1761 * @param cp the character
1762 * @return false
1764 static boolean isDigit(int cp)
1766 return false;
1770 * Returns false to indicate that the character cannot be ignored
1771 * within an identifier
1772 * @param cp the character
1773 * @return false
1775 static boolean isIdentifierIgnorable(int cp)
1777 return false;
1781 * Returns false to indicate that the character cannot be part of a
1782 * Java identifier.
1783 * @param cp the character
1784 * @return false
1786 static boolean isJavaIdentifierPart(int cp)
1788 return false;
1792 * Returns false to indicate that the character cannot be start a
1793 * Java identifier.
1794 * @param cp the character
1795 * @return false
1797 static boolean isJavaIdentiferStart(int cp)
1799 return false;
1803 * Returns false to indicate that the character is not a letter.
1804 * @param cp the character
1805 * @return false
1807 static boolean isLetter(int cp)
1809 return false;
1813 * Returns false to indicate that the character cannot is neither a letter
1814 * nor a digit.
1815 * @param cp the character
1816 * @return false
1818 static boolean isLetterOrDigit(int cp)
1820 return false;
1824 * Returns false to indicate that the character is not a lowercase letter.
1825 * @param cp the character
1826 * @return false
1828 static boolean isLowerCase(int cp)
1830 return false;
1834 * Returns false to indicate that the character cannot is not mirrored.
1835 * @param cp the character
1836 * @return false
1838 static boolean isMirrored(int cp)
1840 return false;
1844 * Returns false to indicate that the character is not a space character.
1845 * @param cp the character
1846 * @return false
1848 static boolean isSpaceChar(int cp)
1850 return false;
1854 * Returns false to indicate that the character it not a titlecase letter.
1855 * @param cp the character
1856 * @return false
1858 static boolean isTitleCase(int cp)
1860 return false;
1864 * Returns false to indicate that the character cannot be part of a
1865 * Unicode identifier.
1866 * @param cp the character
1867 * @return false
1869 static boolean isUnicodeIdentifierPart(int cp)
1871 return false;
1875 * Returns false to indicate that the character cannot start a
1876 * Unicode identifier.
1877 * @param cp the character
1878 * @return false
1880 static boolean isUnicodeIdentifierStart(int cp)
1882 return false;
1886 * Returns false to indicate that the character is not an uppercase letter.
1887 * @param cp the character
1888 * @return false
1890 static boolean isUpperCase(int cp)
1892 return false;
1896 * Returns false to indicate that the character is not a whitespace
1897 * character.
1898 * @param cp the character
1899 * @return false
1901 static boolean isWhiteSpace(int cp)
1903 return false;
1907 * Returns cp to indicate this character has no lowercase conversion.
1908 * @param cp the character
1909 * @return cp
1911 static int toLowerCase(int cp)
1913 return cp;
1917 * Returns cp to indicate this character has no titlecase conversion.
1918 * @param cp the character
1919 * @return cp
1921 static int toTitleCase(int cp)
1923 return cp;
1927 * Returns cp to indicate this character has no uppercase conversion.
1928 * @param cp the character
1929 * @return cp
1931 static int toUpperCase(int cp)
1933 return cp;
1938 * The immutable value of this Character.
1940 * @serial the value of this Character
1942 private final char value;
1945 * Compatible with JDK 1.0+.
1947 private static final long serialVersionUID = 3786198910865385080L;
1950 * Smallest value allowed for radix arguments in Java. This value is 2.
1952 * @see #digit(char, int)
1953 * @see #forDigit(int, int)
1954 * @see Integer#toString(int, int)
1955 * @see Integer#valueOf(String)
1957 public static final int MIN_RADIX = 2;
1960 * Largest value allowed for radix arguments in Java. This value is 36.
1962 * @see #digit(char, int)
1963 * @see #forDigit(int, int)
1964 * @see Integer#toString(int, int)
1965 * @see Integer#valueOf(String)
1967 public static final int MAX_RADIX = 36;
1970 * The minimum value the char data type can hold.
1971 * This value is <code>'\\u0000'</code>.
1973 public static final char MIN_VALUE = '\u0000';
1976 * The maximum value the char data type can hold.
1977 * This value is <code>'\\uFFFF'</code>.
1979 public static final char MAX_VALUE = '\uFFFF';
1982 * Class object representing the primitive char data type.
1984 * @since 1.1
1986 public static final Class TYPE = VMClassLoader.getPrimitiveClass('C');
1989 * The number of bits needed to represent a <code>char</code>.
1990 * @since 1.5
1992 public static final int SIZE = 16;
1994 // This caches some Character values, and is used by boxing
1995 // conversions via valueOf(). We must cache at least 0..127;
1996 // this constant controls how much we actually cache.
1997 private static final int MAX_CACHE = 127;
1998 private static Character[] charCache = new Character[MAX_CACHE + 1];
2001 * Lu = Letter, Uppercase (Informative).
2003 * @since 1.1
2005 public static final byte UPPERCASE_LETTER = 1;
2008 * Ll = Letter, Lowercase (Informative).
2010 * @since 1.1
2012 public static final byte LOWERCASE_LETTER = 2;
2015 * Lt = Letter, Titlecase (Informative).
2017 * @since 1.1
2019 public static final byte TITLECASE_LETTER = 3;
2022 * Mn = Mark, Non-Spacing (Normative).
2024 * @since 1.1
2026 public static final byte NON_SPACING_MARK = 6;
2029 * Mc = Mark, Spacing Combining (Normative).
2031 * @since 1.1
2033 public static final byte COMBINING_SPACING_MARK = 8;
2036 * Me = Mark, Enclosing (Normative).
2038 * @since 1.1
2040 public static final byte ENCLOSING_MARK = 7;
2043 * Nd = Number, Decimal Digit (Normative).
2045 * @since 1.1
2047 public static final byte DECIMAL_DIGIT_NUMBER = 9;
2050 * Nl = Number, Letter (Normative).
2052 * @since 1.1
2054 public static final byte LETTER_NUMBER = 10;
2057 * No = Number, Other (Normative).
2059 * @since 1.1
2061 public static final byte OTHER_NUMBER = 11;
2064 * Zs = Separator, Space (Normative).
2066 * @since 1.1
2068 public static final byte SPACE_SEPARATOR = 12;
2071 * Zl = Separator, Line (Normative).
2073 * @since 1.1
2075 public static final byte LINE_SEPARATOR = 13;
2078 * Zp = Separator, Paragraph (Normative).
2080 * @since 1.1
2082 public static final byte PARAGRAPH_SEPARATOR = 14;
2085 * Cc = Other, Control (Normative).
2087 * @since 1.1
2089 public static final byte CONTROL = 15;
2092 * Cf = Other, Format (Normative).
2094 * @since 1.1
2096 public static final byte FORMAT = 16;
2099 * Cs = Other, Surrogate (Normative).
2101 * @since 1.1
2103 public static final byte SURROGATE = 19;
2106 * Co = Other, Private Use (Normative).
2108 * @since 1.1
2110 public static final byte PRIVATE_USE = 18;
2113 * Cn = Other, Not Assigned (Normative).
2115 * @since 1.1
2117 public static final byte UNASSIGNED = 0;
2120 * Lm = Letter, Modifier (Informative).
2122 * @since 1.1
2124 public static final byte MODIFIER_LETTER = 4;
2127 * Lo = Letter, Other (Informative).
2129 * @since 1.1
2131 public static final byte OTHER_LETTER = 5;
2134 * Pc = Punctuation, Connector (Informative).
2136 * @since 1.1
2138 public static final byte CONNECTOR_PUNCTUATION = 23;
2141 * Pd = Punctuation, Dash (Informative).
2143 * @since 1.1
2145 public static final byte DASH_PUNCTUATION = 20;
2148 * Ps = Punctuation, Open (Informative).
2150 * @since 1.1
2152 public static final byte START_PUNCTUATION = 21;
2155 * Pe = Punctuation, Close (Informative).
2157 * @since 1.1
2159 public static final byte END_PUNCTUATION = 22;
2162 * Pi = Punctuation, Initial Quote (Informative).
2164 * @since 1.4
2166 public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
2169 * Pf = Punctuation, Final Quote (Informative).
2171 * @since 1.4
2173 public static final byte FINAL_QUOTE_PUNCTUATION = 30;
2176 * Po = Punctuation, Other (Informative).
2178 * @since 1.1
2180 public static final byte OTHER_PUNCTUATION = 24;
2183 * Sm = Symbol, Math (Informative).
2185 * @since 1.1
2187 public static final byte MATH_SYMBOL = 25;
2190 * Sc = Symbol, Currency (Informative).
2192 * @since 1.1
2194 public static final byte CURRENCY_SYMBOL = 26;
2197 * Sk = Symbol, Modifier (Informative).
2199 * @since 1.1
2201 public static final byte MODIFIER_SYMBOL = 27;
2204 * So = Symbol, Other (Informative).
2206 * @since 1.1
2208 public static final byte OTHER_SYMBOL = 28;
2211 * Undefined bidirectional character type. Undefined char values have
2212 * undefined directionality in the Unicode specification.
2214 * @since 1.4
2216 public static final byte DIRECTIONALITY_UNDEFINED = -1;
2219 * Strong bidirectional character type "L".
2221 * @since 1.4
2223 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
2226 * Strong bidirectional character type "R".
2228 * @since 1.4
2230 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
2233 * Strong bidirectional character type "AL".
2235 * @since 1.4
2237 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
2240 * Weak bidirectional character type "EN".
2242 * @since 1.4
2244 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
2247 * Weak bidirectional character type "ES".
2249 * @since 1.4
2251 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
2254 * Weak bidirectional character type "ET".
2256 * @since 1.4
2258 public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
2261 * Weak bidirectional character type "AN".
2263 * @since 1.4
2265 public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
2268 * Weak bidirectional character type "CS".
2270 * @since 1.4
2272 public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
2275 * Weak bidirectional character type "NSM".
2277 * @since 1.4
2279 public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
2282 * Weak bidirectional character type "BN".
2284 * @since 1.4
2286 public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
2289 * Neutral bidirectional character type "B".
2291 * @since 1.4
2293 public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
2296 * Neutral bidirectional character type "S".
2298 * @since 1.4
2300 public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
2303 * Strong bidirectional character type "WS".
2305 * @since 1.4
2307 public static final byte DIRECTIONALITY_WHITESPACE = 12;
2310 * Neutral bidirectional character type "ON".
2312 * @since 1.4
2314 public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
2317 * Strong bidirectional character type "LRE".
2319 * @since 1.4
2321 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
2324 * Strong bidirectional character type "LRO".
2326 * @since 1.4
2328 public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
2331 * Strong bidirectional character type "RLE".
2333 * @since 1.4
2335 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
2338 * Strong bidirectional character type "RLO".
2340 * @since 1.4
2342 public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
2345 * Weak bidirectional character type "PDF".
2347 * @since 1.4
2349 public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
2352 * Stores unicode block offset lookup table. Exploit package visibility of
2353 * String.value to avoid copying the array.
2354 * @see #readCodePoint(int)
2355 * @see CharData#BLOCKS
2357 private static final char[][] blocks =
2358 new char[][]{
2359 String.zeroBasedStringValue(CharData.BLOCKS[0]),
2360 String.zeroBasedStringValue(CharData.BLOCKS[1]),
2361 String.zeroBasedStringValue(CharData.BLOCKS[2]),
2362 String.zeroBasedStringValue(CharData.BLOCKS[3]),
2363 String.zeroBasedStringValue(CharData.BLOCKS[4]),
2364 String.zeroBasedStringValue(CharData.BLOCKS[5]),
2365 String.zeroBasedStringValue(CharData.BLOCKS[6]),
2366 String.zeroBasedStringValue(CharData.BLOCKS[7]),
2367 String.zeroBasedStringValue(CharData.BLOCKS[8]),
2368 String.zeroBasedStringValue(CharData.BLOCKS[9]),
2369 String.zeroBasedStringValue(CharData.BLOCKS[10]),
2370 String.zeroBasedStringValue(CharData.BLOCKS[11]),
2371 String.zeroBasedStringValue(CharData.BLOCKS[12]),
2372 String.zeroBasedStringValue(CharData.BLOCKS[13]),
2373 String.zeroBasedStringValue(CharData.BLOCKS[14]),
2374 String.zeroBasedStringValue(CharData.BLOCKS[15]),
2375 String.zeroBasedStringValue(CharData.BLOCKS[16])};
2378 * Stores unicode attribute offset lookup table. Exploit package visibility
2379 * of String.value to avoid copying the array.
2380 * @see CharData#DATA
2382 private static final char[][] data =
2383 new char[][]{
2384 String.zeroBasedStringValue(CharData.DATA[0]),
2385 String.zeroBasedStringValue(CharData.DATA[1]),
2386 String.zeroBasedStringValue(CharData.DATA[2]),
2387 String.zeroBasedStringValue(CharData.DATA[3]),
2388 String.zeroBasedStringValue(CharData.DATA[4]),
2389 String.zeroBasedStringValue(CharData.DATA[5]),
2390 String.zeroBasedStringValue(CharData.DATA[6]),
2391 String.zeroBasedStringValue(CharData.DATA[7]),
2392 String.zeroBasedStringValue(CharData.DATA[8]),
2393 String.zeroBasedStringValue(CharData.DATA[9]),
2394 String.zeroBasedStringValue(CharData.DATA[10]),
2395 String.zeroBasedStringValue(CharData.DATA[11]),
2396 String.zeroBasedStringValue(CharData.DATA[12]),
2397 String.zeroBasedStringValue(CharData.DATA[13]),
2398 String.zeroBasedStringValue(CharData.DATA[14]),
2399 String.zeroBasedStringValue(CharData.DATA[15]),
2400 String.zeroBasedStringValue(CharData.DATA[16])};
2403 * Stores unicode numeric value attribute table. Exploit package visibility
2404 * of String.value to avoid copying the array.
2405 * @see CharData#NUM_VALUE
2407 private static final char[][] numValue =
2408 new char[][]{
2409 String.zeroBasedStringValue(CharData.NUM_VALUE[0]),
2410 String.zeroBasedStringValue(CharData.NUM_VALUE[1]),
2411 String.zeroBasedStringValue(CharData.NUM_VALUE[2]),
2412 String.zeroBasedStringValue(CharData.NUM_VALUE[3]),
2413 String.zeroBasedStringValue(CharData.NUM_VALUE[4]),
2414 String.zeroBasedStringValue(CharData.NUM_VALUE[5]),
2415 String.zeroBasedStringValue(CharData.NUM_VALUE[6]),
2416 String.zeroBasedStringValue(CharData.NUM_VALUE[7]),
2417 String.zeroBasedStringValue(CharData.NUM_VALUE[8]),
2418 String.zeroBasedStringValue(CharData.NUM_VALUE[9]),
2419 String.zeroBasedStringValue(CharData.NUM_VALUE[10]),
2420 String.zeroBasedStringValue(CharData.NUM_VALUE[11]),
2421 String.zeroBasedStringValue(CharData.NUM_VALUE[12]),
2422 String.zeroBasedStringValue(CharData.NUM_VALUE[13]),
2423 String.zeroBasedStringValue(CharData.NUM_VALUE[14]),
2424 String.zeroBasedStringValue(CharData.NUM_VALUE[15]),
2425 String.zeroBasedStringValue(CharData.NUM_VALUE[16])};
2428 * Stores unicode uppercase attribute table. Exploit package visibility
2429 * of String.value to avoid copying the array.
2430 * @see CharData#UPPER
2432 private static final char[][] upper =
2433 new char[][]{
2434 String.zeroBasedStringValue(CharData.UPPER[0]),
2435 String.zeroBasedStringValue(CharData.UPPER[1]),
2436 String.zeroBasedStringValue(CharData.UPPER[2]),
2437 String.zeroBasedStringValue(CharData.UPPER[3]),
2438 String.zeroBasedStringValue(CharData.UPPER[4]),
2439 String.zeroBasedStringValue(CharData.UPPER[5]),
2440 String.zeroBasedStringValue(CharData.UPPER[6]),
2441 String.zeroBasedStringValue(CharData.UPPER[7]),
2442 String.zeroBasedStringValue(CharData.UPPER[8]),
2443 String.zeroBasedStringValue(CharData.UPPER[9]),
2444 String.zeroBasedStringValue(CharData.UPPER[10]),
2445 String.zeroBasedStringValue(CharData.UPPER[11]),
2446 String.zeroBasedStringValue(CharData.UPPER[12]),
2447 String.zeroBasedStringValue(CharData.UPPER[13]),
2448 String.zeroBasedStringValue(CharData.UPPER[14]),
2449 String.zeroBasedStringValue(CharData.UPPER[15]),
2450 String.zeroBasedStringValue(CharData.UPPER[16])};
2453 * Stores unicode lowercase attribute table. Exploit package visibility
2454 * of String.value to avoid copying the array.
2455 * @see CharData#LOWER
2457 private static final char[][] lower =
2458 new char[][]{
2459 String.zeroBasedStringValue(CharData.LOWER[0]),
2460 String.zeroBasedStringValue(CharData.LOWER[1]),
2461 String.zeroBasedStringValue(CharData.LOWER[2]),
2462 String.zeroBasedStringValue(CharData.LOWER[3]),
2463 String.zeroBasedStringValue(CharData.LOWER[4]),
2464 String.zeroBasedStringValue(CharData.LOWER[5]),
2465 String.zeroBasedStringValue(CharData.LOWER[6]),
2466 String.zeroBasedStringValue(CharData.LOWER[7]),
2467 String.zeroBasedStringValue(CharData.LOWER[8]),
2468 String.zeroBasedStringValue(CharData.LOWER[9]),
2469 String.zeroBasedStringValue(CharData.LOWER[10]),
2470 String.zeroBasedStringValue(CharData.LOWER[11]),
2471 String.zeroBasedStringValue(CharData.LOWER[12]),
2472 String.zeroBasedStringValue(CharData.LOWER[13]),
2473 String.zeroBasedStringValue(CharData.LOWER[14]),
2474 String.zeroBasedStringValue(CharData.LOWER[15]),
2475 String.zeroBasedStringValue(CharData.LOWER[16])};
2478 * Stores unicode direction attribute table. Exploit package visibility
2479 * of String.value to avoid copying the array.
2480 * @see CharData#DIRECTION
2482 // Package visible for use by String.
2483 static final char[][] direction =
2484 new char[][]{
2485 String.zeroBasedStringValue(CharData.DIRECTION[0]),
2486 String.zeroBasedStringValue(CharData.DIRECTION[1]),
2487 String.zeroBasedStringValue(CharData.DIRECTION[2]),
2488 String.zeroBasedStringValue(CharData.DIRECTION[3]),
2489 String.zeroBasedStringValue(CharData.DIRECTION[4]),
2490 String.zeroBasedStringValue(CharData.DIRECTION[5]),
2491 String.zeroBasedStringValue(CharData.DIRECTION[6]),
2492 String.zeroBasedStringValue(CharData.DIRECTION[7]),
2493 String.zeroBasedStringValue(CharData.DIRECTION[8]),
2494 String.zeroBasedStringValue(CharData.DIRECTION[9]),
2495 String.zeroBasedStringValue(CharData.DIRECTION[10]),
2496 String.zeroBasedStringValue(CharData.DIRECTION[11]),
2497 String.zeroBasedStringValue(CharData.DIRECTION[12]),
2498 String.zeroBasedStringValue(CharData.DIRECTION[13]),
2499 String.zeroBasedStringValue(CharData.DIRECTION[14]),
2500 String.zeroBasedStringValue(CharData.DIRECTION[15]),
2501 String.zeroBasedStringValue(CharData.DIRECTION[16])};
2504 * Stores unicode titlecase table. Exploit package visibility of
2505 * String.value to avoid copying the array.
2506 * @see CharData#TITLE
2508 private static final char[] title = String.zeroBasedStringValue(CharData.TITLE);
2511 * Mask for grabbing the type out of the contents of data.
2512 * @see CharData#DATA
2514 private static final int TYPE_MASK = 0x1F;
2517 * Mask for grabbing the non-breaking space flag out of the contents of
2518 * data.
2519 * @see CharData#DATA
2521 private static final int NO_BREAK_MASK = 0x20;
2524 * Mask for grabbing the mirrored directionality flag out of the contents
2525 * of data.
2526 * @see CharData#DATA
2528 private static final int MIRROR_MASK = 0x40;
2531 * Min value for supplementary code point.
2533 * @since 1.5
2535 public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
2538 * Min value for code point.
2540 * @since 1.5
2542 public static final int MIN_CODE_POINT = 0;
2546 * Max value for code point.
2548 * @since 1.5
2550 public static final int MAX_CODE_POINT = 0x010ffff;
2554 * Minimum high surrogate code in UTF-16 encoding.
2556 * @since 1.5
2558 public static final char MIN_HIGH_SURROGATE = '\ud800';
2561 * Maximum high surrogate code in UTF-16 encoding.
2563 * @since 1.5
2565 public static final char MAX_HIGH_SURROGATE = '\udbff';
2568 * Minimum low surrogate code in UTF-16 encoding.
2570 * @since 1.5
2572 public static final char MIN_LOW_SURROGATE = '\udc00';
2575 * Maximum low surrogate code in UTF-16 encoding.
2577 * @since 1.5
2579 public static final char MAX_LOW_SURROGATE = '\udfff';
2582 * Minimum surrogate code in UTF-16 encoding.
2584 * @since 1.5
2586 public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
2589 * Maximum low surrogate code in UTF-16 encoding.
2591 * @since 1.5
2593 public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
2596 * Grabs an attribute offset from the Unicode attribute database. The lower
2597 * 5 bits are the character type, the next 2 bits are flags, and the top
2598 * 9 bits are the offset into the attribute tables.
2600 * @param codePoint the character to look up
2601 * @return the character's attribute offset and type
2602 * @see #TYPE_MASK
2603 * @see #NO_BREAK_MASK
2604 * @see #MIRROR_MASK
2605 * @see CharData#DATA
2606 * @see CharData#SHIFT
2608 static char readCodePoint(int codePoint)
2610 int plane = codePoint >>> 16;
2611 char offset = (char) (codePoint & 0xffff);
2612 return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)];
2616 * Wraps up a character.
2618 * @param value the character to wrap
2620 public Character(char value)
2622 this.value = value;
2626 * Returns the character which has been wrapped by this class.
2628 * @return the character wrapped
2630 public char charValue()
2632 return value;
2636 * Returns the numerical value (unsigned) of the wrapped character.
2637 * Range of returned values: 0x0000-0xFFFF.
2639 * @return the value of the wrapped character
2641 public int hashCode()
2643 return value;
2647 * Determines if an object is equal to this object. This is only true for
2648 * another Character object wrapping the same value.
2650 * @param o object to compare
2651 * @return true if o is a Character with the same value
2653 public boolean equals(Object o)
2655 return o instanceof Character && value == ((Character) o).value;
2659 * Converts the wrapped character into a String.
2661 * @return a String containing one character -- the wrapped character
2662 * of this instance
2664 public String toString()
2666 // Package constructor avoids an array copy.
2667 return new String(new char[] { value }, 0, 1, true);
2671 * Returns a String of length 1 representing the specified character.
2673 * @param ch the character to convert
2674 * @return a String containing the character
2675 * @since 1.4
2677 public static String toString(char ch)
2679 // Package constructor avoids an array copy.
2680 return new String(new char[] { ch }, 0, 1, true);
2684 * Determines if a character is a Unicode lowercase letter. For example,
2685 * <code>'a'</code> is lowercase. Returns true if getType() returns
2686 * LOWERCASE_LETTER.
2687 * <br>
2688 * lowercase = [Ll]
2690 * @param ch character to test
2691 * @return true if ch is a Unicode lowercase letter, else false
2692 * @see #isUpperCase(char)
2693 * @see #isTitleCase(char)
2694 * @see #toLowerCase(char)
2695 * @see #getType(char)
2697 public static boolean isLowerCase(char ch)
2699 return isLowerCase((int)ch);
2703 * Determines if a character is a Unicode lowercase letter. For example,
2704 * <code>'a'</code> is lowercase. Returns true if getType() returns
2705 * LOWERCASE_LETTER.
2706 * <br>
2707 * lowercase = [Ll]
2709 * @param codePoint character to test
2710 * @return true if ch is a Unicode lowercase letter, else false
2711 * @see #isUpperCase(char)
2712 * @see #isTitleCase(char)
2713 * @see #toLowerCase(char)
2714 * @see #getType(char)
2716 * @since 1.5
2718 public static boolean isLowerCase(int codePoint)
2720 return getType(codePoint) == LOWERCASE_LETTER;
2724 * Determines if a character is a Unicode uppercase letter. For example,
2725 * <code>'A'</code> is uppercase. Returns true if getType() returns
2726 * UPPERCASE_LETTER.
2727 * <br>
2728 * uppercase = [Lu]
2730 * @param ch character to test
2731 * @return true if ch is a Unicode uppercase letter, else false
2732 * @see #isLowerCase(char)
2733 * @see #isTitleCase(char)
2734 * @see #toUpperCase(char)
2735 * @see #getType(char)
2737 public static boolean isUpperCase(char ch)
2739 return isUpperCase((int)ch);
2743 * Determines if a character is a Unicode uppercase letter. For example,
2744 * <code>'A'</code> is uppercase. Returns true if getType() returns
2745 * UPPERCASE_LETTER.
2746 * <br>
2747 * uppercase = [Lu]
2749 * @param codePoint character to test
2750 * @return true if ch is a Unicode uppercase letter, else false
2751 * @see #isLowerCase(char)
2752 * @see #isTitleCase(char)
2753 * @see #toUpperCase(char)
2754 * @see #getType(char)
2756 * @since 1.5
2758 public static boolean isUpperCase(int codePoint)
2760 return getType(codePoint) == UPPERCASE_LETTER;
2764 * Determines if a character is a Unicode titlecase letter. For example,
2765 * the character "Lj" (Latin capital L with small letter j) is titlecase.
2766 * True if getType() returns TITLECASE_LETTER.
2767 * <br>
2768 * titlecase = [Lt]
2770 * @param ch character to test
2771 * @return true if ch is a Unicode titlecase letter, else false
2772 * @see #isLowerCase(char)
2773 * @see #isUpperCase(char)
2774 * @see #toTitleCase(char)
2775 * @see #getType(char)
2777 public static boolean isTitleCase(char ch)
2779 return isTitleCase((int)ch);
2783 * Determines if a character is a Unicode titlecase letter. For example,
2784 * the character "Lj" (Latin capital L with small letter j) is titlecase.
2785 * True if getType() returns TITLECASE_LETTER.
2786 * <br>
2787 * titlecase = [Lt]
2789 * @param codePoint character to test
2790 * @return true if ch is a Unicode titlecase letter, else false
2791 * @see #isLowerCase(char)
2792 * @see #isUpperCase(char)
2793 * @see #toTitleCase(char)
2794 * @see #getType(char)
2796 * @since 1.5
2798 public static boolean isTitleCase(int codePoint)
2800 return getType(codePoint) == TITLECASE_LETTER;
2805 * Determines if a character is a Unicode decimal digit. For example,
2806 * <code>'0'</code> is a digit. A character is a Unicode digit if
2807 * getType() returns DECIMAL_DIGIT_NUMBER.
2808 * <br>
2809 * Unicode decimal digit = [Nd]
2811 * @param ch character to test
2812 * @return true if ch is a Unicode decimal digit, else false
2813 * @see #digit(char, int)
2814 * @see #forDigit(int, int)
2815 * @see #getType(char)
2817 public static boolean isDigit(char ch)
2819 return isDigit((int)ch);
2823 * Determines if a character is a Unicode decimal digit. For example,
2824 * <code>'0'</code> is a digit. A character is a Unicode digit if
2825 * getType() returns DECIMAL_DIGIT_NUMBER.
2826 * <br>
2827 * Unicode decimal digit = [Nd]
2829 * @param codePoint character to test
2830 * @return true if ch is a Unicode decimal digit, else false
2831 * @see #digit(char, int)
2832 * @see #forDigit(int, int)
2833 * @see #getType(char)
2835 * @since 1.5
2838 public static boolean isDigit(int codePoint)
2840 return getType(codePoint) == DECIMAL_DIGIT_NUMBER;
2844 * Determines if a character is part of the Unicode Standard. This is an
2845 * evolving standard, but covers every character in the data file.
2846 * <br>
2847 * defined = not [Cn]
2849 * @param ch character to test
2850 * @return true if ch is a Unicode character, else false
2851 * @see #isDigit(char)
2852 * @see #isLetter(char)
2853 * @see #isLetterOrDigit(char)
2854 * @see #isLowerCase(char)
2855 * @see #isTitleCase(char)
2856 * @see #isUpperCase(char)
2858 public static boolean isDefined(char ch)
2860 return isDefined((int)ch);
2864 * Determines if a character is part of the Unicode Standard. This is an
2865 * evolving standard, but covers every character in the data file.
2866 * <br>
2867 * defined = not [Cn]
2869 * @param codePoint character to test
2870 * @return true if ch is a Unicode character, else false
2871 * @see #isDigit(char)
2872 * @see #isLetter(char)
2873 * @see #isLetterOrDigit(char)
2874 * @see #isLowerCase(char)
2875 * @see #isTitleCase(char)
2876 * @see #isUpperCase(char)
2878 * @since 1.5
2880 public static boolean isDefined(int codePoint)
2882 return getType(codePoint) != UNASSIGNED;
2886 * Determines if a character is a Unicode letter. Not all letters have case,
2887 * so this may return true when isLowerCase and isUpperCase return false.
2888 * A character is a Unicode letter if getType() returns one of
2889 * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
2890 * or OTHER_LETTER.
2891 * <br>
2892 * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2894 * @param ch character to test
2895 * @return true if ch is a Unicode letter, else false
2896 * @see #isDigit(char)
2897 * @see #isJavaIdentifierStart(char)
2898 * @see #isJavaLetter(char)
2899 * @see #isJavaLetterOrDigit(char)
2900 * @see #isLetterOrDigit(char)
2901 * @see #isLowerCase(char)
2902 * @see #isTitleCase(char)
2903 * @see #isUnicodeIdentifierStart(char)
2904 * @see #isUpperCase(char)
2906 public static boolean isLetter(char ch)
2908 return isLetter((int)ch);
2912 * Determines if a character is a Unicode letter. Not all letters have case,
2913 * so this may return true when isLowerCase and isUpperCase return false.
2914 * A character is a Unicode letter if getType() returns one of
2915 * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
2916 * or OTHER_LETTER.
2917 * <br>
2918 * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
2920 * @param codePoint character to test
2921 * @return true if ch is a Unicode letter, else false
2922 * @see #isDigit(char)
2923 * @see #isJavaIdentifierStart(char)
2924 * @see #isJavaLetter(char)
2925 * @see #isJavaLetterOrDigit(char)
2926 * @see #isLetterOrDigit(char)
2927 * @see #isLowerCase(char)
2928 * @see #isTitleCase(char)
2929 * @see #isUnicodeIdentifierStart(char)
2930 * @see #isUpperCase(char)
2932 * @since 1.5
2934 public static boolean isLetter(int codePoint)
2936 return ((1 << getType(codePoint))
2937 & ((1 << UPPERCASE_LETTER)
2938 | (1 << LOWERCASE_LETTER)
2939 | (1 << TITLECASE_LETTER)
2940 | (1 << MODIFIER_LETTER)
2941 | (1 << OTHER_LETTER))) != 0;
2944 * Returns the index into the given CharSequence that is offset
2945 * <code>codePointOffset</code> code points from <code>index</code>.
2946 * @param seq the CharSequence
2947 * @param index the start position in the CharSequence
2948 * @param codePointOffset the number of code points offset from the start
2949 * position
2950 * @return the index into the CharSequence that is codePointOffset code
2951 * points offset from index
2953 * @throws NullPointerException if seq is null
2954 * @throws IndexOutOfBoundsException if index is negative or greater than the
2955 * length of the sequence.
2956 * @throws IndexOutOfBoundsException if codePointOffset is positive and the
2957 * subsequence from index to the end of seq has fewer than codePointOffset
2958 * code points
2959 * @throws IndexOutOfBoundsException if codePointOffset is negative and the
2960 * subsequence from the start of seq to index has fewer than
2961 * (-codePointOffset) code points
2962 * @since 1.5
2964 public static int offsetByCodePoints(CharSequence seq,
2965 int index,
2966 int codePointOffset)
2968 int len = seq.length();
2969 if (index < 0 || index > len)
2970 throw new IndexOutOfBoundsException();
2972 int numToGo = codePointOffset;
2973 int offset = index;
2974 int adjust = 1;
2975 if (numToGo >= 0)
2977 for (; numToGo > 0; offset++)
2979 numToGo--;
2980 if (Character.isHighSurrogate(seq.charAt(offset))
2981 && (offset + 1) < len
2982 && Character.isLowSurrogate(seq.charAt(offset + 1)))
2983 offset++;
2985 return offset;
2987 else
2989 numToGo *= -1;
2990 for (; numToGo > 0;)
2992 numToGo--;
2993 offset--;
2994 if (Character.isLowSurrogate(seq.charAt(offset))
2995 && (offset - 1) >= 0
2996 && Character.isHighSurrogate(seq.charAt(offset - 1)))
2997 offset--;
2999 return offset;
3004 * Returns the index into the given char subarray that is offset
3005 * <code>codePointOffset</code> code points from <code>index</code>.
3006 * @param a the char array
3007 * @param start the start index of the subarray
3008 * @param count the length of the subarray
3009 * @param index the index to be offset
3010 * @param codePointOffset the number of code points offset from <code>index
3011 * </code>
3012 * @return the index into the char array
3014 * @throws NullPointerException if a is null
3015 * @throws IndexOutOfBoundsException if start or count is negative or if
3016 * start + count is greater than the length of the array
3017 * @throws IndexOutOfBoundsException if index is less than start or larger
3018 * than start + count
3019 * @throws IndexOutOfBoundsException if codePointOffset is positive and the
3020 * subarray from index to start + count - 1 has fewer than codePointOffset
3021 * code points.
3022 * @throws IndexOutOfBoundsException if codePointOffset is negative and the
3023 * subarray from start to index - 1 has fewer than (-codePointOffset) code
3024 * points
3026 * @since 1.5
3028 public static int offsetByCodePoints(char[] a,
3029 int start,
3030 int count,
3031 int index,
3032 int codePointOffset)
3034 int len = a.length;
3035 int end = start + count;
3036 if (start < 0 || count < 0 || end > len || index < start || index > end)
3037 throw new IndexOutOfBoundsException();
3039 int numToGo = codePointOffset;
3040 int offset = index;
3041 int adjust = 1;
3042 if (numToGo >= 0)
3044 for (; numToGo > 0; offset++)
3046 numToGo--;
3047 if (Character.isHighSurrogate(a[offset])
3048 && (offset + 1) < len
3049 && Character.isLowSurrogate(a[offset + 1]))
3050 offset++;
3052 return offset;
3054 else
3056 numToGo *= -1;
3057 for (; numToGo > 0;)
3059 numToGo--;
3060 offset--;
3061 if (Character.isLowSurrogate(a[offset])
3062 && (offset - 1) >= 0
3063 && Character.isHighSurrogate(a[offset - 1]))
3064 offset--;
3065 if (offset < start)
3066 throw new IndexOutOfBoundsException();
3068 return offset;
3074 * Returns the number of Unicode code points in the specified range of the
3075 * given CharSequence. The first char in the range is at position
3076 * beginIndex and the last one is at position endIndex - 1. Paired
3077 * surrogates (supplementary characters are represented by a pair of chars -
3078 * one from the high surrogates and one from the low surrogates)
3079 * count as just one code point.
3080 * @param seq the CharSequence to inspect
3081 * @param beginIndex the beginning of the range
3082 * @param endIndex the end of the range
3083 * @return the number of Unicode code points in the given range of the
3084 * sequence
3085 * @throws NullPointerException if seq is null
3086 * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is
3087 * larger than the length of seq, or if beginIndex is greater than endIndex.
3088 * @since 1.5
3090 public static int codePointCount(CharSequence seq, int beginIndex,
3091 int endIndex)
3093 int len = seq.length();
3094 if (beginIndex < 0 || endIndex > len || beginIndex > endIndex)
3095 throw new IndexOutOfBoundsException();
3097 int count = 0;
3098 for (int i = beginIndex; i < endIndex; i++)
3100 count++;
3101 // If there is a pairing, count it only once.
3102 if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex
3103 && isLowSurrogate(seq.charAt(i + 1)))
3104 i ++;
3106 return count;
3110 * Returns the number of Unicode code points in the specified range of the
3111 * given char array. The first char in the range is at position
3112 * offset and the length of the range is count. Paired surrogates
3113 * (supplementary characters are represented by a pair of chars -
3114 * one from the high surrogates and one from the low surrogates)
3115 * count as just one code point.
3116 * @param a the char array to inspect
3117 * @param offset the beginning of the range
3118 * @param count the length of the range
3119 * @return the number of Unicode code points in the given range of the
3120 * array
3121 * @throws NullPointerException if a is null
3122 * @throws IndexOutOfBoundsException if offset or count is negative or if
3123 * offset + countendIndex is larger than the length of a.
3124 * @since 1.5
3126 public static int codePointCount(char[] a, int offset,
3127 int count)
3129 int len = a.length;
3130 int end = offset + count;
3131 if (offset < 0 || count < 0 || end > len)
3132 throw new IndexOutOfBoundsException();
3134 int counter = 0;
3135 for (int i = offset; i < end; i++)
3137 counter++;
3138 // If there is a pairing, count it only once.
3139 if (isHighSurrogate(a[i]) && (i + 1) < end
3140 && isLowSurrogate(a[i + 1]))
3141 i ++;
3143 return counter;
3147 * Determines if a character is a Unicode letter or a Unicode digit. This
3148 * is the combination of isLetter and isDigit.
3149 * <br>
3150 * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
3152 * @param ch character to test
3153 * @return true if ch is a Unicode letter or a Unicode digit, else false
3154 * @see #isDigit(char)
3155 * @see #isJavaIdentifierPart(char)
3156 * @see #isJavaLetter(char)
3157 * @see #isJavaLetterOrDigit(char)
3158 * @see #isLetter(char)
3159 * @see #isUnicodeIdentifierPart(char)
3161 public static boolean isLetterOrDigit(char ch)
3163 return isLetterOrDigit((int)ch);
3167 * Determines if a character is a Unicode letter or a Unicode digit. This
3168 * is the combination of isLetter and isDigit.
3169 * <br>
3170 * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
3172 * @param codePoint character to test
3173 * @return true if ch is a Unicode letter or a Unicode digit, else false
3174 * @see #isDigit(char)
3175 * @see #isJavaIdentifierPart(char)
3176 * @see #isJavaLetter(char)
3177 * @see #isJavaLetterOrDigit(char)
3178 * @see #isLetter(char)
3179 * @see #isUnicodeIdentifierPart(char)
3181 * @since 1.5
3183 public static boolean isLetterOrDigit(int codePoint)
3185 return ((1 << getType(codePoint))
3186 & ((1 << UPPERCASE_LETTER)
3187 | (1 << LOWERCASE_LETTER)
3188 | (1 << TITLECASE_LETTER)
3189 | (1 << MODIFIER_LETTER)
3190 | (1 << OTHER_LETTER)
3191 | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
3195 * Determines if a character can start a Java identifier. This is the
3196 * combination of isLetter, any character where getType returns
3197 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3198 * (like '_').
3200 * @param ch character to test
3201 * @return true if ch can start a Java identifier, else false
3202 * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
3203 * @see #isJavaLetterOrDigit(char)
3204 * @see #isJavaIdentifierStart(char)
3205 * @see #isJavaIdentifierPart(char)
3206 * @see #isLetter(char)
3207 * @see #isLetterOrDigit(char)
3208 * @see #isUnicodeIdentifierStart(char)
3210 public static boolean isJavaLetter(char ch)
3212 return isJavaIdentifierStart(ch);
3216 * Determines if a character can follow the first letter in
3217 * a Java identifier. This is the combination of isJavaLetter (isLetter,
3218 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3219 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3220 * or isIdentifierIgnorable.
3222 * @param ch character to test
3223 * @return true if ch can follow the first letter in a Java identifier
3224 * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
3225 * @see #isJavaLetter(char)
3226 * @see #isJavaIdentifierStart(char)
3227 * @see #isJavaIdentifierPart(char)
3228 * @see #isLetter(char)
3229 * @see #isLetterOrDigit(char)
3230 * @see #isUnicodeIdentifierPart(char)
3231 * @see #isIdentifierIgnorable(char)
3233 public static boolean isJavaLetterOrDigit(char ch)
3235 return isJavaIdentifierPart(ch);
3239 * Determines if a character can start a Java identifier. This is the
3240 * combination of isLetter, any character where getType returns
3241 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3242 * (like '_').
3243 * <br>
3244 * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
3246 * @param ch character to test
3247 * @return true if ch can start a Java identifier, else false
3248 * @see #isJavaIdentifierPart(char)
3249 * @see #isLetter(char)
3250 * @see #isUnicodeIdentifierStart(char)
3251 * @since 1.1
3253 public static boolean isJavaIdentifierStart(char ch)
3255 return isJavaIdentifierStart((int)ch);
3259 * Determines if a character can start a Java identifier. This is the
3260 * combination of isLetter, any character where getType returns
3261 * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
3262 * (like '_').
3263 * <br>
3264 * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
3266 * @param codePoint character to test
3267 * @return true if ch can start a Java identifier, else false
3268 * @see #isJavaIdentifierPart(char)
3269 * @see #isLetter(char)
3270 * @see #isUnicodeIdentifierStart(char)
3271 * @since 1.5
3273 public static boolean isJavaIdentifierStart(int codePoint)
3275 return ((1 << getType(codePoint))
3276 & ((1 << UPPERCASE_LETTER)
3277 | (1 << LOWERCASE_LETTER)
3278 | (1 << TITLECASE_LETTER)
3279 | (1 << MODIFIER_LETTER)
3280 | (1 << OTHER_LETTER)
3281 | (1 << LETTER_NUMBER)
3282 | (1 << CURRENCY_SYMBOL)
3283 | (1 << CONNECTOR_PUNCTUATION))) != 0;
3287 * Determines if a character can follow the first letter in
3288 * a Java identifier. This is the combination of isJavaLetter (isLetter,
3289 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3290 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3291 * or isIdentifierIgnorable.
3292 * <br>
3293 * Java identifier extender =
3294 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
3295 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3297 * @param ch character to test
3298 * @return true if ch can follow the first letter in a Java identifier
3299 * @see #isIdentifierIgnorable(char)
3300 * @see #isJavaIdentifierStart(char)
3301 * @see #isLetterOrDigit(char)
3302 * @see #isUnicodeIdentifierPart(char)
3303 * @since 1.1
3305 public static boolean isJavaIdentifierPart(char ch)
3307 return isJavaIdentifierPart((int)ch);
3311 * Determines if a character can follow the first letter in
3312 * a Java identifier. This is the combination of isJavaLetter (isLetter,
3313 * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
3314 * numeric letter (like Roman numerals), combining marks, non-spacing marks,
3315 * or isIdentifierIgnorable.
3316 * <br>
3317 * Java identifier extender =
3318 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
3319 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3321 * @param codePoint character to test
3322 * @return true if ch can follow the first letter in a Java identifier
3323 * @see #isIdentifierIgnorable(char)
3324 * @see #isJavaIdentifierStart(char)
3325 * @see #isLetterOrDigit(char)
3326 * @see #isUnicodeIdentifierPart(char)
3327 * @since 1.5
3329 public static boolean isJavaIdentifierPart(int codePoint)
3331 int category = getType(codePoint);
3332 return ((1 << category)
3333 & ((1 << UPPERCASE_LETTER)
3334 | (1 << LOWERCASE_LETTER)
3335 | (1 << TITLECASE_LETTER)
3336 | (1 << MODIFIER_LETTER)
3337 | (1 << OTHER_LETTER)
3338 | (1 << NON_SPACING_MARK)
3339 | (1 << COMBINING_SPACING_MARK)
3340 | (1 << DECIMAL_DIGIT_NUMBER)
3341 | (1 << LETTER_NUMBER)
3342 | (1 << CURRENCY_SYMBOL)
3343 | (1 << CONNECTOR_PUNCTUATION)
3344 | (1 << FORMAT))) != 0
3345 || (category == CONTROL && isIdentifierIgnorable(codePoint));
3349 * Determines if a character can start a Unicode identifier. Only
3350 * letters can start a Unicode identifier, but this includes characters
3351 * in LETTER_NUMBER.
3352 * <br>
3353 * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
3355 * @param ch character to test
3356 * @return true if ch can start a Unicode identifier, else false
3357 * @see #isJavaIdentifierStart(char)
3358 * @see #isLetter(char)
3359 * @see #isUnicodeIdentifierPart(char)
3360 * @since 1.1
3362 public static boolean isUnicodeIdentifierStart(char ch)
3364 return isUnicodeIdentifierStart((int)ch);
3368 * Determines if a character can start a Unicode identifier. Only
3369 * letters can start a Unicode identifier, but this includes characters
3370 * in LETTER_NUMBER.
3371 * <br>
3372 * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
3374 * @param codePoint character to test
3375 * @return true if ch can start a Unicode identifier, else false
3376 * @see #isJavaIdentifierStart(char)
3377 * @see #isLetter(char)
3378 * @see #isUnicodeIdentifierPart(char)
3379 * @since 1.5
3381 public static boolean isUnicodeIdentifierStart(int codePoint)
3383 return ((1 << getType(codePoint))
3384 & ((1 << UPPERCASE_LETTER)
3385 | (1 << LOWERCASE_LETTER)
3386 | (1 << TITLECASE_LETTER)
3387 | (1 << MODIFIER_LETTER)
3388 | (1 << OTHER_LETTER)
3389 | (1 << LETTER_NUMBER))) != 0;
3393 * Determines if a character can follow the first letter in
3394 * a Unicode identifier. This includes letters, connecting punctuation,
3395 * digits, numeric letters, combining marks, non-spacing marks, and
3396 * isIdentifierIgnorable.
3397 * <br>
3398 * Unicode identifier extender =
3399 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
3400 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3402 * @param ch character to test
3403 * @return true if ch can follow the first letter in a Unicode identifier
3404 * @see #isIdentifierIgnorable(char)
3405 * @see #isJavaIdentifierPart(char)
3406 * @see #isLetterOrDigit(char)
3407 * @see #isUnicodeIdentifierStart(char)
3408 * @since 1.1
3410 public static boolean isUnicodeIdentifierPart(char ch)
3412 return isUnicodeIdentifierPart((int)ch);
3416 * Determines if a character can follow the first letter in
3417 * a Unicode identifier. This includes letters, connecting punctuation,
3418 * digits, numeric letters, combining marks, non-spacing marks, and
3419 * isIdentifierIgnorable.
3420 * <br>
3421 * Unicode identifier extender =
3422 * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
3423 * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
3425 * @param codePoint character to test
3426 * @return true if ch can follow the first letter in a Unicode identifier
3427 * @see #isIdentifierIgnorable(char)
3428 * @see #isJavaIdentifierPart(char)
3429 * @see #isLetterOrDigit(char)
3430 * @see #isUnicodeIdentifierStart(char)
3431 * @since 1.5
3433 public static boolean isUnicodeIdentifierPart(int codePoint)
3435 int category = getType(codePoint);
3436 return ((1 << category)
3437 & ((1 << UPPERCASE_LETTER)
3438 | (1 << LOWERCASE_LETTER)
3439 | (1 << TITLECASE_LETTER)
3440 | (1 << MODIFIER_LETTER)
3441 | (1 << OTHER_LETTER)
3442 | (1 << NON_SPACING_MARK)
3443 | (1 << COMBINING_SPACING_MARK)
3444 | (1 << DECIMAL_DIGIT_NUMBER)
3445 | (1 << LETTER_NUMBER)
3446 | (1 << CONNECTOR_PUNCTUATION)
3447 | (1 << FORMAT))) != 0
3448 || (category == CONTROL && isIdentifierIgnorable(codePoint));
3452 * Determines if a character is ignorable in a Unicode identifier. This
3453 * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
3454 * through <code>'\u0008'</code>, <code>'\u000E'</code> through
3455 * <code>'\u001B'</code>, and <code>'\u007F'</code> through
3456 * <code>'\u009F'</code>), and FORMAT characters.
3457 * <br>
3458 * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
3459 * |U+007F-U+009F
3461 * @param ch character to test
3462 * @return true if ch is ignorable in a Unicode or Java identifier
3463 * @see #isJavaIdentifierPart(char)
3464 * @see #isUnicodeIdentifierPart(char)
3465 * @since 1.1
3467 public static boolean isIdentifierIgnorable(char ch)
3469 return isIdentifierIgnorable((int)ch);
3472 * Determines if a character is ignorable in a Unicode identifier. This
3473 * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
3474 * through <code>'\u0008'</code>, <code>'\u000E'</code> through
3475 * <code>'\u001B'</code>, and <code>'\u007F'</code> through
3476 * <code>'\u009F'</code>), and FORMAT characters.
3477 * <br>
3478 * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
3479 * |U+007F-U+009F
3481 * @param codePoint character to test
3482 * @return true if ch is ignorable in a Unicode or Java identifier
3483 * @see #isJavaIdentifierPart(char)
3484 * @see #isUnicodeIdentifierPart(char)
3485 * @since 1.5
3487 public static boolean isIdentifierIgnorable(int codePoint)
3489 if ((codePoint >= 0 && codePoint <= 0x0008)
3490 || (codePoint >= 0x000E && codePoint <= 0x001B)
3491 || (codePoint >= 0x007F && codePoint <= 0x009F)
3492 || getType(codePoint) == FORMAT)
3493 return true;
3494 return false;
3498 * Converts a Unicode character into its lowercase equivalent mapping.
3499 * If a mapping does not exist, then the character passed is returned.
3500 * Note that isLowerCase(toLowerCase(ch)) does not always return true.
3502 * @param ch character to convert to lowercase
3503 * @return lowercase mapping of ch, or ch if lowercase mapping does
3504 * not exist
3505 * @see #isLowerCase(char)
3506 * @see #isUpperCase(char)
3507 * @see #toTitleCase(char)
3508 * @see #toUpperCase(char)
3510 public static char toLowerCase(char ch)
3512 return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch);
3516 * Converts a Unicode character into its lowercase equivalent mapping.
3517 * If a mapping does not exist, then the character passed is returned.
3518 * Note that isLowerCase(toLowerCase(ch)) does not always return true.
3520 * @param codePoint character to convert to lowercase
3521 * @return lowercase mapping of ch, or ch if lowercase mapping does
3522 * not exist
3523 * @see #isLowerCase(char)
3524 * @see #isUpperCase(char)
3525 * @see #toTitleCase(char)
3526 * @see #toUpperCase(char)
3528 * @since 1.5
3530 public static int toLowerCase(int codePoint)
3532 // If the code point is unassigned or in one of the private use areas
3533 // then we delegate the call to the appropriate private static inner class.
3534 int plane = codePoint >>> 16;
3535 if (plane > 2 && plane < 14)
3536 return UnassignedCharacters.toLowerCase(codePoint);
3537 if (plane > 14)
3538 return PrivateUseCharacters.toLowerCase(codePoint);
3540 // The short value stored in lower[plane] is the signed difference between
3541 // codePoint and its lowercase conversion.
3542 return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
3546 * Converts a Unicode character into its uppercase equivalent mapping.
3547 * If a mapping does not exist, then the character passed is returned.
3548 * Note that isUpperCase(toUpperCase(ch)) does not always return true.
3550 * @param ch character to convert to uppercase
3551 * @return uppercase mapping of ch, or ch if uppercase mapping does
3552 * not exist
3553 * @see #isLowerCase(char)
3554 * @see #isUpperCase(char)
3555 * @see #toLowerCase(char)
3556 * @see #toTitleCase(char)
3558 public static char toUpperCase(char ch)
3560 return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch);
3564 * Converts a Unicode character into its uppercase equivalent mapping.
3565 * If a mapping does not exist, then the character passed is returned.
3566 * Note that isUpperCase(toUpperCase(ch)) does not always return true.
3568 * @param codePoint character to convert to uppercase
3569 * @return uppercase mapping of ch, or ch if uppercase mapping does
3570 * not exist
3571 * @see #isLowerCase(char)
3572 * @see #isUpperCase(char)
3573 * @see #toLowerCase(char)
3574 * @see #toTitleCase(char)
3576 * @since 1.5
3578 public static int toUpperCase(int codePoint)
3580 // If the code point is unassigned or in one of the private use areas
3581 // then we delegate the call to the appropriate private static inner class.
3582 int plane = codePoint >>> 16;
3583 if (plane > 2 && plane < 14)
3584 return UnassignedCharacters.toUpperCase(codePoint);
3585 if (plane > 14)
3586 return PrivateUseCharacters.toUpperCase(codePoint);
3588 // The short value stored in upper[plane] is the signed difference between
3589 // codePoint and its uppercase conversion.
3590 return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
3594 * Converts a Unicode character into its titlecase equivalent mapping.
3595 * If a mapping does not exist, then the character passed is returned.
3596 * Note that isTitleCase(toTitleCase(ch)) does not always return true.
3598 * @param ch character to convert to titlecase
3599 * @return titlecase mapping of ch, or ch if titlecase mapping does
3600 * not exist
3601 * @see #isTitleCase(char)
3602 * @see #toLowerCase(char)
3603 * @see #toUpperCase(char)
3605 public static char toTitleCase(char ch)
3607 // As title is short, it doesn't hurt to exhaustively iterate over it.
3608 for (int i = title.length - 2; i >= 0; i -= 2)
3609 if (title[i] == ch)
3610 return title[i + 1];
3611 return toUpperCase(ch);
3615 * Converts a Unicode character into its titlecase equivalent mapping.
3616 * If a mapping does not exist, then the character passed is returned.
3617 * Note that isTitleCase(toTitleCase(ch)) does not always return true.
3619 * @param codePoint character to convert to titlecase
3620 * @return titlecase mapping of ch, or ch if titlecase mapping does
3621 * not exist
3622 * @see #isTitleCase(char)
3623 * @see #toLowerCase(char)
3624 * @see #toUpperCase(char)
3626 * @since 1.5
3628 public static int toTitleCase(int codePoint)
3630 // As of Unicode 4.0.0 no characters outside of plane 0 have
3631 // titlecase mappings that are different from their uppercase
3632 // mapping.
3633 if (codePoint < 0x10000)
3634 return (int) toTitleCase((char)codePoint);
3635 return toUpperCase(codePoint);
3639 * Converts a character into a digit of the specified radix. If the radix
3640 * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3641 * exceeds the radix, or if ch is not a decimal digit or in the case
3642 * insensitive set of 'a'-'z', the result is -1.
3643 * <br>
3644 * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3645 * |U+FF21-U+FF3A|U+FF41-U+FF5A
3647 * @param ch character to convert into a digit
3648 * @param radix radix in which ch is a digit
3649 * @return digit which ch represents in radix, or -1 not a valid digit
3650 * @see #MIN_RADIX
3651 * @see #MAX_RADIX
3652 * @see #forDigit(int, int)
3653 * @see #isDigit(char)
3654 * @see #getNumericValue(char)
3656 public static int digit(char ch, int radix)
3658 if (radix < MIN_RADIX || radix > MAX_RADIX)
3659 return -1;
3660 char attr = readCodePoint((int)ch);
3661 if (((1 << (attr & TYPE_MASK))
3662 & ((1 << UPPERCASE_LETTER)
3663 | (1 << LOWERCASE_LETTER)
3664 | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
3666 // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
3667 int digit = numValue[0][attr >> 7];
3668 return (digit < radix) ? digit : -1;
3670 return -1;
3674 * Converts a character into a digit of the specified radix. If the radix
3675 * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
3676 * exceeds the radix, or if ch is not a decimal digit or in the case
3677 * insensitive set of 'a'-'z', the result is -1.
3678 * <br>
3679 * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
3680 * |U+FF21-U+FF3A|U+FF41-U+FF5A
3682 * @param codePoint character to convert into a digit
3683 * @param radix radix in which ch is a digit
3684 * @return digit which ch represents in radix, or -1 not a valid digit
3685 * @see #MIN_RADIX
3686 * @see #MAX_RADIX
3687 * @see #forDigit(int, int)
3688 * @see #isDigit(char)
3689 * @see #getNumericValue(char)
3691 public static int digit(int codePoint, int radix)
3693 if (radix < MIN_RADIX || radix > MAX_RADIX)
3694 return -1;
3696 // If the code point is unassigned or in one of the private use areas
3697 // then we delegate the call to the appropriate private static inner class.
3698 int plane = codePoint >>> 16;
3699 if (plane > 2 && plane < 14)
3700 return UnassignedCharacters.digit(codePoint, radix);
3701 if (plane > 14)
3702 return PrivateUseCharacters.digit(codePoint, radix);
3703 char attr = readCodePoint(codePoint);
3704 if (((1 << (attr & TYPE_MASK))
3705 & ((1 << UPPERCASE_LETTER)
3706 | (1 << LOWERCASE_LETTER)
3707 | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
3709 // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
3710 int digit = numValue[plane][attr >> 7];
3712 // If digit is less than or equal to -3 then the numerical value was
3713 // too large to fit into numValue and is stored in CharData.LARGENUMS.
3714 if (digit <= -3)
3715 digit = CharData.LARGENUMS[-digit - 3];
3716 return (digit < radix) ? digit : -1;
3718 return -1;
3722 * Returns the Unicode numeric value property of a character. For example,
3723 * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3725 * <p>This method also returns values for the letters A through Z, (not
3726 * specified by Unicode), in these ranges: <code>'\u0041'</code>
3727 * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3728 * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3729 * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3730 * <code>'\uFF5A'</code> (full width variants).
3732 * <p>If the character lacks a numeric value property, -1 is returned.
3733 * If the character has a numeric value property which is not representable
3734 * as a nonnegative integer, such as a fraction, -2 is returned.
3736 * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3737 * |U+FF21-U+FF3A|U+FF41-U+FF5A
3739 * @param ch character from which the numeric value property will
3740 * be retrieved
3741 * @return the numeric value property of ch, or -1 if it does not exist, or
3742 * -2 if it is not representable as a nonnegative integer
3743 * @see #forDigit(int, int)
3744 * @see #digit(char, int)
3745 * @see #isDigit(char)
3746 * @since 1.1
3748 public static int getNumericValue(char ch)
3750 // Treat numValue as signed.
3751 return (short) numValue[0][readCodePoint((int)ch) >> 7];
3755 * Returns the Unicode numeric value property of a character. For example,
3756 * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
3758 * <p>This method also returns values for the letters A through Z, (not
3759 * specified by Unicode), in these ranges: <code>'\u0041'</code>
3760 * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
3761 * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
3762 * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
3763 * <code>'\uFF5A'</code> (full width variants).
3765 * <p>If the character lacks a numeric value property, -1 is returned.
3766 * If the character has a numeric value property which is not representable
3767 * as a nonnegative integer, such as a fraction, -2 is returned.
3769 * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
3770 * |U+FF21-U+FF3A|U+FF41-U+FF5A
3772 * @param codePoint character from which the numeric value property will
3773 * be retrieved
3774 * @return the numeric value property of ch, or -1 if it does not exist, or
3775 * -2 if it is not representable as a nonnegative integer
3776 * @see #forDigit(int, int)
3777 * @see #digit(char, int)
3778 * @see #isDigit(char)
3779 * @since 1.5
3781 public static int getNumericValue(int codePoint)
3783 // If the code point is unassigned or in one of the private use areas
3784 // then we delegate the call to the appropriate private static inner class.
3785 int plane = codePoint >>> 16;
3786 if (plane > 2 && plane < 14)
3787 return UnassignedCharacters.getNumericValue(codePoint);
3788 if (plane > 14)
3789 return PrivateUseCharacters.getNumericValue(codePoint);
3791 // If the value N found in numValue[plane] is less than or equal to -3
3792 // then the numeric value was too big to fit into 16 bits and is
3793 // stored in CharData.LARGENUMS at offset (-N - 3).
3794 short num = (short)numValue[plane][readCodePoint(codePoint) >> 7];
3795 if (num <= -3)
3796 return CharData.LARGENUMS[-num - 3];
3797 return num;
3801 * Determines if a character is a ISO-LATIN-1 space. This is only the five
3802 * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
3803 * <code>'\r'</code>, and <code>' '</code>.
3804 * <br>
3805 * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
3807 * @param ch character to test
3808 * @return true if ch is a space, else false
3809 * @deprecated Replaced by {@link #isWhitespace(char)}
3810 * @see #isSpaceChar(char)
3811 * @see #isWhitespace(char)
3813 public static boolean isSpace(char ch)
3815 // Performing the subtraction up front alleviates need to compare longs.
3816 return ch-- <= ' ' && ((1 << ch)
3817 & ((1 << (' ' - 1))
3818 | (1 << ('\t' - 1))
3819 | (1 << ('\n' - 1))
3820 | (1 << ('\r' - 1))
3821 | (1 << ('\f' - 1)))) != 0;
3825 * Determines if a character is a Unicode space character. This includes
3826 * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3827 * <br>
3828 * Unicode space = [Zs]|[Zp]|[Zl]
3830 * @param ch character to test
3831 * @return true if ch is a Unicode space, else false
3832 * @see #isWhitespace(char)
3833 * @since 1.1
3835 public static boolean isSpaceChar(char ch)
3837 return isSpaceChar((int)ch);
3841 * Determines if a character is a Unicode space character. This includes
3842 * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
3843 * <br>
3844 * Unicode space = [Zs]|[Zp]|[Zl]
3846 * @param codePoint character to test
3847 * @return true if ch is a Unicode space, else false
3848 * @see #isWhitespace(char)
3849 * @since 1.5
3851 public static boolean isSpaceChar(int codePoint)
3853 return ((1 << getType(codePoint))
3854 & ((1 << SPACE_SEPARATOR)
3855 | (1 << LINE_SEPARATOR)
3856 | (1 << PARAGRAPH_SEPARATOR))) != 0;
3860 * Determines if a character is Java whitespace. This includes Unicode
3861 * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3862 * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3863 * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3864 * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3865 * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3866 * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3867 * and <code>'\u001F'</code>.
3868 * <br>
3869 * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3871 * @param ch character to test
3872 * @return true if ch is Java whitespace, else false
3873 * @see #isSpaceChar(char)
3874 * @since 1.1
3876 public static boolean isWhitespace(char ch)
3878 return isWhitespace((int) ch);
3882 * Determines if a character is Java whitespace. This includes Unicode
3883 * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
3884 * PARAGRAPH_SEPARATOR) except the non-breaking spaces
3885 * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
3886 * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
3887 * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
3888 * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
3889 * and <code>'\u001F'</code>.
3890 * <br>
3891 * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
3893 * @param codePoint character to test
3894 * @return true if ch is Java whitespace, else false
3895 * @see #isSpaceChar(char)
3896 * @since 1.5
3898 public static boolean isWhitespace(int codePoint)
3900 int plane = codePoint >>> 16;
3901 if (plane > 2 && plane < 14)
3902 return UnassignedCharacters.isWhiteSpace(codePoint);
3903 if (plane > 14)
3904 return PrivateUseCharacters.isWhiteSpace(codePoint);
3906 int attr = readCodePoint(codePoint);
3907 return ((((1 << (attr & TYPE_MASK))
3908 & ((1 << SPACE_SEPARATOR)
3909 | (1 << LINE_SEPARATOR)
3910 | (1 << PARAGRAPH_SEPARATOR))) != 0)
3911 && (attr & NO_BREAK_MASK) == 0)
3912 || (codePoint <= '\u001F' && ((1 << codePoint)
3913 & ((1 << '\t')
3914 | (1 << '\n')
3915 | (1 << '\u000B')
3916 | (1 << '\u000C')
3917 | (1 << '\r')
3918 | (1 << '\u001C')
3919 | (1 << '\u001D')
3920 | (1 << '\u001E')
3921 | (1 << '\u001F'))) != 0);
3925 * Determines if a character has the ISO Control property.
3926 * <br>
3927 * ISO Control = [Cc]
3929 * @param ch character to test
3930 * @return true if ch is an ISO Control character, else false
3931 * @see #isSpaceChar(char)
3932 * @see #isWhitespace(char)
3933 * @since 1.1
3935 public static boolean isISOControl(char ch)
3937 return isISOControl((int)ch);
3941 * Determines if the character is an ISO Control character. This is true
3942 * if the code point is in the range [0, 0x001F] or if it is in the range
3943 * [0x007F, 0x009F].
3944 * @param codePoint the character to check
3945 * @return true if the character is in one of the above ranges
3947 * @since 1.5
3949 public static boolean isISOControl(int codePoint)
3951 if ((codePoint >= 0 && codePoint <= 0x001F)
3952 || (codePoint >= 0x007F && codePoint <= 0x009F))
3953 return true;
3954 return false;
3958 * Returns the Unicode general category property of a character.
3960 * @param ch character from which the general category property will
3961 * be retrieved
3962 * @return the character category property of ch as an integer
3963 * @see #UNASSIGNED
3964 * @see #UPPERCASE_LETTER
3965 * @see #LOWERCASE_LETTER
3966 * @see #TITLECASE_LETTER
3967 * @see #MODIFIER_LETTER
3968 * @see #OTHER_LETTER
3969 * @see #NON_SPACING_MARK
3970 * @see #ENCLOSING_MARK
3971 * @see #COMBINING_SPACING_MARK
3972 * @see #DECIMAL_DIGIT_NUMBER
3973 * @see #LETTER_NUMBER
3974 * @see #OTHER_NUMBER
3975 * @see #SPACE_SEPARATOR
3976 * @see #LINE_SEPARATOR
3977 * @see #PARAGRAPH_SEPARATOR
3978 * @see #CONTROL
3979 * @see #FORMAT
3980 * @see #PRIVATE_USE
3981 * @see #SURROGATE
3982 * @see #DASH_PUNCTUATION
3983 * @see #START_PUNCTUATION
3984 * @see #END_PUNCTUATION
3985 * @see #CONNECTOR_PUNCTUATION
3986 * @see #OTHER_PUNCTUATION
3987 * @see #MATH_SYMBOL
3988 * @see #CURRENCY_SYMBOL
3989 * @see #MODIFIER_SYMBOL
3990 * @see #INITIAL_QUOTE_PUNCTUATION
3991 * @see #FINAL_QUOTE_PUNCTUATION
3992 * @since 1.1
3994 public static int getType(char ch)
3996 return getType((int)ch);
4000 * Returns the Unicode general category property of a character.
4002 * @param codePoint character from which the general category property will
4003 * be retrieved
4004 * @return the character category property of ch as an integer
4005 * @see #UNASSIGNED
4006 * @see #UPPERCASE_LETTER
4007 * @see #LOWERCASE_LETTER
4008 * @see #TITLECASE_LETTER
4009 * @see #MODIFIER_LETTER
4010 * @see #OTHER_LETTER
4011 * @see #NON_SPACING_MARK
4012 * @see #ENCLOSING_MARK
4013 * @see #COMBINING_SPACING_MARK
4014 * @see #DECIMAL_DIGIT_NUMBER
4015 * @see #LETTER_NUMBER
4016 * @see #OTHER_NUMBER
4017 * @see #SPACE_SEPARATOR
4018 * @see #LINE_SEPARATOR
4019 * @see #PARAGRAPH_SEPARATOR
4020 * @see #CONTROL
4021 * @see #FORMAT
4022 * @see #PRIVATE_USE
4023 * @see #SURROGATE
4024 * @see #DASH_PUNCTUATION
4025 * @see #START_PUNCTUATION
4026 * @see #END_PUNCTUATION
4027 * @see #CONNECTOR_PUNCTUATION
4028 * @see #OTHER_PUNCTUATION
4029 * @see #MATH_SYMBOL
4030 * @see #CURRENCY_SYMBOL
4031 * @see #MODIFIER_SYMBOL
4032 * @see #INITIAL_QUOTE_PUNCTUATION
4033 * @see #FINAL_QUOTE_PUNCTUATION
4035 * @since 1.5
4037 public static int getType(int codePoint)
4039 // If the codePoint is unassigned or in one of the private use areas
4040 // then we delegate the call to the appropriate private static inner class.
4041 int plane = codePoint >>> 16;
4042 if (plane > 2 && plane < 14)
4043 return UnassignedCharacters.getType(codePoint);
4044 if (plane > 14)
4045 return PrivateUseCharacters.getType(codePoint);
4047 return readCodePoint(codePoint) & TYPE_MASK;
4051 * Converts a digit into a character which represents that digit
4052 * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
4053 * or the digit exceeds the radix, then the null character <code>'\0'</code>
4054 * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'.
4055 * <br>
4056 * return value boundary = U+0030-U+0039|U+0061-U+007A
4058 * @param digit digit to be converted into a character
4059 * @param radix radix of digit
4060 * @return character representing digit in radix, or '\0'
4061 * @see #MIN_RADIX
4062 * @see #MAX_RADIX
4063 * @see #digit(char, int)
4065 public static char forDigit(int digit, int radix)
4067 if (radix < MIN_RADIX || radix > MAX_RADIX
4068 || digit < 0 || digit >= radix)
4069 return '\0';
4070 return Number.digits[digit];
4074 * Returns the Unicode directionality property of the character. This
4075 * is used in the visual ordering of text.
4077 * @param ch the character to look up
4078 * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
4079 * @see #DIRECTIONALITY_UNDEFINED
4080 * @see #DIRECTIONALITY_LEFT_TO_RIGHT
4081 * @see #DIRECTIONALITY_RIGHT_TO_LEFT
4082 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
4083 * @see #DIRECTIONALITY_EUROPEAN_NUMBER
4084 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
4085 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
4086 * @see #DIRECTIONALITY_ARABIC_NUMBER
4087 * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
4088 * @see #DIRECTIONALITY_NONSPACING_MARK
4089 * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
4090 * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
4091 * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
4092 * @see #DIRECTIONALITY_WHITESPACE
4093 * @see #DIRECTIONALITY_OTHER_NEUTRALS
4094 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
4095 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
4096 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
4097 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
4098 * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
4099 * @since 1.4
4101 public static byte getDirectionality(char ch)
4103 // The result will correctly be signed.
4104 return getDirectionality((int)ch);
4108 * Returns the Unicode directionality property of the character. This
4109 * is used in the visual ordering of text.
4111 * @param codePoint the character to look up
4112 * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
4113 * @see #DIRECTIONALITY_UNDEFINED
4114 * @see #DIRECTIONALITY_LEFT_TO_RIGHT
4115 * @see #DIRECTIONALITY_RIGHT_TO_LEFT
4116 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
4117 * @see #DIRECTIONALITY_EUROPEAN_NUMBER
4118 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
4119 * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
4120 * @see #DIRECTIONALITY_ARABIC_NUMBER
4121 * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
4122 * @see #DIRECTIONALITY_NONSPACING_MARK
4123 * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
4124 * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
4125 * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
4126 * @see #DIRECTIONALITY_WHITESPACE
4127 * @see #DIRECTIONALITY_OTHER_NEUTRALS
4128 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
4129 * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
4130 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
4131 * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
4132 * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
4133 * @since 1.5
4135 public static byte getDirectionality(int codePoint)
4137 // If the code point is unassigned or in one of the private use areas
4138 // then we delegate the call to the appropriate private static inner class.
4139 int plane = codePoint >>> 16;
4140 if (plane > 2 && plane < 14)
4141 return UnassignedCharacters.getDirectionality(codePoint);
4142 if (plane > 14)
4143 return PrivateUseCharacters.getDirectionality(codePoint);
4145 // The result will correctly be signed.
4146 return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2);
4150 * Determines whether the character is mirrored according to Unicode. For
4151 * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
4152 * left-to-right text, but ')' in right-to-left text.
4154 * @param ch the character to look up
4155 * @return true if the character is mirrored
4156 * @since 1.4
4158 public static boolean isMirrored(char ch)
4160 return (readCodePoint((int)ch) & MIRROR_MASK) != 0;
4164 * Determines whether the character is mirrored according to Unicode. For
4165 * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
4166 * left-to-right text, but ')' in right-to-left text.
4168 * @param codePoint the character to look up
4169 * @return true if the character is mirrored
4170 * @since 1.5
4172 public static boolean isMirrored(int codePoint)
4174 // If the code point is unassigned or part of one of the private use areas
4175 // then we delegate the call to the appropriate private static inner class.
4176 int plane = codePoint >>> 16;
4177 if (plane > 2 && plane < 14)
4178 return UnassignedCharacters.isMirrored(codePoint);
4179 if (plane > 14)
4180 return PrivateUseCharacters.isMirrored(codePoint);
4182 return (readCodePoint(codePoint) & MIRROR_MASK) != 0;
4186 * Compares another Character to this Character, numerically.
4188 * @param anotherCharacter Character to compare with this Character
4189 * @return a negative integer if this Character is less than
4190 * anotherCharacter, zero if this Character is equal, and
4191 * a positive integer if this Character is greater
4192 * @throws NullPointerException if anotherCharacter is null
4193 * @since 1.2
4195 public int compareTo(Character anotherCharacter)
4197 return value - anotherCharacter.value;
4201 * Compares an object to this Character. Assuming the object is a
4202 * Character object, this method performs the same comparison as
4203 * compareTo(Character).
4205 * @param o object to compare
4206 * @return the comparison value
4207 * @throws ClassCastException if o is not a Character object
4208 * @throws NullPointerException if o is null
4209 * @see #compareTo(Character)
4210 * @since 1.2
4212 public int compareTo(Object o)
4214 return compareTo((Character) o);
4218 * Returns an <code>Character</code> object wrapping the value.
4219 * In contrast to the <code>Character</code> constructor, this method
4220 * will cache some values. It is used by boxing conversion.
4222 * @param val the value to wrap
4223 * @return the <code>Character</code>
4225 * @since 1.5
4227 public static Character valueOf(char val)
4229 if (val > MAX_CACHE)
4230 return new Character(val);
4231 synchronized (charCache)
4233 if (charCache[val - MIN_VALUE] == null)
4234 charCache[val - MIN_VALUE] = new Character(val);
4235 return charCache[val - MIN_VALUE];
4240 * Reverse the bytes in val.
4241 * @since 1.5
4243 public static char reverseBytes(char val)
4245 return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
4249 * Converts a unicode code point to a UTF-16 representation of that
4250 * code point.
4252 * @param codePoint the unicode code point
4254 * @return the UTF-16 representation of that code point
4256 * @throws IllegalArgumentException if the code point is not a valid
4257 * unicode code point
4259 * @since 1.5
4261 public static char[] toChars(int codePoint)
4263 if (!isValidCodePoint(codePoint))
4264 throw new IllegalArgumentException("Illegal Unicode code point : "
4265 + codePoint);
4266 char[] result = new char[charCount(codePoint)];
4267 int ignore = toChars(codePoint, result, 0);
4268 return result;
4272 * Converts a unicode code point to its UTF-16 representation.
4274 * @param codePoint the unicode code point
4275 * @param dst the target char array
4276 * @param dstIndex the start index for the target
4278 * @return number of characters written to <code>dst</code>
4280 * @throws IllegalArgumentException if <code>codePoint</code> is not a
4281 * valid unicode code point
4282 * @throws NullPointerException if <code>dst</code> is <code>null</code>
4283 * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
4284 * in <code>dst</code> or if the UTF-16 representation does not
4285 * fit into <code>dst</code>
4287 * @since 1.5
4289 public static int toChars(int codePoint, char[] dst, int dstIndex)
4291 if (!isValidCodePoint(codePoint))
4293 throw new IllegalArgumentException("not a valid code point: "
4294 + codePoint);
4297 int result;
4298 if (isSupplementaryCodePoint(codePoint))
4300 // Write second char first to cause IndexOutOfBoundsException
4301 // immediately.
4302 final int cp2 = codePoint - 0x10000;
4303 dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
4304 dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
4305 result = 2;
4307 else
4309 dst[dstIndex] = (char) codePoint;
4310 result = 1;
4312 return result;
4316 * Return number of 16-bit characters required to represent the given
4317 * code point.
4319 * @param codePoint a unicode code point
4321 * @return 2 if codePoint >= 0x10000, 1 otherwise.
4323 * @since 1.5
4325 public static int charCount(int codePoint)
4327 return
4328 (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
4329 ? 2
4330 : 1;
4334 * Determines whether the specified code point is
4335 * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
4336 * supplementary character range.
4338 * @param codePoint a Unicode code point
4340 * @return <code>true</code> if code point is in supplementary range
4342 * @since 1.5
4344 public static boolean isSupplementaryCodePoint(int codePoint)
4346 return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
4347 && codePoint <= MAX_CODE_POINT;
4351 * Determines whether the specified code point is
4352 * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
4354 * @param codePoint a Unicode code point
4356 * @return <code>true</code> if code point is valid
4358 * @since 1.5
4360 public static boolean isValidCodePoint(int codePoint)
4362 return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
4366 * Return true if the given character is a high surrogate.
4367 * @param ch the character
4368 * @return true if the character is a high surrogate character
4370 * @since 1.5
4372 public static boolean isHighSurrogate(char ch)
4374 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
4378 * Return true if the given character is a low surrogate.
4379 * @param ch the character
4380 * @return true if the character is a low surrogate character
4382 * @since 1.5
4384 public static boolean isLowSurrogate(char ch)
4386 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
4390 * Return true if the given characters compose a surrogate pair.
4391 * This is true if the first character is a high surrogate and the
4392 * second character is a low surrogate.
4393 * @param ch1 the first character
4394 * @param ch2 the first character
4395 * @return true if the characters compose a surrogate pair
4397 * @since 1.5
4399 public static boolean isSurrogatePair(char ch1, char ch2)
4401 return isHighSurrogate(ch1) && isLowSurrogate(ch2);
4405 * Given a valid surrogate pair, this returns the corresponding
4406 * code point.
4407 * @param high the high character of the pair
4408 * @param low the low character of the pair
4409 * @return the corresponding code point
4411 * @since 1.5
4413 public static int toCodePoint(char high, char low)
4415 return ((high - MIN_HIGH_SURROGATE) * 0x400) +
4416 (low - MIN_LOW_SURROGATE) + 0x10000;
4420 * Get the code point at the specified index in the CharSequence.
4421 * This is like CharSequence#charAt(int), but if the character is
4422 * the start of a surrogate pair, and there is a following
4423 * character, and this character completes the pair, then the
4424 * corresponding supplementary code point is returned. Otherwise,
4425 * the character at the index is returned.
4427 * @param sequence the CharSequence
4428 * @param index the index of the codepoint to get, starting at 0
4429 * @return the codepoint at the specified index
4430 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4431 * @since 1.5
4433 public static int codePointAt(CharSequence sequence, int index)
4435 int len = sequence.length();
4436 if (index < 0 || index >= len)
4437 throw new IndexOutOfBoundsException();
4438 char high = sequence.charAt(index);
4439 if (! isHighSurrogate(high) || ++index >= len)
4440 return high;
4441 char low = sequence.charAt(index);
4442 if (! isLowSurrogate(low))
4443 return high;
4444 return toCodePoint(high, low);
4448 * Get the code point at the specified index in the CharSequence.
4449 * If the character is the start of a surrogate pair, and there is a
4450 * following character, and this character completes the pair, then
4451 * the corresponding supplementary code point is returned.
4452 * Otherwise, the character at the index is returned.
4454 * @param chars the character array in which to look
4455 * @param index the index of the codepoint to get, starting at 0
4456 * @return the codepoint at the specified index
4457 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4458 * @since 1.5
4460 public static int codePointAt(char[] chars, int index)
4462 return codePointAt(chars, index, chars.length);
4466 * Get the code point at the specified index in the CharSequence.
4467 * If the character is the start of a surrogate pair, and there is a
4468 * following character within the specified range, and this
4469 * character completes the pair, then the corresponding
4470 * supplementary code point is returned. Otherwise, the character
4471 * at the index is returned.
4473 * @param chars the character array in which to look
4474 * @param index the index of the codepoint to get, starting at 0
4475 * @param limit the limit past which characters should not be examined
4476 * @return the codepoint at the specified index
4477 * @throws IndexOutOfBoundsException if index is negative or &gt;=
4478 * limit, or if limit is negative or &gt;= the length of the array
4479 * @since 1.5
4481 public static int codePointAt(char[] chars, int index, int limit)
4483 if (index < 0 || index >= limit || limit < 0 || limit > chars.length)
4484 throw new IndexOutOfBoundsException();
4485 char high = chars[index];
4486 if (! isHighSurrogate(high) || ++index >= limit)
4487 return high;
4488 char low = chars[index];
4489 if (! isLowSurrogate(low))
4490 return high;
4491 return toCodePoint(high, low);
4495 * Get the code point before the specified index. This is like
4496 * #codePointAt(char[], int), but checks the characters at
4497 * <code>index-1</code> and <code>index-2</code> to see if they form
4498 * a supplementary code point. If they do not, the character at
4499 * <code>index-1</code> is returned.
4501 * @param chars the character array
4502 * @param index the index just past the codepoint to get, starting at 0
4503 * @return the codepoint at the specified index
4504 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4505 * @since 1.5
4507 public static int codePointBefore(char[] chars, int index)
4509 return codePointBefore(chars, index, 1);
4513 * Get the code point before the specified index. This is like
4514 * #codePointAt(char[], int), but checks the characters at
4515 * <code>index-1</code> and <code>index-2</code> to see if they form
4516 * a supplementary code point. If they do not, the character at
4517 * <code>index-1</code> is returned. The start parameter is used to
4518 * limit the range of the array which may be examined.
4520 * @param chars the character array
4521 * @param index the index just past the codepoint to get, starting at 0
4522 * @param start the index before which characters should not be examined
4523 * @return the codepoint at the specified index
4524 * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
4525 * the length of the array, or if limit is negative or &gt;= the
4526 * length of the array
4527 * @since 1.5
4529 public static int codePointBefore(char[] chars, int index, int start)
4531 if (index < start || index > chars.length
4532 || start < 0 || start >= chars.length)
4533 throw new IndexOutOfBoundsException();
4534 --index;
4535 char low = chars[index];
4536 if (! isLowSurrogate(low) || --index < start)
4537 return low;
4538 char high = chars[index];
4539 if (! isHighSurrogate(high))
4540 return low;
4541 return toCodePoint(high, low);
4545 * Get the code point before the specified index. This is like
4546 * #codePointAt(CharSequence, int), but checks the characters at
4547 * <code>index-1</code> and <code>index-2</code> to see if they form
4548 * a supplementary code point. If they do not, the character at
4549 * <code>index-1</code> is returned.
4551 * @param sequence the CharSequence
4552 * @param index the index just past the codepoint to get, starting at 0
4553 * @return the codepoint at the specified index
4554 * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
4555 * @since 1.5
4557 public static int codePointBefore(CharSequence sequence, int index)
4559 int len = sequence.length();
4560 if (index < 1 || index > len)
4561 throw new IndexOutOfBoundsException();
4562 --index;
4563 char low = sequence.charAt(index);
4564 if (! isLowSurrogate(low) || --index < 0)
4565 return low;
4566 char high = sequence.charAt(index);
4567 if (! isHighSurrogate(high))
4568 return low;
4569 return toCodePoint(high, low);
4571 } // class Character