1 // Scintilla source code edit control
2 /** @file CharacterCategory.cxx
3 ** Returns the Unicode general category of a character.
4 ** Table automatically regenerated by scripts/GenerateCharacterCategory.py
5 ** Should only be rarely regenerated for new versions of Unicode.
7 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
8 // The License.txt file describes the conditions under which this software may be distributed.
12 #include "StringCopy.h"
13 #include "CharacterCategory.h"
18 // Use an unnamed namespace to protect the declarations from name conflicts
20 const int catRanges
[] = {
21 //++Autogenerated -- start of section automatically generated
22 // Created with Python 3.6.1, Unicode 9.0.0
3677 //--Autogenerated -- end of section automatically generated
3680 const int maxUnicode
= 0x10ffff;
3681 const int maskCategory
= 0x1F;
3682 const int nRanges
= ELEMENTS(catRanges
);
3686 // Each element in catRanges is the start of a range of Unicode characters in
3687 // one general category.
3688 // The value is comprised of a 21-bit character value shifted 5 bits and a 5 bit
3689 // category matching the CharacterCategory enumeration.
3690 // Initial version has 3249 entries and adds about 13K to the executable.
3691 // The array is in ascending order so can be searched using binary search.
3692 // Therefore the average call takes log2(3249) = 12 comparisons.
3693 // For speed, it may be useful to make a linear table for the common values,
3694 // possibly for 0..0xff for most Western European text or 0..0xfff for most
3695 // alphabetic languages.
3697 CharacterCategory
CategoriseCharacter(int character
) {
3698 if (character
< 0 || character
> maxUnicode
)
3700 const int baseValue
= character
* (maskCategory
+1) + maskCategory
;
3701 const int *placeAfter
= std::lower_bound(catRanges
, catRanges
+nRanges
, baseValue
);
3702 return static_cast<CharacterCategory
>(*(placeAfter
-1) & maskCategory
);
3705 // Implementation of character sets recommended for identifiers in Unicode Standard Annex #31.
3706 // http://unicode.org/reports/tr31/
3710 enum class OtherID
{ oidNone
, oidStart
, oidContinue
};
3712 // Some characters are treated as valid for identifiers even
3713 // though most characters from their category are not.
3714 // Values copied from http://www.unicode.org/Public/9.0.0/ucd/PropList.txt
3715 OtherID
OtherIDOfCharacter(int character
) {
3717 (character
== 0x1885) || // MONGOLIAN LETTER ALI GALI BALUDA
3718 (character
== 0x1886) || // MONGOLIAN LETTER ALI GALI THREE BALUDA
3719 (character
== 0x2118) || // SCRIPT CAPITAL P
3720 (character
== 0x212E) || // ESTIMATED SYMBOL
3721 (character
== 0x309B) || // KATAKANA-HIRAGANA VOICED SOUND MARK
3722 (character
== 0x309C)) { // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
3723 return OtherID::oidStart
;
3725 (character
== 0x00B7) || // MIDDLE DOT
3726 (character
== 0x0387) || // GREEK ANO TELEIA
3727 ((character
>= 0x1369) && (character
<= 0x1371)) || // ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
3728 (character
== 0x19DA)) { // NEW TAI LUE THAM DIGIT ONE
3729 return OtherID::oidContinue
;
3731 return OtherID::oidNone
;
3735 // Determine if a character is in Ll|Lu|Lt|Lm|Lo|Nl|Mn|Mc|Nd|Pc and has
3736 // Pattern_Syntax|Pattern_White_Space.
3737 // As of Unicode 9, only VERTICAL TILDE which is in Lm and has Pattern_Syntax matches.
3738 // Should really generate from PropList.txt a list of Pattern_Syntax and Pattern_White_Space.
3739 bool IsIdPattern(int character
) {
3740 return character
== 0x2E2F;
3743 bool OmitXidStart(int character
) {
3744 switch (character
) {
3745 case 0x037A: // GREEK YPOGEGRAMMENI
3746 case 0x0E33: // THAI CHARACTER SARA AM
3747 case 0x0EB3: // LAO VOWEL SIGN AM
3748 case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
3749 case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
3750 case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
3751 case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
3752 case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
3753 case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
3754 case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
3755 case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
3756 case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
3757 case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
3758 case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
3759 case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
3760 case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
3761 case 0xFE76: // ARABIC FATHA ISOLATED FORM
3762 case 0xFE78: // ARABIC DAMMA ISOLATED FORM
3763 case 0xFE7A: // ARABIC KASRA ISOLATED FORM
3764 case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
3765 case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
3766 case 0xFF9E: // HALFWIDTH KATAKANA VOICED SOUND MARK
3767 case 0xFF9F: // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
3774 bool OmitXidContinue(int character
) {
3775 switch (character
) {
3776 case 0x037A: // GREEK YPOGEGRAMMENI
3777 case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
3778 case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
3779 case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
3780 case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
3781 case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
3782 case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
3783 case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
3784 case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
3785 case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
3786 case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
3787 case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
3788 case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
3789 case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
3790 case 0xFE76: // ARABIC FATHA ISOLATED FORM
3791 case 0xFE78: // ARABIC DAMMA ISOLATED FORM
3792 case 0xFE7A: // ARABIC KASRA ISOLATED FORM
3793 case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
3794 case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
3803 // UAX #31 defines ID_Start as
3804 // [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
3805 bool IsIdStart(int character
) {
3806 if (IsIdPattern(character
)) {
3809 const OtherID oid
= OtherIDOfCharacter(character
);
3810 if (oid
== OtherID::oidStart
) {
3813 const CharacterCategory c
= CategoriseCharacter(character
);
3814 return (c
== ccLl
|| c
== ccLu
|| c
== ccLt
|| c
== ccLm
|| c
== ccLo
3818 // UAX #31 defines ID_Continue as
3819 // [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
3820 bool IsIdContinue(int character
) {
3821 if (IsIdPattern(character
)) {
3824 const OtherID oid
= OtherIDOfCharacter(character
);
3825 if (oid
!= OtherID::oidNone
) {
3828 const CharacterCategory c
= CategoriseCharacter(character
);
3829 return (c
== ccLl
|| c
== ccLu
|| c
== ccLt
|| c
== ccLm
|| c
== ccLo
3830 || c
== ccNl
|| c
== ccMn
|| c
== ccMc
|| c
== ccNd
|| c
== ccPc
);
3833 // XID_Start is ID_Start modified for Normalization Form KC in UAX #31
3834 bool IsXidStart(int character
) {
3835 if (OmitXidStart(character
)) {
3838 return IsIdStart(character
);
3842 // XID_Continue is ID_Continue modified for Normalization Form KC in UAX #31
3843 bool IsXidContinue(int character
) {
3844 if (OmitXidContinue(character
)) {
3847 return IsIdContinue(character
);