1 // Scintilla source code edit control
2 /** @file CharacterCategory.cxx
3 ** Returns the Unicode general category of a character.
4 ** Table automatically regenerated by scripts/GenerateCharacterCategory.py
5 ** Should only be rarely regenerated for new versions of Unicode.
7 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
8 // The License.txt file describes the conditions under which this software may be distributed.
14 #include "Scintilla.h" // for ptrdiff_t in Position.h
15 #include "CharacterCategory.h"
16 #include "Position.h" // for Sci::clamp
21 // Use an unnamed namespace to protect the declarations from name conflicts
23 const int catRanges
[] = {
24 //++Autogenerated -- start of section automatically generated
25 // Created with Python 3.7.0, Unicode 11.0.0
3797 //--Autogenerated -- end of section automatically generated
3800 const int maxUnicode
= 0x10ffff;
3801 const int maskCategory
= 0x1F;
3805 // Each element in catRanges is the start of a range of Unicode characters in
3806 // one general category.
3807 // The value is comprised of a 21-bit character value shifted 5 bits and a 5 bit
3808 // category matching the CharacterCategory enumeration.
3809 // Initial version has 3249 entries and adds about 13K to the executable.
3810 // The array is in ascending order so can be searched using binary search.
3811 // Therefore the average call takes log2(3249) = 12 comparisons.
3812 // For speed, it may be useful to make a linear table for the common values,
3813 // possibly for 0..0xff for most Western European text or 0..0xfff for most
3814 // alphabetic languages.
3816 CharacterCategory
CategoriseCharacter(int character
) {
3817 if (character
< 0 || character
> maxUnicode
)
3819 const int baseValue
= character
* (maskCategory
+1) + maskCategory
;
3820 const int *placeAfter
= std::lower_bound(catRanges
, std::end(catRanges
), baseValue
);
3821 return static_cast<CharacterCategory
>(*(placeAfter
-1) & maskCategory
);
3824 // Implementation of character sets recommended for identifiers in Unicode Standard Annex #31.
3825 // http://unicode.org/reports/tr31/
3829 enum class OtherID
{ oidNone
, oidStart
, oidContinue
};
3831 // Some characters are treated as valid for identifiers even
3832 // though most characters from their category are not.
3833 // Values copied from http://www.unicode.org/Public/9.0.0/ucd/PropList.txt
3834 OtherID
OtherIDOfCharacter(int character
) {
3836 (character
== 0x1885) || // MONGOLIAN LETTER ALI GALI BALUDA
3837 (character
== 0x1886) || // MONGOLIAN LETTER ALI GALI THREE BALUDA
3838 (character
== 0x2118) || // SCRIPT CAPITAL P
3839 (character
== 0x212E) || // ESTIMATED SYMBOL
3840 (character
== 0x309B) || // KATAKANA-HIRAGANA VOICED SOUND MARK
3841 (character
== 0x309C)) { // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
3842 return OtherID::oidStart
;
3844 (character
== 0x00B7) || // MIDDLE DOT
3845 (character
== 0x0387) || // GREEK ANO TELEIA
3846 ((character
>= 0x1369) && (character
<= 0x1371)) || // ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
3847 (character
== 0x19DA)) { // NEW TAI LUE THAM DIGIT ONE
3848 return OtherID::oidContinue
;
3850 return OtherID::oidNone
;
3854 // Determine if a character is in Ll|Lu|Lt|Lm|Lo|Nl|Mn|Mc|Nd|Pc and has
3855 // Pattern_Syntax|Pattern_White_Space.
3856 // As of Unicode 9, only VERTICAL TILDE which is in Lm and has Pattern_Syntax matches.
3857 // Should really generate from PropList.txt a list of Pattern_Syntax and Pattern_White_Space.
3858 bool IsIdPattern(int character
) {
3859 return character
== 0x2E2F;
3862 bool OmitXidStart(int character
) {
3863 switch (character
) {
3864 case 0x037A: // GREEK YPOGEGRAMMENI
3865 case 0x0E33: // THAI CHARACTER SARA AM
3866 case 0x0EB3: // LAO VOWEL SIGN AM
3867 case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
3868 case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
3869 case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
3870 case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
3871 case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
3872 case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
3873 case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
3874 case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
3875 case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
3876 case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
3877 case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
3878 case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
3879 case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
3880 case 0xFE76: // ARABIC FATHA ISOLATED FORM
3881 case 0xFE78: // ARABIC DAMMA ISOLATED FORM
3882 case 0xFE7A: // ARABIC KASRA ISOLATED FORM
3883 case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
3884 case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
3885 case 0xFF9E: // HALFWIDTH KATAKANA VOICED SOUND MARK
3886 case 0xFF9F: // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
3893 bool OmitXidContinue(int character
) {
3894 switch (character
) {
3895 case 0x037A: // GREEK YPOGEGRAMMENI
3896 case 0x309B: // KATAKANA-HIRAGANA VOICED SOUND MARK
3897 case 0x309C: // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
3898 case 0xFC5E: // ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
3899 case 0xFC5F: // ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
3900 case 0xFC60: // ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
3901 case 0xFC61: // ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
3902 case 0xFC62: // ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
3903 case 0xFC63: // ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
3904 case 0xFDFA: // ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
3905 case 0xFDFB: // ARABIC LIGATURE JALLAJALALOUHOU
3906 case 0xFE70: // ARABIC FATHATAN ISOLATED FORM
3907 case 0xFE72: // ARABIC DAMMATAN ISOLATED FORM
3908 case 0xFE74: // ARABIC KASRATAN ISOLATED FORM
3909 case 0xFE76: // ARABIC FATHA ISOLATED FORM
3910 case 0xFE78: // ARABIC DAMMA ISOLATED FORM
3911 case 0xFE7A: // ARABIC KASRA ISOLATED FORM
3912 case 0xFE7C: // ARABIC SHADDA ISOLATED FORM
3913 case 0xFE7E: // ARABIC SUKUN ISOLATED FORM
3922 // UAX #31 defines ID_Start as
3923 // [[:L:][:Nl:][:Other_ID_Start:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
3924 bool IsIdStart(int character
) {
3925 if (IsIdPattern(character
)) {
3928 const OtherID oid
= OtherIDOfCharacter(character
);
3929 if (oid
== OtherID::oidStart
) {
3932 const CharacterCategory c
= CategoriseCharacter(character
);
3933 return (c
== ccLl
|| c
== ccLu
|| c
== ccLt
|| c
== ccLm
|| c
== ccLo
3937 // UAX #31 defines ID_Continue as
3938 // [[:ID_Start:][:Mn:][:Mc:][:Nd:][:Pc:][:Other_ID_Continue:]--[:Pattern_Syntax:]--[:Pattern_White_Space:]]
3939 bool IsIdContinue(int character
) {
3940 if (IsIdPattern(character
)) {
3943 const OtherID oid
= OtherIDOfCharacter(character
);
3944 if (oid
!= OtherID::oidNone
) {
3947 const CharacterCategory c
= CategoriseCharacter(character
);
3948 return (c
== ccLl
|| c
== ccLu
|| c
== ccLt
|| c
== ccLm
|| c
== ccLo
3949 || c
== ccNl
|| c
== ccMn
|| c
== ccMc
|| c
== ccNd
|| c
== ccPc
);
3952 // XID_Start is ID_Start modified for Normalization Form KC in UAX #31
3953 bool IsXidStart(int character
) {
3954 if (OmitXidStart(character
)) {
3957 return IsIdStart(character
);
3961 // XID_Continue is ID_Continue modified for Normalization Form KC in UAX #31
3962 bool IsXidContinue(int character
) {
3963 if (OmitXidContinue(character
)) {
3966 return IsIdContinue(character
);
3970 CharacterCategoryMap::CharacterCategoryMap() noexcept
{
3974 int CharacterCategoryMap::Size() const noexcept
{
3975 return static_cast<int>(dense
.size());
3978 void CharacterCategoryMap::Optimize(int countCharacters
) {
3979 const int characters
= Sci::clamp(countCharacters
, 256, maxUnicode
+ 1);
3980 dense
.resize(characters
);
3984 int current
= catRanges
[index
];
3987 const int next
= catRanges
[index
];
3988 const unsigned char category
= current
& maskCategory
;
3990 end
= std::min(characters
, next
>> 5);
3991 while (current
< end
) {
3992 dense
[current
++] = category
;
3996 } while (characters
> end
);