1 // Scintilla source code edit control
2 /** @file CharacterCategory.cxx
3 ** Returns the Unicode general category of a character.
4 ** Table automatically regenerated by scripts/GenerateCharacterCategory.py
5 ** Should only be rarely regenerated for new versions of Unicode.
7 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
8 // The License.txt file describes the conditions under which this software may be distributed.
12 #include "StringCopy.h"
13 #include "CharacterCategory.h"
20 // Use an unnamed namespace to protect the declarations from name conflicts
22 const int catRanges
[] = {
23 //++Autogenerated -- start of section automatically generated
24 // Created with Python 3.3.0, Unicode 6.1.0
3274 //--Autogenerated -- end of section automatically generated
3277 const int maxUnicode
= 0x10ffff;
3278 const int maskCategory
= 0x1F;
3279 const int nRanges
= ELEMENTS(catRanges
);
3283 // Each element in catRanges is the start of a range of Unicode characters in
3284 // one general category.
3285 // The value is comprised of a 21-bit character value shifted 5 bits and a 5 bit
3286 // category matching the CharacterCategory enumeration.
3287 // Initial version has 3249 entries and adds about 13K to the executable.
3288 // The array is in ascending order so can be searched using binary search.
3289 // Therefore the average call takes log2(3249) = 12 comparisons.
3290 // For speed, it may be useful to make a linear table for the common values,
3291 // possibly for 0..0xff for most Western European text or 0..0xfff for most
3292 // alphabetic languages.
3294 CharacterCategory
CategoriseCharacter(int character
) {
3295 if (character
< 0 || character
> maxUnicode
)
3297 const int baseValue
= character
* (maskCategory
+1) + maskCategory
;
3298 const int *placeAfter
= std::lower_bound(catRanges
, catRanges
+nRanges
, baseValue
);
3299 return static_cast<CharacterCategory
>(*(placeAfter
-1) & maskCategory
);
3302 #ifdef SCI_NAMESPACE