1 // Scintilla source code edit control
2 /** @file UniConversion.h
3 ** Functions to handle UTF-8 and UTF-16 strings.
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
8 #ifndef UNICONVERSION_H
9 #define UNICONVERSION_H
11 namespace Scintilla::Internal
{
13 constexpr int UTF8MaxBytes
= 4;
15 constexpr int unicodeReplacementChar
= 0xFFFD;
17 size_t UTF8Length(std::wstring_view wsv
) noexcept
;
18 size_t UTF8PositionFromUTF16Position(std::string_view u8Text
, size_t positionUTF16
) noexcept
;
19 void UTF8FromUTF16(std::wstring_view wsv
, char *putf
, size_t len
) noexcept
;
20 void UTF8FromUTF32Character(int uch
, char *putf
) noexcept
;
21 size_t UTF16Length(std::string_view svu8
) noexcept
;
22 size_t UTF16FromUTF8(std::string_view svu8
, wchar_t *tbuf
, size_t tlen
);
23 size_t UTF32Length(std::string_view svu8
) noexcept
;
24 size_t UTF32FromUTF8(std::string_view svu8
, unsigned int *tbuf
, size_t tlen
);
25 // WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so
26 // works on both Windows and Unix.
27 std::wstring
WStringFromUTF8(std::string_view svu8
);
28 unsigned int UTF16FromUTF32Character(unsigned int val
, wchar_t *tbuf
) noexcept
;
29 bool UTF8IsValid(std::string_view svu8
) noexcept
;
30 std::string
FixInvalidUTF8(const std::string
&text
);
32 extern const unsigned char UTF8BytesOfLead
[256];
34 inline int UnicodeFromUTF8(const unsigned char *us
) noexcept
{
35 switch (UTF8BytesOfLead
[us
[0]]) {
39 return ((us
[0] & 0x1F) << 6) + (us
[1] & 0x3F);
41 return ((us
[0] & 0xF) << 12) + ((us
[1] & 0x3F) << 6) + (us
[2] & 0x3F);
43 return ((us
[0] & 0x7) << 18) + ((us
[1] & 0x3F) << 12) + ((us
[2] & 0x3F) << 6) + (us
[3] & 0x3F);
47 inline constexpr bool UTF8IsTrailByte(unsigned char ch
) noexcept
{
48 return (ch
>= 0x80) && (ch
< 0xc0);
51 inline constexpr bool UTF8IsAscii(unsigned char ch
) noexcept
{
55 inline constexpr bool UTF8IsAscii(char ch
) noexcept
{
56 const unsigned char uch
= ch
;
60 enum { UTF8MaskWidth
=0x7, UTF8MaskInvalid
=0x8 };
61 int UTF8Classify(const unsigned char *us
, size_t len
) noexcept
;
62 inline int UTF8Classify(std::string_view sv
) noexcept
{
63 return UTF8Classify(reinterpret_cast<const unsigned char *>(sv
.data()), sv
.length());
66 // Similar to UTF8Classify but returns a length of 1 for invalid bytes
67 // instead of setting the invalid flag
68 int UTF8DrawBytes(const unsigned char *us
, int len
) noexcept
;
70 // Line separator is U+2028 \xe2\x80\xa8
71 // Paragraph separator is U+2029 \xe2\x80\xa9
72 constexpr int UTF8SeparatorLength
= 3;
73 inline bool UTF8IsSeparator(const unsigned char *us
) noexcept
{
74 return (us
[0] == 0xe2) && (us
[1] == 0x80) && ((us
[2] == 0xa8) || (us
[2] == 0xa9));
77 // NEL is U+0085 \xc2\x85
78 constexpr int UTF8NELLength
= 2;
79 inline bool UTF8IsNEL(const unsigned char *us
) noexcept
{
80 return (us
[0] == 0xc2) && (us
[1] == 0x85);
83 // Is the sequence of 3 char a UTF-8 line end? Only the last two char are tested for a NEL.
84 constexpr bool UTF8IsMultibyteLineEnd(unsigned char ch0
, unsigned char ch1
, unsigned char ch2
) noexcept
{
86 ((ch0
== 0xe2) && (ch1
== 0x80) && ((ch2
== 0xa8) || (ch2
== 0xa9))) ||
87 ((ch1
== 0xc2) && (ch2
== 0x85));
90 enum { SURROGATE_LEAD_FIRST
= 0xD800 };
91 enum { SURROGATE_LEAD_LAST
= 0xDBFF };
92 enum { SURROGATE_TRAIL_FIRST
= 0xDC00 };
93 enum { SURROGATE_TRAIL_LAST
= 0xDFFF };
94 enum { SUPPLEMENTAL_PLANE_FIRST
= 0x10000 };
96 inline constexpr unsigned int UTF16CharLength(wchar_t uch
) noexcept
{
97 return ((uch
>= SURROGATE_LEAD_FIRST
) && (uch
<= SURROGATE_LEAD_LAST
)) ? 2 : 1;
100 inline constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount
) noexcept
{
101 return (byteCount
< 4) ? 1 : 2;