1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
12 #include <string_view>
14 #include "UniConversion.h"
16 namespace Scintilla::Internal
{
18 size_t UTF8Length(std::wstring_view wsv
) noexcept
{
20 for (size_t i
= 0; i
< wsv
.length() && wsv
[i
];) {
21 const unsigned int uch
= wsv
[i
];
24 } else if (uch
< 0x800) {
26 } else if ((uch
>= SURROGATE_LEAD_FIRST
) &&
27 (uch
<= SURROGATE_TRAIL_LAST
)) {
38 size_t UTF8PositionFromUTF16Position(std::string_view u8Text
, size_t positionUTF16
) noexcept
{
39 size_t positionUTF8
= 0;
40 for (size_t lengthUTF16
= 0; (positionUTF8
< u8Text
.length()) && (lengthUTF16
< positionUTF16
);) {
41 const unsigned char uch
= u8Text
[positionUTF8
];
42 const unsigned int byteCount
= UTF8BytesOfLead
[uch
];
43 lengthUTF16
+= UTF16LengthFromUTF8ByteCount(byteCount
);
44 positionUTF8
+= byteCount
;
50 void UTF8FromUTF16(std::wstring_view wsv
, char *putf
, size_t len
) noexcept
{
52 for (size_t i
= 0; i
< wsv
.length() && wsv
[i
];) {
53 const unsigned int uch
= wsv
[i
];
55 putf
[k
++] = static_cast<char>(uch
);
56 } else if (uch
< 0x800) {
57 putf
[k
++] = static_cast<char>(0xC0 | (uch
>> 6));
58 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
59 } else if ((uch
>= SURROGATE_LEAD_FIRST
) &&
60 (uch
<= SURROGATE_TRAIL_LAST
)) {
61 // Half a surrogate pair
63 const unsigned int xch
= 0x10000 + ((uch
& 0x3ff) << 10) + (wsv
[i
] & 0x3ff);
64 putf
[k
++] = static_cast<char>(0xF0 | (xch
>> 18));
65 putf
[k
++] = static_cast<char>(0x80 | ((xch
>> 12) & 0x3f));
66 putf
[k
++] = static_cast<char>(0x80 | ((xch
>> 6) & 0x3f));
67 putf
[k
++] = static_cast<char>(0x80 | (xch
& 0x3f));
69 putf
[k
++] = static_cast<char>(0xE0 | (uch
>> 12));
70 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
71 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
79 void UTF8FromUTF32Character(int uch
, char *putf
) noexcept
{
82 putf
[k
++] = static_cast<char>(uch
);
83 } else if (uch
< 0x800) {
84 putf
[k
++] = static_cast<char>(0xC0 | (uch
>> 6));
85 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
86 } else if (uch
< 0x10000) {
87 putf
[k
++] = static_cast<char>(0xE0 | (uch
>> 12));
88 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
89 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
91 putf
[k
++] = static_cast<char>(0xF0 | (uch
>> 18));
92 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 12) & 0x3f));
93 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
94 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
99 size_t UTF16Length(std::string_view svu8
) noexcept
{
101 for (size_t i
= 0; i
< svu8
.length();) {
102 const unsigned char ch
= svu8
[i
];
103 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
104 const unsigned int utf16Len
= UTF16LengthFromUTF8ByteCount(byteCount
);
106 ulen
+= (i
> svu8
.length()) ? 1 : utf16Len
;
111 constexpr unsigned char TrailByteValue(unsigned char c
) {
112 // The top 2 bits are 0b10 to indicate a trail byte.
113 // The lower 6 bits contain the value.
114 return c
& 0b0011'1111;
117 size_t UTF16FromUTF8(std::string_view svu8
, wchar_t *tbuf
, size_t tlen
) {
119 for (size_t i
= 0; i
< svu8
.length();) {
120 unsigned char ch
= svu8
[i
];
121 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
124 if (i
+ byteCount
> svu8
.length()) {
125 // Trying to read past end but still have space to write
133 const size_t outLen
= UTF16LengthFromUTF8ByteCount(byteCount
);
134 if (ui
+ outLen
> tlen
) {
135 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
144 value
= (ch
& 0x1F) << 6;
146 value
+= TrailByteValue(ch
);
147 tbuf
[ui
] = static_cast<wchar_t>(value
);
150 value
= (ch
& 0xF) << 12;
152 value
+= (TrailByteValue(ch
) << 6);
154 value
+= TrailByteValue(ch
);
155 tbuf
[ui
] = static_cast<wchar_t>(value
);
158 // Outside the BMP so need two surrogates
159 value
= (ch
& 0x7) << 18;
161 value
+= TrailByteValue(ch
) << 12;
163 value
+= TrailByteValue(ch
) << 6;
165 value
+= TrailByteValue(ch
);
166 tbuf
[ui
] = static_cast<wchar_t>(((value
- 0x10000) >> 10) + SURROGATE_LEAD_FIRST
);
168 tbuf
[ui
] = static_cast<wchar_t>((value
& 0x3ff) + SURROGATE_TRAIL_FIRST
);
176 size_t UTF32Length(std::string_view svu8
) noexcept
{
178 for (size_t i
= 0; i
< svu8
.length();) {
179 const unsigned char ch
= svu8
[i
];
180 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
187 size_t UTF32FromUTF8(std::string_view svu8
, unsigned int *tbuf
, size_t tlen
) {
189 for (size_t i
= 0; i
< svu8
.length();) {
190 unsigned char ch
= svu8
[i
];
191 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
194 if (i
+ byteCount
> svu8
.length()) {
195 // Trying to read past end but still have space to write
204 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
213 value
= (ch
& 0x1F) << 6;
215 value
+= TrailByteValue(ch
);
218 value
= (ch
& 0xF) << 12;
220 value
+= TrailByteValue(ch
) << 6;
222 value
+= TrailByteValue(ch
);
225 value
= (ch
& 0x7) << 18;
227 value
+= TrailByteValue(ch
) << 12;
229 value
+= TrailByteValue(ch
) << 6;
231 value
+= TrailByteValue(ch
);
240 std::wstring
WStringFromUTF8(std::string_view svu8
) {
241 if constexpr (sizeof(wchar_t) == 2) {
242 const size_t len16
= UTF16Length(svu8
);
243 std::wstring
ws(len16
, 0);
244 UTF16FromUTF8(svu8
, &ws
[0], len16
);
247 const size_t len32
= UTF32Length(svu8
);
248 std::wstring
ws(len32
, 0);
249 UTF32FromUTF8(svu8
, reinterpret_cast<unsigned int *>(&ws
[0]), len32
);
254 unsigned int UTF16FromUTF32Character(unsigned int val
, wchar_t *tbuf
) noexcept
{
255 if (val
< SUPPLEMENTAL_PLANE_FIRST
) {
256 tbuf
[0] = static_cast<wchar_t>(val
);
259 tbuf
[0] = static_cast<wchar_t>(((val
- SUPPLEMENTAL_PLANE_FIRST
) >> 10) + SURROGATE_LEAD_FIRST
);
260 tbuf
[1] = static_cast<wchar_t>((val
& 0x3ff) + SURROGATE_TRAIL_FIRST
);
265 const unsigned char UTF8BytesOfLead
[256] = {
266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
267 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
278 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
279 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
280 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
281 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
284 // Return both the width of the first character in the string and a status
285 // saying whether it is valid or invalid.
286 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
287 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
288 // reasonably treated as code points in some circumstances. They will, however,
289 // not have associated glyphs.
290 int UTF8Classify(const unsigned char *us
, size_t len
) noexcept
{
291 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
297 const size_t byteCount
= UTF8BytesOfLead
[us
[0]];
298 if (byteCount
== 1 || byteCount
> len
) {
300 return UTF8MaskInvalid
| 1;
303 if (!UTF8IsTrailByte(us
[1])) {
304 // Invalid trail byte
305 return UTF8MaskInvalid
| 1;
313 if (UTF8IsTrailByte(us
[2])) {
314 if ((*us
== 0xe0) && ((us
[1] & 0xe0) == 0x80)) {
316 return UTF8MaskInvalid
| 1;
318 if ((*us
== 0xed) && ((us
[1] & 0xe0) == 0xa0)) {
320 return UTF8MaskInvalid
| 1;
322 if ((*us
== 0xef) && (us
[1] == 0xbf) && (us
[2] == 0xbe)) {
323 // U+FFFE non-character - 3 bytes long
324 return UTF8MaskInvalid
| 3;
326 if ((*us
== 0xef) && (us
[1] == 0xbf) && (us
[2] == 0xbf)) {
327 // U+FFFF non-character - 3 bytes long
328 return UTF8MaskInvalid
| 3;
330 if ((*us
== 0xef) && (us
[1] == 0xb7) && (((us
[2] & 0xf0) == 0x90) || ((us
[2] & 0xf0) == 0xa0))) {
332 return UTF8MaskInvalid
| 3;
339 if (UTF8IsTrailByte(us
[2]) && UTF8IsTrailByte(us
[3])) {
340 if (((us
[1] & 0xf) == 0xf) && (us
[2] == 0xbf) && ((us
[3] == 0xbe) || (us
[3] == 0xbf))) {
341 // *FFFE or *FFFF non-character
342 return UTF8MaskInvalid
| 4;
345 // Check if encoding a value beyond the last Unicode character 10FFFF
347 return UTF8MaskInvalid
| 1;
349 } else if ((*us
== 0xf0) && ((us
[1] & 0xf0) == 0x80)) {
351 return UTF8MaskInvalid
| 1;
358 return UTF8MaskInvalid
| 1;
361 int UTF8DrawBytes(const char *s
, size_t len
) noexcept
{
362 const int utf8StatusNext
= UTF8Classify(reinterpret_cast<const unsigned char *>(s
), len
);
363 return (utf8StatusNext
& UTF8MaskInvalid
) ? 1 : (utf8StatusNext
& UTF8MaskWidth
);
366 bool UTF8IsValid(std::string_view svu8
) noexcept
{
367 const unsigned char *us
= reinterpret_cast<const unsigned char *>(svu8
.data());
368 size_t remaining
= svu8
.length();
369 while (remaining
> 0) {
370 const int utf8Status
= UTF8Classify(us
, remaining
);
371 if (utf8Status
& UTF8MaskInvalid
) {
374 const int lenChar
= utf8Status
& UTF8MaskWidth
;
376 remaining
-= lenChar
;
379 return remaining
== 0;
382 // Replace invalid bytes in UTF-8 with the replacement character
383 std::string
FixInvalidUTF8(const std::string
&text
) {
385 const char *s
= text
.c_str();
386 size_t remaining
= text
.size();
387 while (remaining
> 0) {
388 const int utf8Status
= UTF8Classify(reinterpret_cast<const unsigned char *>(s
), remaining
);
389 if (utf8Status
& UTF8MaskInvalid
) {
390 // Replacement character 0xFFFD = UTF8:"efbfbd".
391 result
.append("\xef\xbf\xbd");
395 const size_t len
= utf8Status
& UTF8MaskWidth
;
396 result
.append(s
, len
);