1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
13 #include "UniConversion.h"
15 using namespace Scintilla
;
19 size_t UTF8Length(const wchar_t *uptr
, size_t tlen
) {
21 for (size_t i
= 0; i
< tlen
&& uptr
[i
];) {
22 const unsigned int uch
= uptr
[i
];
25 } else if (uch
< 0x800) {
27 } else if ((uch
>= SURROGATE_LEAD_FIRST
) &&
28 (uch
<= SURROGATE_TRAIL_LAST
)) {
39 void UTF8FromUTF16(const wchar_t *uptr
, size_t tlen
, char *putf
, size_t len
) {
41 for (size_t i
= 0; i
< tlen
&& uptr
[i
];) {
42 const unsigned int uch
= uptr
[i
];
44 putf
[k
++] = static_cast<char>(uch
);
45 } else if (uch
< 0x800) {
46 putf
[k
++] = static_cast<char>(0xC0 | (uch
>> 6));
47 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
48 } else if ((uch
>= SURROGATE_LEAD_FIRST
) &&
49 (uch
<= SURROGATE_TRAIL_LAST
)) {
50 // Half a surrogate pair
52 const unsigned int xch
= 0x10000 + ((uch
& 0x3ff) << 10) + (uptr
[i
] & 0x3ff);
53 putf
[k
++] = static_cast<char>(0xF0 | (xch
>> 18));
54 putf
[k
++] = static_cast<char>(0x80 | ((xch
>> 12) & 0x3f));
55 putf
[k
++] = static_cast<char>(0x80 | ((xch
>> 6) & 0x3f));
56 putf
[k
++] = static_cast<char>(0x80 | (xch
& 0x3f));
58 putf
[k
++] = static_cast<char>(0xE0 | (uch
>> 12));
59 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
60 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
68 void UTF8FromUTF32Character(int uch
, char *putf
) {
71 putf
[k
++] = static_cast<char>(uch
);
72 } else if (uch
< 0x800) {
73 putf
[k
++] = static_cast<char>(0xC0 | (uch
>> 6));
74 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
75 } else if (uch
< 0x10000) {
76 putf
[k
++] = static_cast<char>(0xE0 | (uch
>> 12));
77 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
78 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
80 putf
[k
++] = static_cast<char>(0xF0 | (uch
>> 18));
81 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 12) & 0x3f));
82 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
83 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
88 size_t UTF16Length(const char *s
, size_t len
) {
90 const unsigned char *us
= reinterpret_cast<const unsigned char *>(s
);
91 for (size_t i
= 0; i
< len
;) {
92 const unsigned char ch
= us
[i
];
93 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
94 const unsigned int utf16Len
= UTF16LengthFromUTF8ByteCount(byteCount
);
96 ulen
+= (i
> len
) ? 1 : utf16Len
;
101 constexpr unsigned char TrailByteValue(unsigned char c
) {
102 // The top 2 bits are 0b10 to indicate a trail byte.
103 // The lower 6 bits contain the value.
104 return c
& 0b0011'1111;
107 size_t UTF16FromUTF8(const char *s
, size_t len
, wchar_t *tbuf
, size_t tlen
) {
109 const unsigned char *us
= reinterpret_cast<const unsigned char *>(s
);
110 for (size_t i
= 0; i
< len
;) {
111 unsigned char ch
= us
[i
];
112 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
115 if (i
+ byteCount
> len
) {
116 // Trying to read past end but still have space to write
124 const size_t outLen
= UTF16LengthFromUTF8ByteCount(byteCount
);
125 if (ui
+ outLen
> tlen
) {
126 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
135 value
= (ch
& 0x1F) << 6;
137 value
+= TrailByteValue(ch
);
138 tbuf
[ui
] = static_cast<wchar_t>(value
);
141 value
= (ch
& 0xF) << 12;
143 value
+= (TrailByteValue(ch
) << 6);
145 value
+= TrailByteValue(ch
);
146 tbuf
[ui
] = static_cast<wchar_t>(value
);
149 // Outside the BMP so need two surrogates
150 value
= (ch
& 0x7) << 18;
152 value
+= TrailByteValue(ch
) << 12;
154 value
+= TrailByteValue(ch
) << 6;
156 value
+= TrailByteValue(ch
);
157 tbuf
[ui
] = static_cast<wchar_t>(((value
- 0x10000) >> 10) + SURROGATE_LEAD_FIRST
);
159 tbuf
[ui
] = static_cast<wchar_t>((value
& 0x3ff) + SURROGATE_TRAIL_FIRST
);
167 size_t UTF32FromUTF8(const char *s
, size_t len
, unsigned int *tbuf
, size_t tlen
) {
169 const unsigned char *us
= reinterpret_cast<const unsigned char *>(s
);
170 for (size_t i
= 0; i
< len
;) {
171 unsigned char ch
= us
[i
];
172 const unsigned int byteCount
= UTF8BytesOfLead
[ch
];
175 if (i
+ byteCount
> len
) {
176 // Trying to read past end but still have space to write
185 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
194 value
= (ch
& 0x1F) << 6;
196 value
+= TrailByteValue(ch
);
199 value
= (ch
& 0xF) << 12;
201 value
+= TrailByteValue(ch
) << 6;
203 value
+= TrailByteValue(ch
);
206 value
= (ch
& 0x7) << 18;
208 value
+= TrailByteValue(ch
) << 12;
210 value
+= TrailByteValue(ch
) << 6;
212 value
+= TrailByteValue(ch
);
221 unsigned int UTF16FromUTF32Character(unsigned int val
, wchar_t *tbuf
) {
222 if (val
< SUPPLEMENTAL_PLANE_FIRST
) {
223 tbuf
[0] = static_cast<wchar_t>(val
);
226 tbuf
[0] = static_cast<wchar_t>(((val
- SUPPLEMENTAL_PLANE_FIRST
) >> 10) + SURROGATE_LEAD_FIRST
);
227 tbuf
[1] = static_cast<wchar_t>((val
& 0x3ff) + SURROGATE_TRAIL_FIRST
);
232 const unsigned char UTF8BytesOfLead
[256] = {
233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
243 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
244 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
245 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
248 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
251 // Return both the width of the first character in the string and a status
252 // saying whether it is valid or invalid.
253 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
254 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
255 // reasonably treated as code points in some circumstances. They will, however,
256 // not have associated glyphs.
257 int UTF8Classify(const unsigned char *us
, int len
) {
258 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
264 const int byteCount
= UTF8BytesOfLead
[us
[0]];
265 if (byteCount
== 1 || byteCount
> len
) {
267 return UTF8MaskInvalid
| 1;
270 if (!UTF8IsTrailByte(us
[1])) {
271 // Invalid trail byte
272 return UTF8MaskInvalid
| 1;
280 if (UTF8IsTrailByte(us
[2])) {
281 if ((*us
== 0xe0) && ((us
[1] & 0xe0) == 0x80)) {
283 return UTF8MaskInvalid
| 1;
285 if ((*us
== 0xed) && ((us
[1] & 0xe0) == 0xa0)) {
287 return UTF8MaskInvalid
| 1;
289 if ((*us
== 0xef) && (us
[1] == 0xbf) && (us
[2] == 0xbe)) {
290 // U+FFFE non-character - 3 bytes long
291 return UTF8MaskInvalid
| 3;
293 if ((*us
== 0xef) && (us
[1] == 0xbf) && (us
[2] == 0xbf)) {
294 // U+FFFF non-character - 3 bytes long
295 return UTF8MaskInvalid
| 3;
297 if ((*us
== 0xef) && (us
[1] == 0xb7) && (((us
[2] & 0xf0) == 0x90) || ((us
[2] & 0xf0) == 0xa0))) {
299 return UTF8MaskInvalid
| 3;
306 if (UTF8IsTrailByte(us
[2]) && UTF8IsTrailByte(us
[3])) {
307 if (((us
[1] & 0xf) == 0xf) && (us
[2] == 0xbf) && ((us
[3] == 0xbe) || (us
[3] == 0xbf))) {
308 // *FFFE or *FFFF non-character
309 return UTF8MaskInvalid
| 4;
312 // Check if encoding a value beyond the last Unicode character 10FFFF
314 return UTF8MaskInvalid
| 1;
316 } else if ((*us
== 0xf0) && ((us
[1] & 0xf0) == 0x80)) {
318 return UTF8MaskInvalid
| 1;
325 return UTF8MaskInvalid
| 1;
328 int UTF8DrawBytes(const unsigned char *us
, int len
) {
329 const int utf8StatusNext
= UTF8Classify(us
, len
);
330 return (utf8StatusNext
& UTF8MaskInvalid
) ? 1 : (utf8StatusNext
& UTF8MaskWidth
);
333 // Replace invalid bytes in UTF-8 with the replacement character
334 std::string
FixInvalidUTF8(const std::string
&text
) {
336 const unsigned char *us
= reinterpret_cast<const unsigned char *>(text
.c_str());
337 size_t remaining
= text
.size();
338 while (remaining
> 0) {
339 const int utf8Status
= UTF8Classify(us
, static_cast<int>(remaining
));
340 if (utf8Status
& UTF8MaskInvalid
) {
341 // Replacement character 0xFFFD = UTF8:"efbfbd".
342 result
.append("\xef\xbf\xbd");
346 const int len
= utf8Status
&UTF8MaskWidth
;
347 result
.append(reinterpret_cast<const char *>(us
), len
);