1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
13 #include "UniConversion.h"
16 using namespace Scintilla
;
23 unsigned int UTF8Length(const wchar_t *uptr
, unsigned int tlen
) {
25 for (unsigned int i
= 0; i
< tlen
&& uptr
[i
];) {
26 unsigned int uch
= uptr
[i
];
29 } else if (uch
< 0x800) {
31 } else if ((uch
>= SURROGATE_LEAD_FIRST
) &&
32 (uch
<= SURROGATE_TRAIL_LAST
)) {
43 void UTF8FromUTF16(const wchar_t *uptr
, unsigned int tlen
, char *putf
, unsigned int len
) {
45 for (unsigned int i
= 0; i
< tlen
&& uptr
[i
];) {
46 unsigned int uch
= uptr
[i
];
48 putf
[k
++] = static_cast<char>(uch
);
49 } else if (uch
< 0x800) {
50 putf
[k
++] = static_cast<char>(0xC0 | (uch
>> 6));
51 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
52 } else if ((uch
>= SURROGATE_LEAD_FIRST
) &&
53 (uch
<= SURROGATE_TRAIL_LAST
)) {
54 // Half a surrogate pair
56 unsigned int xch
= 0x10000 + ((uch
& 0x3ff) << 10) + (uptr
[i
] & 0x3ff);
57 putf
[k
++] = static_cast<char>(0xF0 | (xch
>> 18));
58 putf
[k
++] = static_cast<char>(0x80 | ((xch
>> 12) & 0x3f));
59 putf
[k
++] = static_cast<char>(0x80 | ((xch
>> 6) & 0x3f));
60 putf
[k
++] = static_cast<char>(0x80 | (xch
& 0x3f));
62 putf
[k
++] = static_cast<char>(0xE0 | (uch
>> 12));
63 putf
[k
++] = static_cast<char>(0x80 | ((uch
>> 6) & 0x3f));
64 putf
[k
++] = static_cast<char>(0x80 | (uch
& 0x3f));
72 unsigned int UTF8CharLength(unsigned char ch
) {
75 } else if (ch
< 0x80 + 0x40 + 0x20) {
77 } else if (ch
< 0x80 + 0x40 + 0x20 + 0x10) {
84 size_t UTF16Length(const char *s
, size_t len
) {
87 for (size_t i
= 0; i
<len
;) {
88 unsigned char ch
= static_cast<unsigned char>(s
[i
]);
91 } else if (ch
< 0x80 + 0x40 + 0x20) {
93 } else if (ch
< 0x80 + 0x40 + 0x20 + 0x10) {
105 size_t UTF16FromUTF8(const char *s
, size_t len
, wchar_t *tbuf
, size_t tlen
) {
107 const unsigned char *us
= reinterpret_cast<const unsigned char *>(s
);
109 while ((i
<len
) && (ui
<tlen
)) {
110 unsigned char ch
= us
[i
++];
113 } else if (ch
< 0x80 + 0x40 + 0x20) {
114 tbuf
[ui
] = static_cast<wchar_t>((ch
& 0x1F) << 6);
116 tbuf
[ui
] = static_cast<wchar_t>(tbuf
[ui
] + (ch
& 0x7F));
117 } else if (ch
< 0x80 + 0x40 + 0x20 + 0x10) {
118 tbuf
[ui
] = static_cast<wchar_t>((ch
& 0xF) << 12);
120 tbuf
[ui
] = static_cast<wchar_t>(tbuf
[ui
] + ((ch
& 0x7F) << 6));
122 tbuf
[ui
] = static_cast<wchar_t>(tbuf
[ui
] + (ch
& 0x7F));
124 // Outside the BMP so need two surrogates
125 int val
= (ch
& 0x7) << 18;
127 val
+= (ch
& 0x3F) << 12;
129 val
+= (ch
& 0x3F) << 6;
132 tbuf
[ui
] = static_cast<wchar_t>(((val
- 0x10000) >> 10) + SURROGATE_LEAD_FIRST
);
134 tbuf
[ui
] = static_cast<wchar_t>((val
& 0x3ff) + SURROGATE_TRAIL_FIRST
);
141 unsigned int UTF32FromUTF8(const char *s
, unsigned int len
, unsigned int *tbuf
, unsigned int tlen
) {
143 const unsigned char *us
= reinterpret_cast<const unsigned char *>(s
);
145 while ((i
<len
) && (ui
<tlen
)) {
146 unsigned char ch
= us
[i
++];
147 unsigned int value
= 0;
150 } else if (((len
-i
) >= 1) && (ch
< 0x80 + 0x40 + 0x20)) {
151 value
= (ch
& 0x1F) << 6;
154 } else if (((len
-i
) >= 2) && (ch
< 0x80 + 0x40 + 0x20 + 0x10)) {
155 value
= (ch
& 0xF) << 12;
157 value
+= (ch
& 0x7F) << 6;
160 } else if ((len
-i
) >= 3) {
161 value
= (ch
& 0x7) << 18;
163 value
+= (ch
& 0x3F) << 12;
165 value
+= (ch
& 0x3F) << 6;
175 unsigned int UTF16FromUTF32Character(unsigned int val
, wchar_t *tbuf
) {
176 if (val
< SUPPLEMENTAL_PLANE_FIRST
) {
177 tbuf
[0] = static_cast<wchar_t>(val
);
180 tbuf
[0] = static_cast<wchar_t>(((val
- SUPPLEMENTAL_PLANE_FIRST
) >> 10) + SURROGATE_LEAD_FIRST
);
181 tbuf
[1] = static_cast<wchar_t>((val
& 0x3ff) + SURROGATE_TRAIL_FIRST
);
186 int UTF8BytesOfLead
[256];
187 static bool initialisedBytesOfLead
= false;
189 static int BytesFromLead(int leadByte
) {
190 if (leadByte
< 0xC2) {
191 // Single byte or invalid
193 } else if (leadByte
< 0xE0) {
195 } else if (leadByte
< 0xF0) {
197 } else if (leadByte
< 0xF5) {
200 // Characters longer than 4 bytes not possible in current UTF-8
205 void UTF8BytesOfLeadInitialise() {
206 if (!initialisedBytesOfLead
) {
207 for (int i
=0; i
<256; i
++) {
208 UTF8BytesOfLead
[i
] = BytesFromLead(i
);
210 initialisedBytesOfLead
= true;
214 // Return both the width of the first character in the string and a status
215 // saying whether it is valid or invalid.
216 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
217 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
218 // reasonably treated as code points in some circumstances. They will, however,
219 // not have associated glyphs.
220 int UTF8Classify(const unsigned char *us
, int len
) {
221 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
225 } else if (*us
> 0xf4) {
226 // Characters longer than 4 bytes not possible in current UTF-8
227 return UTF8MaskInvalid
| 1;
228 } else if (*us
>= 0xf0) {
231 return UTF8MaskInvalid
| 1;
232 if (UTF8IsTrailByte(us
[1]) && UTF8IsTrailByte(us
[2]) && UTF8IsTrailByte(us
[3])) {
233 if (((us
[1] & 0xf) == 0xf) && (us
[2] == 0xbf) && ((us
[3] == 0xbe) || (us
[3] == 0xbf))) {
234 // *FFFE or *FFFF non-character
235 return UTF8MaskInvalid
| 4;
238 // Check if encoding a value beyond the last Unicode character 10FFFF
240 return UTF8MaskInvalid
| 1;
241 } else if (us
[1] == 0x8f) {
243 return UTF8MaskInvalid
| 1;
244 } else if (us
[2] == 0xbf) {
246 return UTF8MaskInvalid
| 1;
250 } else if ((*us
== 0xf0) && ((us
[1] & 0xf0) == 0x80)) {
252 return UTF8MaskInvalid
| 1;
256 return UTF8MaskInvalid
| 1;
258 } else if (*us
>= 0xe0) {
261 return UTF8MaskInvalid
| 1;
262 if (UTF8IsTrailByte(us
[1]) && UTF8IsTrailByte(us
[2])) {
263 if ((*us
== 0xe0) && ((us
[1] & 0xe0) == 0x80)) {
265 return UTF8MaskInvalid
| 1;
267 if ((*us
== 0xed) && ((us
[1] & 0xe0) == 0xa0)) {
269 return UTF8MaskInvalid
| 1;
271 if ((*us
== 0xef) && (us
[1] == 0xbf) && (us
[2] == 0xbe)) {
272 // U+FFFE non-character - 3 bytes long
273 return UTF8MaskInvalid
| 3;
275 if ((*us
== 0xef) && (us
[1] == 0xbf) && (us
[2] == 0xbf)) {
276 // U+FFFF non-character - 3 bytes long
277 return UTF8MaskInvalid
| 3;
279 if ((*us
== 0xef) && (us
[1] == 0xb7) && (((us
[2] & 0xf0) == 0x90) || ((us
[2] & 0xf0) == 0xa0))) {
281 return UTF8MaskInvalid
| 3;
285 return UTF8MaskInvalid
| 1;
287 } else if (*us
>= 0xc2) {
290 return UTF8MaskInvalid
| 1;
291 if (UTF8IsTrailByte(us
[1])) {
294 return UTF8MaskInvalid
| 1;
297 // 0xc0 .. 0xc1 is overlong encoding
298 // 0x80 .. 0xbf is trail byte
299 return UTF8MaskInvalid
| 1;
303 int UTF8DrawBytes(const unsigned char *us
, int len
) {
304 int utf8StatusNext
= UTF8Classify(us
, len
);
305 return (utf8StatusNext
& UTF8MaskInvalid
) ? 1 : (utf8StatusNext
& UTF8MaskWidth
);
308 // Replace invalid bytes in UTF-8 with the replacement character
309 std::string
FixInvalidUTF8(const std::string
&text
) {
311 const unsigned char *us
= reinterpret_cast<const unsigned char *>(text
.c_str());
312 size_t remaining
= text
.size();
313 while (remaining
> 0) {
314 const int utf8Status
= UTF8Classify(us
, static_cast<int>(remaining
));
315 if (utf8Status
& UTF8MaskInvalid
) {
316 // Replacement character 0xFFFD = UTF8:"efbfbd".
317 result
.append("\xef\xbf\xbd");
321 const int len
= utf8Status
&UTF8MaskWidth
;
322 result
.append(reinterpret_cast<const char *>(us
), len
);