Update Scintilla to 4.0.4
[TortoiseGit.git] / ext / scintilla / src / UniConversion.cxx
blobd059f38e20b630de8367e7d293b4acd196cdfdfe
1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
8 #include <cstdlib>
10 #include <stdexcept>
11 #include <string>
13 #include "UniConversion.h"
15 using namespace Scintilla;
17 namespace Scintilla {
19 size_t UTF8Length(const wchar_t *uptr, size_t tlen) {
20 size_t len = 0;
21 for (size_t i = 0; i < tlen && uptr[i];) {
22 const unsigned int uch = uptr[i];
23 if (uch < 0x80) {
24 len++;
25 } else if (uch < 0x800) {
26 len += 2;
27 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
28 (uch <= SURROGATE_TRAIL_LAST)) {
29 len += 4;
30 i++;
31 } else {
32 len += 3;
34 i++;
36 return len;
39 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
40 size_t k = 0;
41 for (size_t i = 0; i < tlen && uptr[i];) {
42 const unsigned int uch = uptr[i];
43 if (uch < 0x80) {
44 putf[k++] = static_cast<char>(uch);
45 } else if (uch < 0x800) {
46 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
47 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
48 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
49 (uch <= SURROGATE_TRAIL_LAST)) {
50 // Half a surrogate pair
51 i++;
52 const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
53 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
54 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
55 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
56 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
57 } else {
58 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
59 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
60 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
62 i++;
64 if (k < len)
65 putf[k] = '\0';
68 void UTF8FromUTF32Character(int uch, char *putf) {
69 size_t k = 0;
70 if (uch < 0x80) {
71 putf[k++] = static_cast<char>(uch);
72 } else if (uch < 0x800) {
73 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
74 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
75 } else if (uch < 0x10000) {
76 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
77 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
78 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
79 } else {
80 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
81 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
82 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
83 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
85 putf[k] = '\0';
88 size_t UTF16Length(const char *s, size_t len) {
89 size_t ulen = 0;
90 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
91 for (size_t i = 0; i < len;) {
92 const unsigned char ch = us[i];
93 const unsigned int byteCount = UTF8BytesOfLead[ch];
94 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
95 i += byteCount;
96 ulen += (i > len) ? 1 : utf16Len;
98 return ulen;
101 constexpr unsigned char TrailByteValue(unsigned char c) {
102 // The top 2 bits are 0b10 to indicate a trail byte.
103 // The lower 6 bits contain the value.
104 return c & 0b0011'1111;
107 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
108 size_t ui = 0;
109 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
110 for (size_t i = 0; i < len;) {
111 unsigned char ch = us[i];
112 const unsigned int byteCount = UTF8BytesOfLead[ch];
113 unsigned int value;
115 if (i + byteCount > len) {
116 // Trying to read past end but still have space to write
117 if (ui < tlen) {
118 tbuf[ui] = ch;
119 ui++;
121 break;
124 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
125 if (ui + outLen > tlen) {
126 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
129 i++;
130 switch (byteCount) {
131 case 1:
132 tbuf[ui] = ch;
133 break;
134 case 2:
135 value = (ch & 0x1F) << 6;
136 ch = us[i++];
137 value += TrailByteValue(ch);
138 tbuf[ui] = static_cast<wchar_t>(value);
139 break;
140 case 3:
141 value = (ch & 0xF) << 12;
142 ch = us[i++];
143 value += (TrailByteValue(ch) << 6);
144 ch = us[i++];
145 value += TrailByteValue(ch);
146 tbuf[ui] = static_cast<wchar_t>(value);
147 break;
148 default:
149 // Outside the BMP so need two surrogates
150 value = (ch & 0x7) << 18;
151 ch = us[i++];
152 value += TrailByteValue(ch) << 12;
153 ch = us[i++];
154 value += TrailByteValue(ch) << 6;
155 ch = us[i++];
156 value += TrailByteValue(ch);
157 tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
158 ui++;
159 tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
160 break;
162 ui++;
164 return ui;
167 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
168 size_t ui = 0;
169 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
170 for (size_t i = 0; i < len;) {
171 unsigned char ch = us[i];
172 const unsigned int byteCount = UTF8BytesOfLead[ch];
173 unsigned int value;
175 if (i + byteCount > len) {
176 // Trying to read past end but still have space to write
177 if (ui < tlen) {
178 tbuf[ui] = ch;
179 ui++;
181 break;
184 if (ui == tlen) {
185 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
188 i++;
189 switch (byteCount) {
190 case 1:
191 value = ch;
192 break;
193 case 2:
194 value = (ch & 0x1F) << 6;
195 ch = us[i++];
196 value += TrailByteValue(ch);
197 break;
198 case 3:
199 value = (ch & 0xF) << 12;
200 ch = us[i++];
201 value += TrailByteValue(ch) << 6;
202 ch = us[i++];
203 value += TrailByteValue(ch);
204 break;
205 default:
206 value = (ch & 0x7) << 18;
207 ch = us[i++];
208 value += TrailByteValue(ch) << 12;
209 ch = us[i++];
210 value += TrailByteValue(ch) << 6;
211 ch = us[i++];
212 value += TrailByteValue(ch);
213 break;
215 tbuf[ui] = value;
216 ui++;
218 return ui;
221 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
222 if (val < SUPPLEMENTAL_PLANE_FIRST) {
223 tbuf[0] = static_cast<wchar_t>(val);
224 return 1;
225 } else {
226 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
227 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
228 return 2;
232 const unsigned char UTF8BytesOfLead[256] = {
233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
243 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
244 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
245 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
248 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
251 // Return both the width of the first character in the string and a status
252 // saying whether it is valid or invalid.
253 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
254 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
255 // reasonably treated as code points in some circumstances. They will, however,
256 // not have associated glyphs.
257 int UTF8Classify(const unsigned char *us, int len) {
258 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
259 if (us[0] < 0x80) {
260 // ASCII
261 return 1;
264 const int byteCount = UTF8BytesOfLead[us[0]];
265 if (byteCount == 1 || byteCount > len) {
266 // Invalid lead byte
267 return UTF8MaskInvalid | 1;
270 if (!UTF8IsTrailByte(us[1])) {
271 // Invalid trail byte
272 return UTF8MaskInvalid | 1;
275 switch (byteCount) {
276 case 2:
277 return 2;
279 case 3:
280 if (UTF8IsTrailByte(us[2])) {
281 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
282 // Overlong
283 return UTF8MaskInvalid | 1;
285 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
286 // Surrogate
287 return UTF8MaskInvalid | 1;
289 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
290 // U+FFFE non-character - 3 bytes long
291 return UTF8MaskInvalid | 3;
293 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
294 // U+FFFF non-character - 3 bytes long
295 return UTF8MaskInvalid | 3;
297 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
298 // U+FDD0 .. U+FDEF
299 return UTF8MaskInvalid | 3;
301 return 3;
303 break;
305 default:
306 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
307 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
308 // *FFFE or *FFFF non-character
309 return UTF8MaskInvalid | 4;
311 if (*us == 0xf4) {
312 // Check if encoding a value beyond the last Unicode character 10FFFF
313 if (us[1] > 0x8f) {
314 return UTF8MaskInvalid | 1;
316 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
317 // Overlong
318 return UTF8MaskInvalid | 1;
320 return 4;
322 break;
325 return UTF8MaskInvalid | 1;
328 int UTF8DrawBytes(const unsigned char *us, int len) {
329 const int utf8StatusNext = UTF8Classify(us, len);
330 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
333 // Replace invalid bytes in UTF-8 with the replacement character
334 std::string FixInvalidUTF8(const std::string &text) {
335 std::string result;
336 const unsigned char *us = reinterpret_cast<const unsigned char *>(text.c_str());
337 size_t remaining = text.size();
338 while (remaining > 0) {
339 const int utf8Status = UTF8Classify(us, static_cast<int>(remaining));
340 if (utf8Status & UTF8MaskInvalid) {
341 // Replacement character 0xFFFD = UTF8:"efbfbd".
342 result.append("\xef\xbf\xbd");
343 us++;
344 remaining--;
345 } else {
346 const int len = utf8Status&UTF8MaskWidth;
347 result.append(reinterpret_cast<const char *>(us), len);
348 us += len;
349 remaining -= len;
352 return result;