Fix action icons in the log dialog being clipped on High-DPI displays
[TortoiseGit.git] / ext / scintilla / src / UniConversion.cxx
blobc57331c9892db6839b543a4c9fd9659ca94ef43e
1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
8 #include <stdlib.h>
10 #include <stdexcept>
11 #include <string>
13 #include "UniConversion.h"
15 #ifdef SCI_NAMESPACE
16 using namespace Scintilla;
17 #endif
19 #ifdef SCI_NAMESPACE
20 namespace Scintilla {
21 #endif
23 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
24 unsigned int len = 0;
25 for (unsigned int i = 0; i < tlen && uptr[i];) {
26 unsigned int uch = uptr[i];
27 if (uch < 0x80) {
28 len++;
29 } else if (uch < 0x800) {
30 len += 2;
31 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
32 (uch <= SURROGATE_TRAIL_LAST)) {
33 len += 4;
34 i++;
35 } else {
36 len += 3;
38 i++;
40 return len;
43 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
44 unsigned int k = 0;
45 for (unsigned int i = 0; i < tlen && uptr[i];) {
46 unsigned int uch = uptr[i];
47 if (uch < 0x80) {
48 putf[k++] = static_cast<char>(uch);
49 } else if (uch < 0x800) {
50 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
51 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
52 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
53 (uch <= SURROGATE_TRAIL_LAST)) {
54 // Half a surrogate pair
55 i++;
56 unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
57 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
58 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
59 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
60 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
61 } else {
62 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
63 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
64 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
66 i++;
68 if (k < len)
69 putf[k] = '\0';
72 unsigned int UTF8CharLength(unsigned char ch) {
73 if (ch < 0x80) {
74 return 1;
75 } else if (ch < 0x80 + 0x40 + 0x20) {
76 return 2;
77 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
78 return 3;
79 } else {
80 return 4;
84 size_t UTF16Length(const char *s, size_t len) {
85 size_t ulen = 0;
86 size_t charLen;
87 for (size_t i = 0; i<len;) {
88 unsigned char ch = static_cast<unsigned char>(s[i]);
89 if (ch < 0x80) {
90 charLen = 1;
91 } else if (ch < 0x80 + 0x40 + 0x20) {
92 charLen = 2;
93 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
94 charLen = 3;
95 } else {
96 charLen = 4;
97 ulen++;
99 i += charLen;
100 ulen++;
102 return ulen;
105 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
106 size_t ui = 0;
107 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
108 size_t i = 0;
109 while ((i<len) && (ui<tlen)) {
110 unsigned char ch = us[i++];
111 if (ch < 0x80) {
112 tbuf[ui] = ch;
113 } else if (ch < 0x80 + 0x40 + 0x20) {
114 tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
115 ch = us[i++];
116 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
117 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
118 tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
119 ch = us[i++];
120 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
121 ch = us[i++];
122 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
123 } else {
124 // Outside the BMP so need two surrogates
125 int val = (ch & 0x7) << 18;
126 ch = us[i++];
127 val += (ch & 0x3F) << 12;
128 ch = us[i++];
129 val += (ch & 0x3F) << 6;
130 ch = us[i++];
131 val += (ch & 0x3F);
132 tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
133 ui++;
134 tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
136 ui++;
138 return ui;
141 unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) {
142 unsigned int ui=0;
143 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
144 unsigned int i=0;
145 while ((i<len) && (ui<tlen)) {
146 unsigned char ch = us[i++];
147 unsigned int value = 0;
148 if (ch < 0x80) {
149 value = ch;
150 } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) {
151 value = (ch & 0x1F) << 6;
152 ch = us[i++];
153 value += ch & 0x7F;
154 } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) {
155 value = (ch & 0xF) << 12;
156 ch = us[i++];
157 value += (ch & 0x7F) << 6;
158 ch = us[i++];
159 value += ch & 0x7F;
160 } else if ((len-i) >= 3) {
161 value = (ch & 0x7) << 18;
162 ch = us[i++];
163 value += (ch & 0x3F) << 12;
164 ch = us[i++];
165 value += (ch & 0x3F) << 6;
166 ch = us[i++];
167 value += ch & 0x3F;
169 tbuf[ui] = value;
170 ui++;
172 return ui;
175 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
176 if (val < SUPPLEMENTAL_PLANE_FIRST) {
177 tbuf[0] = static_cast<wchar_t>(val);
178 return 1;
179 } else {
180 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
181 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
182 return 2;
186 int UTF8BytesOfLead[256];
187 static bool initialisedBytesOfLead = false;
189 static int BytesFromLead(int leadByte) {
190 if (leadByte < 0xC2) {
191 // Single byte or invalid
192 return 1;
193 } else if (leadByte < 0xE0) {
194 return 2;
195 } else if (leadByte < 0xF0) {
196 return 3;
197 } else if (leadByte < 0xF5) {
198 return 4;
199 } else {
200 // Characters longer than 4 bytes not possible in current UTF-8
201 return 1;
205 void UTF8BytesOfLeadInitialise() {
206 if (!initialisedBytesOfLead) {
207 for (int i=0; i<256; i++) {
208 UTF8BytesOfLead[i] = BytesFromLead(i);
210 initialisedBytesOfLead = true;
214 // Return both the width of the first character in the string and a status
215 // saying whether it is valid or invalid.
216 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
217 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
218 // reasonably treated as code points in some circumstances. They will, however,
219 // not have associated glyphs.
220 int UTF8Classify(const unsigned char *us, int len) {
221 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
222 if (*us < 0x80) {
223 // Single bytes easy
224 return 1;
225 } else if (*us > 0xf4) {
226 // Characters longer than 4 bytes not possible in current UTF-8
227 return UTF8MaskInvalid | 1;
228 } else if (*us >= 0xf0) {
229 // 4 bytes
230 if (len < 4)
231 return UTF8MaskInvalid | 1;
232 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
233 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
234 // *FFFE or *FFFF non-character
235 return UTF8MaskInvalid | 4;
237 if (*us == 0xf4) {
238 // Check if encoding a value beyond the last Unicode character 10FFFF
239 if (us[1] > 0x8f) {
240 return UTF8MaskInvalid | 1;
241 } else if (us[1] == 0x8f) {
242 if (us[2] > 0xbf) {
243 return UTF8MaskInvalid | 1;
244 } else if (us[2] == 0xbf) {
245 if (us[3] > 0xbf) {
246 return UTF8MaskInvalid | 1;
250 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
251 // Overlong
252 return UTF8MaskInvalid | 1;
254 return 4;
255 } else {
256 return UTF8MaskInvalid | 1;
258 } else if (*us >= 0xe0) {
259 // 3 bytes
260 if (len < 3)
261 return UTF8MaskInvalid | 1;
262 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
263 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
264 // Overlong
265 return UTF8MaskInvalid | 1;
267 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
268 // Surrogate
269 return UTF8MaskInvalid | 1;
271 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
272 // U+FFFE non-character - 3 bytes long
273 return UTF8MaskInvalid | 3;
275 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
276 // U+FFFF non-character - 3 bytes long
277 return UTF8MaskInvalid | 3;
279 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
280 // U+FDD0 .. U+FDEF
281 return UTF8MaskInvalid | 3;
283 return 3;
284 } else {
285 return UTF8MaskInvalid | 1;
287 } else if (*us >= 0xc2) {
288 // 2 bytes
289 if (len < 2)
290 return UTF8MaskInvalid | 1;
291 if (UTF8IsTrailByte(us[1])) {
292 return 2;
293 } else {
294 return UTF8MaskInvalid | 1;
296 } else {
297 // 0xc0 .. 0xc1 is overlong encoding
298 // 0x80 .. 0xbf is trail byte
299 return UTF8MaskInvalid | 1;
303 int UTF8DrawBytes(const unsigned char *us, int len) {
304 int utf8StatusNext = UTF8Classify(us, len);
305 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
308 // Replace invalid bytes in UTF-8 with the replacement character
309 std::string FixInvalidUTF8(const std::string &text) {
310 std::string result;
311 const unsigned char *us = reinterpret_cast<const unsigned char *>(text.c_str());
312 size_t remaining = text.size();
313 while (remaining > 0) {
314 const int utf8Status = UTF8Classify(us, static_cast<int>(remaining));
315 if (utf8Status & UTF8MaskInvalid) {
316 // Replacement character 0xFFFD = UTF8:"efbfbd".
317 result.append("\xef\xbf\xbd");
318 us++;
319 remaining--;
320 } else {
321 const int len = utf8Status&UTF8MaskWidth;
322 result.append(reinterpret_cast<const char *>(us), len);
323 us += len;
324 remaining -= len;
327 return result;
330 #ifdef SCI_NAMESPACE
332 #endif