Update Scintilla to version 3.5.2
[TortoiseGit.git] / ext / scintilla / src / UniConversion.cxx
blob58651bc2b47dd5bc14937e9726bb699dfb7b69e8
1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
8 #include <stdlib.h>
10 #include "UniConversion.h"
12 #ifdef SCI_NAMESPACE
13 using namespace Scintilla;
14 #endif
16 #ifdef SCI_NAMESPACE
17 namespace Scintilla {
18 #endif
20 enum { SURROGATE_LEAD_FIRST = 0xD800 };
21 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
22 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
23 enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };
25 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
26 unsigned int len = 0;
27 for (unsigned int i = 0; i < tlen && uptr[i];) {
28 unsigned int uch = uptr[i];
29 if (uch < 0x80) {
30 len++;
31 } else if (uch < 0x800) {
32 len += 2;
33 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
34 (uch <= SURROGATE_TRAIL_LAST)) {
35 len += 4;
36 i++;
37 } else {
38 len += 3;
40 i++;
42 return len;
45 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
46 int k = 0;
47 for (unsigned int i = 0; i < tlen && uptr[i];) {
48 unsigned int uch = uptr[i];
49 if (uch < 0x80) {
50 putf[k++] = static_cast<char>(uch);
51 } else if (uch < 0x800) {
52 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
53 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
54 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
55 (uch <= SURROGATE_TRAIL_LAST)) {
56 // Half a surrogate pair
57 i++;
58 unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
59 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
60 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
61 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
62 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
63 } else {
64 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
65 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
66 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
68 i++;
70 putf[len] = '\0';
73 unsigned int UTF8CharLength(unsigned char ch) {
74 if (ch < 0x80) {
75 return 1;
76 } else if (ch < 0x80 + 0x40 + 0x20) {
77 return 2;
78 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
79 return 3;
80 } else {
81 return 4;
85 unsigned int UTF16Length(const char *s, unsigned int len) {
86 unsigned int ulen = 0;
87 unsigned int charLen;
88 for (unsigned int i=0; i<len;) {
89 unsigned char ch = static_cast<unsigned char>(s[i]);
90 if (ch < 0x80) {
91 charLen = 1;
92 } else if (ch < 0x80 + 0x40 + 0x20) {
93 charLen = 2;
94 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
95 charLen = 3;
96 } else {
97 charLen = 4;
98 ulen++;
100 i += charLen;
101 ulen++;
103 return ulen;
106 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
107 unsigned int ui=0;
108 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
109 unsigned int i=0;
110 while ((i<len) && (ui<tlen)) {
111 unsigned char ch = us[i++];
112 if (ch < 0x80) {
113 tbuf[ui] = ch;
114 } else if (ch < 0x80 + 0x40 + 0x20) {
115 tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
116 ch = us[i++];
117 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
118 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
119 tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
120 ch = us[i++];
121 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
122 ch = us[i++];
123 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
124 } else {
125 // Outside the BMP so need two surrogates
126 int val = (ch & 0x7) << 18;
127 ch = us[i++];
128 val += (ch & 0x3F) << 12;
129 ch = us[i++];
130 val += (ch & 0x3F) << 6;
131 ch = us[i++];
132 val += (ch & 0x3F);
133 tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
134 ui++;
135 tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
137 ui++;
139 return ui;
142 unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) {
143 unsigned int ui=0;
144 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
145 unsigned int i=0;
146 while ((i<len) && (ui<tlen)) {
147 unsigned char ch = us[i++];
148 wchar_t value = 0;
149 if (ch < 0x80) {
150 value = ch;
151 } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) {
152 value = (ch & 0x1F) << 6;
153 ch = us[i++];
154 value += ch & 0x7F;
155 } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) {
156 value = (ch & 0xF) << 12;
157 ch = us[i++];
158 value += (ch & 0x7F) << 6;
159 ch = us[i++];
160 value += ch & 0x7F;
161 } else if ((len-i) >= 3) {
162 value = (ch & 0x7) << 18;
163 ch = us[i++];
164 value += (ch & 0x3F) << 12;
165 ch = us[i++];
166 value += (ch & 0x3F) << 6;
167 ch = us[i++];
168 value += ch & 0x3F;
170 tbuf[ui] = value;
171 ui++;
173 return ui;
176 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
177 if (val < SUPPLEMENTAL_PLANE_FIRST) {
178 tbuf[0] = static_cast<wchar_t>(val);
179 return 1;
180 } else {
181 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
182 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
183 return 2;
187 int UTF8BytesOfLead[256];
188 static bool initialisedBytesOfLead = false;
190 static int BytesFromLead(int leadByte) {
191 if (leadByte < 0xC2) {
192 // Single byte or invalid
193 return 1;
194 } else if (leadByte < 0xE0) {
195 return 2;
196 } else if (leadByte < 0xF0) {
197 return 3;
198 } else if (leadByte < 0xF5) {
199 return 4;
200 } else {
201 // Characters longer than 4 bytes not possible in current UTF-8
202 return 1;
206 void UTF8BytesOfLeadInitialise() {
207 if (!initialisedBytesOfLead) {
208 for (int i=0; i<256; i++) {
209 UTF8BytesOfLead[i] = BytesFromLead(i);
211 initialisedBytesOfLead = true;
215 // Return both the width of the first character in the string and a status
216 // saying whether it is valid or invalid.
217 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
218 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
219 // reasonably treated as code points in some circumstances. They will, however,
220 // not have associated glyphs.
221 int UTF8Classify(const unsigned char *us, int len) {
222 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
223 if (*us < 0x80) {
224 // Single bytes easy
225 return 1;
226 } else if (*us > 0xf4) {
227 // Characters longer than 4 bytes not possible in current UTF-8
228 return UTF8MaskInvalid | 1;
229 } else if (*us >= 0xf0) {
230 // 4 bytes
231 if (len < 4)
232 return UTF8MaskInvalid | 1;
233 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
234 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
235 // *FFFE or *FFFF non-character
236 return UTF8MaskInvalid | 4;
238 if (*us == 0xf4) {
239 // Check if encoding a value beyond the last Unicode character 10FFFF
240 if (us[1] > 0x8f) {
241 return UTF8MaskInvalid | 1;
242 } else if (us[1] == 0x8f) {
243 if (us[2] > 0xbf) {
244 return UTF8MaskInvalid | 1;
245 } else if (us[2] == 0xbf) {
246 if (us[3] > 0xbf) {
247 return UTF8MaskInvalid | 1;
251 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
252 // Overlong
253 return UTF8MaskInvalid | 1;
255 return 4;
256 } else {
257 return UTF8MaskInvalid | 1;
259 } else if (*us >= 0xe0) {
260 // 3 bytes
261 if (len < 3)
262 return UTF8MaskInvalid | 1;
263 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
264 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
265 // Overlong
266 return UTF8MaskInvalid | 1;
268 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
269 // Surrogate
270 return UTF8MaskInvalid | 1;
272 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
273 // U+FFFE non-character - 3 bytes long
274 return UTF8MaskInvalid | 3;
276 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
277 // U+FFFF non-character - 3 bytes long
278 return UTF8MaskInvalid | 3;
280 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
281 // U+FDD0 .. U+FDEF
282 return UTF8MaskInvalid | 3;
284 return 3;
285 } else {
286 return UTF8MaskInvalid | 1;
288 } else if (*us >= 0xc2) {
289 // 2 bytes
290 if (len < 2)
291 return UTF8MaskInvalid | 1;
292 if (UTF8IsTrailByte(us[1])) {
293 return 2;
294 } else {
295 return UTF8MaskInvalid | 1;
297 } else {
298 // 0xc0 .. 0xc1 is overlong encoding
299 // 0x80 .. 0xbf is trail byte
300 return UTF8MaskInvalid | 1;
304 int UTF8DrawBytes(const unsigned char *us, int len) {
305 int utf8StatusNext = UTF8Classify(us, len);
306 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
309 #ifdef SCI_NAMESPACE
311 #endif