Merge pull request #826 from kugel-/doxygen-fixes2
[geany-mirror.git] / scintilla / src / UniConversion.cxx
blobc12ca34c2c4d637eaa928e41a193303d8d4ef6ef
1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
8 #include <stdlib.h>
10 #include <stdexcept>
12 #include "UniConversion.h"
14 #ifdef SCI_NAMESPACE
15 using namespace Scintilla;
16 #endif
18 #ifdef SCI_NAMESPACE
19 namespace Scintilla {
20 #endif
22 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
23 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
24 enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };
26 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
27 unsigned int len = 0;
28 for (unsigned int i = 0; i < tlen && uptr[i];) {
29 unsigned int uch = uptr[i];
30 if (uch < 0x80) {
31 len++;
32 } else if (uch < 0x800) {
33 len += 2;
34 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
35 (uch <= SURROGATE_TRAIL_LAST)) {
36 len += 4;
37 i++;
38 } else {
39 len += 3;
41 i++;
43 return len;
46 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
47 unsigned int k = 0;
48 for (unsigned int i = 0; i < tlen && uptr[i];) {
49 unsigned int uch = uptr[i];
50 if (uch < 0x80) {
51 putf[k++] = static_cast<char>(uch);
52 } else if (uch < 0x800) {
53 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
54 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
55 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
56 (uch <= SURROGATE_TRAIL_LAST)) {
57 // Half a surrogate pair
58 i++;
59 unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
60 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
61 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
62 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
63 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
64 } else {
65 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
66 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
67 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
69 i++;
71 if (k < len)
72 putf[k] = '\0';
75 unsigned int UTF8CharLength(unsigned char ch) {
76 if (ch < 0x80) {
77 return 1;
78 } else if (ch < 0x80 + 0x40 + 0x20) {
79 return 2;
80 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
81 return 3;
82 } else {
83 return 4;
87 size_t UTF16Length(const char *s, size_t len) {
88 size_t ulen = 0;
89 size_t charLen;
90 for (size_t i = 0; i<len;) {
91 unsigned char ch = static_cast<unsigned char>(s[i]);
92 if (ch < 0x80) {
93 charLen = 1;
94 } else if (ch < 0x80 + 0x40 + 0x20) {
95 charLen = 2;
96 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
97 charLen = 3;
98 } else {
99 charLen = 4;
100 ulen++;
102 i += charLen;
103 ulen++;
105 return ulen;
108 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
109 size_t ui = 0;
110 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
111 size_t i = 0;
112 while ((i<len) && (ui<tlen)) {
113 unsigned char ch = us[i++];
114 if (ch < 0x80) {
115 tbuf[ui] = ch;
116 } else if (ch < 0x80 + 0x40 + 0x20) {
117 tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
118 ch = us[i++];
119 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
120 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
121 tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
122 ch = us[i++];
123 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
124 ch = us[i++];
125 tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
126 } else {
127 // Outside the BMP so need two surrogates
128 int val = (ch & 0x7) << 18;
129 ch = us[i++];
130 val += (ch & 0x3F) << 12;
131 ch = us[i++];
132 val += (ch & 0x3F) << 6;
133 ch = us[i++];
134 val += (ch & 0x3F);
135 tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
136 ui++;
137 tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
139 ui++;
141 return ui;
144 unsigned int UTF32FromUTF8(const char *s, unsigned int len, unsigned int *tbuf, unsigned int tlen) {
145 unsigned int ui=0;
146 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
147 unsigned int i=0;
148 while ((i<len) && (ui<tlen)) {
149 unsigned char ch = us[i++];
150 unsigned int value = 0;
151 if (ch < 0x80) {
152 value = ch;
153 } else if (((len-i) >= 1) && (ch < 0x80 + 0x40 + 0x20)) {
154 value = (ch & 0x1F) << 6;
155 ch = us[i++];
156 value += ch & 0x7F;
157 } else if (((len-i) >= 2) && (ch < 0x80 + 0x40 + 0x20 + 0x10)) {
158 value = (ch & 0xF) << 12;
159 ch = us[i++];
160 value += (ch & 0x7F) << 6;
161 ch = us[i++];
162 value += ch & 0x7F;
163 } else if ((len-i) >= 3) {
164 value = (ch & 0x7) << 18;
165 ch = us[i++];
166 value += (ch & 0x3F) << 12;
167 ch = us[i++];
168 value += (ch & 0x3F) << 6;
169 ch = us[i++];
170 value += ch & 0x3F;
172 tbuf[ui] = value;
173 ui++;
175 return ui;
178 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) {
179 if (val < SUPPLEMENTAL_PLANE_FIRST) {
180 tbuf[0] = static_cast<wchar_t>(val);
181 return 1;
182 } else {
183 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
184 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
185 return 2;
189 int UTF8BytesOfLead[256];
190 static bool initialisedBytesOfLead = false;
192 static int BytesFromLead(int leadByte) {
193 if (leadByte < 0xC2) {
194 // Single byte or invalid
195 return 1;
196 } else if (leadByte < 0xE0) {
197 return 2;
198 } else if (leadByte < 0xF0) {
199 return 3;
200 } else if (leadByte < 0xF5) {
201 return 4;
202 } else {
203 // Characters longer than 4 bytes not possible in current UTF-8
204 return 1;
208 void UTF8BytesOfLeadInitialise() {
209 if (!initialisedBytesOfLead) {
210 for (int i=0; i<256; i++) {
211 UTF8BytesOfLead[i] = BytesFromLead(i);
213 initialisedBytesOfLead = true;
217 // Return both the width of the first character in the string and a status
218 // saying whether it is valid or invalid.
219 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
220 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
221 // reasonably treated as code points in some circumstances. They will, however,
222 // not have associated glyphs.
223 int UTF8Classify(const unsigned char *us, int len) {
224 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
225 if (*us < 0x80) {
226 // Single bytes easy
227 return 1;
228 } else if (*us > 0xf4) {
229 // Characters longer than 4 bytes not possible in current UTF-8
230 return UTF8MaskInvalid | 1;
231 } else if (*us >= 0xf0) {
232 // 4 bytes
233 if (len < 4)
234 return UTF8MaskInvalid | 1;
235 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
236 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
237 // *FFFE or *FFFF non-character
238 return UTF8MaskInvalid | 4;
240 if (*us == 0xf4) {
241 // Check if encoding a value beyond the last Unicode character 10FFFF
242 if (us[1] > 0x8f) {
243 return UTF8MaskInvalid | 1;
244 } else if (us[1] == 0x8f) {
245 if (us[2] > 0xbf) {
246 return UTF8MaskInvalid | 1;
247 } else if (us[2] == 0xbf) {
248 if (us[3] > 0xbf) {
249 return UTF8MaskInvalid | 1;
253 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
254 // Overlong
255 return UTF8MaskInvalid | 1;
257 return 4;
258 } else {
259 return UTF8MaskInvalid | 1;
261 } else if (*us >= 0xe0) {
262 // 3 bytes
263 if (len < 3)
264 return UTF8MaskInvalid | 1;
265 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
266 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
267 // Overlong
268 return UTF8MaskInvalid | 1;
270 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
271 // Surrogate
272 return UTF8MaskInvalid | 1;
274 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
275 // U+FFFE non-character - 3 bytes long
276 return UTF8MaskInvalid | 3;
278 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
279 // U+FFFF non-character - 3 bytes long
280 return UTF8MaskInvalid | 3;
282 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
283 // U+FDD0 .. U+FDEF
284 return UTF8MaskInvalid | 3;
286 return 3;
287 } else {
288 return UTF8MaskInvalid | 1;
290 } else if (*us >= 0xc2) {
291 // 2 bytes
292 if (len < 2)
293 return UTF8MaskInvalid | 1;
294 if (UTF8IsTrailByte(us[1])) {
295 return 2;
296 } else {
297 return UTF8MaskInvalid | 1;
299 } else {
300 // 0xc0 .. 0xc1 is overlong encoding
301 // 0x80 .. 0xbf is trail byte
302 return UTF8MaskInvalid | 1;
306 int UTF8DrawBytes(const unsigned char *us, int len) {
307 int utf8StatusNext = UTF8Classify(us, len);
308 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
311 #ifdef SCI_NAMESPACE
313 #endif