2010-04-06 Rodrigo Kumpera <rkumpera@novell.com>
[mono.git] / eglib / src / gutf8.c
blobab6c12ea69c1fc5ad614df6e2e98b73310cf008b
1 /*
2 * gutf8.c: UTF-8 conversion
4 * Author:
5 * Atsushi Enomoto <atsushi@ximian.com>
7 * (C) 2006 Novell, Inc.
8 */
10 #include <stdio.h>
11 #include <glib.h>
13 gpointer error_quark = "ERROR";
15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
18 gpointer
19 g_convert_error_quark ()
21 return error_quark;
24 static gunichar*
25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
27 glong i, u16len, u32len;
28 gunichar2 *u16str;
29 gunichar *u32str;
30 gchar *u8str;
31 GError **err = NULL;
33 u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
34 u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
35 for (i = 0; i < u32len; i++) {
36 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
38 g_free (u16str);
39 u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
40 u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
41 g_free (u32str);
42 g_free (u16str);
43 return (gunichar*)u8str;
46 gchar*
47 g_utf8_strup (const gchar *str, gssize len)
49 return (gchar*)utf8_case_conv (str, len, TRUE);
52 gchar*
53 g_utf8_strdown (const gchar *str, gssize len)
55 return (gchar*)utf8_case_conv (str, len, FALSE);
58 gunichar2*
59 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
61 /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
62 but error check is always done at utf8_to_utf16_len() so that
63 the conversion core below simply resets erroreous bits */
64 glong utf16_len;
65 gunichar2 *ret;
66 guchar ch, mb_size, mb_remain;
67 guint32 codepoint;
68 glong in_pos, out_pos;
70 utf16_len = 0;
71 mb_size = 0;
72 mb_remain = 0;
73 in_pos = 0;
74 out_pos = 0;
76 if (error)
77 *error = NULL;
79 if (items_written)
80 *items_written = 0;
81 utf16_len = utf8_to_utf16_len (str, len, items_read, error);
82 if (error)
83 if (*error)
84 return NULL;
85 if (utf16_len < 0)
86 return NULL;
88 ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
90 for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
91 ch = (guchar) str [in_pos];
92 if (mb_size == 0) {
93 if (ch < 0x80)
94 ret [out_pos++] = ch;
95 else if ((ch & 0xE0) == 0xC0) {
96 codepoint = ch & 0x1F;
97 mb_size = 2;
98 } else if ((ch & 0xF0) == 0xE0) {
99 codepoint = ch & 0x0F;
100 mb_size = 3;
101 } else if ((ch & 0xF8) == 0xF0) {
102 codepoint = ch & 7;
103 mb_size = 4;
104 } else if ((ch & 0xFC) == 0xF8) {
105 codepoint = ch & 3;
106 mb_size = 5;
107 } else if ((ch & 0xFE) == 0xFC) {
108 codepoint = ch & 3;
109 mb_size = 6;
110 } else {
111 /* invalid utf-8 sequence */
112 codepoint = 0;
113 mb_remain = mb_size = 0;
115 if (mb_size > 1)
116 mb_remain = mb_size - 1;
117 } else {
118 if ((ch & 0xC0) == 0x80) {
119 codepoint = (codepoint << 6) | (ch & 0x3F);
120 if (--mb_remain == 0) {
121 /* multi byte character is fully consumed now. */
122 if (codepoint < 0x10000) {
123 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
124 } else if (codepoint < 0x110000) {
125 /* surrogate pair */
126 codepoint -= 0x10000;
127 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
128 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
129 } else {
130 /* invalid utf-8 sequence (excess) */
131 codepoint = 0;
132 mb_remain = 0;
134 mb_size = 0;
136 } else {
137 /* invalid utf-8 sequence */
138 codepoint = 0;
139 mb_remain = mb_size = 0;
144 ret [out_pos] = 0;
145 if (items_written)
146 *items_written = out_pos;
147 return ret;
150 static glong
151 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
153 /* It is almost identical to UTF8Encoding.GetCharCount() */
154 guchar ch, mb_size, mb_remain;
155 gboolean overlong;
156 guint32 codepoint;
157 glong in_pos, ret;
159 mb_size = 0;
160 mb_remain = 0;
161 overlong = 0;
162 in_pos = 0;
163 ret = 0;
165 for (in_pos = 0; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
166 ch = str [in_pos];
167 if (mb_size == 0) {
168 if (ch < 0x80)
169 ret++;
170 else if ((ch & 0xE0) == 0xC0) {
171 codepoint = ch & 0x1F;
172 mb_size = 2;
173 } else if ((ch & 0xF0) == 0xE0) {
174 codepoint = ch & 0x0F;
175 mb_size = 3;
176 } else if ((ch & 0xF8) == 0xF0) {
177 codepoint = ch & 7;
178 mb_size = 4;
179 } else if ((ch & 0xFC) == 0xF8) {
180 codepoint = ch & 3;
181 mb_size = 5;
182 } else if ((ch & 0xFE) == 0xFC) {
183 codepoint = ch & 3;
184 mb_size = 6;
185 } else {
186 /* invalid utf-8 sequence */
187 if (error) {
188 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
189 if (items_read)
190 *items_read = in_pos;
191 return -1;
192 } else {
193 codepoint = 0;
194 mb_remain = mb_size = 0;
197 if (mb_size > 1)
198 mb_remain = mb_size - 1;
199 } else {
200 if ((ch & 0xC0) == 0x80) {
201 codepoint = (codepoint << 6) | (ch & 0x3F);
202 if (--mb_remain == 0) {
203 /* multi byte character is fully consumed now. */
204 if (codepoint < 0x10000) {
205 switch (mb_size) {
206 case 2:
207 overlong = codepoint < 0x7F;
208 break;
209 case 3:
210 overlong = codepoint < 0x7FF;
211 break;
212 case 4:
213 overlong = codepoint < 0xFFFF;
214 break;
215 case 5:
216 overlong = codepoint < 0x1FFFFF;
217 break;
218 case 6:
219 overlong = codepoint < 0x03FFFFFF;
220 break;
222 if (overlong) {
223 /* invalid utf-8 sequence (overlong) */
224 if (error) {
225 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
226 if (items_read)
227 *items_read = in_pos;
228 return -1;
229 } else {
230 codepoint = 0;
231 mb_remain = 0;
232 overlong = FALSE;
235 else
236 ret++;
237 } else if (codepoint < 0x110000) {
238 /* surrogate pair */
239 ret += 2;
240 } else {
241 /* invalid utf-8 sequence (excess) */
242 if (error) {
243 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
244 if (items_read)
245 *items_read = in_pos;
246 return -1;
247 } else {
248 codepoint = 0;
249 mb_remain = 0;
252 mb_size = 0;
254 } else {
255 /* invalid utf-8 sequence */
256 if (error) {
257 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
258 if (items_read)
259 *items_read = in_pos;
260 return -1;
261 } else {
262 codepoint = 0;
263 mb_remain = mb_size = 0;
269 if (items_read)
270 *items_read = in_pos;
271 return ret;
274 gchar*
275 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
277 /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
278 but error check is always done at utf16_to_utf8_len() so that
279 the conversion core below simply resets erroreous bits */
280 glong utf8_len;
281 gchar *ret;
282 glong in_pos, out_pos;
283 gunichar2 ch;
284 guint32 codepoint = 0;
285 gboolean surrogate;
287 in_pos = 0;
288 out_pos = 0;
289 surrogate = FALSE;
291 if (items_written)
292 *items_written = 0;
293 utf8_len = utf16_to_utf8_len (str, len, items_read, error);
294 if (error)
295 if (*error)
296 return NULL;
297 if (utf8_len < 0)
298 return NULL;
300 ret = g_malloc ((1+utf8_len) * sizeof (gchar));
302 while (len < 0 ? str [in_pos] : in_pos < len) {
303 ch = str [in_pos];
304 if (surrogate) {
305 if (ch >= 0xDC00 && ch <= 0xDFFF) {
306 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
307 surrogate = 0;
308 } else {
309 surrogate = 0;
310 /* invalid surrogate pair */
311 continue;
313 } else {
314 /* fast path optimization */
315 if (ch < 0x80) {
316 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
317 if (str [in_pos] < 0x80)
318 ret [out_pos++] = (gchar)(str [in_pos]);
319 else
320 break;
322 continue;
324 else if (ch >= 0xD800 && ch <= 0xDBFF)
325 surrogate = ch;
326 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
327 /* invalid surrogate pair */
328 continue;
330 else
331 codepoint = ch;
333 in_pos++;
335 if (surrogate != 0)
336 continue;
337 if (codepoint < 0x80)
338 ret [out_pos++] = (gchar) codepoint;
339 else if (codepoint < 0x0800) {
340 ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
341 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
342 } else if (codepoint < 0x10000) {
343 ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
344 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
345 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
346 } else {
347 ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
348 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
349 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
350 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
353 ret [out_pos] = 0;
355 if (items_written)
356 *items_written = out_pos;
357 return ret;
360 static glong
361 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
363 glong ret, in_pos;
364 gunichar2 ch;
365 gboolean surrogate;
367 ret = 0;
368 in_pos = 0;
369 surrogate = FALSE;
371 while (len < 0 ? str [in_pos] : in_pos < len) {
372 ch = str [in_pos];
373 if (surrogate) {
374 if (ch >= 0xDC00 && ch <= 0xDFFF) {
375 ret += 4;
376 } else {
377 /* invalid surrogate pair */
378 if (error) {
379 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
380 if (items_read)
381 *items_read = in_pos;
382 return -1;
383 } /* otherwise just ignore. */
385 surrogate = FALSE;
386 } else {
387 /* fast path optimization */
388 if (ch < 0x80) {
389 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
390 if (str [in_pos] < 0x80)
391 ++ret;
392 else
393 break;
395 continue;
397 else if (ch < 0x0800)
398 ret += 2;
399 else if (ch >= 0xD800 && ch <= 0xDBFF)
400 surrogate = TRUE;
401 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
402 /* invalid surrogate pair */
403 if (error) {
404 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
405 if (items_read)
406 *items_read = in_pos;
407 return -1;
408 } /* otherwise just ignore. */
410 else
411 ret += 3;
413 in_pos++;
416 if (items_read)
417 *items_read = in_pos;
418 return ret;
421 static glong
422 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
424 glong retlen = 0;
425 glong errindex = 0;
426 const gunichar *lstr = str;
428 if (!str)
429 return 0;
431 while (*lstr != '\0' && len--) {
432 gunichar ch;
433 ch = *lstr++;
434 if (ch <= 0x0000FFFF) {
435 if (ch >= 0xD800 && ch <= 0xDFFF) {
436 errindex = (glong)(lstr - str)-1;
437 if (error)
438 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
439 "Invalid sequence in conversion input");
440 if (items_read)
441 *items_read = errindex;
442 return 0;
443 } else {
444 retlen++;
446 } else if (ch > 0x10FFFF) {
447 errindex = (glong)(lstr - str)-1;
448 if (error)
449 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
450 "Character out of range for UTF-16");
451 if (items_read)
452 *items_read = errindex;
453 return 0;
455 } else {
456 retlen+=2;
460 if (items_read)
461 *items_read = (glong)(lstr - str);
462 return retlen;
465 gunichar2*
466 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
468 glong allocsz;
469 gunichar2 *retstr = 0;
470 gunichar2 *retch = 0;
471 glong nwritten = 0;
472 GError *lerror =0 ;
474 allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
476 if (!lerror) {
477 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
478 retstr[allocsz] = '\0';
480 while (*str != '\0' && len--) {
481 gunichar ch;
482 ch = *str++;
483 if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
484 *retch++ = (gunichar2)ch;
485 nwritten ++;
486 } else {
487 ch -= 0x0010000UL;
488 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
489 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
490 nwritten +=2;
495 if (items_written)
496 *items_written = nwritten;
497 if (error)
498 *error = lerror;
500 return retstr;
503 static glong
504 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
506 glong retlen = 0;
507 glong errindex = 0;
508 const gunichar2 *lstr = str;
509 gunichar2 ch,ch2;
511 if (!str)
512 return 0;
514 while (*lstr != '\0' && len--) {
515 ch = *lstr++;
516 if (ch >= 0xD800 && ch <= 0xDBFF) {
517 if (!len--) {
518 lstr--;
519 break;
521 ch2 = *lstr;
522 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
523 lstr++;
524 } else {
525 errindex = (glong)(lstr - str);
526 if (error)
527 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
528 "Invalid sequence in conversion input");
529 if (items_read)
530 *items_read = errindex;
531 return 0;
533 } else {
534 if (ch >= 0xDC00 && ch <= 0xDFFF) {
535 errindex = (glong)(lstr - str)-1;
536 if (error)
537 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
538 "Invalid sequence in conversion input");
539 if (items_read)
540 *items_read = errindex;
541 return 0;
544 retlen++;
547 if (items_read)
548 *items_read = (glong)(lstr - str);
550 return retlen;
553 gunichar*
554 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
556 glong allocsz;
557 gunichar *retstr = 0;
558 gunichar *retch = 0;
559 glong nwritten = 0;
560 GError *lerror =0 ;
561 gunichar ch,ch2;
563 allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
565 if (!lerror) {
566 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
567 retstr[allocsz] = '\0';
568 nwritten = allocsz;
570 while (*str != '\0' && allocsz--) {
571 ch = *str++;
572 if (ch >= 0xD800 && ch <= 0xDBFF) {
573 ch2 = *str++;
574 ch = ((ch - (gunichar)0xD800) << 10)
575 + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
577 *retch++ = ch;
581 if (items_written)
582 *items_written = nwritten;
583 if (error)
584 *error = lerror;
586 return retstr;