Implement 99% of what moonlight needs to run on eglib
[mono-project/dkf.git] / eglib / src / gutf8.c
blob4ed723fe74ea95cbdf3695eaa5fd2bd92378b242
1 /*
2 * gutf8.c: UTF-8 conversion
4 * Author:
5 * Atsushi Enomoto <atsushi@ximian.com>
7 * (C) 2006 Novell, Inc.
8 */
10 #include <stdio.h>
11 #include <glib.h>
13 gpointer error_quark = "ERROR";
15 static glong utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error);
16 static glong utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error);
18 gpointer
19 g_convert_error_quark ()
21 return error_quark;
24 static gunichar*
25 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
27 glong i, u16len, u32len;
28 gunichar2 *u16str;
29 gunichar *u32str;
30 gchar *u8str;
31 GError **err = NULL;
33 u16str = g_utf8_to_utf16 (str, (glong)len, NULL, &u16len, err);
34 u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
35 for (i = 0; i < u32len; i++) {
36 u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
38 g_free (u16str);
39 u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
40 u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
41 g_free (u32str);
42 g_free (u16str);
43 return (gunichar*)u8str;
46 gchar*
47 g_utf8_strup (const gchar *str, gssize len)
49 return (gchar*)utf8_case_conv (str, len, TRUE);
52 gchar*
53 g_utf8_strdown (const gchar *str, gssize len)
55 return (gchar*)utf8_case_conv (str, len, FALSE);
58 static glong
59 utf8_to_utf16_len (const gchar *str, glong len, glong *items_read, GError **error)
61 /* It is almost identical to UTF8Encoding.GetCharCount() */
62 guchar ch, mb_size, mb_remain;
63 gboolean overlong;
64 guint32 codepoint;
65 glong in_pos, ret;
67 if (len < 0)
68 len = (glong) strlen (str);
70 in_pos = 0;
71 ret = 0;
73 /* Common case */
74 for (in_pos = 0; in_pos < len && (guchar) str [in_pos] < 0x80; in_pos++)
75 ret ++;
77 if (in_pos == len) {
78 if (items_read)
79 *items_read = in_pos;
80 return ret;
83 mb_size = 0;
84 mb_remain = 0;
85 overlong = 0;
87 for (; in_pos < len; in_pos++) {
88 ch = str [in_pos];
89 if (mb_size == 0) {
90 if (ch < 0x80)
91 ret++;
92 else if ((ch & 0xE0) == 0xC0) {
93 codepoint = ch & 0x1F;
94 mb_size = 2;
95 } else if ((ch & 0xF0) == 0xE0) {
96 codepoint = ch & 0x0F;
97 mb_size = 3;
98 } else if ((ch & 0xF8) == 0xF0) {
99 codepoint = ch & 7;
100 mb_size = 4;
101 } else if ((ch & 0xFC) == 0xF8) {
102 codepoint = ch & 3;
103 mb_size = 5;
104 } else if ((ch & 0xFE) == 0xFC) {
105 codepoint = ch & 3;
106 mb_size = 6;
107 } else {
108 /* invalid utf-8 sequence */
109 if (error) {
110 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal first byte)", in_pos);
111 if (items_read)
112 *items_read = in_pos;
113 return -1;
114 } else {
115 codepoint = 0;
116 mb_remain = mb_size = 0;
119 if (mb_size > 1)
120 mb_remain = mb_size - 1;
121 } else {
122 if ((ch & 0xC0) == 0x80) {
123 codepoint = (codepoint << 6) | (ch & 0x3F);
124 if (--mb_remain == 0) {
125 /* multi byte character is fully consumed now. */
126 if (codepoint < 0x10000) {
127 switch (mb_size) {
128 case 2:
129 overlong = codepoint < 0x7F;
130 break;
131 case 3:
132 overlong = codepoint < 0x7FF;
133 break;
134 case 4:
135 overlong = codepoint < 0xFFFF;
136 break;
137 case 5:
138 overlong = codepoint < 0x1FFFFF;
139 break;
140 case 6:
141 overlong = codepoint < 0x03FFFFFF;
142 break;
144 if (overlong) {
145 /* invalid utf-8 sequence (overlong) */
146 if (error) {
147 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (overlong)", in_pos);
148 if (items_read)
149 *items_read = in_pos;
150 return -1;
151 } else {
152 codepoint = 0;
153 mb_remain = 0;
154 overlong = FALSE;
157 else
158 ret++;
159 } else if (codepoint < 0x110000) {
160 /* surrogate pair */
161 ret += 2;
162 } else {
163 /* invalid utf-8 sequence (excess) */
164 if (error) {
165 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos);
166 if (items_read)
167 *items_read = in_pos;
168 return -1;
169 } else {
170 codepoint = 0;
171 mb_remain = 0;
174 mb_size = 0;
176 } else {
177 /* invalid utf-8 sequence */
178 if (error) {
179 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos);
180 if (items_read)
181 *items_read = in_pos;
182 return -1;
183 } else {
184 codepoint = 0;
185 mb_remain = mb_size = 0;
191 if (items_read)
192 *items_read = in_pos;
193 return ret;
196 gunichar2*
197 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
199 /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
200 but error check is always done at utf8_to_utf16_len() so that
201 the conversion core below simply resets erroreous bits */
202 glong utf16_len;
203 gunichar2 *ret;
204 guchar ch, mb_size, mb_remain;
205 guint32 codepoint;
206 glong in_pos, out_pos;
208 utf16_len = 0;
209 mb_size = 0;
210 mb_remain = 0;
211 in_pos = 0;
212 out_pos = 0;
214 if (error)
215 *error = NULL;
217 if (len < 0)
218 len = (glong) strlen (str);
220 if (items_read)
221 *items_read = 0;
222 if (items_written)
223 *items_written = 0;
224 utf16_len = utf8_to_utf16_len (str, len, items_read, error);
225 if (error)
226 if (*error)
227 return NULL;
228 if (utf16_len < 0)
229 return NULL;
231 ret = g_malloc ((1 + utf16_len) * sizeof (gunichar2));
233 /* Common case */
234 for (in_pos = 0; in_pos < len; in_pos++) {
235 ch = (guchar) str [in_pos];
237 if (ch >= 0x80)
238 break;
239 ret [out_pos++] = ch;
242 for (; in_pos < len; in_pos++) {
243 ch = (guchar) str [in_pos];
244 if (mb_size == 0) {
245 if (ch < 0x80)
246 ret [out_pos++] = ch;
247 else if ((ch & 0xE0) == 0xC0) {
248 codepoint = ch & 0x1F;
249 mb_size = 2;
250 } else if ((ch & 0xF0) == 0xE0) {
251 codepoint = ch & 0x0F;
252 mb_size = 3;
253 } else if ((ch & 0xF8) == 0xF0) {
254 codepoint = ch & 7;
255 mb_size = 4;
256 } else if ((ch & 0xFC) == 0xF8) {
257 codepoint = ch & 3;
258 mb_size = 5;
259 } else if ((ch & 0xFE) == 0xFC) {
260 codepoint = ch & 3;
261 mb_size = 6;
262 } else {
263 /* invalid utf-8 sequence */
264 codepoint = 0;
265 mb_remain = mb_size = 0;
267 if (mb_size > 1)
268 mb_remain = mb_size - 1;
269 } else {
270 if ((ch & 0xC0) == 0x80) {
271 codepoint = (codepoint << 6) | (ch & 0x3F);
272 if (--mb_remain == 0) {
273 /* multi byte character is fully consumed now. */
274 if (codepoint < 0x10000) {
275 ret [out_pos++] = (gunichar2)(codepoint % 0x10000);
276 } else if (codepoint < 0x110000) {
277 /* surrogate pair */
278 codepoint -= 0x10000;
279 ret [out_pos++] = (gunichar2)((codepoint >> 10) + 0xD800);
280 ret [out_pos++] = (gunichar2)((codepoint & 0x3FF) + 0xDC00);
281 } else {
282 /* invalid utf-8 sequence (excess) */
283 codepoint = 0;
284 mb_remain = 0;
286 mb_size = 0;
288 } else {
289 /* invalid utf-8 sequence */
290 codepoint = 0;
291 mb_remain = mb_size = 0;
296 ret [out_pos] = 0;
297 if (items_written)
298 *items_written = out_pos;
299 return ret;
302 gchar*
303 g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
305 /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
306 but error check is always done at utf16_to_utf8_len() so that
307 the conversion core below simply resets erroreous bits */
308 glong utf8_len;
309 gchar *ret;
310 glong in_pos, out_pos;
311 gunichar2 ch;
312 guint32 codepoint = 0;
313 gboolean surrogate;
315 in_pos = 0;
316 out_pos = 0;
317 surrogate = FALSE;
319 if (items_read)
320 *items_read = 0;
321 if (items_written)
322 *items_written = 0;
323 utf8_len = utf16_to_utf8_len (str, len, items_read, error);
324 if (error)
325 if (*error)
326 return NULL;
327 if (utf8_len < 0)
328 return NULL;
330 ret = g_malloc ((1+utf8_len) * sizeof (gchar));
332 while (len < 0 ? str [in_pos] : in_pos < len) {
333 ch = str [in_pos];
334 if (surrogate) {
335 if (ch >= 0xDC00 && ch <= 0xDFFF) {
336 codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
337 surrogate = 0;
338 } else {
339 surrogate = 0;
340 /* invalid surrogate pair */
341 ++in_pos;
342 continue;
344 } else {
345 /* fast path optimization */
346 if (ch < 0x80) {
347 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
348 if (str [in_pos] < 0x80)
349 ret [out_pos++] = (gchar)(str [in_pos]);
350 else
351 break;
353 continue;
355 else if (ch >= 0xD800 && ch <= 0xDBFF)
356 surrogate = ch;
357 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
358 ++in_pos;
359 /* invalid surrogate pair */
360 continue;
362 else
363 codepoint = ch;
365 in_pos++;
367 if (surrogate != 0)
368 continue;
369 if (codepoint < 0x80)
370 ret [out_pos++] = (gchar) codepoint;
371 else if (codepoint < 0x0800) {
372 ret [out_pos++] = (gchar) (0xC0 | (codepoint >> 6));
373 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
374 } else if (codepoint < 0x10000) {
375 ret [out_pos++] = (gchar) (0xE0 | (codepoint >> 12));
376 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
377 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
378 } else {
379 ret [out_pos++] = (gchar) (0xF0 | (codepoint >> 18));
380 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 12) & 0x3F));
381 ret [out_pos++] = (gchar) (0x80 | ((codepoint >> 6) & 0x3F));
382 ret [out_pos++] = (gchar) (0x80 | (codepoint & 0x3F));
385 ret [out_pos] = 0;
387 if (items_written)
388 *items_written = out_pos;
389 return ret;
392 static glong
393 utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
395 glong ret, in_pos;
396 gunichar2 ch;
397 gboolean surrogate;
399 ret = 0;
400 in_pos = 0;
401 surrogate = FALSE;
403 while (len < 0 ? str [in_pos] : in_pos < len) {
404 ch = str [in_pos];
405 if (surrogate) {
406 if (ch >= 0xDC00 && ch <= 0xDFFF) {
407 ret += 4;
408 } else {
409 /* invalid surrogate pair */
410 if (error) {
411 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos);
412 if (items_read)
413 *items_read = in_pos;
414 return -1;
415 } /* otherwise just ignore. */
417 surrogate = FALSE;
418 } else {
419 /* fast path optimization */
420 if (ch < 0x80) {
421 for (; len < 0 ? str [in_pos] : in_pos < len; in_pos++) {
422 if (str [in_pos] < 0x80)
423 ++ret;
424 else
425 break;
427 continue;
429 else if (ch < 0x0800)
430 ret += 2;
431 else if (ch >= 0xD800 && ch <= 0xDBFF)
432 surrogate = TRUE;
433 else if (ch >= 0xDC00 && ch <= 0xDFFF) {
434 /* invalid surrogate pair */
435 if (error) {
436 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos);
437 if (items_read)
438 *items_read = in_pos;
439 return -1;
440 } /* otherwise just ignore. */
442 else
443 ret += 3;
445 in_pos++;
448 if (items_read)
449 *items_read = in_pos;
450 return ret;
453 gchar *
454 g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
456 gchar *outbuf, *outptr;
457 glong nwritten = 0;
458 glong i;
459 gint n;
461 if (len == -1) {
462 for (i = 0; str[i] != 0; i++) {
463 if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
464 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
465 "Invalid sequence in conversion input");
467 if (items_read)
468 *items_read = i;
470 return NULL;
473 nwritten += n;
475 } else {
476 for (i = 0; i < len; i++) {
477 if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
478 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
479 "Invalid sequence in conversion input");
481 if (items_read)
482 *items_read = i;
484 return NULL;
487 nwritten += n;
491 outptr = outbuf = g_malloc (nwritten + 1);
492 if (len == -1) {
493 for (i = 0; str[i] != 0; i++)
494 outptr += g_unichar_to_utf8 (str[i], outptr);
495 } else {
496 for (i = 0; i < len; i++)
497 outptr += g_unichar_to_utf8 (str[i], outptr);
499 *outptr = '\0';
501 if (items_written)
502 *items_written = nwritten;
504 if (items_read != 0)
505 *items_read = i;
507 return outbuf;
510 static glong
511 g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
513 glong retlen = 0;
514 glong errindex = 0;
515 const gunichar *lstr = str;
517 if (!str)
518 return 0;
520 while (*lstr != '\0' && len--) {
521 gunichar ch;
522 ch = *lstr++;
523 if (ch <= 0x0000FFFF) {
524 if (ch >= 0xD800 && ch <= 0xDFFF) {
525 errindex = (glong)(lstr - str)-1;
526 if (error)
527 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
528 "Invalid sequence in conversion input");
529 if (items_read)
530 *items_read = errindex;
531 return 0;
532 } else {
533 retlen++;
535 } else if (ch > 0x10FFFF) {
536 errindex = (glong)(lstr - str)-1;
537 if (error)
538 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
539 "Character out of range for UTF-16");
540 if (items_read)
541 *items_read = errindex;
542 return 0;
544 } else {
545 retlen+=2;
549 if (items_read)
550 *items_read = (glong)(lstr - str);
551 return retlen;
554 gunichar2*
555 g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
557 glong allocsz;
558 gunichar2 *retstr = 0;
559 gunichar2 *retch = 0;
560 glong nwritten = 0;
561 GError *lerror =0 ;
563 allocsz = g_ucs4_to_utf16_len (str, len, items_read, &lerror);
565 if (!lerror) {
566 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar2));
567 retstr[allocsz] = '\0';
569 while (*str != '\0' && len--) {
570 gunichar ch;
571 ch = *str++;
572 if (ch <= 0x0000FFFF && (ch < 0xD800 || ch > 0xDFFF)) {
573 *retch++ = (gunichar2)ch;
574 nwritten ++;
575 } else {
576 ch -= 0x0010000UL;
577 *retch++ = (gunichar2)((ch >> 10) + 0xD800);
578 *retch++ = (gunichar2)((ch & 0x3FFUL) + 0xDC00);
579 nwritten +=2;
584 if (items_written)
585 *items_written = nwritten;
586 if (error)
587 *error = lerror;
589 return retstr;
592 static glong
593 g_utf16_to_ucs4_len (const gunichar2 *str, glong len, glong *items_read, GError **error)
595 glong retlen = 0;
596 glong errindex = 0;
597 const gunichar2 *lstr = str;
598 gunichar2 ch,ch2;
600 if (!str)
601 return 0;
603 while (*lstr != '\0' && len--) {
604 ch = *lstr++;
605 if (ch >= 0xD800 && ch <= 0xDBFF) {
606 if (!len--) {
607 lstr--;
608 break;
610 ch2 = *lstr;
611 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
612 lstr++;
613 } else {
614 errindex = (glong)(lstr - str);
615 if (error)
616 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
617 "Invalid sequence in conversion input");
618 if (items_read)
619 *items_read = errindex;
620 return 0;
622 } else {
623 if (ch >= 0xDC00 && ch <= 0xDFFF) {
624 errindex = (glong)(lstr - str)-1;
625 if (error)
626 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
627 "Invalid sequence in conversion input");
628 if (items_read)
629 *items_read = errindex;
630 return 0;
633 retlen++;
636 if (items_read)
637 *items_read = (glong)(lstr - str);
639 return retlen;
642 gunichar*
643 g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error)
645 glong allocsz;
646 gunichar *retstr = 0;
647 gunichar *retch = 0;
648 glong nwritten = 0;
649 GError *lerror =0 ;
650 gunichar ch,ch2;
652 allocsz = g_utf16_to_ucs4_len (str, len, items_read, &lerror);
654 if (!lerror) {
655 retch = retstr = g_malloc ((allocsz+1) * sizeof (gunichar));
656 retstr[allocsz] = '\0';
657 nwritten = allocsz;
659 while (*str != '\0' && allocsz--) {
660 ch = *str++;
661 if (ch >= 0xD800 && ch <= 0xDBFF) {
662 ch2 = *str++;
663 ch = ((ch - (gunichar)0xD800) << 10)
664 + (ch2 - (gunichar)0xDC00) + (gunichar)0x0010000UL;
666 *retch++ = ch;
670 if (items_written)
671 *items_written = nwritten;
672 if (error)
673 *error = lerror;
675 return retstr;
678 gchar *
679 g_utf8_offset_to_pointer (const gchar *str, glong offset)
681 if (offset == 0)
682 return str;
684 if (offset > 0) {
685 gchar *p = (gchar*)str;
686 do {
687 p = g_utf8_next_char (p);
688 offset --;
689 } while (offset > 0);
691 return p;
693 else {
694 // MOONLIGHT_FIXME
695 g_assert_not_reached();
699 glong
700 g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
702 const gchar *inptr, *inend;
703 glong offset = 0;
704 glong sign = 1;
706 if (pos == str)
707 return 0;
709 if (str < pos) {
710 inptr = str;
711 inend = pos;
712 } else {
713 inptr = pos;
714 inend = str;
715 sign = -1;
718 do {
719 inptr = g_utf8_next_char (inptr);
720 offset++;
721 } while (inptr < inend);
723 return offset * sign;
726 gunichar*
727 g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
729 gunichar* ucs4;
730 int ucs4_index;
731 const char *p;
732 int mb_size;
733 gunichar codepoint;
735 g_return_val_if_fail (str != NULL, NULL);
737 if (len < 0) {
738 /* we need to find the length of str, as len < 0 means it must be 0 terminated */
740 len = 0;
741 p = str;
742 while (*p) {
743 len ++;
744 p = g_utf8_next_char(p);
748 ucs4 = g_malloc (sizeof(gunichar)*len);
749 if (items_written)
750 *items_written = len;
752 p = str;
753 ucs4_index = 0;
754 while (len) {
755 guint8 c = *p++;
758 if (c < 0x80) {
759 mb_size = 1;
761 else if ((c & 0xE0) == 0xC0) {
762 c &= 0x1f;
764 mb_size = 2;
766 else if ((c & 0xF0) == 0xE0) {
767 c &= 0x0f;
768 mb_size = 3;
770 else if ((c & 0xF8) == 0xF0) {
771 c &= 0x07;
772 mb_size = 4;
774 else if ((c & 0xFC) == 0xF8) {
775 c &= 0x03;
776 mb_size = 5;
778 else if ((c & 0xFE) == 0xFC) {
779 c &= 0x01;
780 mb_size = 6;
783 codepoint = c;
784 while (--mb_size) {
785 codepoint <<= 6 | ((*p)&0x3f);
786 p++;
789 ucs4[ucs4_index++] = codepoint;
790 len --;
793 return ucs4;
797 * from http://home.tiscali.nl/t876506/utf8tbl.html
799 * From Unicode UCS-4 to UTF-8:
800 * Start with the Unicode number expressed as a decimal number and call this ud.
802 * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
804 * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
805 * byte 1 = 192 + (ud div 64)
806 * byte 2 = 128 + (ud mod 64)
808 * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
809 * byte 1 = 224 + (ud div 4096)
810 * byte 2 = 128 + ((ud div 64) mod 64)
811 * byte 3 = 128 + (ud mod 64)
813 * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
814 * byte 1 = 240 + (ud div 262144)
815 * byte 2 = 128 + ((ud div 4096) mod 64)
816 * byte 3 = 128 + ((ud div 64) mod 64)
817 * byte 4 = 128 + (ud mod 64)
819 * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
820 * byte 1 = 248 + (ud div 16777216)
821 * byte 2 = 128 + ((ud div 262144) mod 64)
822 * byte 3 = 128 + ((ud div 4096) mod 64)
823 * byte 4 = 128 + ((ud div 64) mod 64)
824 * byte 5 = 128 + (ud mod 64)
826 * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
827 * byte 1 = 252 + (ud div 1073741824)
828 * byte 2 = 128 + ((ud div 16777216) mod 64)
829 * byte 3 = 128 + ((ud div 262144) mod 64)
830 * byte 4 = 128 + ((ud div 4096) mod 64)
831 * byte 5 = 128 + ((ud div 64) mod 64)
832 * byte 6 = 128 + (ud mod 64)
834 gint
835 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
837 gint len, i;
838 char base;
840 if (c < 128UL) {
841 base = 0;
842 len = 1;
843 } else if (c < 2048UL) {
844 base = 192;
845 len = 2;
846 } else if (c < 65536UL) {
847 base = 224;
848 len = 3;
849 } else if (c < 2097152UL) {
850 base = 240;
851 len = 4;
852 } else if (c < 67108864UL) {
853 base = 248;
854 len = 5;
855 } else if (c < 2147483648UL) {
856 base = 252;
857 len = 6;
858 } else
859 return -1;
861 if (outbuf != NULL) {
862 for (i = len - 1; i > 0; i--) {
863 /* mask off 6 bits worth and add 128 */
864 outbuf[i] = 128 + (c & 0x3f);
865 c >>= 6;
868 /* first character has a different base */
869 outbuf[0] = base + (c & 0x3f);
872 return len;