2 * gutf8.c: UTF-8 conversion
5 * Atsushi Enomoto <atsushi@ximian.com>
7 * (C) 2006 Novell, Inc.
13 gpointer error_quark
= "ERROR";
15 static glong
utf8_to_utf16_len (const gchar
*str
, glong len
, glong
*items_read
, GError
**error
);
16 static glong
utf16_to_utf8_len (const gunichar2
*str
, glong len
, glong
*items_read
, GError
**error
);
19 g_convert_error_quark ()
25 utf8_case_conv (const gchar
*str
, gssize len
, gboolean upper
)
27 glong i
, u16len
, u32len
;
33 u16str
= g_utf8_to_utf16 (str
, (glong
)len
, NULL
, &u16len
, err
);
34 u32str
= g_utf16_to_ucs4 (u16str
, u16len
, NULL
, &u32len
, err
);
35 for (i
= 0; i
< u32len
; i
++) {
36 u32str
[i
] = upper
? g_unichar_toupper (u32str
[i
]) : g_unichar_tolower (u32str
[i
]);
39 u16str
= g_ucs4_to_utf16 (u32str
, u32len
, NULL
, &u16len
, err
);
40 u8str
= g_utf16_to_utf8 (u16str
, u16len
, NULL
, NULL
, err
);
43 return (gunichar
*)u8str
;
47 g_utf8_strup (const gchar
*str
, gssize len
)
49 return (gchar
*)utf8_case_conv (str
, len
, TRUE
);
53 g_utf8_strdown (const gchar
*str
, gssize len
)
55 return (gchar
*)utf8_case_conv (str
, len
, FALSE
);
59 utf8_to_utf16_len (const gchar
*str
, glong len
, glong
*items_read
, GError
**error
)
61 /* It is almost identical to UTF8Encoding.GetCharCount() */
62 guchar ch
, mb_size
, mb_remain
;
68 len
= (glong
) strlen (str
);
74 for (in_pos
= 0; in_pos
< len
&& (guchar
) str
[in_pos
] < 0x80; in_pos
++)
87 for (; in_pos
< len
; in_pos
++) {
92 else if ((ch
& 0xE0) == 0xC0) {
93 codepoint
= ch
& 0x1F;
95 } else if ((ch
& 0xF0) == 0xE0) {
96 codepoint
= ch
& 0x0F;
98 } else if ((ch
& 0xF8) == 0xF0) {
101 } else if ((ch
& 0xFC) == 0xF8) {
104 } else if ((ch
& 0xFE) == 0xFC) {
108 /* invalid utf-8 sequence */
110 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "invalid utf-8 sequence at %d (illegal first byte)", in_pos
);
112 *items_read
= in_pos
;
116 mb_remain
= mb_size
= 0;
120 mb_remain
= mb_size
- 1;
122 if ((ch
& 0xC0) == 0x80) {
123 codepoint
= (codepoint
<< 6) | (ch
& 0x3F);
124 if (--mb_remain
== 0) {
125 /* multi byte character is fully consumed now. */
126 if (codepoint
< 0x10000) {
129 overlong
= codepoint
< 0x7F;
132 overlong
= codepoint
< 0x7FF;
135 overlong
= codepoint
< 0xFFFF;
138 overlong
= codepoint
< 0x1FFFFF;
141 overlong
= codepoint
< 0x03FFFFFF;
145 /* invalid utf-8 sequence (overlong) */
147 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "invalid utf-8 sequence at %d (overlong)", in_pos
);
149 *items_read
= in_pos
;
159 } else if (codepoint
< 0x110000) {
163 /* invalid utf-8 sequence (excess) */
165 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "invalid utf-8 sequence at %d (codepoint range excess)", in_pos
);
167 *items_read
= in_pos
;
177 /* invalid utf-8 sequence */
179 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "invalid utf-8 sequence at %d (illegal following bytes)", in_pos
);
181 *items_read
= in_pos
;
185 mb_remain
= mb_size
= 0;
192 *items_read
= in_pos
;
197 g_utf8_to_utf16 (const gchar
*str
, glong len
, glong
*items_read
, glong
*items_written
, GError
**error
)
199 /* The conversion logic is almost identical to UTF8Encoding.GetChars(),
200 but error check is always done at utf8_to_utf16_len() so that
201 the conversion core below simply resets erroreous bits */
204 guchar ch
, mb_size
, mb_remain
;
206 glong in_pos
, out_pos
;
218 len
= (glong
) strlen (str
);
224 utf16_len
= utf8_to_utf16_len (str
, len
, items_read
, error
);
231 ret
= g_malloc ((1 + utf16_len
) * sizeof (gunichar2
));
234 for (in_pos
= 0; in_pos
< len
; in_pos
++) {
235 ch
= (guchar
) str
[in_pos
];
239 ret
[out_pos
++] = ch
;
242 for (; in_pos
< len
; in_pos
++) {
243 ch
= (guchar
) str
[in_pos
];
246 ret
[out_pos
++] = ch
;
247 else if ((ch
& 0xE0) == 0xC0) {
248 codepoint
= ch
& 0x1F;
250 } else if ((ch
& 0xF0) == 0xE0) {
251 codepoint
= ch
& 0x0F;
253 } else if ((ch
& 0xF8) == 0xF0) {
256 } else if ((ch
& 0xFC) == 0xF8) {
259 } else if ((ch
& 0xFE) == 0xFC) {
263 /* invalid utf-8 sequence */
265 mb_remain
= mb_size
= 0;
268 mb_remain
= mb_size
- 1;
270 if ((ch
& 0xC0) == 0x80) {
271 codepoint
= (codepoint
<< 6) | (ch
& 0x3F);
272 if (--mb_remain
== 0) {
273 /* multi byte character is fully consumed now. */
274 if (codepoint
< 0x10000) {
275 ret
[out_pos
++] = (gunichar2
)(codepoint
% 0x10000);
276 } else if (codepoint
< 0x110000) {
278 codepoint
-= 0x10000;
279 ret
[out_pos
++] = (gunichar2
)((codepoint
>> 10) + 0xD800);
280 ret
[out_pos
++] = (gunichar2
)((codepoint
& 0x3FF) + 0xDC00);
282 /* invalid utf-8 sequence (excess) */
289 /* invalid utf-8 sequence */
291 mb_remain
= mb_size
= 0;
298 *items_written
= out_pos
;
303 g_utf16_to_utf8 (const gunichar2
*str
, glong len
, glong
*items_read
, glong
*items_written
, GError
**error
)
305 /* The conversion logic is almost identical to UTF8Encoding.GetBytes(),
306 but error check is always done at utf16_to_utf8_len() so that
307 the conversion core below simply resets erroreous bits */
310 glong in_pos
, out_pos
;
312 guint32 codepoint
= 0;
323 utf8_len
= utf16_to_utf8_len (str
, len
, items_read
, error
);
330 ret
= g_malloc ((1+utf8_len
) * sizeof (gchar
));
332 while (len
< 0 ? str
[in_pos
] : in_pos
< len
) {
335 if (ch
>= 0xDC00 && ch
<= 0xDFFF) {
336 codepoint
= 0x10000 + (ch
- 0xDC00) + ((surrogate
- 0xD800) << 10);
340 /* invalid surrogate pair */
345 /* fast path optimization */
347 for (; len
< 0 ? str
[in_pos
] : in_pos
< len
; in_pos
++) {
348 if (str
[in_pos
] < 0x80)
349 ret
[out_pos
++] = (gchar
)(str
[in_pos
]);
355 else if (ch
>= 0xD800 && ch
<= 0xDBFF)
357 else if (ch
>= 0xDC00 && ch
<= 0xDFFF) {
359 /* invalid surrogate pair */
369 if (codepoint
< 0x80)
370 ret
[out_pos
++] = (gchar
) codepoint
;
371 else if (codepoint
< 0x0800) {
372 ret
[out_pos
++] = (gchar
) (0xC0 | (codepoint
>> 6));
373 ret
[out_pos
++] = (gchar
) (0x80 | (codepoint
& 0x3F));
374 } else if (codepoint
< 0x10000) {
375 ret
[out_pos
++] = (gchar
) (0xE0 | (codepoint
>> 12));
376 ret
[out_pos
++] = (gchar
) (0x80 | ((codepoint
>> 6) & 0x3F));
377 ret
[out_pos
++] = (gchar
) (0x80 | (codepoint
& 0x3F));
379 ret
[out_pos
++] = (gchar
) (0xF0 | (codepoint
>> 18));
380 ret
[out_pos
++] = (gchar
) (0x80 | ((codepoint
>> 12) & 0x3F));
381 ret
[out_pos
++] = (gchar
) (0x80 | ((codepoint
>> 6) & 0x3F));
382 ret
[out_pos
++] = (gchar
) (0x80 | (codepoint
& 0x3F));
388 *items_written
= out_pos
;
393 utf16_to_utf8_len (const gunichar2
*str
, glong len
, glong
*items_read
, GError
**error
)
403 while (len
< 0 ? str
[in_pos
] : in_pos
< len
) {
406 if (ch
>= 0xDC00 && ch
<= 0xDFFF) {
409 /* invalid surrogate pair */
411 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "invalid utf-16 sequence at %d (missing surrogate tail)", in_pos
);
413 *items_read
= in_pos
;
415 } /* otherwise just ignore. */
419 /* fast path optimization */
421 for (; len
< 0 ? str
[in_pos
] : in_pos
< len
; in_pos
++) {
422 if (str
[in_pos
] < 0x80)
429 else if (ch
< 0x0800)
431 else if (ch
>= 0xD800 && ch
<= 0xDBFF)
433 else if (ch
>= 0xDC00 && ch
<= 0xDFFF) {
434 /* invalid surrogate pair */
436 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
, "invalid utf-16 sequence at %d (missing surrogate head)", in_pos
);
438 *items_read
= in_pos
;
440 } /* otherwise just ignore. */
449 *items_read
= in_pos
;
454 g_ucs4_to_utf8 (const gunichar
*str
, glong len
, glong
*items_read
, glong
*items_written
, GError
**error
)
456 gchar
*outbuf
, *outptr
;
462 for (i
= 0; str
[i
] != 0; i
++) {
463 if ((n
= g_unichar_to_utf8 (str
[i
], NULL
)) < 0) {
464 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
465 "Invalid sequence in conversion input");
476 for (i
= 0; i
< len
; i
++) {
477 if ((n
= g_unichar_to_utf8 (str
[i
], NULL
)) < 0) {
478 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
479 "Invalid sequence in conversion input");
491 outptr
= outbuf
= g_malloc (nwritten
+ 1);
493 for (i
= 0; str
[i
] != 0; i
++)
494 outptr
+= g_unichar_to_utf8 (str
[i
], outptr
);
496 for (i
= 0; i
< len
; i
++)
497 outptr
+= g_unichar_to_utf8 (str
[i
], outptr
);
502 *items_written
= nwritten
;
511 g_ucs4_to_utf16_len (const gunichar
*str
, glong len
, glong
*items_read
, GError
**error
)
515 const gunichar
*lstr
= str
;
520 while (*lstr
!= '\0' && len
--) {
523 if (ch
<= 0x0000FFFF) {
524 if (ch
>= 0xD800 && ch
<= 0xDFFF) {
525 errindex
= (glong
)(lstr
- str
)-1;
527 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
528 "Invalid sequence in conversion input");
530 *items_read
= errindex
;
535 } else if (ch
> 0x10FFFF) {
536 errindex
= (glong
)(lstr
- str
)-1;
538 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
539 "Character out of range for UTF-16");
541 *items_read
= errindex
;
550 *items_read
= (glong
)(lstr
- str
);
555 g_ucs4_to_utf16 (const gunichar
*str
, glong len
, glong
*items_read
, glong
*items_written
, GError
**error
)
558 gunichar2
*retstr
= 0;
559 gunichar2
*retch
= 0;
563 allocsz
= g_ucs4_to_utf16_len (str
, len
, items_read
, &lerror
);
566 retch
= retstr
= g_malloc ((allocsz
+1) * sizeof (gunichar2
));
567 retstr
[allocsz
] = '\0';
569 while (*str
!= '\0' && len
--) {
572 if (ch
<= 0x0000FFFF && (ch
< 0xD800 || ch
> 0xDFFF)) {
573 *retch
++ = (gunichar2
)ch
;
577 *retch
++ = (gunichar2
)((ch
>> 10) + 0xD800);
578 *retch
++ = (gunichar2
)((ch
& 0x3FFUL
) + 0xDC00);
585 *items_written
= nwritten
;
593 g_utf16_to_ucs4_len (const gunichar2
*str
, glong len
, glong
*items_read
, GError
**error
)
597 const gunichar2
*lstr
= str
;
603 while (*lstr
!= '\0' && len
--) {
605 if (ch
>= 0xD800 && ch
<= 0xDBFF) {
611 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
614 errindex
= (glong
)(lstr
- str
);
616 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
617 "Invalid sequence in conversion input");
619 *items_read
= errindex
;
623 if (ch
>= 0xDC00 && ch
<= 0xDFFF) {
624 errindex
= (glong
)(lstr
- str
)-1;
626 g_set_error (error
, G_CONVERT_ERROR
, G_CONVERT_ERROR_ILLEGAL_SEQUENCE
,
627 "Invalid sequence in conversion input");
629 *items_read
= errindex
;
637 *items_read
= (glong
)(lstr
- str
);
643 g_utf16_to_ucs4 (const gunichar2
*str
, glong len
, glong
*items_read
, glong
*items_written
, GError
**error
)
646 gunichar
*retstr
= 0;
652 allocsz
= g_utf16_to_ucs4_len (str
, len
, items_read
, &lerror
);
655 retch
= retstr
= g_malloc ((allocsz
+1) * sizeof (gunichar
));
656 retstr
[allocsz
] = '\0';
659 while (*str
!= '\0' && allocsz
--) {
661 if (ch
>= 0xD800 && ch
<= 0xDBFF) {
663 ch
= ((ch
- (gunichar
)0xD800) << 10)
664 + (ch2
- (gunichar
)0xDC00) + (gunichar
)0x0010000UL
;
671 *items_written
= nwritten
;
679 g_utf8_offset_to_pointer (const gchar
*str
, glong offset
)
685 gchar
*p
= (gchar
*)str
;
687 p
= g_utf8_next_char (p
);
689 } while (offset
> 0);
695 g_assert_not_reached();
700 g_utf8_pointer_to_offset (const gchar
*str
, const gchar
*pos
)
702 const gchar
*inptr
, *inend
;
719 inptr
= g_utf8_next_char (inptr
);
721 } while (inptr
< inend
);
723 return offset
* sign
;
727 g_utf8_to_ucs4_fast (const gchar
*str
, glong len
, glong
*items_written
)
735 g_return_val_if_fail (str
!= NULL
, NULL
);
738 /* we need to find the length of str, as len < 0 means it must be 0 terminated */
744 p
= g_utf8_next_char(p
);
748 ucs4
= g_malloc (sizeof(gunichar
)*len
);
750 *items_written
= len
;
761 else if ((c
& 0xE0) == 0xC0) {
766 else if ((c
& 0xF0) == 0xE0) {
770 else if ((c
& 0xF8) == 0xF0) {
774 else if ((c
& 0xFC) == 0xF8) {
778 else if ((c
& 0xFE) == 0xFC) {
785 codepoint
<<= 6 | ((*p
)&0x3f);
789 ucs4
[ucs4_index
++] = codepoint
;
797 * from http://home.tiscali.nl/t876506/utf8tbl.html
799 * From Unicode UCS-4 to UTF-8:
800 * Start with the Unicode number expressed as a decimal number and call this ud.
802 * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
804 * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
805 * byte 1 = 192 + (ud div 64)
806 * byte 2 = 128 + (ud mod 64)
808 * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
809 * byte 1 = 224 + (ud div 4096)
810 * byte 2 = 128 + ((ud div 64) mod 64)
811 * byte 3 = 128 + (ud mod 64)
813 * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
814 * byte 1 = 240 + (ud div 262144)
815 * byte 2 = 128 + ((ud div 4096) mod 64)
816 * byte 3 = 128 + ((ud div 64) mod 64)
817 * byte 4 = 128 + (ud mod 64)
819 * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
820 * byte 1 = 248 + (ud div 16777216)
821 * byte 2 = 128 + ((ud div 262144) mod 64)
822 * byte 3 = 128 + ((ud div 4096) mod 64)
823 * byte 4 = 128 + ((ud div 64) mod 64)
824 * byte 5 = 128 + (ud mod 64)
826 * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
827 * byte 1 = 252 + (ud div 1073741824)
828 * byte 2 = 128 + ((ud div 16777216) mod 64)
829 * byte 3 = 128 + ((ud div 262144) mod 64)
830 * byte 4 = 128 + ((ud div 4096) mod 64)
831 * byte 5 = 128 + ((ud div 64) mod 64)
832 * byte 6 = 128 + (ud mod 64)
835 g_unichar_to_utf8 (gunichar c
, gchar
*outbuf
)
843 } else if (c
< 2048UL) {
846 } else if (c
< 65536UL) {
849 } else if (c
< 2097152UL) {
852 } else if (c
< 67108864UL) {
855 } else if (c
< 2147483648UL) {
861 if (outbuf
!= NULL
) {
862 for (i
= len
- 1; i
> 0; i
--) {
863 /* mask off 6 bits worth and add 128 */
864 outbuf
[i
] = 128 + (c
& 0x3f);
868 /* first character has a different base */
869 outbuf
[0] = base
+ (c
& 0x3f);