2 * strenc.c: string encoding conversions
5 * Dick Porter (dick@ximian.com)
7 * (C) 2003 Ximian, Inc.
16 static const char trailingBytesForUTF8
[256] = {
17 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
24 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
28 * mono_unicode_from_external:
29 * @in: pointers to the buffer.
30 * @bytes: number of bytes in the string.
32 * Tries to turn a NULL-terminated string into UTF16.
34 * First, see if it's valid UTF8, in which case just turn it directly
35 * into UTF16. Next, run through the colon-separated encodings in
36 * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
37 * returning the first successful conversion to UTF16. If no
38 * conversion succeeds, return NULL.
40 * Callers must free the returned string if not NULL. bytes holds the number
41 * of bytes in the returned string, not including the terminator.
44 mono_unicode_from_external (const gchar
*in
, gsize
*bytes
)
48 const gchar
*encoding_list
;
56 encoding_list
=g_getenv ("MONO_EXTERNAL_ENCODINGS");
57 if(encoding_list
==NULL
) {
61 encodings
=g_strsplit (encoding_list
, ":", 0);
62 for(i
=0;encodings
[i
]!=NULL
; i
++) {
63 /* "default_locale" is a special case encoding */
64 if(!strcmp (encodings
[i
], "default_locale")) {
65 gchar
*utf8
=g_locale_to_utf8 (in
, -1, NULL
, NULL
, NULL
);
67 res
=(gchar
*) g_utf8_to_utf16 (utf8
, -1, NULL
, &lbytes
, NULL
);
68 *bytes
= (gsize
) lbytes
;
72 /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
73 res
= g_convert (in
, strlen (in
), "UTF8", encodings
[i
], NULL
, bytes
, NULL
);
76 res
= (gchar
*) g_utf8_to_utf16 (res
, -1, NULL
, &lbytes
, NULL
);
77 *bytes
= (gsize
) lbytes
;
83 g_strfreev (encodings
);
85 return((gunichar2
*)res
);
89 g_strfreev (encodings
);
91 if(g_utf8_validate (in
, -1, NULL
)) {
92 gunichar2
*unires
=g_utf8_to_utf16 (in
, -1, NULL
, (glong
*)bytes
, NULL
);
101 * mono_utf8_from_external:
102 * @in: pointer to the string buffer.
104 * Tries to turn a NULL-terminated string into UTF8.
106 * First, see if it's valid UTF8, in which case there's nothing more
107 * to be done. Next, run through the colon-separated encodings in
108 * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
109 * returning the first successful conversion to utf8. If no
110 * conversion succeeds, return NULL.
112 * Callers must free the returned string if not NULL.
114 * This function is identical to mono_unicode_from_external, apart
115 * from returning utf8 not utf16; it's handy in a few places to work
118 gchar
*mono_utf8_from_external (const gchar
*in
)
122 const gchar
*encoding_list
;
129 encoding_list
=g_getenv ("MONO_EXTERNAL_ENCODINGS");
130 if(encoding_list
==NULL
) {
134 encodings
=g_strsplit (encoding_list
, ":", 0);
135 for(i
=0;encodings
[i
]!=NULL
; i
++) {
137 /* "default_locale" is a special case encoding */
138 if(!strcmp (encodings
[i
], "default_locale")) {
139 res
=g_locale_to_utf8 (in
, -1, NULL
, NULL
, NULL
);
140 if(res
!=NULL
&& !g_utf8_validate (res
, -1, NULL
)) {
145 res
=g_convert (in
, -1, "UTF8", encodings
[i
], NULL
,
150 g_strfreev (encodings
);
155 g_strfreev (encodings
);
157 if(g_utf8_validate (in
, -1, NULL
)) {
158 return(g_strdup (in
));
165 * mono_unicode_to_external:
166 * @uni: an UTF16 string to conver to an external representation.
168 * Turns NULL-terminated UTF16 into either UTF8, or the first
169 * working item in MONO_EXTERNAL_ENCODINGS if set. If no conversions
170 * work, then UTF8 is returned.
172 * Callers must free the returned string.
174 gchar
*mono_unicode_to_external (const gunichar2
*uni
)
177 const gchar
*encoding_list
;
179 /* Turn the unicode into utf8 to start with, because its
180 * easier to work with gchar * than gunichar2 *
182 utf8
=g_utf16_to_utf8 (uni
, -1, NULL
, NULL
, NULL
);
183 g_assert (utf8
!=NULL
);
185 encoding_list
=g_getenv ("MONO_EXTERNAL_ENCODINGS");
186 if(encoding_list
==NULL
) {
190 gchar
*res
, **encodings
;
193 encodings
=g_strsplit (encoding_list
, ":", 0);
194 for(i
=0; encodings
[i
]!=NULL
; i
++) {
195 if(!strcmp (encodings
[i
], "default_locale")) {
196 res
=g_locale_from_utf8 (utf8
, -1, NULL
, NULL
,
199 res
=g_convert (utf8
, -1, encodings
[i
], "UTF8",
205 g_strfreev (encodings
);
211 g_strfreev (encodings
);
214 /* Nothing else worked, so just return the utf8 */
219 * mono_utf8_validate_and_len
220 * @source: Pointer to putative UTF-8 encoded string.
222 * Checks @source for being valid UTF-8. @utf is assumed to be
225 * Return value: true if @source is valid.
226 * oEnd : will equal the null terminator at the end of the string if valid.
227 * if not valid, it will equal the first charater of the invalid sequence.
228 * oLengh : will equal the length to @oEnd
231 mono_utf8_validate_and_len (const gchar
*source
, glong
* oLength
, const gchar
** oEnd
)
233 gboolean retVal
= TRUE
;
234 gboolean lastRet
= TRUE
;
235 guchar
* ptr
= (guchar
*) source
;
241 length
= trailingBytesForUTF8
[*ptr
] + 1;
242 srcPtr
= (guchar
*) ptr
+ length
;
244 default: retVal
= FALSE
;
245 /* Everything else falls through when "TRUE"... */
246 case 4: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
247 if ((a
== (guchar
) 0xBF || a
== (guchar
) 0xBE) && *(srcPtr
-1) == (guchar
) 0xBF) {
248 if (*(srcPtr
-2) == (guchar
) 0x8F || *(srcPtr
-2) == (guchar
) 0x9F ||
249 *(srcPtr
-2) == (guchar
) 0xAF || *(srcPtr
-2) == (guchar
) 0xBF)
252 case 3: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
253 case 2: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
256 /* no fall-through in this inner switch */
257 case 0xE0: if (a
< (guchar
) 0xA0) retVal
= FALSE
; break;
258 case 0xED: if (a
> (guchar
) 0x9F) retVal
= FALSE
; break;
259 case 0xEF: if (a
== (guchar
)0xB7 && (*(srcPtr
+1) > (guchar
) 0x8F && *(srcPtr
+1) < 0xB0)) retVal
= FALSE
;
260 if (a
== (guchar
)0xBF && (*(srcPtr
+1) == (guchar
) 0xBE || *(srcPtr
+1) == 0xBF)) retVal
= FALSE
; break;
261 case 0xF0: if (a
< (guchar
) 0x90) retVal
= FALSE
; break;
262 case 0xF4: if (a
> (guchar
) 0x8F) retVal
= FALSE
; break;
263 default: if (a
< (guchar
) 0x80) retVal
= FALSE
;
266 case 1: if (*ptr
>= (guchar
) 0x80 && *ptr
< (guchar
) 0xC2) retVal
= FALSE
;
268 if (*ptr
> (guchar
) 0xF4)
270 //If the string is invalid, set the end to the invalid byte.
271 if (!retVal
&& lastRet
) {
273 *oEnd
= (gchar
*) ptr
;
279 if (retVal
&& oEnd
!= NULL
)
280 *oEnd
= (gchar
*) ptr
;
286 * mono_utf8_validate_and_len_with_bounds
287 * @source: Pointer to putative UTF-8 encoded string.
288 * @max_bytes: Max number of bytes that can be decoded. This function returns FALSE if
289 * it needs to decode characters beyond that.
291 * Checks @source for being valid UTF-8. @utf is assumed to be
294 * Return value: true if @source is valid.
295 * oEnd : will equal the null terminator at the end of the string if valid.
296 * if not valid, it will equal the first charater of the invalid sequence.
297 * oLengh : will equal the length to @oEnd
300 mono_utf8_validate_and_len_with_bounds (const gchar
*source
, glong max_bytes
, glong
* oLength
, const gchar
** oEnd
)
302 gboolean retVal
= TRUE
;
303 gboolean lastRet
= TRUE
;
304 guchar
* ptr
= (guchar
*) source
;
305 guchar
*end
= ptr
+ max_bytes
;
313 *oEnd
= (gchar
*) ptr
;
318 length
= trailingBytesForUTF8
[*ptr
] + 1;
319 srcPtr
= (guchar
*) ptr
+ length
;
321 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
322 srcPtr points to the first byte after the current char.*/
328 default: retVal
= FALSE
;
329 /* Everything else falls through when "TRUE"... */
330 case 4: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
331 if ((a
== (guchar
) 0xBF || a
== (guchar
) 0xBE) && *(srcPtr
-1) == (guchar
) 0xBF) {
332 if (*(srcPtr
-2) == (guchar
) 0x8F || *(srcPtr
-2) == (guchar
) 0x9F ||
333 *(srcPtr
-2) == (guchar
) 0xAF || *(srcPtr
-2) == (guchar
) 0xBF)
336 case 3: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
337 case 2: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
340 /* no fall-through in this inner switch */
341 case 0xE0: if (a
< (guchar
) 0xA0) retVal
= FALSE
; break;
342 case 0xED: if (a
> (guchar
) 0x9F) retVal
= FALSE
; break;
343 case 0xEF: if (a
== (guchar
)0xB7 && (*(srcPtr
+1) > (guchar
) 0x8F && *(srcPtr
+1) < 0xB0)) retVal
= FALSE
;
344 if (a
== (guchar
)0xBF && (*(srcPtr
+1) == (guchar
) 0xBE || *(srcPtr
+1) == 0xBF)) retVal
= FALSE
; break;
345 case 0xF0: if (a
< (guchar
) 0x90) retVal
= FALSE
; break;
346 case 0xF4: if (a
> (guchar
) 0x8F) retVal
= FALSE
; break;
347 default: if (a
< (guchar
) 0x80) retVal
= FALSE
;
350 case 1: if (*ptr
>= (guchar
) 0x80 && *ptr
< (guchar
) 0xC2) retVal
= FALSE
;
352 if (*ptr
> (guchar
) 0xF4)
354 //If the string is invalid, set the end to the invalid byte.
355 if (!retVal
&& lastRet
) {
357 *oEnd
= (gchar
*) ptr
;
363 if (retVal
&& oEnd
!= NULL
)
364 *oEnd
= (gchar
*) ptr
;