3 * string encoding conversions
6 * Dick Porter (dick@ximian.com)
8 * (C) 2003 Ximian, Inc.
17 static const char trailingBytesForUTF8
[256] = {
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
24 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
25 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
29 * mono_unicode_from_external:
30 * \param in pointers to the buffer.
31 * \param bytes number of bytes in the string.
32 * Tries to turn a NULL-terminated string into UTF-16.
34 * First, see if it's valid UTF-8, in which case just turn it directly
35 * into UTF-16. Next, run through the colon-separated encodings in
36 * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
37 * returning the first successful conversion to UTF-16. If no
38 * conversion succeeds, return NULL.
40 * Callers must free the returned string if not NULL. \p bytes holds the number
41 * of bytes in the returned string, not including the terminator.
44 mono_unicode_from_external (const gchar
*in
, gsize
*bytes
)
56 encoding_list
=g_getenv ("MONO_EXTERNAL_ENCODINGS");
57 if(encoding_list
==NULL
) {
58 encoding_list
= g_strdup("");
61 encodings
=g_strsplit (encoding_list
, ":", 0);
62 g_free (encoding_list
);
63 for(i
=0;encodings
[i
]!=NULL
; i
++) {
64 /* "default_locale" is a special case encoding */
65 if(!strcmp (encodings
[i
], "default_locale")) {
66 gchar
*utf8
=g_locale_to_utf8 (in
, -1, NULL
, NULL
, NULL
);
68 res
=(gchar
*) g_utf8_to_utf16 (utf8
, -1, NULL
, &lbytes
, NULL
);
69 *bytes
= (gsize
) lbytes
;
73 /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
74 res
= g_convert (in
, strlen (in
), "UTF8", encodings
[i
], NULL
, bytes
, NULL
);
77 res
= (gchar
*) g_utf8_to_utf16 (res
, -1, NULL
, &lbytes
, NULL
);
78 *bytes
= (gsize
) lbytes
;
84 g_strfreev (encodings
);
86 return((gunichar2
*)res
);
90 g_strfreev (encodings
);
92 if(g_utf8_validate (in
, -1, NULL
)) {
94 gunichar2
*unires
=g_utf8_to_utf16 (in
, -1, NULL
, &items_written
, NULL
);
96 *bytes
= items_written
;
104 * mono_utf8_from_external:
105 * \param in pointer to the string buffer.
106 * Tries to turn a NULL-terminated string into UTF8.
108 * First, see if it's valid UTF-8, in which case there's nothing more
109 * to be done. Next, run through the colon-separated encodings in
110 * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
111 * returning the first successful conversion to UTF-8. If no
112 * conversion succeeds, return NULL.
114 * Callers must free the returned string if not NULL.
116 * This function is identical to \c mono_unicode_from_external, apart
117 * from returning UTF-8 not UTF-16; it's handy in a few places to work
120 gchar
*mono_utf8_from_external (const gchar
*in
)
124 gchar
*encoding_list
;
131 encoding_list
=g_getenv ("MONO_EXTERNAL_ENCODINGS");
132 if(encoding_list
==NULL
) {
133 encoding_list
= g_strdup("");
136 encodings
=g_strsplit (encoding_list
, ":", 0);
137 g_free (encoding_list
);
138 for(i
=0;encodings
[i
]!=NULL
; i
++) {
140 /* "default_locale" is a special case encoding */
141 if(!strcmp (encodings
[i
], "default_locale")) {
142 res
=g_locale_to_utf8 (in
, -1, NULL
, NULL
, NULL
);
143 if(res
!=NULL
&& !g_utf8_validate (res
, -1, NULL
)) {
148 res
=g_convert (in
, -1, "UTF8", encodings
[i
], NULL
,
153 g_strfreev (encodings
);
158 g_strfreev (encodings
);
160 if(g_utf8_validate (in
, -1, NULL
)) {
161 return(g_strdup (in
));
168 * mono_unicode_to_external:
169 * \param uni a UTF-16 string to convert to an external representation.
170 * Turns NULL-terminated UTF-16 into either UTF-8, or the first
171 * working item in \c MONO_EXTERNAL_ENCODINGS if set. If no conversions
172 * work, then UTF-8 is returned.
173 * Callers must free the returned string.
175 gchar
*mono_unicode_to_external (const gunichar2
*uni
)
178 gchar
*encoding_list
;
180 /* Turn the unicode into utf8 to start with, because its
181 * easier to work with gchar * than gunichar2 *
183 utf8
=g_utf16_to_utf8 (uni
, -1, NULL
, NULL
, NULL
);
184 g_assert (utf8
!=NULL
);
186 encoding_list
=g_getenv ("MONO_EXTERNAL_ENCODINGS");
187 if(encoding_list
==NULL
) {
191 gchar
*res
, **encodings
;
194 encodings
=g_strsplit (encoding_list
, ":", 0);
195 g_free (encoding_list
);
196 for(i
=0; encodings
[i
]!=NULL
; i
++) {
197 if(!strcmp (encodings
[i
], "default_locale")) {
198 res
=g_locale_from_utf8 (utf8
, -1, NULL
, NULL
,
201 res
=g_convert (utf8
, -1, encodings
[i
], "UTF8",
207 g_strfreev (encodings
);
213 g_strfreev (encodings
);
216 /* Nothing else worked, so just return the utf8 */
221 * mono_utf8_validate_and_len
222 * \param source Pointer to putative UTF-8 encoded string.
223 * Checks \p source for being valid UTF-8. \p utf is assumed to be
225 * \returns TRUE if \p source is valid.
226 * \p oEnd will equal the null terminator at the end of the string if valid.
227 * if not valid, it will equal the first charater of the invalid sequence.
228 * \p oLength will equal the length to \p oEnd
231 mono_utf8_validate_and_len (const gchar
*source
, glong
* oLength
, const gchar
** oEnd
)
233 gboolean retVal
= TRUE
;
234 gboolean lastRet
= TRUE
;
235 guchar
* ptr
= (guchar
*) source
;
241 length
= trailingBytesForUTF8
[*ptr
] + 1;
242 srcPtr
= (guchar
*) ptr
+ length
;
244 default: retVal
= FALSE
;
245 /* Everything else falls through when "TRUE"... */
246 case 4: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
247 if ((a
== (guchar
) 0xBF || a
== (guchar
) 0xBE) && *(srcPtr
-1) == (guchar
) 0xBF) {
248 if (*(srcPtr
-2) == (guchar
) 0x8F || *(srcPtr
-2) == (guchar
) 0x9F ||
249 *(srcPtr
-2) == (guchar
) 0xAF || *(srcPtr
-2) == (guchar
) 0xBF)
252 case 3: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
253 case 2: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
256 /* no fall-through in this inner switch */
257 case 0xE0: if (a
< (guchar
) 0xA0) retVal
= FALSE
; break;
258 case 0xED: if (a
> (guchar
) 0x9F) retVal
= FALSE
; break;
259 case 0xEF: if (a
== (guchar
)0xB7 && (*(srcPtr
+1) > (guchar
) 0x8F && *(srcPtr
+1) < 0xB0)) retVal
= FALSE
;
260 if (a
== (guchar
)0xBF && (*(srcPtr
+1) == (guchar
) 0xBE || *(srcPtr
+1) == 0xBF)) retVal
= FALSE
; break;
261 case 0xF0: if (a
< (guchar
) 0x90) retVal
= FALSE
; break;
262 case 0xF4: if (a
> (guchar
) 0x8F) retVal
= FALSE
; break;
263 default: if (a
< (guchar
) 0x80) retVal
= FALSE
;
266 case 1: if (*ptr
>= (guchar
) 0x80 && *ptr
< (guchar
) 0xC2) retVal
= FALSE
;
268 if (*ptr
> (guchar
) 0xF4)
270 //If the string is invalid, set the end to the invalid byte.
271 if (!retVal
&& lastRet
) {
273 *oEnd
= (gchar
*) ptr
;
279 if (retVal
&& oEnd
!= NULL
)
280 *oEnd
= (gchar
*) ptr
;
286 * mono_utf8_validate_and_len_with_bounds
287 * \param source: Pointer to putative UTF-8 encoded string.
288 * \param max_bytes: Max number of bytes that can be decoded.
290 * Checks \p source for being valid UTF-8. \p utf is assumed to be
293 * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
295 * \returns TRUE if \p source is valid.
296 * \p oEnd will equal the null terminator at the end of the string if valid.
297 * if not valid, it will equal the first charater of the invalid sequence.
298 * \p oLength will equal the length to \p oEnd
301 mono_utf8_validate_and_len_with_bounds (const gchar
*source
, glong max_bytes
, glong
* oLength
, const gchar
** oEnd
)
303 gboolean retVal
= TRUE
;
304 gboolean lastRet
= TRUE
;
305 guchar
* ptr
= (guchar
*) source
;
306 guchar
*end
= ptr
+ max_bytes
;
314 *oEnd
= (gchar
*) ptr
;
319 length
= trailingBytesForUTF8
[*ptr
] + 1;
320 srcPtr
= (guchar
*) ptr
+ length
;
322 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
323 srcPtr points to the first byte after the current char.*/
329 default: retVal
= FALSE
;
330 /* Everything else falls through when "TRUE"... */
331 case 4: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
332 if ((a
== (guchar
) 0xBF || a
== (guchar
) 0xBE) && *(srcPtr
-1) == (guchar
) 0xBF) {
333 if (*(srcPtr
-2) == (guchar
) 0x8F || *(srcPtr
-2) == (guchar
) 0x9F ||
334 *(srcPtr
-2) == (guchar
) 0xAF || *(srcPtr
-2) == (guchar
) 0xBF)
337 case 3: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
338 case 2: if ((a
= (*--srcPtr
)) < (guchar
) 0x80 || a
> (guchar
) 0xBF) retVal
= FALSE
;
341 /* no fall-through in this inner switch */
342 case 0xE0: if (a
< (guchar
) 0xA0) retVal
= FALSE
; break;
343 case 0xED: if (a
> (guchar
) 0x9F) retVal
= FALSE
; break;
344 case 0xEF: if (a
== (guchar
)0xB7 && (*(srcPtr
+1) > (guchar
) 0x8F && *(srcPtr
+1) < 0xB0)) retVal
= FALSE
;
345 if (a
== (guchar
)0xBF && (*(srcPtr
+1) == (guchar
) 0xBE || *(srcPtr
+1) == 0xBF)) retVal
= FALSE
; break;
346 case 0xF0: if (a
< (guchar
) 0x90) retVal
= FALSE
; break;
347 case 0xF4: if (a
> (guchar
) 0x8F) retVal
= FALSE
; break;
348 default: if (a
< (guchar
) 0x80) retVal
= FALSE
;
351 case 1: if (*ptr
>= (guchar
) 0x80 && *ptr
< (guchar
) 0xC2) retVal
= FALSE
;
353 if (*ptr
> (guchar
) 0xF4)
355 //If the string is invalid, set the end to the invalid byte.
356 if (!retVal
&& lastRet
) {
358 *oEnd
= (gchar
*) ptr
;
364 if (retVal
&& oEnd
!= NULL
)
365 *oEnd
= (gchar
*) ptr
;