2010-02-13 Jb Evain <jbevain@novell.com>
[mono-project.git] / mono / utils / strenc.c
blobca5423cfbab59022afd6d503672b1cf81032284e
1 /*
2 * strenc.c: string encoding conversions
4 * Author:
5 * Dick Porter (dick@ximian.com)
7 * (C) 2003 Ximian, Inc.
8 */
10 #include <config.h>
11 #include <glib.h>
12 #include <string.h>
14 #include "strenc.h"
16 static const char trailingBytesForUTF8[256] = {
17 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
24 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
27 /**
28 * mono_unicode_from_external:
29 * @in: pointers to the buffer.
30 * @bytes: number of bytes in the string.
32 * Tries to turn a NULL-terminated string into UTF16.
34 * First, see if it's valid UTF8, in which case just turn it directly
35 * into UTF16. Next, run through the colon-separated encodings in
36 * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
37 * returning the first successful conversion to UTF16. If no
38 * conversion succeeds, return NULL.
40 * Callers must free the returned string if not NULL. bytes holds the number
41 * of bytes in the returned string, not including the terminator.
43 gunichar2 *
44 mono_unicode_from_external (const gchar *in, gsize *bytes)
46 gchar *res=NULL;
47 gchar **encodings;
48 const gchar *encoding_list;
49 int i;
50 glong lbytes;
52 if(in==NULL) {
53 return(NULL);
56 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
57 if(encoding_list==NULL) {
58 encoding_list = "";
61 encodings=g_strsplit (encoding_list, ":", 0);
62 for(i=0;encodings[i]!=NULL; i++) {
63 /* "default_locale" is a special case encoding */
64 if(!strcmp (encodings[i], "default_locale")) {
65 gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
66 if(utf8!=NULL) {
67 res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
68 *bytes = (gsize) lbytes;
70 g_free (utf8);
71 } else {
72 /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
73 res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
74 if (res != NULL) {
75 gchar *ptr = res;
76 res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
77 *bytes = (gsize) lbytes;
78 g_free (ptr);
82 if(res!=NULL) {
83 g_strfreev (encodings);
84 *bytes *= 2;
85 return((gunichar2 *)res);
89 g_strfreev (encodings);
91 if(g_utf8_validate (in, -1, NULL)) {
92 gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, (glong *)bytes, NULL);
93 *bytes *= 2;
94 return(unires);
97 return(NULL);
101 * mono_utf8_from_external:
102 * @in: pointer to the string buffer.
104 * Tries to turn a NULL-terminated string into UTF8.
106 * First, see if it's valid UTF8, in which case there's nothing more
107 * to be done. Next, run through the colon-separated encodings in
108 * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
109 * returning the first successful conversion to utf8. If no
110 * conversion succeeds, return NULL.
112 * Callers must free the returned string if not NULL.
114 * This function is identical to mono_unicode_from_external, apart
115 * from returning utf8 not utf16; it's handy in a few places to work
116 * in utf8.
118 gchar *mono_utf8_from_external (const gchar *in)
120 gchar *res=NULL;
121 gchar **encodings;
122 const gchar *encoding_list;
123 int i;
125 if(in==NULL) {
126 return(NULL);
129 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
130 if(encoding_list==NULL) {
131 encoding_list = "";
134 encodings=g_strsplit (encoding_list, ":", 0);
135 for(i=0;encodings[i]!=NULL; i++) {
137 /* "default_locale" is a special case encoding */
138 if(!strcmp (encodings[i], "default_locale")) {
139 res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
140 if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
141 g_free (res);
142 res=NULL;
144 } else {
145 res=g_convert (in, -1, "UTF8", encodings[i], NULL,
146 NULL, NULL);
149 if(res!=NULL) {
150 g_strfreev (encodings);
151 return(res);
155 g_strfreev (encodings);
157 if(g_utf8_validate (in, -1, NULL)) {
158 return(g_strdup (in));
161 return(NULL);
165 * mono_unicode_to_external:
166 * @uni: an UTF16 string to conver to an external representation.
168 * Turns NULL-terminated UTF16 into either UTF8, or the first
169 * working item in MONO_EXTERNAL_ENCODINGS if set. If no conversions
170 * work, then UTF8 is returned.
172 * Callers must free the returned string.
174 gchar *mono_unicode_to_external (const gunichar2 *uni)
176 gchar *utf8;
177 const gchar *encoding_list;
179 /* Turn the unicode into utf8 to start with, because its
180 * easier to work with gchar * than gunichar2 *
182 utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, NULL);
183 g_assert (utf8!=NULL);
185 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
186 if(encoding_list==NULL) {
187 /* Do UTF8 */
188 return(utf8);
189 } else {
190 gchar *res, **encodings;
191 int i;
193 encodings=g_strsplit (encoding_list, ":", 0);
194 for(i=0; encodings[i]!=NULL; i++) {
195 if(!strcmp (encodings[i], "default_locale")) {
196 res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
197 NULL);
198 } else {
199 res=g_convert (utf8, -1, encodings[i], "UTF8",
200 NULL, NULL, NULL);
203 if(res!=NULL) {
204 g_free (utf8);
205 g_strfreev (encodings);
207 return(res);
211 g_strfreev (encodings);
214 /* Nothing else worked, so just return the utf8 */
215 return(utf8);
219 * mono_utf8_validate_and_len
220 * @source: Pointer to putative UTF-8 encoded string.
222 * Checks @source for being valid UTF-8. @utf is assumed to be
223 * null-terminated.
225 * Return value: true if @source is valid.
226 * oEnd : will equal the null terminator at the end of the string if valid.
227 * if not valid, it will equal the first charater of the invalid sequence.
228 * oLengh : will equal the length to @oEnd
230 gboolean
231 mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
233 gboolean retVal = TRUE;
234 gboolean lastRet = TRUE;
235 guchar* ptr = (guchar*) source;
236 guchar* srcPtr;
237 guint length;
238 guchar a;
239 *oLength = 0;
240 while (*ptr != 0) {
241 length = trailingBytesForUTF8 [*ptr] + 1;
242 srcPtr = (guchar*) ptr + length;
243 switch (length) {
244 default: retVal = FALSE;
245 /* Everything else falls through when "TRUE"... */
246 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
247 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
248 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
249 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
250 retVal = FALSE;
252 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
253 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
255 switch (*ptr) {
256 /* no fall-through in this inner switch */
257 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
258 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
259 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
260 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
261 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
262 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
263 default: if (a < (guchar) 0x80) retVal = FALSE;
266 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
268 if (*ptr > (guchar) 0xF4)
269 retVal = FALSE;
270 //If the string is invalid, set the end to the invalid byte.
271 if (!retVal && lastRet) {
272 if (oEnd != NULL)
273 *oEnd = (gchar*) ptr;
274 lastRet = FALSE;
276 ptr += length;
277 (*oLength)++;
279 if (retVal && oEnd != NULL)
280 *oEnd = (gchar*) ptr;
281 return retVal;
286 * mono_utf8_validate_and_len_with_bounds
287 * @source: Pointer to putative UTF-8 encoded string.
288 * @max_bytes: Max number of bytes that can be decoded. This function returns FALSE if
289 * it needs to decode characters beyond that.
291 * Checks @source for being valid UTF-8. @utf is assumed to be
292 * null-terminated.
294 * Return value: true if @source is valid.
295 * oEnd : will equal the null terminator at the end of the string if valid.
296 * if not valid, it will equal the first charater of the invalid sequence.
297 * oLengh : will equal the length to @oEnd
299 gboolean
300 mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
302 gboolean retVal = TRUE;
303 gboolean lastRet = TRUE;
304 guchar* ptr = (guchar*) source;
305 guchar *end = ptr + max_bytes;
306 guchar* srcPtr;
307 guint length;
308 guchar a;
309 *oLength = 0;
311 if (max_bytes < 1) {
312 if (oEnd)
313 *oEnd = (gchar*) ptr;
314 return FALSE;
317 while (*ptr != 0) {
318 length = trailingBytesForUTF8 [*ptr] + 1;
319 srcPtr = (guchar*) ptr + length;
321 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
322 srcPtr points to the first byte after the current char.*/
323 if (srcPtr >= end) {
324 retVal = FALSE;
325 break;
327 switch (length) {
328 default: retVal = FALSE;
329 /* Everything else falls through when "TRUE"... */
330 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
331 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
332 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
333 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
334 retVal = FALSE;
336 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
337 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
339 switch (*ptr) {
340 /* no fall-through in this inner switch */
341 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
342 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
343 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
344 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
345 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
346 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
347 default: if (a < (guchar) 0x80) retVal = FALSE;
350 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
352 if (*ptr > (guchar) 0xF4)
353 retVal = FALSE;
354 //If the string is invalid, set the end to the invalid byte.
355 if (!retVal && lastRet) {
356 if (oEnd != NULL)
357 *oEnd = (gchar*) ptr;
358 lastRet = FALSE;
360 ptr += length;
361 (*oLength)++;
363 if (retVal && oEnd != NULL)
364 *oEnd = (gchar*) ptr;
365 return retVal;