[build] Skips RemoteExecuted bases tests on monodroid
[mono-project.git] / mono / utils / strenc.c
blob81b0ade26bc94114cf06cfc43f78da7296cd9953
1 /**
2 * \file
3 * string encoding conversions
5 * Author:
6 * Dick Porter (dick@ximian.com)
8 * (C) 2003 Ximian, Inc.
9 */
11 #include <config.h>
12 #include <glib.h>
13 #include <string.h>
15 #include "strenc.h"
17 static const char trailingBytesForUTF8[256] = {
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
24 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
25 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
28 /**
29 * mono_unicode_from_external:
30 * \param in pointers to the buffer.
31 * \param bytes number of bytes in the string.
32 * Tries to turn a NULL-terminated string into UTF-16.
34 * First, see if it's valid UTF-8, in which case just turn it directly
35 * into UTF-16. Next, run through the colon-separated encodings in
36 * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
37 * returning the first successful conversion to UTF-16. If no
38 * conversion succeeds, return NULL.
40 * Callers must free the returned string if not NULL. \p bytes holds the number
41 * of bytes in the returned string, not including the terminator.
43 gunichar2 *
44 mono_unicode_from_external (const gchar *in, gsize *bytes)
46 gchar *res=NULL;
47 gchar **encodings;
48 gchar *encoding_list;
49 int i;
50 glong lbytes;
52 if(in==NULL) {
53 return(NULL);
56 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
57 if(encoding_list==NULL) {
58 encoding_list = g_strdup("");
61 encodings=g_strsplit (encoding_list, ":", 0);
62 g_free (encoding_list);
63 for(i=0;encodings[i]!=NULL; i++) {
64 /* "default_locale" is a special case encoding */
65 if(!strcmp (encodings[i], "default_locale")) {
66 gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
67 if(utf8!=NULL) {
68 res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
69 *bytes = (gsize) lbytes;
71 g_free (utf8);
72 } else {
73 /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
74 res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
75 if (res != NULL) {
76 gchar *ptr = res;
77 res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
78 *bytes = (gsize) lbytes;
79 g_free (ptr);
83 if(res!=NULL) {
84 g_strfreev (encodings);
85 *bytes *= 2;
86 return((gunichar2 *)res);
90 g_strfreev (encodings);
92 if(g_utf8_validate (in, -1, NULL)) {
93 glong items_written;
94 gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, &items_written, NULL);
95 items_written *= 2;
96 *bytes = items_written;
97 return(unires);
100 return(NULL);
104 * mono_utf8_from_external:
105 * \param in pointer to the string buffer.
106 * Tries to turn a NULL-terminated string into UTF8.
108 * First, see if it's valid UTF-8, in which case there's nothing more
109 * to be done. Next, run through the colon-separated encodings in
110 * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
111 * returning the first successful conversion to UTF-8. If no
112 * conversion succeeds, return NULL.
114 * Callers must free the returned string if not NULL.
116 * This function is identical to \c mono_unicode_from_external, apart
117 * from returning UTF-8 not UTF-16; it's handy in a few places to work
118 * in UTF-8.
120 gchar *mono_utf8_from_external (const gchar *in)
122 gchar *res=NULL;
123 gchar **encodings;
124 gchar *encoding_list;
125 int i;
127 if(in==NULL) {
128 return(NULL);
131 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
132 if(encoding_list==NULL) {
133 encoding_list = g_strdup("");
136 encodings=g_strsplit (encoding_list, ":", 0);
137 g_free (encoding_list);
138 for(i=0;encodings[i]!=NULL; i++) {
140 /* "default_locale" is a special case encoding */
141 if(!strcmp (encodings[i], "default_locale")) {
142 res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
143 if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
144 g_free (res);
145 res=NULL;
147 } else {
148 res=g_convert (in, -1, "UTF8", encodings[i], NULL,
149 NULL, NULL);
152 if(res!=NULL) {
153 g_strfreev (encodings);
154 return(res);
158 g_strfreev (encodings);
160 if(g_utf8_validate (in, -1, NULL)) {
161 return(g_strdup (in));
164 return(NULL);
168 * mono_unicode_to_external:
169 * \param uni a UTF-16 string to convert to an external representation.
170 * Turns NULL-terminated UTF-16 into either UTF-8, or the first
171 * working item in \c MONO_EXTERNAL_ENCODINGS if set. If no conversions
172 * work, then UTF-8 is returned.
173 * Callers must free the returned string.
175 gchar *mono_unicode_to_external (const gunichar2 *uni)
177 gchar *utf8;
178 gchar *encoding_list;
180 /* Turn the unicode into utf8 to start with, because its
181 * easier to work with gchar * than gunichar2 *
183 utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, NULL);
184 g_assert (utf8!=NULL);
186 encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
187 if(encoding_list==NULL) {
188 /* Do UTF8 */
189 return(utf8);
190 } else {
191 gchar *res, **encodings;
192 int i;
194 encodings=g_strsplit (encoding_list, ":", 0);
195 g_free (encoding_list);
196 for(i=0; encodings[i]!=NULL; i++) {
197 if(!strcmp (encodings[i], "default_locale")) {
198 res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
199 NULL);
200 } else {
201 res=g_convert (utf8, -1, encodings[i], "UTF8",
202 NULL, NULL, NULL);
205 if(res!=NULL) {
206 g_free (utf8);
207 g_strfreev (encodings);
209 return(res);
213 g_strfreev (encodings);
216 /* Nothing else worked, so just return the utf8 */
217 return(utf8);
221 * mono_utf8_validate_and_len
222 * \param source Pointer to putative UTF-8 encoded string.
223 * Checks \p source for being valid UTF-8. \p utf is assumed to be
224 * null-terminated.
225 * \returns TRUE if \p source is valid.
226 * \p oEnd will equal the null terminator at the end of the string if valid.
227 * if not valid, it will equal the first charater of the invalid sequence.
228 * \p oLength will equal the length to \p oEnd
230 gboolean
231 mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
233 gboolean retVal = TRUE;
234 gboolean lastRet = TRUE;
235 guchar* ptr = (guchar*) source;
236 guchar* srcPtr;
237 guint length;
238 guchar a;
239 *oLength = 0;
240 while (*ptr != 0) {
241 length = trailingBytesForUTF8 [*ptr] + 1;
242 srcPtr = (guchar*) ptr + length;
243 switch (length) {
244 default: retVal = FALSE;
245 /* Everything else falls through when "TRUE"... */
246 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
247 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
248 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
249 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
250 retVal = FALSE;
252 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
253 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
255 switch (*ptr) {
256 /* no fall-through in this inner switch */
257 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
258 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
259 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
260 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
261 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
262 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
263 default: if (a < (guchar) 0x80) retVal = FALSE;
266 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
268 if (*ptr > (guchar) 0xF4)
269 retVal = FALSE;
270 //If the string is invalid, set the end to the invalid byte.
271 if (!retVal && lastRet) {
272 if (oEnd != NULL)
273 *oEnd = (gchar*) ptr;
274 lastRet = FALSE;
276 ptr += length;
277 (*oLength)++;
279 if (retVal && oEnd != NULL)
280 *oEnd = (gchar*) ptr;
281 return retVal;
286 * mono_utf8_validate_and_len_with_bounds
287 * \param source: Pointer to putative UTF-8 encoded string.
288 * \param max_bytes: Max number of bytes that can be decoded.
290 * Checks \p source for being valid UTF-8. \p utf is assumed to be
291 * null-terminated.
293 * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
295 * \returns TRUE if \p source is valid.
296 * \p oEnd will equal the null terminator at the end of the string if valid.
297 * if not valid, it will equal the first charater of the invalid sequence.
298 * \p oLength will equal the length to \p oEnd
300 gboolean
301 mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
303 gboolean retVal = TRUE;
304 gboolean lastRet = TRUE;
305 guchar* ptr = (guchar*) source;
306 guchar *end = ptr + max_bytes;
307 guchar* srcPtr;
308 guint length;
309 guchar a;
310 *oLength = 0;
312 if (max_bytes < 1) {
313 if (oEnd)
314 *oEnd = (gchar*) ptr;
315 return FALSE;
318 while (*ptr != 0) {
319 length = trailingBytesForUTF8 [*ptr] + 1;
320 srcPtr = (guchar*) ptr + length;
322 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
323 srcPtr points to the first byte after the current char.*/
324 if (srcPtr >= end) {
325 retVal = FALSE;
326 break;
328 switch (length) {
329 default: retVal = FALSE;
330 /* Everything else falls through when "TRUE"... */
331 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
332 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
333 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
334 *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
335 retVal = FALSE;
337 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
338 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
340 switch (*ptr) {
341 /* no fall-through in this inner switch */
342 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
343 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
344 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
345 if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
346 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
347 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
348 default: if (a < (guchar) 0x80) retVal = FALSE;
351 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
353 if (*ptr > (guchar) 0xF4)
354 retVal = FALSE;
355 //If the string is invalid, set the end to the invalid byte.
356 if (!retVal && lastRet) {
357 if (oEnd != NULL)
358 *oEnd = (gchar*) ptr;
359 lastRet = FALSE;
361 ptr += length;
362 (*oLength)++;
364 if (retVal && oEnd != NULL)
365 *oEnd = (gchar*) ptr;
366 return retVal;