mono/utils/strenc.c

   1 /**
   2  * \file
   3  * string encoding conversions
   4  *
   5  * Author:
   6  *      Dick Porter (dick@ximian.com)
   7  *
   8  * (C) 2003 Ximian, Inc.
   9  */
  10
  11 #include <config.h>
  12 #include <glib.h>
  13 #include <string.h>
  14
  15 #include "strenc.h"
  16
  17 static const char trailingBytesForUTF8[256] = {
  18         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  19         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  20         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  21         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  22         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  23         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  24         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  25         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
  26 };
  27
  28 /**
  29  * mono_unicode_from_external:
  30  * \param in pointers to the buffer.
  31  * \param bytes number of bytes in the string.
  32  * Tries to turn a NULL-terminated string into UTF-16.
  33  *
  34  * First, see if it's valid UTF-8, in which case just turn it directly
  35  * into UTF-16.  Next, run through the colon-separated encodings in
  36  * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
  37  * returning the first successful conversion to UTF-16.  If no
  38  * conversion succeeds, return NULL.
  39  *
  40  * Callers must free the returned string if not NULL. \p bytes holds the number
  41  * of bytes in the returned string, not including the terminator.
  42  */
  43 gunichar2 *
  44 mono_unicode_from_external (const gchar *in, gsize *bytes)
  45 {
  46         gchar *res=NULL;
  47         gchar **encodings;
  48         gchar *encoding_list;
  49         int i;
  50         glong lbytes;
  51
  52         if(in==NULL) {
  53                 return(NULL);
  54         }
  55
  56         encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
  57         if(encoding_list==NULL) {
  58                 encoding_list = g_strdup("");
  59         }
  60
  61         encodings=g_strsplit (encoding_list, ":", 0);
  62         g_free (encoding_list);
  63         for(i=0;encodings[i]!=NULL; i++) {
  64                 /* "default_locale" is a special case encoding */
  65                 if(!strcmp (encodings[i], "default_locale")) {
  66                         gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
  67                         if(utf8!=NULL) {
  68                                 res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
  69                                 *bytes = (gsize) lbytes;
  70                         }
  71                         g_free (utf8);
  72                 } else {
  73                         /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
  74                         res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
  75                         if (res != NULL) {
  76                                 gchar *ptr = res;
  77                                 res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
  78                                 *bytes = (gsize) lbytes;
  79                                 g_free (ptr);
  80                         }
  81                 }
  82
  83                 if(res!=NULL) {
  84                         g_strfreev (encodings);
  85                         *bytes *= 2;
  86                         return((gunichar2 *)res);
  87                 }
  88         }
  89
  90         g_strfreev (encodings);
  91
  92         if(g_utf8_validate (in, -1, NULL)) {
  93                 glong items_written;
  94                 gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, &items_written, NULL);
  95                 items_written *= 2;
  96                 *bytes = items_written;
  97                 return(unires);
  98         }
  99
 100         return(NULL);
 101 }
 102
 103 /**
 104  * mono_utf8_from_external:
 105  * \param in pointer to the string buffer.
 106  * Tries to turn a NULL-terminated string into UTF8.
 107  *
 108  * First, see if it's valid UTF-8, in which case there's nothing more
 109  * to be done.  Next, run through the colon-separated encodings in
 110  * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
 111  * returning the first successful conversion to UTF-8.  If no
 112  * conversion succeeds, return NULL.
 113  *
 114  * Callers must free the returned string if not NULL.
 115  *
 116  * This function is identical to \c mono_unicode_from_external, apart
 117  * from returning UTF-8 not UTF-16; it's handy in a few places to work
 118  * in UTF-8.
 119  */
 120 gchar *mono_utf8_from_external (const gchar *in)
 121 {
 122         gchar *res=NULL;
 123         gchar **encodings;
 124         gchar *encoding_list;
 125         int i;
 126
 127         if(in==NULL) {
 128                 return(NULL);
 129         }
 130
 131         encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
 132         if(encoding_list==NULL) {
 133                 encoding_list = g_strdup("");
 134         }
 135
 136         encodings=g_strsplit (encoding_list, ":", 0);
 137         g_free (encoding_list);
 138         for(i=0;encodings[i]!=NULL; i++) {
 139
 140                 /* "default_locale" is a special case encoding */
 141                 if(!strcmp (encodings[i], "default_locale")) {
 142                         res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
 143                         if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
 144                                 g_free (res);
 145                                 res=NULL;
 146                         }
 147                 } else {
 148                         res=g_convert (in, -1, "UTF8", encodings[i], NULL,
 149                                        NULL, NULL);
 150                 }
 151
 152                 if(res!=NULL) {
 153                         g_strfreev (encodings);
 154                         return(res);
 155                 }
 156         }
 157
 158         g_strfreev (encodings);
 159
 160         if(g_utf8_validate (in, -1, NULL)) {
 161                 return(g_strdup (in));
 162         }
 163
 164         return(NULL);
 165 }
 166
 167 /**
 168  * mono_unicode_to_external:
 169  * \param uni a UTF-16 string to convert to an external representation.
 170  * Turns NULL-terminated UTF-16 into either UTF-8, or the first
 171  * working item in \c MONO_EXTERNAL_ENCODINGS if set.  If no conversions
 172  * work, then UTF-8 is returned.
 173  * Callers must free the returned string.
 174  */
 175 gchar *mono_unicode_to_external (const gunichar2 *uni)
 176 {
 177         gchar *utf8;
 178         gchar *encoding_list;
 179
 180         /* Turn the unicode into utf8 to start with, because its
 181          * easier to work with gchar * than gunichar2 *
 182          */
 183         utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, NULL);
 184         g_assert (utf8!=NULL);
 185
 186         encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
 187         if(encoding_list==NULL) {
 188                 /* Do UTF8 */
 189                 return(utf8);
 190         } else {
 191                 gchar *res, **encodings;
 192                 int i;
 193
 194                 encodings=g_strsplit (encoding_list, ":", 0);
 195                 g_free (encoding_list);
 196                 for(i=0; encodings[i]!=NULL; i++) {
 197                         if(!strcmp (encodings[i], "default_locale")) {
 198                                 res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
 199                                                         NULL);
 200                         } else {
 201                                 res=g_convert (utf8, -1, encodings[i], "UTF8",
 202                                                NULL, NULL, NULL);
 203                         }
 204
 205                         if(res!=NULL) {
 206                                 g_free (utf8);
 207                                 g_strfreev (encodings);
 208
 209                                 return(res);
 210                         }
 211                 }
 212
 213                 g_strfreev (encodings);
 214         }
 215
 216         /* Nothing else worked, so just return the utf8 */
 217         return(utf8);
 218 }
 219
 220 /**
 221  * mono_utf8_validate_and_len
 222  * \param source Pointer to putative UTF-8 encoded string.
 223  * Checks \p source for being valid UTF-8. \p utf is assumed to be
 224  * null-terminated.
 225  * \returns TRUE if \p source is valid.
 226  * \p oEnd will equal the null terminator at the end of the string if valid.
 227  * if not valid, it will equal the first charater of the invalid sequence.
 228  * \p oLength will equal the length to \p oEnd
 229  **/
 230 gboolean
 231 mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
 232 {
 233         gboolean retVal = TRUE;
 234         gboolean lastRet = TRUE;
 235         guchar* ptr = (guchar*) source;
 236         guchar* srcPtr;
 237         guint length;
 238         guchar a;
 239         *oLength = 0;
 240         while (*ptr != 0) {
 241                 length = trailingBytesForUTF8 [*ptr] + 1;
 242                 srcPtr = (guchar*) ptr + length;
 243                 switch (length) {
 244                 default: retVal = FALSE;
 245                 /* Everything else falls through when "TRUE"... */
 246                 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 247                                 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
 248                                 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
 249                                         *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
 250                                         retVal = FALSE;
 251                                 }
 252                 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 253                 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 254
 255                 switch (*ptr) {
 256                 /* no fall-through in this inner switch */
 257                 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
 258                 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
 259                 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
 260                                    if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
 261                 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
 262                 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
 263                 default:   if (a < (guchar) 0x80) retVal = FALSE;
 264                 }
 265
 266                 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
 267                 }
 268                 if (*ptr > (guchar) 0xF4)
 269                         retVal = FALSE;
 270                 //If the string is invalid, set the end to the invalid byte.
 271                 if (!retVal && lastRet) {
 272                         if (oEnd != NULL)
 273                                 *oEnd = (gchar*) ptr;
 274                         lastRet = FALSE;
 275                 }
 276                 ptr += length;
 277                 (*oLength)++;
 278         }
 279         if (retVal && oEnd != NULL)
 280                 *oEnd = (gchar*) ptr;
 281         return retVal;
 282 }
 283
 284
 285 /**
 286  * mono_utf8_validate_and_len_with_bounds
 287  * \param source: Pointer to putative UTF-8 encoded string.
 288  * \param max_bytes: Max number of bytes that can be decoded.
 289  *
 290  * Checks \p source for being valid UTF-8. \p utf is assumed to be
 291  * null-terminated.
 292  *
 293  * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
 294  *
 295  * \returns TRUE if \p source is valid.
 296  * \p oEnd will equal the null terminator at the end of the string if valid.
 297  * if not valid, it will equal the first charater of the invalid sequence.
 298  * \p oLength will equal the length to \p oEnd
 299  **/
 300 gboolean
 301 mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
 302 {
 303         gboolean retVal = TRUE;
 304         gboolean lastRet = TRUE;
 305         guchar* ptr = (guchar*) source;
 306         guchar *end = ptr + max_bytes;
 307         guchar* srcPtr;
 308         guint length;
 309         guchar a;
 310         *oLength = 0;
 311
 312         if (max_bytes < 1) {
 313                 if (oEnd)
 314                         *oEnd = (gchar*) ptr;
 315                 return FALSE;
 316         }
 317
 318         while (*ptr != 0) {
 319                 length = trailingBytesForUTF8 [*ptr] + 1;
 320                 srcPtr = (guchar*) ptr + length;
 321
 322                 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
 323                    srcPtr points to the first byte after the current char.*/
 324                 if (srcPtr >= end) {
 325                         retVal = FALSE;
 326                         break;
 327                 }
 328                 switch (length) {
 329                 default: retVal = FALSE;
 330                 /* Everything else falls through when "TRUE"... */
 331                 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 332                                 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
 333                                 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
 334                                         *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
 335                                         retVal = FALSE;
 336                                 }
 337                 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 338                 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 339
 340                 switch (*ptr) {
 341                 /* no fall-through in this inner switch */
 342                 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
 343                 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
 344                 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
 345                                    if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
 346                 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
 347                 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
 348                 default:   if (a < (guchar) 0x80) retVal = FALSE;
 349                 }
 350
 351                 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
 352                 }
 353                 if (*ptr > (guchar) 0xF4)
 354                         retVal = FALSE;
 355                 //If the string is invalid, set the end to the invalid byte.
 356                 if (!retVal && lastRet) {
 357                         if (oEnd != NULL)
 358                                 *oEnd = (gchar*) ptr;
 359                         lastRet = FALSE;
 360                 }
 361                 ptr += length;
 362                 (*oLength)++;
 363         }
 364         if (retVal && oEnd != NULL)
 365                 *oEnd = (gchar*) ptr;
 366         return retVal;
 367 }
 368