mono/eglib/gutf8.c

   1 /*
   2  * gutf8.c: UTF-8 conversion
   3  *
   4  * Author:
   5  *   Atsushi Enomoto  <atsushi@ximian.com>
   6  *
   7  * (C) 2006 Novell, Inc.
   8  * Copyright 2012 Xamarin Inc
   9  */
  10 #include "config.h"
  11 #include <stdio.h>
  12 #include <glib.h>
  13
  14 /*
  15  * Index into the table below with the first byte of a UTF-8 sequence to get
  16  * the number of bytes that are supposed to follow it to complete the sequence.
  17  *
  18  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
  19  * as-is for anyone who may want to do such conversion, which was allowed in
  20  * earlier algorithms.
  21 */
  22 const guchar g_utf8_jump_table[256] = {
  23         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  24         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  25         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  26         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  27         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  28         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  29         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  30         3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
  31 };
  32
  33 static gchar *
  34 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
  35 {
  36         gunichar *ustr;
  37         glong i, ulen;
  38         gchar *utf8;
  39
  40         ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
  41         for (i = 0; i < ulen; i++)
  42                 ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
  43         utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
  44         g_free (ustr);
  45
  46         return utf8;
  47 }
  48
  49 gchar *
  50 g_utf8_strup (const gchar *str, gssize len)
  51 {
  52         return utf8_case_conv (str, len, TRUE);
  53 }
  54
  55 gchar *
  56 g_utf8_strdown (const gchar *str, gssize len)
  57 {
  58         return utf8_case_conv (str, len, FALSE);
  59 }
  60
  61 static gboolean
  62 utf8_validate (const unsigned char *inptr, size_t len)
  63 {
  64         const unsigned char *ptr = inptr + len;
  65         unsigned char c;
  66
  67         /* Everything falls through when TRUE... */
  68         switch (len) {
  69         default:
  70                 return FALSE;
  71         case 4:
  72                 if ((c = (*--ptr)) < 0x80 || c > 0xBF)
  73                         return FALSE;
  74
  75                 if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
  76                         if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
  77                             ptr[-2] == 0xAF || ptr[-2] == 0xBF)
  78                                 return FALSE;
  79                 }
  80         case 3:
  81                 if ((c = (*--ptr)) < 0x80 || c > 0xBF)
  82                         return FALSE;
  83         case 2:
  84                 if ((c = (*--ptr)) < 0x80 || c > 0xBF)
  85                         return FALSE;
  86
  87                 /* no fall-through in this inner switch */
  88                 switch (*inptr) {
  89                 case 0xE0: if (c < 0xA0) return FALSE; break;
  90                 case 0xED: if (c > 0x9F) return FALSE; break;
  91                 case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
  92                         if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
  93                         break;
  94                 case 0xF0: if (c < 0x90) return FALSE; break;
  95                 case 0xF4: if (c > 0x8F) return FALSE; break;
  96                 default:   if (c < 0x80) return FALSE; break;
  97                 }
  98         case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
  99         }
 100
 101         if (*inptr > 0xF4)
 102                 return FALSE;
 103
 104         return TRUE;
 105 }
 106
 107 /**
 108  * g_utf8_validate:
 109  * @str: a utf-8 encoded string
 110  * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
 111  * @end: output parameter to mark the end of the valid input
 112  *
 113  * Checks @utf for being valid UTF-8. @str is assumed to be
 114  * null-terminated. This function is not super-strict, as it will
 115  * allow longer UTF-8 sequences than necessary. Note that Java is
 116  * capable of producing these sequences if provoked. Also note, this
 117  * routine checks for the 4-byte maximum size, but does not check for
 118  * 0x10ffff maximum value.
 119  *
 120  * Return value: %TRUE if @str is valid or %FALSE otherwise.
 121  **/
 122 gboolean
 123 g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
 124 {
 125         guchar *inptr = (guchar *) str;
 126         gboolean valid = TRUE;
 127         guint length, min;
 128         gssize n = 0;
 129
 130         if (max_len == 0)
 131                 return FALSE;
 132
 133         if (max_len < 0) {
 134                 while (*inptr != 0) {
 135                         length = g_utf8_jump_table[*inptr];
 136                         if (!utf8_validate (inptr, length)) {
 137                                 valid = FALSE;
 138                                 break;
 139                         }
 140
 141                         inptr += length;
 142                 }
 143         } else {
 144                 while (n < max_len) {
 145                         if (*inptr == 0) {
 146                                 /* Note: return FALSE if we encounter nul-byte
 147                                  * before max_len is reached. */
 148                                 valid = FALSE;
 149                                 break;
 150                         }
 151
 152                         length = g_utf8_jump_table[*inptr];
 153                         min = MIN (length, max_len - n);
 154
 155                         if (!utf8_validate (inptr, min)) {
 156                                 valid = FALSE;
 157                                 break;
 158                         }
 159
 160                         if (min < length) {
 161                                 valid = FALSE;
 162                                 break;
 163                         }
 164
 165                         inptr += length;
 166                         n += length;
 167                 }
 168         }
 169
 170         if (end != NULL)
 171                 *end = (gchar *) inptr;
 172
 173         return valid;
 174 }
 175
 176 gunichar
 177 g_utf8_get_char_validated (const gchar *str, gssize max_len)
 178 {
 179         unsigned char *inptr = (unsigned char *) str;
 180         gunichar u = *inptr;
 181         int n, i;
 182
 183         if (max_len == 0)
 184                 return -2;
 185
 186         if (u < 0x80) {
 187                 /* simple ascii case */
 188                 return u;
 189         } else if (u < 0xc2) {
 190                 return -1;
 191         } else if (u < 0xe0) {
 192                 u &= 0x1f;
 193                 n = 2;
 194         } else if (u < 0xf0) {
 195                 u &= 0x0f;
 196                 n = 3;
 197         } else if (u < 0xf8) {
 198                 u &= 0x07;
 199                 n = 4;
 200         } else if (u < 0xfc) {
 201                 u &= 0x03;
 202                 n = 5;
 203         } else if (u < 0xfe) {
 204                 u &= 0x01;
 205                 n = 6;
 206         } else {
 207                 return -1;
 208         }
 209
 210         if (max_len > 0) {
 211                 if (!utf8_validate (inptr, MIN (max_len, n)))
 212                         return -1;
 213
 214                 if (max_len < n)
 215                         return -2;
 216         } else {
 217                 if (!utf8_validate (inptr, n))
 218                         return -1;
 219         }
 220
 221         for (i = 1; i < n; i++)
 222                 u = (u << 6) | (*++inptr ^ 0x80);
 223
 224         return u;
 225 }
 226
 227 glong
 228 g_utf8_strlen (const gchar *str, gssize max_len)
 229 {
 230         const guchar *inptr = (const guchar *) str;
 231         glong clen = 0, len = 0, n;
 232
 233         if (max_len == 0)
 234                 return 0;
 235
 236         if (max_len < 0) {
 237                 while (*inptr) {
 238                         inptr += g_utf8_jump_table[*inptr];
 239                         len++;
 240                 }
 241         } else {
 242                 while (len < max_len && *inptr) {
 243                         n = g_utf8_jump_table[*inptr];
 244                         if ((clen + n) > max_len)
 245                                 break;
 246
 247                         inptr += n;
 248                         clen += n;
 249                         len++;
 250                 }
 251         }
 252
 253         return len;
 254 }
 255
 256 gunichar
 257 g_utf8_get_char (const gchar *src)
 258 {
 259         unsigned char *inptr = (unsigned char *) src;
 260         gunichar u = *inptr;
 261         int n, i;
 262
 263         if (u < 0x80) {
 264                 /* simple ascii case */
 265                 return u;
 266         } else if (u < 0xe0) {
 267                 u &= 0x1f;
 268                 n = 2;
 269         } else if (u < 0xf0) {
 270                 u &= 0x0f;
 271                 n = 3;
 272         } else if (u < 0xf8) {
 273                 u &= 0x07;
 274                 n = 4;
 275         } else if (u < 0xfc) {
 276                 u &= 0x03;
 277                 n = 5;
 278         } else {
 279                 u &= 0x01;
 280                 n = 6;
 281         }
 282
 283         for (i = 1; i < n; i++)
 284                 u = (u << 6) | (*++inptr ^ 0x80);
 285
 286         return u;
 287 }
 288
 289 gchar *
 290 g_utf8_offset_to_pointer (const gchar *str, glong offset)
 291 {
 292         const gchar *p = str;
 293
 294         if (offset > 0) {
 295                 do {
 296                         p = g_utf8_next_char (p);
 297                         offset --;
 298                 } while (offset > 0);
 299         }
 300         else if (offset < 0) {
 301                 const gchar *jump = str;
 302                 do {
 303                         // since the minimum size of a character is 1
 304                         // we know we can step back at least offset bytes
 305                         jump = jump + offset;
 306
 307                         // if we land in the middle of a character
 308                         // walk to the beginning
 309                         while ((*jump & 0xc0) == 0x80)
 310                                 jump --;
 311
 312                         // count how many characters we've actually walked
 313                         // by going forward
 314                         p = jump;
 315                         do {
 316                                 p = g_utf8_next_char (p);
 317                                 offset ++;
 318                         } while (p < jump);
 319
 320                 } while (offset < 0);
 321         }
 322
 323         return (gchar *)p;
 324 }
 325
 326 glong
 327 g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
 328 {
 329         const gchar *inptr, *inend;
 330         glong offset = 0;
 331         glong sign = 1;
 332
 333         if (pos == str)
 334                 return 0;
 335
 336         if (str < pos) {
 337                 inptr = str;
 338                 inend = pos;
 339         } else {
 340                 inptr = pos;
 341                 inend = str;
 342                 sign = -1;
 343         }
 344
 345         do {
 346                 inptr = g_utf8_next_char (inptr);
 347                 offset++;
 348         } while (inptr < inend);
 349
 350         return offset * sign;
 351 }