firmware/common/unicode.c

   1 /*   Some conversion functions for handling UTF-8
   2  *
   3  *   copyright Marcoen Hirschberg (2004,2005)
   4  *
   5  *   I got all the info from:
   6  *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
   7  *   and
   8  *   http://en.wikipedia.org/wiki/Unicode
   9  */
  10
  11 #include <stdio.h>
  12 #include "file.h"
  13 #include "debug.h"
  14 #include "rbunicode.h"
  15 #include "config.h"
  16
  17 #ifndef O_BINARY
  18 #define O_BINARY 0
  19 #endif
  20
  21 #define CODEPAGE_DIR    "/.rockbox/codepages"
  22 static int default_codepage = 0;
  23 static int loaded_cp_table = 0;
  24
  25 #ifdef HAVE_LCD_BITMAP
  26
  27 #define MAX_CP_TABLE_SIZE  32768
  28 #define NUM_TABLES             5
  29
  30 enum {
  31     ISO_8859_1 = 0, ISO_8859_7, ISO_8859_8, WIN_1251,
  32     ISO_8859_11, WIN_1256, ISO_8859_9, ISO_8859_2,
  33     SJIS, GB_2312, KSX_1001, BIG_5, UTF_8, NUM_CODEPAGES
  34 };
  35 static const char *filename[NUM_TABLES] =
  36 {
  37     CODEPAGE_DIR"/iso.cp",
  38     CODEPAGE_DIR"/932.cp",  /* SJIS    */
  39     CODEPAGE_DIR"/936.cp",  /* GB2312  */
  40     CODEPAGE_DIR"/949.cp",  /* KSX1001 */
  41     CODEPAGE_DIR"/950.cp"   /* BIG5    */
  42 };
  43 static const char cp_2_table[NUM_CODEPAGES] =
  44 {
  45     0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
  46 };
  47
  48 #else /* !HAVE_LCD_BITMAP, reduced support */
  49
  50 #define MAX_CP_TABLE_SIZE  512
  51 #define NUM_TABLES           1
  52
  53 enum {
  54     ISO_8859_1 = 0, ISO_8859_7, WIN_1251,
  55     ISO_8859_9, ISO_8859_2, UTF_8, NUM_CODEPAGES
  56 };
  57 static const char *filename[NUM_TABLES] =
  58 {
  59     CODEPAGE_DIR"/isomini.cp",
  60 };
  61 static const char cp_2_table[NUM_CODEPAGES] =
  62 {
  63     0, 1, 1, 1, 1, 0
  64 };
  65
  66 #endif
  67
  68 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
  69
  70 static const unsigned char utf8comp[6] =
  71 {
  72     0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
  73 };
  74
  75 /* Load codepage file into memory */
  76 static int load_cp_table(int cp)
  77 {
  78     int i=0;
  79     int table = cp_2_table[cp];
  80     int file, tablesize;
  81     unsigned char tmp[2];
  82
  83     if (table == 0 || table == loaded_cp_table)
  84         return 1;
  85
  86     file = open(filename[table-1], O_RDONLY|O_BINARY);
  87
  88     if (file < 0) {
  89         DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
  90         return 0;
  91     }
  92
  93     tablesize = filesize(file) / 2;
  94
  95     if (tablesize > MAX_CP_TABLE_SIZE) {
  96         DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
  97         close(file);
  98         return 0;
  99     }
 100
 101     while (i < tablesize) {
 102         if (!read(file, tmp, 2)) {
 103             DEBUGF("Can't read from codepage file: %s.cp\n",
 104                     filename[table-1]);
 105             loaded_cp_table = 0;
 106             return 0;
 107         }
 108         codepage_table[i++] = (tmp[1] << 8) | tmp[0];
 109     }
 110
 111     loaded_cp_table = table;
 112     close(file);
 113     return 1;
 114 }
 115
 116 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
 117 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
 118 {
 119     int tail = 0;
 120
 121     if (ucs > 0x7F)
 122         while (ucs >> (5*tail + 6))
 123             tail++;
 124
 125     *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
 126     while (tail--)
 127         *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
 128
 129     return utf8;
 130 }
 131
 132 /* Recode an iso encoded string to UTF-8 */
 133 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
 134                           int cp, int count)
 135 {
 136     unsigned short ucs, tmp;
 137
 138     if (cp == -1) /* use default codepage */
 139         cp = default_codepage;
 140
 141     if (!load_cp_table(cp)) cp = 0;
 142
 143     while (count--) {
 144         if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
 145             *utf8++ = *iso++;
 146
 147         else {
 148
 149             /* cp tells us which codepage to convert from */
 150             switch (cp) {
 151                 case ISO_8859_7:  /* Greek */
 152                 case WIN_1251:    /* Cyrillic */
 153                 case ISO_8859_9:  /* Turkish */
 154                 case ISO_8859_2:  /* Latin Extended */
 155 #ifdef HAVE_LCD_BITMAP
 156                 case ISO_8859_8:  /* Hebrew */
 157                 case ISO_8859_11: /* Thai */
 158                 case WIN_1256:    /* Arabic */
 159 #endif
 160                     tmp = ((cp-1)*128) + (*iso++ - 128);
 161                     ucs = codepage_table[tmp];
 162                     break;
 163
 164 #ifdef HAVE_LCD_BITMAP
 165                 case SJIS: /* Japanese */
 166                     if (*iso > 0xA0 && *iso < 0xE0) {
 167                         tmp = *iso++ | (0xA100 - 0x8000);
 168                         ucs = codepage_table[tmp];
 169                         break;
 170                     }
 171
 172                 case GB_2312:  /* Simplified Chinese */
 173                 case KSX_1001: /* Korean */
 174                 case BIG_5:    /* Traditional Chinese */
 175                     if (count < 1 || !iso[1]) {
 176                         ucs = *iso++;
 177                         break;
 178                     }
 179
 180                     /* we assume all cjk strings are written
 181                        in big endian order */
 182                     tmp = *iso++ << 8;
 183                     tmp |= *iso++;
 184                     tmp -= 0x8000;
 185                     ucs = codepage_table[tmp];
 186                     count--;
 187                     break;
 188 #endif /* HAVE_LCD_BITMAP */
 189
 190                 default:
 191                     ucs = *iso++;
 192                     break;
 193             }
 194
 195             if (ucs == 0) /* unknown char, use replacement char */
 196                 ucs = 0xfffd;
 197             utf8 = utf8encode(ucs, utf8);
 198         }
 199     }
 200     return utf8;
 201 }
 202
 203 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
 204 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
 205         int count)
 206 {
 207     unsigned long ucs;
 208
 209     while (count > 0) {
 210         /* Check for a surrogate pair */
 211         if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
 212             ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
 213                     | utf16[2] | ((utf16[3] - 0xDC) << 8));
 214             utf16 += 4;
 215             count -= 2;
 216         } else {
 217             ucs = (utf16[0] | (utf16[1] << 8));
 218             utf16 += 2;
 219             count -= 1;
 220         }
 221         utf8 = utf8encode(ucs, utf8);
 222     }
 223     return utf8;
 224 }
 225
 226 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
 227 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
 228         int count)
 229 {
 230     unsigned long ucs;
 231
 232     while (count > 0) {
 233         if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
 234             ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
 235                     | ((utf16[2] - 0xDC) << 8) | utf16[3]);
 236             utf16 += 4;
 237             count -= 2;
 238         } else {
 239             ucs = (utf16[0] << 8) | utf16[1];
 240             utf16 += 2;
 241             count -= 1;
 242         }
 243         utf8 = utf8encode(ucs, utf8);
 244     }
 245     return utf8;
 246 }
 247
 248 #if 0 /* currently unused */
 249 /* Recode any UTF-16 string to UTF-8 */
 250 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
 251         unsigned int count)
 252 {
 253     unsigned long ucs;
 254
 255     ucs = *(utf16++) << 8;
 256     ucs |= *(utf16++);
 257
 258     if (ucs == 0xFEFF) /* Check for BOM */
 259         return utf16BEdecode(utf16, utf8, count-1);
 260     else if (ucs == 0xFFFE)
 261         return utf16LEdecode(utf16, utf8, count-1);
 262     else { /* ADDME: Should default be LE or BE? */
 263         utf16 -= 2;
 264         return utf16BEdecode(utf16, utf8, count);
 265     }
 266 }
 267 #endif
 268
 269 /* Return the number of UTF-8 chars in a string */
 270 unsigned long utf8length(const unsigned char *utf8)
 271 {
 272     unsigned long l = 0;
 273
 274     while (*utf8 != 0)
 275         if ((*utf8++ & MASK) != COMP)
 276             l++;
 277
 278     return l;
 279 }
 280
 281 /* Decode 1 UTF-8 char and return a pointer to the next char. */
 282 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
 283 {
 284     unsigned char c = *utf8++;
 285     unsigned long code;
 286     int tail = 0;
 287
 288     if ((c <= 0x7f) || (c >= 0xc2)) {
 289         /* Start of new character. */
 290         if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
 291             code = c;
 292         } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
 293             tail = 1;
 294             code = c & 0x1f;
 295         } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
 296             tail = 2;
 297             code = c & 0x0f;
 298         } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
 299             tail = 3;
 300             code = c & 0x07;
 301         } else {
 302             /* Invalid size. */
 303             code = 0xfffd;
 304         }
 305
 306         while (tail-- && ((c = *utf8++) != 0)) {
 307             if ((c & 0xc0) == 0x80) {
 308                 /* Valid continuation character. */
 309                 code = (code << 6) | (c & 0x3f);
 310
 311             } else {
 312                 /* Invalid continuation char */
 313                 code = 0xfffd;
 314                 utf8--;
 315                 break;
 316             }
 317         }
 318     } else {
 319         /* Invalid UTF-8 char */
 320         code = 0xfffd;
 321     }
 322     /* currently we don't support chars above U-FFFF */
 323     *ucs = (code < 0x10000) ? code : 0xfffd;
 324     return utf8;
 325 }
 326
 327 void set_codepage(int cp)
 328 {
 329     default_codepage = cp;
 330     return;
 331 }
 332
 333 /* seek to a given char in a utf8 string and
 334    return its start position in the string */
 335 int utf8seek(const unsigned char* utf8, int offset)
 336 {
 337     int pos = 0;
 338
 339     while (offset--) {
 340         pos++;
 341         while ((utf8[pos] & MASK) == COMP)
 342             pos++;
 343     }
 344     return pos;
 345 }