firmware/common/unicode.c

   1 /*   Some conversion functions for handling UTF-8
   2  *
   3  *   copyright Marcoen Hirschberg (2004,2005)
   4  *
   5  *   I got all the info from:
   6  *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
   7  *   and
   8  *   http://en.wikipedia.org/wiki/Unicode
   9  */
  10
  11 #include <stdio.h>
  12 #include "file.h"
  13 #include "debug.h"
  14 #include "rbunicode.h"
  15 #include "config.h"
  16
  17 #ifndef O_BINARY
  18 #define O_BINARY 0
  19 #endif
  20
  21 #define CODEPAGE_DIR    "/.rockbox/codepages"
  22 static int default_codepage = 0;
  23 static int loaded_cp_table = 0;
  24
  25 #ifdef HAVE_LCD_BITMAP
  26
  27 #define MAX_CP_TABLE_SIZE  32768
  28 #define NUM_TABLES             5
  29
  30 enum {
  31     ISO_8859_1 = 0, ISO_8859_7, ISO_8859_8, WIN_1251,
  32     ISO_8859_11, WIN_1256, ISO_8859_9, ISO_8859_2, WIN_1250,
  33     SJIS, GB_2312, KSX_1001, BIG_5, UTF_8, NUM_CODEPAGES
  34 };
  35 static const char *filename[NUM_TABLES] =
  36 {
  37     CODEPAGE_DIR"/iso.cp",
  38     CODEPAGE_DIR"/932.cp",  /* SJIS    */
  39     CODEPAGE_DIR"/936.cp",  /* GB2312  */
  40     CODEPAGE_DIR"/949.cp",  /* KSX1001 */
  41     CODEPAGE_DIR"/950.cp"   /* BIG5    */
  42 };
  43 static const char cp_2_table[NUM_CODEPAGES] =
  44 {
  45     0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
  46 };
  47
  48 #else /* !HAVE_LCD_BITMAP, reduced support */
  49
  50 #define MAX_CP_TABLE_SIZE  640
  51 #define NUM_TABLES           1
  52
  53 enum {
  54     ISO_8859_1 = 0, ISO_8859_7, WIN_1251, ISO_8859_9,
  55     ISO_8859_2, WIN_1250, UTF_8, NUM_CODEPAGES
  56 };
  57 static const char *filename[NUM_TABLES] =
  58 {
  59     CODEPAGE_DIR"/isomini.cp",
  60 };
  61 static const char cp_2_table[NUM_CODEPAGES] =
  62 {
  63     0, 1, 1, 1, 1, 1, 0
  64 };
  65
  66 #endif
  67
  68 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
  69
  70 static const unsigned char utf8comp[6] =
  71 {
  72     0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
  73 };
  74
  75 /* Load codepage file into memory */
  76 static int load_cp_table(int cp)
  77 {
  78     int i=0;
  79     int table = cp_2_table[cp];
  80     int file, tablesize;
  81     unsigned char tmp[2];
  82
  83     if (table == 0 || table == loaded_cp_table)
  84         return 1;
  85
  86     file = open(filename[table-1], O_RDONLY|O_BINARY);
  87
  88     if (file < 0) {
  89         DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
  90         return 0;
  91     }
  92
  93     tablesize = filesize(file) / 2;
  94
  95     if (tablesize > MAX_CP_TABLE_SIZE) {
  96         DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
  97         close(file);
  98         return 0;
  99     }
 100
 101     while (i < tablesize) {
 102         if (!read(file, tmp, 2)) {
 103             DEBUGF("Can't read from codepage file: %s.cp\n",
 104                     filename[table-1]);
 105             loaded_cp_table = 0;
 106             return 0;
 107         }
 108         codepage_table[i++] = (tmp[1] << 8) | tmp[0];
 109     }
 110
 111     loaded_cp_table = table;
 112     close(file);
 113     return 1;
 114 }
 115
 116 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
 117 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
 118 {
 119     int tail = 0;
 120
 121     if (ucs > 0x7F)
 122         while (ucs >> (5*tail + 6))
 123             tail++;
 124
 125     *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
 126     while (tail--)
 127         *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
 128
 129     return utf8;
 130 }
 131
 132 /* Recode an iso encoded string to UTF-8 */
 133 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
 134                           int cp, int count)
 135 {
 136     unsigned short ucs, tmp;
 137
 138     if (cp == -1) /* use default codepage */
 139         cp = default_codepage;
 140
 141     if (!load_cp_table(cp)) cp = 0;
 142
 143     while (count--) {
 144         if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
 145             *utf8++ = *iso++;
 146
 147         else {
 148
 149             /* cp tells us which codepage to convert from */
 150             switch (cp) {
 151                 case ISO_8859_7:  /* Greek */
 152                 case WIN_1251:    /* Cyrillic */
 153                 case ISO_8859_9:  /* Turkish */
 154                 case ISO_8859_2:  /* Latin Extended */
 155                 case WIN_1250:    /* Central European */
 156 #ifdef HAVE_LCD_BITMAP
 157                 case ISO_8859_8:  /* Hebrew */
 158                 case ISO_8859_11: /* Thai */
 159                 case WIN_1256:    /* Arabic */
 160 #endif
 161                     tmp = ((cp-1)*128) + (*iso++ - 128);
 162                     ucs = codepage_table[tmp];
 163                     break;
 164
 165 #ifdef HAVE_LCD_BITMAP
 166                 case SJIS: /* Japanese */
 167                     if (*iso > 0xA0 && *iso < 0xE0) {
 168                         tmp = *iso++ | (0xA100 - 0x8000);
 169                         ucs = codepage_table[tmp];
 170                         break;
 171                     }
 172
 173                 case GB_2312:  /* Simplified Chinese */
 174                 case KSX_1001: /* Korean */
 175                 case BIG_5:    /* Traditional Chinese */
 176                     if (count < 1 || !iso[1]) {
 177                         ucs = *iso++;
 178                         break;
 179                     }
 180
 181                     /* we assume all cjk strings are written
 182                        in big endian order */
 183                     tmp = *iso++ << 8;
 184                     tmp |= *iso++;
 185                     tmp -= 0x8000;
 186                     ucs = codepage_table[tmp];
 187                     count--;
 188                     break;
 189 #endif /* HAVE_LCD_BITMAP */
 190
 191                 default:
 192                     ucs = *iso++;
 193                     break;
 194             }
 195
 196             if (ucs == 0) /* unknown char, use replacement char */
 197                 ucs = 0xfffd;
 198             utf8 = utf8encode(ucs, utf8);
 199         }
 200     }
 201     return utf8;
 202 }
 203
 204 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
 205 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
 206         int count)
 207 {
 208     unsigned long ucs;
 209
 210     while (count > 0) {
 211         /* Check for a surrogate pair */
 212         if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
 213             ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
 214                     | utf16[2] | ((utf16[3] - 0xDC) << 8));
 215             utf16 += 4;
 216             count -= 2;
 217         } else {
 218             ucs = (utf16[0] | (utf16[1] << 8));
 219             utf16 += 2;
 220             count -= 1;
 221         }
 222         utf8 = utf8encode(ucs, utf8);
 223     }
 224     return utf8;
 225 }
 226
 227 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
 228 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
 229         int count)
 230 {
 231     unsigned long ucs;
 232
 233     while (count > 0) {
 234         if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
 235             ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
 236                     | ((utf16[2] - 0xDC) << 8) | utf16[3]);
 237             utf16 += 4;
 238             count -= 2;
 239         } else {
 240             ucs = (utf16[0] << 8) | utf16[1];
 241             utf16 += 2;
 242             count -= 1;
 243         }
 244         utf8 = utf8encode(ucs, utf8);
 245     }
 246     return utf8;
 247 }
 248
 249 #if 0 /* currently unused */
 250 /* Recode any UTF-16 string to UTF-8 */
 251 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
 252         unsigned int count)
 253 {
 254     unsigned long ucs;
 255
 256     ucs = *(utf16++) << 8;
 257     ucs |= *(utf16++);
 258
 259     if (ucs == 0xFEFF) /* Check for BOM */
 260         return utf16BEdecode(utf16, utf8, count-1);
 261     else if (ucs == 0xFFFE)
 262         return utf16LEdecode(utf16, utf8, count-1);
 263     else { /* ADDME: Should default be LE or BE? */
 264         utf16 -= 2;
 265         return utf16BEdecode(utf16, utf8, count);
 266     }
 267 }
 268 #endif
 269
 270 /* Return the number of UTF-8 chars in a string */
 271 unsigned long utf8length(const unsigned char *utf8)
 272 {
 273     unsigned long l = 0;
 274
 275     while (*utf8 != 0)
 276         if ((*utf8++ & MASK) != COMP)
 277             l++;
 278
 279     return l;
 280 }
 281
 282 /* Decode 1 UTF-8 char and return a pointer to the next char. */
 283 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
 284 {
 285     unsigned char c = *utf8++;
 286     unsigned long code;
 287     int tail = 0;
 288
 289     if ((c <= 0x7f) || (c >= 0xc2)) {
 290         /* Start of new character. */
 291         if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
 292             code = c;
 293         } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
 294             tail = 1;
 295             code = c & 0x1f;
 296         } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
 297             tail = 2;
 298             code = c & 0x0f;
 299         } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
 300             tail = 3;
 301             code = c & 0x07;
 302         } else {
 303             /* Invalid size. */
 304             code = 0xfffd;
 305         }
 306
 307         while (tail-- && ((c = *utf8++) != 0)) {
 308             if ((c & 0xc0) == 0x80) {
 309                 /* Valid continuation character. */
 310                 code = (code << 6) | (c & 0x3f);
 311
 312             } else {
 313                 /* Invalid continuation char */
 314                 code = 0xfffd;
 315                 utf8--;
 316                 break;
 317             }
 318         }
 319     } else {
 320         /* Invalid UTF-8 char */
 321         code = 0xfffd;
 322     }
 323     /* currently we don't support chars above U-FFFF */
 324     *ucs = (code < 0x10000) ? code : 0xfffd;
 325     return utf8;
 326 }
 327
 328 void set_codepage(int cp)
 329 {
 330     default_codepage = cp;
 331     return;
 332 }
 333
 334 /* seek to a given char in a utf8 string and
 335    return its start position in the string */
 336 int utf8seek(const unsigned char* utf8, int offset)
 337 {
 338     int pos = 0;
 339
 340     while (offset--) {
 341         pos++;
 342         while ((utf8[pos] & MASK) == COMP)
 343             pos++;
 344     }
 345     return pos;
 346 }