firmware/common/unicode.c

   1 /*   Some conversion functions for handling UTF-8
   2  *
   3  *   copyright Marcoen Hirschberg (2004,2005)
   4  *
   5  *   I got all the info from:
   6  *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
   7  *   and
   8  *   http://en.wikipedia.org/wiki/Unicode
   9  */
  10
  11 #include <stdio.h>
  12 #include "file.h"
  13 #include "debug.h"
  14 #include "rbunicode.h"
  15
  16 #ifndef O_BINARY
  17 #define O_BINARY 0
  18 #endif
  19
  20 #define NUM_TABLES 5
  21 #define NUM_CODEPAGES 13
  22
  23 static int default_codepage = 0;
  24 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
  25 static int loaded_cp_table = 0;
  26
  27
  28 static const unsigned char utf8comp[6] =
  29 {
  30     0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
  31 };
  32
  33 static const char *filename[NUM_TABLES] =
  34 {
  35     CODEPAGE_DIR"/iso.cp",
  36     CODEPAGE_DIR"/932.cp",  /* SJIS    */
  37     CODEPAGE_DIR"/936.cp",  /* GB2312  */
  38     CODEPAGE_DIR"/949.cp",  /* KSX1001 */
  39     CODEPAGE_DIR"/950.cp"   /* BIG5    */
  40 };
  41
  42 static const char cp_2_table[NUM_CODEPAGES] =
  43 {
  44     0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5
  45 };
  46
  47 /* Load codepage file into memory */
  48 int load_cp_table(int cp)
  49 {
  50     int i=0;
  51     int table = cp_2_table[cp];
  52     int file, tablesize;
  53     unsigned char tmp[2];
  54
  55     if (cp == 0 || table == loaded_cp_table)
  56         return 1;
  57
  58     file = open(filename[table-1], O_RDONLY|O_BINARY);
  59
  60     if (file < 0) {
  61         DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
  62         return 0;
  63     }
  64
  65     tablesize = lseek(file, 0, SEEK_END) / 2;
  66     lseek(file, 0, SEEK_SET);
  67
  68     if (tablesize > MAX_CP_TABLE_SIZE) {
  69         DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
  70         close(file);
  71         return 0;
  72     }
  73
  74     while (i < tablesize) {
  75         if (!read(file, tmp, 2)) {
  76             DEBUGF("Can't read from codepage file: %s.cp\n",
  77                     filename[table-1]);
  78             loaded_cp_table = 0;
  79             return 0;
  80         }
  81         codepage_table[i++] = (tmp[1] << 8) | tmp[0];
  82     }
  83
  84     loaded_cp_table = table;
  85     close(file);
  86     return 1;
  87 }
  88
  89 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
  90 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
  91 {
  92     int tail = 0;
  93
  94     if (ucs > 0x7F)
  95         while (ucs >> (5*tail + 6))
  96             tail++;
  97
  98     *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
  99     while (tail--)
 100         *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
 101
 102     return utf8;
 103 }
 104
 105 /* Recode an iso encoded string to UTF-8 */
 106 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
 107                           int cp, int count)
 108 {
 109     unsigned short ucs, tmp;
 110
 111     if (cp == -1) /* use default codepage */
 112        cp = default_codepage;
 113
 114     if (!load_cp_table(cp)) cp = 0;
 115
 116     while (count--) {
 117         if (*iso < 128)
 118             *utf8++ = *iso++;
 119
 120         else {
 121
 122             /* cp tells us which codepage to convert from */
 123             switch (cp) {
 124                 case 0x01: /* Greek (ISO-8859-7) */
 125                 case 0x02: /* Hebrew (ISO-8859-8) */
 126                 case 0x03: /* Russian (CP1251) */
 127                 case 0x04: /* Thai (ISO-8859-11) */
 128                 case 0x05: /* Arabic (ISO-8859-6) */
 129                 case 0x06: /* Turkish (ISO-8859-9) */
 130                 case 0x07: /* Latin Extended (ISO-8859-2) */
 131                     tmp = ((cp-1)*128) + (*iso++ - 128);
 132                     ucs = codepage_table[tmp];
 133                     break;
 134
 135                 case 0x08: /* Japanese (SJIS) */
 136                     if (*iso > 0xA0 && *iso < 0xE0) {
 137                         tmp = *iso | 0xA100;
 138                         ucs = codepage_table[tmp];
 139                         break;
 140                     }
 141
 142                 case 0x09: /* Simplified Chinese (GB2312) */
 143                 case 0x0A: /* Korean (KSX1001) */
 144                 case 0x0B: /* Traditional Chinese (BIG5) */
 145                     if (count < 1 || !iso[1]) {
 146                         ucs = *iso++;
 147                         break;
 148                     }
 149
 150                     /* we assume all cjk strings are written
 151                        in big endian order */
 152                     tmp = *iso++ << 8;
 153                     tmp |= *iso++;
 154                     tmp -= 0x8000;
 155                     ucs = codepage_table[tmp];
 156                     count--;
 157                     break;
 158
 159                 case 0x0C: /* UTF-8, do nothing */
 160                 default:
 161                     ucs = *iso++;
 162                     break;
 163             }
 164
 165             if (ucs == 0) /* unknown char, assume invalid encoding */
 166                 ucs = 0xffff;
 167             utf8 = utf8encode(ucs, utf8);
 168         }
 169     }
 170     return utf8;
 171 }
 172
 173 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
 174 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
 175         unsigned int count)
 176 {
 177     unsigned long ucs;
 178
 179     while (count != 0) {
 180         /* Check for a surrogate pair */
 181         if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
 182             ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
 183                     | utf16[2] | ((utf16[3] - 0xDC) << 8));
 184             utf16 += 4;
 185             count -= 2;
 186         } else {
 187             ucs = (utf16[0] | (utf16[1] << 8));
 188             utf16 += 2;
 189             count -= 1;
 190         }
 191         utf8 = utf8encode(ucs, utf8);
 192     }
 193     return utf8;
 194 }
 195
 196 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
 197 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
 198         unsigned int count)
 199 {
 200     unsigned long ucs;
 201
 202     while (count != 0) {
 203         if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
 204             ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
 205                     | ((utf16[2] - 0xDC) << 8) | utf16[3]);
 206             utf16 += 4;
 207             count -= 2;
 208         } else {
 209             ucs = (utf16[0] << 8) | utf16[1];
 210             utf16 += 2;
 211             count -= 1;
 212         }
 213         utf8 = utf8encode(ucs, utf8);
 214     }
 215     return utf8;
 216 }
 217
 218 /* Recode any UTF-16 string to UTF-8 */
 219 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
 220         unsigned int count)
 221 {
 222     unsigned long ucs;
 223
 224     ucs = *(utf16++) << 8;
 225     ucs |= *(utf16++);
 226
 227     if (ucs == 0xFEFF) /* Check for BOM */
 228         return utf16BEdecode(utf16, utf8, count-1);
 229     else if (ucs == 0xFFFE)
 230         return utf16LEdecode(utf16, utf8, count-1);
 231     else { /* ADDME: Should default be LE or BE? */
 232         utf16 -= 2;
 233         return utf16BEdecode(utf16, utf8, count);
 234     }
 235 }
 236
 237 /* Return the number of UTF-8 chars in a string */
 238 unsigned long utf8length(const unsigned char *utf8)
 239 {
 240     unsigned long l = 0;
 241
 242     while (*utf8 != 0)
 243         if ((*utf8++ & MASK) != COMP)
 244             l++;
 245
 246     return l;
 247 }
 248
 249 /* Decode 1 UTF-8 char and return a pointer to the next char. */
 250 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
 251 {
 252     unsigned char c = *utf8++;
 253     unsigned long code;
 254     int tail = 0;
 255
 256     if ((c <= 0x7f) || (c >= 0xc2)) {
 257         /* Start of new character. */
 258         if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
 259             code = c;
 260         } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
 261             tail = 1;
 262             code = c & 0x1f;
 263         } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
 264             tail = 2;
 265             code = c & 0x0f;
 266         } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
 267             tail = 3;
 268             code = c & 0x07;
 269         } else {
 270             /* Invalid size. */
 271             code = 0xffff;
 272         }
 273
 274         while (tail-- && ((c = *utf8++) != 0)) {
 275             if ((c & 0xc0) == 0x80) {
 276                 /* Valid continuation character. */
 277                 code = (code << 6) | (c & 0x3f);
 278
 279             } else {
 280                 /* Invalid continuation char */
 281                 code = 0xffff;
 282                 utf8--;
 283                 break;
 284             }
 285         }
 286     } else {
 287         /* Invalid UTF-8 char */
 288         code = 0xffff;
 289     }
 290     /* currently we don't support chars above U-FFFF */
 291     *ucs = (code < 0x10000) ? code : 0xffff;
 292     return utf8;
 293 }
 294
 295 void set_codepage(int cp)
 296 {
 297     default_codepage = cp;
 298     return;
 299 }
 300
 301 /* seek to a given char in a utf8 string and
 302    return its start position in the string */
 303 int utf8seek(const unsigned char* utf8, int offset)
 304 {
 305     int pos = 0;
 306
 307     while (offset--) {
 308         pos++;
 309         while ((utf8[pos] & MASK) == COMP)
 310             pos++;
 311     }
 312     return pos;
 313 }