firmware/common/unicode.c

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (c) 2004,2005 by Marcoen Hirschberg
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License
  14  * as published by the Free Software Foundation; either version 2
  15  * of the License, or (at your option) any later version.
  16  *
  17  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  18  * KIND, either express or implied.
  19  *
  20  ****************************************************************************/
  21 /*   Some conversion functions for handling UTF-8
  22  *
  23  *   I got all the info from:
  24  *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  25  *   and
  26  *   http://en.wikipedia.org/wiki/Unicode
  27  */
  28
  29 #include <stdio.h>
  30 #include "config.h"
  31 #include "file.h"
  32 #include "debug.h"
  33 #include "rbunicode.h"
  34 #include "rbpaths.h"
  35
  36 #ifndef O_BINARY
  37 #define O_BINARY 0
  38 #endif
  39
  40 static int default_codepage = 0;
  41 static int loaded_cp_table = 0;
  42
  43 #ifdef HAVE_LCD_BITMAP
  44
  45 #define MAX_CP_TABLE_SIZE  32768
  46 #define NUM_TABLES             5
  47
  48 static const char * const filename[NUM_TABLES] =
  49 {
  50     CODEPAGE_DIR"/iso.cp",
  51     CODEPAGE_DIR"/932.cp",  /* SJIS    */
  52     CODEPAGE_DIR"/936.cp",  /* GB2312  */
  53     CODEPAGE_DIR"/949.cp",  /* KSX1001 */
  54     CODEPAGE_DIR"/950.cp"   /* BIG5    */
  55 };
  56
  57 static const char cp_2_table[NUM_CODEPAGES] =
  58 {
  59     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
  60 };
  61
  62 static const char * const name_codepages[NUM_CODEPAGES+1] =
  63 {
  64     "ISO-8859-1",
  65     "ISO-8859-7",
  66     "ISO-8859-8",
  67     "CP1251",
  68     "ISO-8859-11",
  69     "CP1256",
  70     "ISO-8859-9",
  71     "ISO-8859-2",
  72     "CP1250",
  73     "CP1252",
  74     "SJIS",
  75     "GB-2312",
  76     "KSX-1001",
  77     "BIG5",
  78     "UTF-8",
  79     "unknown"
  80 };
  81
  82 #else /* !HAVE_LCD_BITMAP, reduced support */
  83
  84 #define MAX_CP_TABLE_SIZE  768
  85 #define NUM_TABLES           1
  86
  87 static const char * const filename[NUM_TABLES] = {
  88     CODEPAGE_DIR"/isomini.cp"
  89 };
  90
  91 static const char cp_2_table[NUM_CODEPAGES] =
  92 {
  93     0, 1, 1, 1, 1, 1, 1, 0
  94 };
  95
  96 static const char * const name_codepages[NUM_CODEPAGES+1] =
  97 {
  98     "ISO-8859-1",
  99     "ISO-8859-7",
 100     "CP1251",
 101     "ISO-8859-9",
 102     "ISO-8859-2",
 103     "CP1250",
 104     "CP1252",
 105     "UTF-8",
 106     "unknown"
 107 };
 108
 109 #endif
 110
 111 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
 112
 113 static const unsigned char utf8comp[6] =
 114 {
 115     0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
 116 };
 117
 118 /* Load codepage file into memory */
 119 static int load_cp_table(int cp)
 120 {
 121     int i = 0;
 122     int table = cp_2_table[cp];
 123     int file, tablesize;
 124     unsigned char tmp[2];
 125
 126     if (table == 0 || table == loaded_cp_table)
 127         return 1;
 128
 129     file = open(filename[table-1], O_RDONLY|O_BINARY);
 130
 131     if (file < 0) {
 132         DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
 133         return 0;
 134     }
 135
 136     tablesize = filesize(file) / 2;
 137
 138     if (tablesize > MAX_CP_TABLE_SIZE) {
 139         DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
 140         close(file);
 141         return 0;
 142     }
 143
 144     while (i < tablesize) {
 145         if (!read(file, tmp, 2)) {
 146             DEBUGF("Can't read from codepage file: %s.cp\n",
 147                     filename[table-1]);
 148             loaded_cp_table = 0;
 149             return 0;
 150         }
 151         codepage_table[i++] = (tmp[1] << 8) | tmp[0];
 152     }
 153
 154     loaded_cp_table = table;
 155     close(file);
 156     return 1;
 157 }
 158
 159 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
 160 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
 161 {
 162     int tail = 0;
 163
 164     if (ucs > 0x7F)
 165         while (ucs >> (5*tail + 6))
 166             tail++;
 167
 168     *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
 169     while (tail--)
 170         *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
 171
 172     return utf8;
 173 }
 174
 175 /* Recode an iso encoded string to UTF-8 */
 176 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
 177                           int cp, int count)
 178 {
 179     unsigned short ucs, tmp;
 180
 181     if (cp == -1) /* use default codepage */
 182         cp = default_codepage;
 183
 184     if (!load_cp_table(cp)) cp = 0;
 185
 186     while (count--) {
 187         if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
 188             *utf8++ = *iso++;
 189
 190         else {
 191
 192             /* cp tells us which codepage to convert from */
 193             switch (cp) {
 194                 case ISO_8859_7:  /* Greek */
 195                 case WIN_1252:    /* Western European */
 196                 case WIN_1251:    /* Cyrillic */
 197                 case ISO_8859_9:  /* Turkish */
 198                 case ISO_8859_2:  /* Latin Extended */
 199                 case WIN_1250:    /* Central European */
 200 #ifdef HAVE_LCD_BITMAP
 201                 case ISO_8859_8:  /* Hebrew */
 202                 case ISO_8859_11: /* Thai */
 203                 case WIN_1256:    /* Arabic */
 204 #endif
 205                     tmp = ((cp-1)*128) + (*iso++ - 128);
 206                     ucs = codepage_table[tmp];
 207                     break;
 208
 209 #ifdef HAVE_LCD_BITMAP
 210                 case SJIS: /* Japanese */
 211                     if (*iso > 0xA0 && *iso < 0xE0) {
 212                         tmp = *iso++ | (0xA100 - 0x8000);
 213                         ucs = codepage_table[tmp];
 214                         break;
 215                     }
 216
 217                 case GB_2312:  /* Simplified Chinese */
 218                 case KSX_1001: /* Korean */
 219                 case BIG_5:    /* Traditional Chinese */
 220                     if (count < 1 || !iso[1]) {
 221                         ucs = *iso++;
 222                         break;
 223                     }
 224
 225                     /* we assume all cjk strings are written
 226                        in big endian order */
 227                     tmp = *iso++ << 8;
 228                     tmp |= *iso++;
 229                     tmp -= 0x8000;
 230                     ucs = codepage_table[tmp];
 231                     count--;
 232                     break;
 233 #endif /* HAVE_LCD_BITMAP */
 234
 235                 default:
 236                     ucs = *iso++;
 237                     break;
 238             }
 239
 240             if (ucs == 0) /* unknown char, use replacement char */
 241                 ucs = 0xfffd;
 242             utf8 = utf8encode(ucs, utf8);
 243         }
 244     }
 245     return utf8;
 246 }
 247
 248 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
 249 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
 250         int count)
 251 {
 252     unsigned long ucs;
 253
 254     while (count > 0) {
 255         /* Check for a surrogate pair */
 256         if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
 257             ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
 258                     | utf16[2] | ((utf16[3] - 0xDC) << 8));
 259             utf16 += 4;
 260             count -= 2;
 261         } else {
 262             ucs = (utf16[0] | (utf16[1] << 8));
 263             utf16 += 2;
 264             count -= 1;
 265         }
 266         utf8 = utf8encode(ucs, utf8);
 267     }
 268     return utf8;
 269 }
 270
 271 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
 272 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
 273         int count)
 274 {
 275     unsigned long ucs;
 276
 277     while (count > 0) {
 278         if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
 279             ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
 280                     | ((utf16[2] - 0xDC) << 8) | utf16[3]);
 281             utf16 += 4;
 282             count -= 2;
 283         } else {
 284             ucs = (utf16[0] << 8) | utf16[1];
 285             utf16 += 2;
 286             count -= 1;
 287         }
 288         utf8 = utf8encode(ucs, utf8);
 289     }
 290     return utf8;
 291 }
 292
 293 #if 0 /* currently unused */
 294 /* Recode any UTF-16 string to UTF-8 */
 295 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
 296         unsigned int count)
 297 {
 298     unsigned long ucs;
 299
 300     ucs = *(utf16++) << 8;
 301     ucs |= *(utf16++);
 302
 303     if (ucs == 0xFEFF) /* Check for BOM */
 304         return utf16BEdecode(utf16, utf8, count-1);
 305     else if (ucs == 0xFFFE)
 306         return utf16LEdecode(utf16, utf8, count-1);
 307     else { /* ADDME: Should default be LE or BE? */
 308         utf16 -= 2;
 309         return utf16BEdecode(utf16, utf8, count);
 310     }
 311 }
 312 #endif
 313
 314 /* Return the number of UTF-8 chars in a string */
 315 unsigned long utf8length(const unsigned char *utf8)
 316 {
 317     unsigned long l = 0;
 318
 319     while (*utf8 != 0)
 320         if ((*utf8++ & MASK) != COMP)
 321             l++;
 322
 323     return l;
 324 }
 325
 326 /* Decode 1 UTF-8 char and return a pointer to the next char. */
 327 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
 328 {
 329     unsigned char c = *utf8++;
 330     unsigned long code;
 331     int tail = 0;
 332
 333     if ((c <= 0x7f) || (c >= 0xc2)) {
 334         /* Start of new character. */
 335         if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
 336             code = c;
 337         } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
 338             tail = 1;
 339             code = c & 0x1f;
 340         } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
 341             tail = 2;
 342             code = c & 0x0f;
 343         } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
 344             tail = 3;
 345             code = c & 0x07;
 346         } else {
 347             /* Invalid size. */
 348             code = 0xfffd;
 349         }
 350
 351         while (tail-- && ((c = *utf8++) != 0)) {
 352             if ((c & 0xc0) == 0x80) {
 353                 /* Valid continuation character. */
 354                 code = (code << 6) | (c & 0x3f);
 355
 356             } else {
 357                 /* Invalid continuation char */
 358                 code = 0xfffd;
 359                 utf8--;
 360                 break;
 361             }
 362         }
 363     } else {
 364         /* Invalid UTF-8 char */
 365         code = 0xfffd;
 366     }
 367     /* currently we don't support chars above U-FFFF */
 368     *ucs = (code < 0x10000) ? code : 0xfffd;
 369     return utf8;
 370 }
 371
 372 void set_codepage(int cp)
 373 {
 374     default_codepage = cp;
 375     return;
 376 }
 377
 378 /* seek to a given char in a utf8 string and
 379    return its start position in the string */
 380 int utf8seek(const unsigned char* utf8, int offset)
 381 {
 382     int pos = 0;
 383
 384     while (offset--) {
 385         pos++;
 386         while ((utf8[pos] & MASK) == COMP)
 387             pos++;
 388     }
 389     return pos;
 390 }
 391
 392 const char* get_codepage_name(int cp)
 393 {
 394     if (cp < 0 || cp>= NUM_CODEPAGES)
 395         return name_codepages[NUM_CODEPAGES];
 396     return name_codepages[cp];
 397 }