1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (c) 2004,2005 by Marcoen Hirschberg
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
21 /* Some conversion functions for handling UTF-8
23 * I got all the info from:
24 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
26 * http://en.wikipedia.org/wiki/Unicode
32 #include "rbunicode.h"
39 #define CODEPAGE_DIR ROCKBOX_DIR"/codepages"
40 static int default_codepage
= 0;
41 static int loaded_cp_table
= 0;
43 #ifdef HAVE_LCD_BITMAP
45 #define MAX_CP_TABLE_SIZE 32768
48 static const char * const filename
[NUM_TABLES
] =
50 CODEPAGE_DIR
"/iso.cp",
51 CODEPAGE_DIR
"/932.cp", /* SJIS */
52 CODEPAGE_DIR
"/936.cp", /* GB2312 */
53 CODEPAGE_DIR
"/949.cp", /* KSX1001 */
54 CODEPAGE_DIR
"/950.cp" /* BIG5 */
57 static const char cp_2_table
[NUM_CODEPAGES
] =
59 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
62 static const char * const name_codepages
[NUM_CODEPAGES
+1] =
81 #else /* !HAVE_LCD_BITMAP, reduced support */
83 #define MAX_CP_TABLE_SIZE 640
86 static const char * const filename
[NUM_TABLES
] = {
87 CODEPAGE_DIR
"/isomini.cp"
90 static const char cp_2_table
[NUM_CODEPAGES
] =
95 static const char * const name_codepages
[NUM_CODEPAGES
+1] =
109 static unsigned short codepage_table
[MAX_CP_TABLE_SIZE
];
111 static const unsigned char utf8comp
[6] =
113 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
116 /* Load codepage file into memory */
117 static int load_cp_table(int cp
)
120 int table
= cp_2_table
[cp
];
122 unsigned char tmp
[2];
124 if (table
== 0 || table
== loaded_cp_table
)
127 file
= open(filename
[table
-1], O_RDONLY
|O_BINARY
);
130 DEBUGF("Can't open codepage file: %s.cp\n", filename
[table
-1]);
134 tablesize
= filesize(file
) / 2;
136 if (tablesize
> MAX_CP_TABLE_SIZE
) {
137 DEBUGF("Invalid codepage file: %s.cp\n", filename
[table
-1]);
142 while (i
< tablesize
) {
143 if (!read(file
, tmp
, 2)) {
144 DEBUGF("Can't read from codepage file: %s.cp\n",
149 codepage_table
[i
++] = (tmp
[1] << 8) | tmp
[0];
152 loaded_cp_table
= table
;
157 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
158 unsigned char* utf8encode(unsigned long ucs
, unsigned char *utf8
)
163 while (ucs
>> (5*tail
+ 6))
166 *utf8
++ = (ucs
>> (6*tail
)) | utf8comp
[tail
];
168 *utf8
++ = ((ucs
>> (6*tail
)) & (MASK
^ 0xFF)) | COMP
;
173 /* Recode an iso encoded string to UTF-8 */
174 unsigned char* iso_decode(const unsigned char *iso
, unsigned char *utf8
,
177 unsigned short ucs
, tmp
;
179 if (cp
== -1) /* use default codepage */
180 cp
= default_codepage
;
182 if (!load_cp_table(cp
)) cp
= 0;
185 if (*iso
< 128 || cp
== UTF_8
) /* Already UTF-8 */
190 /* cp tells us which codepage to convert from */
192 case ISO_8859_7
: /* Greek */
193 case WIN_1251
: /* Cyrillic */
194 case ISO_8859_9
: /* Turkish */
195 case ISO_8859_2
: /* Latin Extended */
196 case WIN_1250
: /* Central European */
197 #ifdef HAVE_LCD_BITMAP
198 case ISO_8859_8
: /* Hebrew */
199 case ISO_8859_11
: /* Thai */
200 case WIN_1256
: /* Arabic */
202 tmp
= ((cp
-1)*128) + (*iso
++ - 128);
203 ucs
= codepage_table
[tmp
];
206 #ifdef HAVE_LCD_BITMAP
207 case SJIS
: /* Japanese */
208 if (*iso
> 0xA0 && *iso
< 0xE0) {
209 tmp
= *iso
++ | (0xA100 - 0x8000);
210 ucs
= codepage_table
[tmp
];
214 case GB_2312
: /* Simplified Chinese */
215 case KSX_1001
: /* Korean */
216 case BIG_5
: /* Traditional Chinese */
217 if (count
< 1 || !iso
[1]) {
222 /* we assume all cjk strings are written
223 in big endian order */
227 ucs
= codepage_table
[tmp
];
230 #endif /* HAVE_LCD_BITMAP */
237 if (ucs
== 0) /* unknown char, use replacement char */
239 utf8
= utf8encode(ucs
, utf8
);
245 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
246 unsigned char* utf16LEdecode(const unsigned char *utf16
, unsigned char *utf8
,
252 /* Check for a surrogate pair */
253 if (utf16
[1] >= 0xD8 && utf16
[1] < 0xE0) {
254 ucs
= 0x10000 + ((utf16
[0] << 10) | ((utf16
[1] - 0xD8) << 18)
255 | utf16
[2] | ((utf16
[3] - 0xDC) << 8));
259 ucs
= (utf16
[0] | (utf16
[1] << 8));
263 utf8
= utf8encode(ucs
, utf8
);
268 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
269 unsigned char* utf16BEdecode(const unsigned char *utf16
, unsigned char *utf8
,
275 if (*utf16
>= 0xD8 && *utf16
< 0xE0) { /* Check for a surrogate pair */
276 ucs
= 0x10000 + (((utf16
[0] - 0xD8) << 18) | (utf16
[1] << 10)
277 | ((utf16
[2] - 0xDC) << 8) | utf16
[3]);
281 ucs
= (utf16
[0] << 8) | utf16
[1];
285 utf8
= utf8encode(ucs
, utf8
);
290 #if 0 /* currently unused */
291 /* Recode any UTF-16 string to UTF-8 */
292 unsigned char* utf16decode(const unsigned char *utf16
, unsigned char *utf8
,
297 ucs
= *(utf16
++) << 8;
300 if (ucs
== 0xFEFF) /* Check for BOM */
301 return utf16BEdecode(utf16
, utf8
, count
-1);
302 else if (ucs
== 0xFFFE)
303 return utf16LEdecode(utf16
, utf8
, count
-1);
304 else { /* ADDME: Should default be LE or BE? */
306 return utf16BEdecode(utf16
, utf8
, count
);
311 /* Return the number of UTF-8 chars in a string */
312 unsigned long utf8length(const unsigned char *utf8
)
317 if ((*utf8
++ & MASK
) != COMP
)
323 /* Decode 1 UTF-8 char and return a pointer to the next char. */
324 const unsigned char* utf8decode(const unsigned char *utf8
, unsigned short *ucs
)
326 unsigned char c
= *utf8
++;
330 if ((c
<= 0x7f) || (c
>= 0xc2)) {
331 /* Start of new character. */
332 if (c
< 0x80) { /* U-00000000 - U-0000007F, 1 byte */
334 } else if (c
< 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
337 } else if (c
< 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
340 } else if (c
< 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
348 while (tail
-- && ((c
= *utf8
++) != 0)) {
349 if ((c
& 0xc0) == 0x80) {
350 /* Valid continuation character. */
351 code
= (code
<< 6) | (c
& 0x3f);
354 /* Invalid continuation char */
361 /* Invalid UTF-8 char */
364 /* currently we don't support chars above U-FFFF */
365 *ucs
= (code
< 0x10000) ? code
: 0xfffd;
369 void set_codepage(int cp
)
371 default_codepage
= cp
;
375 /* seek to a given char in a utf8 string and
376 return its start position in the string */
377 int utf8seek(const unsigned char* utf8
, int offset
)
383 while ((utf8
[pos
] & MASK
) == COMP
)
389 const char* get_codepage_name(int cp
)
391 if (cp
< 0 || cp
>= NUM_CODEPAGES
)
392 return name_codepages
[NUM_CODEPAGES
];
393 return name_codepages
[cp
];