1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (c) 2004,2005 by Marcoen Hirschberg
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
21 /* Some conversion functions for handling UTF-8
23 * I got all the info from:
24 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
26 * http://en.wikipedia.org/wiki/Unicode
33 #include "rbunicode.h"
40 static int default_codepage
= 0;
41 static int loaded_cp_table
= 0;
43 #ifdef HAVE_LCD_BITMAP
45 #define MAX_CP_TABLE_SIZE 32768
48 static const char * const filename
[NUM_TABLES
] =
50 CODEPAGE_DIR
"/iso.cp",
51 CODEPAGE_DIR
"/932.cp", /* SJIS */
52 CODEPAGE_DIR
"/936.cp", /* GB2312 */
53 CODEPAGE_DIR
"/949.cp", /* KSX1001 */
54 CODEPAGE_DIR
"/950.cp" /* BIG5 */
57 static const char cp_2_table
[NUM_CODEPAGES
] =
59 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
62 static const char * const name_codepages
[NUM_CODEPAGES
+1] =
82 #else /* !HAVE_LCD_BITMAP, reduced support */
84 #define MAX_CP_TABLE_SIZE 768
87 static const char * const filename
[NUM_TABLES
] = {
88 CODEPAGE_DIR
"/isomini.cp"
91 static const char cp_2_table
[NUM_CODEPAGES
] =
93 0, 1, 1, 1, 1, 1, 1, 0
96 static const char * const name_codepages
[NUM_CODEPAGES
+1] =
111 static unsigned short codepage_table
[MAX_CP_TABLE_SIZE
];
113 static const unsigned char utf8comp
[6] =
115 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
118 /* Load codepage file into memory */
119 static int load_cp_table(int cp
)
122 int table
= cp_2_table
[cp
];
124 unsigned char tmp
[2];
126 if (table
== 0 || table
== loaded_cp_table
)
129 file
= open(filename
[table
-1], O_RDONLY
|O_BINARY
);
132 DEBUGF("Can't open codepage file: %s.cp\n", filename
[table
-1]);
136 tablesize
= filesize(file
) / 2;
138 if (tablesize
> MAX_CP_TABLE_SIZE
) {
139 DEBUGF("Invalid codepage file: %s.cp\n", filename
[table
-1]);
144 while (i
< tablesize
) {
145 if (!read(file
, tmp
, 2)) {
146 DEBUGF("Can't read from codepage file: %s.cp\n",
151 codepage_table
[i
++] = (tmp
[1] << 8) | tmp
[0];
154 loaded_cp_table
= table
;
159 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
160 unsigned char* utf8encode(unsigned long ucs
, unsigned char *utf8
)
165 while (ucs
>> (5*tail
+ 6))
168 *utf8
++ = (ucs
>> (6*tail
)) | utf8comp
[tail
];
170 *utf8
++ = ((ucs
>> (6*tail
)) & (MASK
^ 0xFF)) | COMP
;
175 /* Recode an iso encoded string to UTF-8 */
176 unsigned char* iso_decode(const unsigned char *iso
, unsigned char *utf8
,
179 unsigned short ucs
, tmp
;
181 if (cp
== -1) /* use default codepage */
182 cp
= default_codepage
;
184 if (!load_cp_table(cp
)) cp
= 0;
187 if (*iso
< 128 || cp
== UTF_8
) /* Already UTF-8 */
192 /* cp tells us which codepage to convert from */
194 case ISO_8859_7
: /* Greek */
195 case WIN_1252
: /* Western European */
196 case WIN_1251
: /* Cyrillic */
197 case ISO_8859_9
: /* Turkish */
198 case ISO_8859_2
: /* Latin Extended */
199 case WIN_1250
: /* Central European */
200 #ifdef HAVE_LCD_BITMAP
201 case ISO_8859_8
: /* Hebrew */
202 case ISO_8859_11
: /* Thai */
203 case WIN_1256
: /* Arabic */
205 tmp
= ((cp
-1)*128) + (*iso
++ - 128);
206 ucs
= codepage_table
[tmp
];
209 #ifdef HAVE_LCD_BITMAP
210 case SJIS
: /* Japanese */
211 if (*iso
> 0xA0 && *iso
< 0xE0) {
212 tmp
= *iso
++ | (0xA100 - 0x8000);
213 ucs
= codepage_table
[tmp
];
217 case GB_2312
: /* Simplified Chinese */
218 case KSX_1001
: /* Korean */
219 case BIG_5
: /* Traditional Chinese */
220 if (count
< 1 || !iso
[1]) {
225 /* we assume all cjk strings are written
226 in big endian order */
230 ucs
= codepage_table
[tmp
];
233 #endif /* HAVE_LCD_BITMAP */
240 if (ucs
== 0) /* unknown char, use replacement char */
242 utf8
= utf8encode(ucs
, utf8
);
248 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
249 unsigned char* utf16LEdecode(const unsigned char *utf16
, unsigned char *utf8
,
255 /* Check for a surrogate pair */
256 if (utf16
[1] >= 0xD8 && utf16
[1] < 0xE0) {
257 ucs
= 0x10000 + ((utf16
[0] << 10) | ((utf16
[1] - 0xD8) << 18)
258 | utf16
[2] | ((utf16
[3] - 0xDC) << 8));
262 ucs
= (utf16
[0] | (utf16
[1] << 8));
266 utf8
= utf8encode(ucs
, utf8
);
271 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
272 unsigned char* utf16BEdecode(const unsigned char *utf16
, unsigned char *utf8
,
278 if (*utf16
>= 0xD8 && *utf16
< 0xE0) { /* Check for a surrogate pair */
279 ucs
= 0x10000 + (((utf16
[0] - 0xD8) << 18) | (utf16
[1] << 10)
280 | ((utf16
[2] - 0xDC) << 8) | utf16
[3]);
284 ucs
= (utf16
[0] << 8) | utf16
[1];
288 utf8
= utf8encode(ucs
, utf8
);
293 #if 0 /* currently unused */
294 /* Recode any UTF-16 string to UTF-8 */
295 unsigned char* utf16decode(const unsigned char *utf16
, unsigned char *utf8
,
300 ucs
= *(utf16
++) << 8;
303 if (ucs
== 0xFEFF) /* Check for BOM */
304 return utf16BEdecode(utf16
, utf8
, count
-1);
305 else if (ucs
== 0xFFFE)
306 return utf16LEdecode(utf16
, utf8
, count
-1);
307 else { /* ADDME: Should default be LE or BE? */
309 return utf16BEdecode(utf16
, utf8
, count
);
314 /* Return the number of UTF-8 chars in a string */
315 unsigned long utf8length(const unsigned char *utf8
)
320 if ((*utf8
++ & MASK
) != COMP
)
326 /* Decode 1 UTF-8 char and return a pointer to the next char. */
327 const unsigned char* utf8decode(const unsigned char *utf8
, unsigned short *ucs
)
329 unsigned char c
= *utf8
++;
333 if ((c
<= 0x7f) || (c
>= 0xc2)) {
334 /* Start of new character. */
335 if (c
< 0x80) { /* U-00000000 - U-0000007F, 1 byte */
337 } else if (c
< 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
340 } else if (c
< 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
343 } else if (c
< 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
351 while (tail
-- && ((c
= *utf8
++) != 0)) {
352 if ((c
& 0xc0) == 0x80) {
353 /* Valid continuation character. */
354 code
= (code
<< 6) | (c
& 0x3f);
357 /* Invalid continuation char */
364 /* Invalid UTF-8 char */
367 /* currently we don't support chars above U-FFFF */
368 *ucs
= (code
< 0x10000) ? code
: 0xfffd;
372 void set_codepage(int cp
)
374 default_codepage
= cp
;
378 /* seek to a given char in a utf8 string and
379 return its start position in the string */
380 int utf8seek(const unsigned char* utf8
, int offset
)
386 while ((utf8
[pos
] & MASK
) == COMP
)
392 const char* get_codepage_name(int cp
)
394 if (cp
< 0 || cp
>= NUM_CODEPAGES
)
395 return name_codepages
[NUM_CODEPAGES
];
396 return name_codepages
[cp
];