1 /* Some conversion functions for handling UTF-8
3 * copyright Marcoen Hirschberg (2004,2005)
5 * I got all the info from:
6 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
8 * http://en.wikipedia.org/wiki/Unicode
14 #include "rbunicode.h"
21 #define CODEPAGE_DIR "/.rockbox/codepages"
22 static int default_codepage
= 0;
23 static int loaded_cp_table
= 0;
25 #ifdef HAVE_LCD_BITMAP
27 #define MAX_CP_TABLE_SIZE 32768
31 ISO_8859_1
= 0, ISO_8859_7
, ISO_8859_8
, WIN_1251
,
32 ISO_8859_11
, WIN_1256
, ISO_8859_9
, ISO_8859_2
, WIN_1250
,
33 SJIS
, GB_2312
, KSX_1001
, BIG_5
, UTF_8
, NUM_CODEPAGES
35 static const char *filename
[NUM_TABLES
] =
37 CODEPAGE_DIR
"/iso.cp",
38 CODEPAGE_DIR
"/932.cp", /* SJIS */
39 CODEPAGE_DIR
"/936.cp", /* GB2312 */
40 CODEPAGE_DIR
"/949.cp", /* KSX1001 */
41 CODEPAGE_DIR
"/950.cp" /* BIG5 */
43 static const char cp_2_table
[NUM_CODEPAGES
] =
45 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
48 #else /* !HAVE_LCD_BITMAP, reduced support */
50 #define MAX_CP_TABLE_SIZE 640
54 ISO_8859_1
= 0, ISO_8859_7
, WIN_1251
, ISO_8859_9
,
55 ISO_8859_2
, WIN_1250
, UTF_8
, NUM_CODEPAGES
57 static const char *filename
[NUM_TABLES
] =
59 CODEPAGE_DIR
"/isomini.cp",
61 static const char cp_2_table
[NUM_CODEPAGES
] =
68 static unsigned short codepage_table
[MAX_CP_TABLE_SIZE
];
70 static const unsigned char utf8comp
[6] =
72 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
75 /* Load codepage file into memory */
76 static int load_cp_table(int cp
)
79 int table
= cp_2_table
[cp
];
83 if (table
== 0 || table
== loaded_cp_table
)
86 file
= open(filename
[table
-1], O_RDONLY
|O_BINARY
);
89 DEBUGF("Can't open codepage file: %s.cp\n", filename
[table
-1]);
93 tablesize
= filesize(file
) / 2;
95 if (tablesize
> MAX_CP_TABLE_SIZE
) {
96 DEBUGF("Invalid codepage file: %s.cp\n", filename
[table
-1]);
101 while (i
< tablesize
) {
102 if (!read(file
, tmp
, 2)) {
103 DEBUGF("Can't read from codepage file: %s.cp\n",
108 codepage_table
[i
++] = (tmp
[1] << 8) | tmp
[0];
111 loaded_cp_table
= table
;
116 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
117 unsigned char* utf8encode(unsigned long ucs
, unsigned char *utf8
)
122 while (ucs
>> (5*tail
+ 6))
125 *utf8
++ = (ucs
>> (6*tail
)) | utf8comp
[tail
];
127 *utf8
++ = ((ucs
>> (6*tail
)) & (MASK
^ 0xFF)) | COMP
;
132 /* Recode an iso encoded string to UTF-8 */
133 unsigned char* iso_decode(const unsigned char *iso
, unsigned char *utf8
,
136 unsigned short ucs
, tmp
;
138 if (cp
== -1) /* use default codepage */
139 cp
= default_codepage
;
141 if (!load_cp_table(cp
)) cp
= 0;
144 if (*iso
< 128 || cp
== UTF_8
) /* Already UTF-8 */
149 /* cp tells us which codepage to convert from */
151 case ISO_8859_7
: /* Greek */
152 case WIN_1251
: /* Cyrillic */
153 case ISO_8859_9
: /* Turkish */
154 case ISO_8859_2
: /* Latin Extended */
155 case WIN_1250
: /* Central European */
156 #ifdef HAVE_LCD_BITMAP
157 case ISO_8859_8
: /* Hebrew */
158 case ISO_8859_11
: /* Thai */
159 case WIN_1256
: /* Arabic */
161 tmp
= ((cp
-1)*128) + (*iso
++ - 128);
162 ucs
= codepage_table
[tmp
];
165 #ifdef HAVE_LCD_BITMAP
166 case SJIS
: /* Japanese */
167 if (*iso
> 0xA0 && *iso
< 0xE0) {
168 tmp
= *iso
++ | (0xA100 - 0x8000);
169 ucs
= codepage_table
[tmp
];
173 case GB_2312
: /* Simplified Chinese */
174 case KSX_1001
: /* Korean */
175 case BIG_5
: /* Traditional Chinese */
176 if (count
< 1 || !iso
[1]) {
181 /* we assume all cjk strings are written
182 in big endian order */
186 ucs
= codepage_table
[tmp
];
189 #endif /* HAVE_LCD_BITMAP */
196 if (ucs
== 0) /* unknown char, use replacement char */
198 utf8
= utf8encode(ucs
, utf8
);
204 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
205 unsigned char* utf16LEdecode(const unsigned char *utf16
, unsigned char *utf8
,
211 /* Check for a surrogate pair */
212 if (utf16
[1] >= 0xD8 && utf16
[1] < 0xE0) {
213 ucs
= 0x10000 + ((utf16
[0] << 10) | ((utf16
[1] - 0xD8) << 18)
214 | utf16
[2] | ((utf16
[3] - 0xDC) << 8));
218 ucs
= (utf16
[0] | (utf16
[1] << 8));
222 utf8
= utf8encode(ucs
, utf8
);
227 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
228 unsigned char* utf16BEdecode(const unsigned char *utf16
, unsigned char *utf8
,
234 if (*utf16
>= 0xD8 && *utf16
< 0xE0) { /* Check for a surrogate pair */
235 ucs
= 0x10000 + (((utf16
[0] - 0xD8) << 18) | (utf16
[1] << 10)
236 | ((utf16
[2] - 0xDC) << 8) | utf16
[3]);
240 ucs
= (utf16
[0] << 8) | utf16
[1];
244 utf8
= utf8encode(ucs
, utf8
);
249 #if 0 /* currently unused */
250 /* Recode any UTF-16 string to UTF-8 */
251 unsigned char* utf16decode(const unsigned char *utf16
, unsigned char *utf8
,
256 ucs
= *(utf16
++) << 8;
259 if (ucs
== 0xFEFF) /* Check for BOM */
260 return utf16BEdecode(utf16
, utf8
, count
-1);
261 else if (ucs
== 0xFFFE)
262 return utf16LEdecode(utf16
, utf8
, count
-1);
263 else { /* ADDME: Should default be LE or BE? */
265 return utf16BEdecode(utf16
, utf8
, count
);
270 /* Return the number of UTF-8 chars in a string */
271 unsigned long utf8length(const unsigned char *utf8
)
276 if ((*utf8
++ & MASK
) != COMP
)
282 /* Decode 1 UTF-8 char and return a pointer to the next char. */
283 const unsigned char* utf8decode(const unsigned char *utf8
, unsigned short *ucs
)
285 unsigned char c
= *utf8
++;
289 if ((c
<= 0x7f) || (c
>= 0xc2)) {
290 /* Start of new character. */
291 if (c
< 0x80) { /* U-00000000 - U-0000007F, 1 byte */
293 } else if (c
< 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
296 } else if (c
< 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
299 } else if (c
< 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
307 while (tail
-- && ((c
= *utf8
++) != 0)) {
308 if ((c
& 0xc0) == 0x80) {
309 /* Valid continuation character. */
310 code
= (code
<< 6) | (c
& 0x3f);
313 /* Invalid continuation char */
320 /* Invalid UTF-8 char */
323 /* currently we don't support chars above U-FFFF */
324 *ucs
= (code
< 0x10000) ? code
: 0xfffd;
328 void set_codepage(int cp
)
330 default_codepage
= cp
;
334 /* seek to a given char in a utf8 string and
335 return its start position in the string */
336 int utf8seek(const unsigned char* utf8
, int offset
)
342 while ((utf8
[pos
] & MASK
) == COMP
)