Add DM320 I²C driver, although not (yet) enabled in the sources.
[Rockbox.git] / firmware / common / unicode.c
blob2d11a388459fbe1bffb7e274bdedb22bd965b25a
1 /* Some conversion functions for handling UTF-8
3 * copyright Marcoen Hirschberg (2004,2005)
5 * I got all the info from:
6 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
7 * and
8 * http://en.wikipedia.org/wiki/Unicode
9 */
11 #include <stdio.h>
12 #include "file.h"
13 #include "debug.h"
14 #include "rbunicode.h"
15 #include "config.h"
17 #ifndef O_BINARY
18 #define O_BINARY 0
19 #endif
21 #define CODEPAGE_DIR "/.rockbox/codepages"
22 static int default_codepage = 0;
23 static int loaded_cp_table = 0;
25 #ifdef HAVE_LCD_BITMAP
27 #define MAX_CP_TABLE_SIZE 32768
28 #define NUM_TABLES 5
30 enum {
31 ISO_8859_1 = 0, ISO_8859_7, ISO_8859_8, WIN_1251,
32 ISO_8859_11, WIN_1256, ISO_8859_9, ISO_8859_2, WIN_1250,
33 SJIS, GB_2312, KSX_1001, BIG_5, UTF_8, NUM_CODEPAGES
35 static const char *filename[NUM_TABLES] =
37 CODEPAGE_DIR"/iso.cp",
38 CODEPAGE_DIR"/932.cp", /* SJIS */
39 CODEPAGE_DIR"/936.cp", /* GB2312 */
40 CODEPAGE_DIR"/949.cp", /* KSX1001 */
41 CODEPAGE_DIR"/950.cp" /* BIG5 */
43 static const char cp_2_table[NUM_CODEPAGES] =
45 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
48 #else /* !HAVE_LCD_BITMAP, reduced support */
50 #define MAX_CP_TABLE_SIZE 640
51 #define NUM_TABLES 1
53 enum {
54 ISO_8859_1 = 0, ISO_8859_7, WIN_1251, ISO_8859_9,
55 ISO_8859_2, WIN_1250, UTF_8, NUM_CODEPAGES
57 static const char *filename[NUM_TABLES] =
59 CODEPAGE_DIR"/isomini.cp",
61 static const char cp_2_table[NUM_CODEPAGES] =
63 0, 1, 1, 1, 1, 1, 0
66 #endif
68 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
70 static const unsigned char utf8comp[6] =
72 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
75 /* Load codepage file into memory */
76 static int load_cp_table(int cp)
78 int i=0;
79 int table = cp_2_table[cp];
80 int file, tablesize;
81 unsigned char tmp[2];
83 if (table == 0 || table == loaded_cp_table)
84 return 1;
86 file = open(filename[table-1], O_RDONLY|O_BINARY);
88 if (file < 0) {
89 DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
90 return 0;
93 tablesize = filesize(file) / 2;
95 if (tablesize > MAX_CP_TABLE_SIZE) {
96 DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
97 close(file);
98 return 0;
101 while (i < tablesize) {
102 if (!read(file, tmp, 2)) {
103 DEBUGF("Can't read from codepage file: %s.cp\n",
104 filename[table-1]);
105 loaded_cp_table = 0;
106 return 0;
108 codepage_table[i++] = (tmp[1] << 8) | tmp[0];
111 loaded_cp_table = table;
112 close(file);
113 return 1;
116 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
117 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
119 int tail = 0;
121 if (ucs > 0x7F)
122 while (ucs >> (5*tail + 6))
123 tail++;
125 *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
126 while (tail--)
127 *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
129 return utf8;
132 /* Recode an iso encoded string to UTF-8 */
133 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
134 int cp, int count)
136 unsigned short ucs, tmp;
138 if (cp == -1) /* use default codepage */
139 cp = default_codepage;
141 if (!load_cp_table(cp)) cp = 0;
143 while (count--) {
144 if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
145 *utf8++ = *iso++;
147 else {
149 /* cp tells us which codepage to convert from */
150 switch (cp) {
151 case ISO_8859_7: /* Greek */
152 case WIN_1251: /* Cyrillic */
153 case ISO_8859_9: /* Turkish */
154 case ISO_8859_2: /* Latin Extended */
155 case WIN_1250: /* Central European */
156 #ifdef HAVE_LCD_BITMAP
157 case ISO_8859_8: /* Hebrew */
158 case ISO_8859_11: /* Thai */
159 case WIN_1256: /* Arabic */
160 #endif
161 tmp = ((cp-1)*128) + (*iso++ - 128);
162 ucs = codepage_table[tmp];
163 break;
165 #ifdef HAVE_LCD_BITMAP
166 case SJIS: /* Japanese */
167 if (*iso > 0xA0 && *iso < 0xE0) {
168 tmp = *iso++ | (0xA100 - 0x8000);
169 ucs = codepage_table[tmp];
170 break;
173 case GB_2312: /* Simplified Chinese */
174 case KSX_1001: /* Korean */
175 case BIG_5: /* Traditional Chinese */
176 if (count < 1 || !iso[1]) {
177 ucs = *iso++;
178 break;
181 /* we assume all cjk strings are written
182 in big endian order */
183 tmp = *iso++ << 8;
184 tmp |= *iso++;
185 tmp -= 0x8000;
186 ucs = codepage_table[tmp];
187 count--;
188 break;
189 #endif /* HAVE_LCD_BITMAP */
191 default:
192 ucs = *iso++;
193 break;
196 if (ucs == 0) /* unknown char, use replacement char */
197 ucs = 0xfffd;
198 utf8 = utf8encode(ucs, utf8);
201 return utf8;
204 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
205 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
206 int count)
208 unsigned long ucs;
210 while (count > 0) {
211 /* Check for a surrogate pair */
212 if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
213 ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
214 | utf16[2] | ((utf16[3] - 0xDC) << 8));
215 utf16 += 4;
216 count -= 2;
217 } else {
218 ucs = (utf16[0] | (utf16[1] << 8));
219 utf16 += 2;
220 count -= 1;
222 utf8 = utf8encode(ucs, utf8);
224 return utf8;
227 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
228 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
229 int count)
231 unsigned long ucs;
233 while (count > 0) {
234 if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
235 ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
236 | ((utf16[2] - 0xDC) << 8) | utf16[3]);
237 utf16 += 4;
238 count -= 2;
239 } else {
240 ucs = (utf16[0] << 8) | utf16[1];
241 utf16 += 2;
242 count -= 1;
244 utf8 = utf8encode(ucs, utf8);
246 return utf8;
249 #if 0 /* currently unused */
250 /* Recode any UTF-16 string to UTF-8 */
251 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
252 unsigned int count)
254 unsigned long ucs;
256 ucs = *(utf16++) << 8;
257 ucs |= *(utf16++);
259 if (ucs == 0xFEFF) /* Check for BOM */
260 return utf16BEdecode(utf16, utf8, count-1);
261 else if (ucs == 0xFFFE)
262 return utf16LEdecode(utf16, utf8, count-1);
263 else { /* ADDME: Should default be LE or BE? */
264 utf16 -= 2;
265 return utf16BEdecode(utf16, utf8, count);
268 #endif
270 /* Return the number of UTF-8 chars in a string */
271 unsigned long utf8length(const unsigned char *utf8)
273 unsigned long l = 0;
275 while (*utf8 != 0)
276 if ((*utf8++ & MASK) != COMP)
277 l++;
279 return l;
282 /* Decode 1 UTF-8 char and return a pointer to the next char. */
283 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
285 unsigned char c = *utf8++;
286 unsigned long code;
287 int tail = 0;
289 if ((c <= 0x7f) || (c >= 0xc2)) {
290 /* Start of new character. */
291 if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */
292 code = c;
293 } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
294 tail = 1;
295 code = c & 0x1f;
296 } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
297 tail = 2;
298 code = c & 0x0f;
299 } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
300 tail = 3;
301 code = c & 0x07;
302 } else {
303 /* Invalid size. */
304 code = 0xfffd;
307 while (tail-- && ((c = *utf8++) != 0)) {
308 if ((c & 0xc0) == 0x80) {
309 /* Valid continuation character. */
310 code = (code << 6) | (c & 0x3f);
312 } else {
313 /* Invalid continuation char */
314 code = 0xfffd;
315 utf8--;
316 break;
319 } else {
320 /* Invalid UTF-8 char */
321 code = 0xfffd;
323 /* currently we don't support chars above U-FFFF */
324 *ucs = (code < 0x10000) ? code : 0xfffd;
325 return utf8;
328 void set_codepage(int cp)
330 default_codepage = cp;
331 return;
334 /* seek to a given char in a utf8 string and
335 return its start position in the string */
336 int utf8seek(const unsigned char* utf8, int offset)
338 int pos = 0;
340 while (offset--) {
341 pos++;
342 while ((utf8[pos] & MASK) == COMP)
343 pos++;
345 return pos;