New plugin: frotz, a Z-machine interpreter, for playing interactive fiction.
[kugel-rb.git] / firmware / common / unicode.c
blob4ef6eaae2bfbce15147e64f3b40941aa771d5b54
1 /***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
10 * Copyright (c) 2004,2005 by Marcoen Hirschberg
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
21 /* Some conversion functions for handling UTF-8
23 * I got all the info from:
24 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
25 * and
26 * http://en.wikipedia.org/wiki/Unicode
29 #include <stdio.h>
30 #include "file.h"
31 #include "debug.h"
32 #include "rbunicode.h"
33 #include "config.h"
35 #ifndef O_BINARY
36 #define O_BINARY 0
37 #endif
39 #define CODEPAGE_DIR ROCKBOX_DIR"/codepages"
40 static int default_codepage = 0;
41 static int loaded_cp_table = 0;
43 #ifdef HAVE_LCD_BITMAP
45 #define MAX_CP_TABLE_SIZE 32768
46 #define NUM_TABLES 5
48 static const char * const filename[NUM_TABLES] =
50 CODEPAGE_DIR"/iso.cp",
51 CODEPAGE_DIR"/932.cp", /* SJIS */
52 CODEPAGE_DIR"/936.cp", /* GB2312 */
53 CODEPAGE_DIR"/949.cp", /* KSX1001 */
54 CODEPAGE_DIR"/950.cp" /* BIG5 */
57 static const char cp_2_table[NUM_CODEPAGES] =
59 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
62 static const char * const name_codepages[NUM_CODEPAGES+1] =
64 "ISO-8859-1",
65 "ISO-8859-7",
66 "ISO-8859-8",
67 "CP1251",
68 "ISO-8859-11",
69 "CP1256",
70 "ISO-8859-9",
71 "ISO-8859-2",
72 "CP1250",
73 "SJIS",
74 "GB-2312",
75 "KSX-1001",
76 "BIG5",
77 "UTF-8",
78 "unknown"
81 #else /* !HAVE_LCD_BITMAP, reduced support */
83 #define MAX_CP_TABLE_SIZE 640
84 #define NUM_TABLES 1
86 static const char * const filename[NUM_TABLES] = {
87 CODEPAGE_DIR"/isomini.cp"
90 static const char cp_2_table[NUM_CODEPAGES] =
92 0, 1, 1, 1, 1, 1, 0
95 static const char * const name_codepages[NUM_CODEPAGES+1] =
97 "ISO-8859-1",
98 "ISO-8859-7",
99 "CP1251",
100 "ISO-8859-9",
101 "ISO-8859-2",
102 "CP1250",
103 "UTF-8",
104 "unknown"
107 #endif
109 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
111 static const unsigned char utf8comp[6] =
113 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
116 /* Load codepage file into memory */
117 static int load_cp_table(int cp)
119 int i = 0;
120 int table = cp_2_table[cp];
121 int file, tablesize;
122 unsigned char tmp[2];
124 if (table == 0 || table == loaded_cp_table)
125 return 1;
127 file = open(filename[table-1], O_RDONLY|O_BINARY);
129 if (file < 0) {
130 DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
131 return 0;
134 tablesize = filesize(file) / 2;
136 if (tablesize > MAX_CP_TABLE_SIZE) {
137 DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
138 close(file);
139 return 0;
142 while (i < tablesize) {
143 if (!read(file, tmp, 2)) {
144 DEBUGF("Can't read from codepage file: %s.cp\n",
145 filename[table-1]);
146 loaded_cp_table = 0;
147 return 0;
149 codepage_table[i++] = (tmp[1] << 8) | tmp[0];
152 loaded_cp_table = table;
153 close(file);
154 return 1;
157 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
158 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
160 int tail = 0;
162 if (ucs > 0x7F)
163 while (ucs >> (5*tail + 6))
164 tail++;
166 *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
167 while (tail--)
168 *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
170 return utf8;
173 /* Recode an iso encoded string to UTF-8 */
174 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
175 int cp, int count)
177 unsigned short ucs, tmp;
179 if (cp == -1) /* use default codepage */
180 cp = default_codepage;
182 if (!load_cp_table(cp)) cp = 0;
184 while (count--) {
185 if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
186 *utf8++ = *iso++;
188 else {
190 /* cp tells us which codepage to convert from */
191 switch (cp) {
192 case ISO_8859_7: /* Greek */
193 case WIN_1251: /* Cyrillic */
194 case ISO_8859_9: /* Turkish */
195 case ISO_8859_2: /* Latin Extended */
196 case WIN_1250: /* Central European */
197 #ifdef HAVE_LCD_BITMAP
198 case ISO_8859_8: /* Hebrew */
199 case ISO_8859_11: /* Thai */
200 case WIN_1256: /* Arabic */
201 #endif
202 tmp = ((cp-1)*128) + (*iso++ - 128);
203 ucs = codepage_table[tmp];
204 break;
206 #ifdef HAVE_LCD_BITMAP
207 case SJIS: /* Japanese */
208 if (*iso > 0xA0 && *iso < 0xE0) {
209 tmp = *iso++ | (0xA100 - 0x8000);
210 ucs = codepage_table[tmp];
211 break;
214 case GB_2312: /* Simplified Chinese */
215 case KSX_1001: /* Korean */
216 case BIG_5: /* Traditional Chinese */
217 if (count < 1 || !iso[1]) {
218 ucs = *iso++;
219 break;
222 /* we assume all cjk strings are written
223 in big endian order */
224 tmp = *iso++ << 8;
225 tmp |= *iso++;
226 tmp -= 0x8000;
227 ucs = codepage_table[tmp];
228 count--;
229 break;
230 #endif /* HAVE_LCD_BITMAP */
232 default:
233 ucs = *iso++;
234 break;
237 if (ucs == 0) /* unknown char, use replacement char */
238 ucs = 0xfffd;
239 utf8 = utf8encode(ucs, utf8);
242 return utf8;
245 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
246 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
247 int count)
249 unsigned long ucs;
251 while (count > 0) {
252 /* Check for a surrogate pair */
253 if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
254 ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
255 | utf16[2] | ((utf16[3] - 0xDC) << 8));
256 utf16 += 4;
257 count -= 2;
258 } else {
259 ucs = (utf16[0] | (utf16[1] << 8));
260 utf16 += 2;
261 count -= 1;
263 utf8 = utf8encode(ucs, utf8);
265 return utf8;
268 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
269 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
270 int count)
272 unsigned long ucs;
274 while (count > 0) {
275 if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
276 ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
277 | ((utf16[2] - 0xDC) << 8) | utf16[3]);
278 utf16 += 4;
279 count -= 2;
280 } else {
281 ucs = (utf16[0] << 8) | utf16[1];
282 utf16 += 2;
283 count -= 1;
285 utf8 = utf8encode(ucs, utf8);
287 return utf8;
290 #if 0 /* currently unused */
291 /* Recode any UTF-16 string to UTF-8 */
292 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
293 unsigned int count)
295 unsigned long ucs;
297 ucs = *(utf16++) << 8;
298 ucs |= *(utf16++);
300 if (ucs == 0xFEFF) /* Check for BOM */
301 return utf16BEdecode(utf16, utf8, count-1);
302 else if (ucs == 0xFFFE)
303 return utf16LEdecode(utf16, utf8, count-1);
304 else { /* ADDME: Should default be LE or BE? */
305 utf16 -= 2;
306 return utf16BEdecode(utf16, utf8, count);
309 #endif
311 /* Return the number of UTF-8 chars in a string */
312 unsigned long utf8length(const unsigned char *utf8)
314 unsigned long l = 0;
316 while (*utf8 != 0)
317 if ((*utf8++ & MASK) != COMP)
318 l++;
320 return l;
323 /* Decode 1 UTF-8 char and return a pointer to the next char. */
324 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
326 unsigned char c = *utf8++;
327 unsigned long code;
328 int tail = 0;
330 if ((c <= 0x7f) || (c >= 0xc2)) {
331 /* Start of new character. */
332 if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */
333 code = c;
334 } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
335 tail = 1;
336 code = c & 0x1f;
337 } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
338 tail = 2;
339 code = c & 0x0f;
340 } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
341 tail = 3;
342 code = c & 0x07;
343 } else {
344 /* Invalid size. */
345 code = 0xfffd;
348 while (tail-- && ((c = *utf8++) != 0)) {
349 if ((c & 0xc0) == 0x80) {
350 /* Valid continuation character. */
351 code = (code << 6) | (c & 0x3f);
353 } else {
354 /* Invalid continuation char */
355 code = 0xfffd;
356 utf8--;
357 break;
360 } else {
361 /* Invalid UTF-8 char */
362 code = 0xfffd;
364 /* currently we don't support chars above U-FFFF */
365 *ucs = (code < 0x10000) ? code : 0xfffd;
366 return utf8;
369 void set_codepage(int cp)
371 default_codepage = cp;
372 return;
375 /* seek to a given char in a utf8 string and
376 return its start position in the string */
377 int utf8seek(const unsigned char* utf8, int offset)
379 int pos = 0;
381 while (offset--) {
382 pos++;
383 while ((utf8[pos] & MASK) == COMP)
384 pos++;
386 return pos;
389 const char* get_codepage_name(int cp)
391 if (cp < 0 || cp>= NUM_CODEPAGES)
392 return name_codepages[NUM_CODEPAGES];
393 return name_codepages[cp];