Bump version numbers for 3.13
[maemo-rb.git] / firmware / common / unicode.c
blob3ad63ee4fb7a79af816bf2e509c84d9e1772d486
1 /***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
10 * Copyright (c) 2004,2005 by Marcoen Hirschberg
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
21 /* Some conversion functions for handling UTF-8
23 * I got all the info from:
24 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
25 * and
26 * http://en.wikipedia.org/wiki/Unicode
29 #include <stdio.h>
30 #include "config.h"
31 #include "file.h"
32 #include "debug.h"
33 #include "rbunicode.h"
34 #include "rbpaths.h"
36 #ifndef O_BINARY
37 #define O_BINARY 0
38 #endif
40 static int default_codepage = 0;
41 static int loaded_cp_table = 0;
43 #ifdef HAVE_LCD_BITMAP
45 #define MAX_CP_TABLE_SIZE 32768
46 #define NUM_TABLES 5
48 static const char * const filename[NUM_TABLES] =
50 CODEPAGE_DIR"/iso.cp",
51 CODEPAGE_DIR"/932.cp", /* SJIS */
52 CODEPAGE_DIR"/936.cp", /* GB2312 */
53 CODEPAGE_DIR"/949.cp", /* KSX1001 */
54 CODEPAGE_DIR"/950.cp" /* BIG5 */
57 static const char cp_2_table[NUM_CODEPAGES] =
59 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
62 static const char * const name_codepages[NUM_CODEPAGES+1] =
64 "ISO-8859-1",
65 "ISO-8859-7",
66 "ISO-8859-8",
67 "CP1251",
68 "ISO-8859-11",
69 "CP1256",
70 "ISO-8859-9",
71 "ISO-8859-2",
72 "CP1250",
73 "CP1252",
74 "SJIS",
75 "GB-2312",
76 "KSX-1001",
77 "BIG5",
78 "UTF-8",
79 "unknown"
82 #else /* !HAVE_LCD_BITMAP, reduced support */
84 #define MAX_CP_TABLE_SIZE 768
85 #define NUM_TABLES 1
87 static const char * const filename[NUM_TABLES] = {
88 CODEPAGE_DIR"/isomini.cp"
91 static const char cp_2_table[NUM_CODEPAGES] =
93 0, 1, 1, 1, 1, 1, 1, 0
96 static const char * const name_codepages[NUM_CODEPAGES+1] =
98 "ISO-8859-1",
99 "ISO-8859-7",
100 "CP1251",
101 "ISO-8859-9",
102 "ISO-8859-2",
103 "CP1250",
104 "CP1252",
105 "UTF-8",
106 "unknown"
109 #endif
111 static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
113 static const unsigned char utf8comp[6] =
115 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
118 /* Load codepage file into memory */
119 static int load_cp_table(int cp)
121 int i = 0;
122 int table = cp_2_table[cp];
123 int file, tablesize;
124 unsigned char tmp[2];
126 if (table == 0 || table == loaded_cp_table)
127 return 1;
129 file = open(filename[table-1], O_RDONLY|O_BINARY);
131 if (file < 0) {
132 DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
133 return 0;
136 tablesize = filesize(file) / 2;
138 if (tablesize > MAX_CP_TABLE_SIZE) {
139 DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
140 close(file);
141 return 0;
144 while (i < tablesize) {
145 if (!read(file, tmp, 2)) {
146 DEBUGF("Can't read from codepage file: %s.cp\n",
147 filename[table-1]);
148 loaded_cp_table = 0;
149 return 0;
151 codepage_table[i++] = (tmp[1] << 8) | tmp[0];
154 loaded_cp_table = table;
155 close(file);
156 return 1;
159 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
160 unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
162 int tail = 0;
164 if (ucs > 0x7F)
165 while (ucs >> (5*tail + 6))
166 tail++;
168 *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
169 while (tail--)
170 *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
172 return utf8;
175 /* Recode an iso encoded string to UTF-8 */
176 unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
177 int cp, int count)
179 unsigned short ucs, tmp;
181 if (cp == -1) /* use default codepage */
182 cp = default_codepage;
184 if (!load_cp_table(cp)) cp = 0;
186 while (count--) {
187 if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
188 *utf8++ = *iso++;
190 else {
192 /* cp tells us which codepage to convert from */
193 switch (cp) {
194 case ISO_8859_7: /* Greek */
195 case WIN_1252: /* Western European */
196 case WIN_1251: /* Cyrillic */
197 case ISO_8859_9: /* Turkish */
198 case ISO_8859_2: /* Latin Extended */
199 case WIN_1250: /* Central European */
200 #ifdef HAVE_LCD_BITMAP
201 case ISO_8859_8: /* Hebrew */
202 case ISO_8859_11: /* Thai */
203 case WIN_1256: /* Arabic */
204 #endif
205 tmp = ((cp-1)*128) + (*iso++ - 128);
206 ucs = codepage_table[tmp];
207 break;
209 #ifdef HAVE_LCD_BITMAP
210 case SJIS: /* Japanese */
211 if (*iso > 0xA0 && *iso < 0xE0) {
212 tmp = *iso++ | (0xA100 - 0x8000);
213 ucs = codepage_table[tmp];
214 break;
217 case GB_2312: /* Simplified Chinese */
218 case KSX_1001: /* Korean */
219 case BIG_5: /* Traditional Chinese */
220 if (count < 1 || !iso[1]) {
221 ucs = *iso++;
222 break;
225 /* we assume all cjk strings are written
226 in big endian order */
227 tmp = *iso++ << 8;
228 tmp |= *iso++;
229 tmp -= 0x8000;
230 ucs = codepage_table[tmp];
231 count--;
232 break;
233 #endif /* HAVE_LCD_BITMAP */
235 default:
236 ucs = *iso++;
237 break;
240 if (ucs == 0) /* unknown char, use replacement char */
241 ucs = 0xfffd;
242 utf8 = utf8encode(ucs, utf8);
245 return utf8;
248 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
249 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
250 int count)
252 unsigned long ucs;
254 while (count > 0) {
255 /* Check for a surrogate pair */
256 if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
257 ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
258 | utf16[2] | ((utf16[3] - 0xDC) << 8));
259 utf16 += 4;
260 count -= 2;
261 } else {
262 ucs = (utf16[0] | (utf16[1] << 8));
263 utf16 += 2;
264 count -= 1;
266 utf8 = utf8encode(ucs, utf8);
268 return utf8;
271 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
272 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
273 int count)
275 unsigned long ucs;
277 while (count > 0) {
278 if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
279 ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
280 | ((utf16[2] - 0xDC) << 8) | utf16[3]);
281 utf16 += 4;
282 count -= 2;
283 } else {
284 ucs = (utf16[0] << 8) | utf16[1];
285 utf16 += 2;
286 count -= 1;
288 utf8 = utf8encode(ucs, utf8);
290 return utf8;
293 #if 0 /* currently unused */
294 /* Recode any UTF-16 string to UTF-8 */
295 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
296 unsigned int count)
298 unsigned long ucs;
300 ucs = *(utf16++) << 8;
301 ucs |= *(utf16++);
303 if (ucs == 0xFEFF) /* Check for BOM */
304 return utf16BEdecode(utf16, utf8, count-1);
305 else if (ucs == 0xFFFE)
306 return utf16LEdecode(utf16, utf8, count-1);
307 else { /* ADDME: Should default be LE or BE? */
308 utf16 -= 2;
309 return utf16BEdecode(utf16, utf8, count);
312 #endif
314 /* Return the number of UTF-8 chars in a string */
315 unsigned long utf8length(const unsigned char *utf8)
317 unsigned long l = 0;
319 while (*utf8 != 0)
320 if ((*utf8++ & MASK) != COMP)
321 l++;
323 return l;
326 /* Decode 1 UTF-8 char and return a pointer to the next char. */
327 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
329 unsigned char c = *utf8++;
330 unsigned long code;
331 int tail = 0;
333 if ((c <= 0x7f) || (c >= 0xc2)) {
334 /* Start of new character. */
335 if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */
336 code = c;
337 } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
338 tail = 1;
339 code = c & 0x1f;
340 } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
341 tail = 2;
342 code = c & 0x0f;
343 } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
344 tail = 3;
345 code = c & 0x07;
346 } else {
347 /* Invalid size. */
348 code = 0xfffd;
351 while (tail-- && ((c = *utf8++) != 0)) {
352 if ((c & 0xc0) == 0x80) {
353 /* Valid continuation character. */
354 code = (code << 6) | (c & 0x3f);
356 } else {
357 /* Invalid continuation char */
358 code = 0xfffd;
359 utf8--;
360 break;
363 } else {
364 /* Invalid UTF-8 char */
365 code = 0xfffd;
367 /* currently we don't support chars above U-FFFF */
368 *ucs = (code < 0x10000) ? code : 0xfffd;
369 return utf8;
372 void set_codepage(int cp)
374 default_codepage = cp;
375 return;
378 /* seek to a given char in a utf8 string and
379 return its start position in the string */
380 int utf8seek(const unsigned char* utf8, int offset)
382 int pos = 0;
384 while (offset--) {
385 pos++;
386 while ((utf8[pos] & MASK) == COMP)
387 pos++;
389 return pos;
392 const char* get_codepage_name(int cp)
394 if (cp < 0 || cp>= NUM_CODEPAGES)
395 return name_codepages[NUM_CODEPAGES];
396 return name_codepages[cp];