1 /* Some conversion functions for handling UTF-8
3 * copyright Marcoen Hirschberg (2004,2005)
5 * I got all the info from:
6 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
8 * http://en.wikipedia.org/wiki/Unicode
14 #include "rbunicode.h"
21 #define NUM_CODEPAGES 13
23 static int default_codepage
= 0;
24 static unsigned short codepage_table
[MAX_CP_TABLE_SIZE
];
25 static int loaded_cp_table
= 0;
28 static const unsigned char utf8comp
[6] =
30 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
33 static const char *filename
[NUM_TABLES
] =
35 CODEPAGE_DIR
"/iso.cp",
36 CODEPAGE_DIR
"/932.cp", /* SJIS */
37 CODEPAGE_DIR
"/936.cp", /* GB2312 */
38 CODEPAGE_DIR
"/949.cp", /* KSX1001 */
39 CODEPAGE_DIR
"/950.cp" /* BIG5 */
42 static const char cp_2_table
[NUM_CODEPAGES
] =
44 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5
47 /* Load codepage file into memory */
48 int load_cp_table(int cp
)
51 int table
= cp_2_table
[cp
];
55 if (cp
== 0 || table
== loaded_cp_table
)
58 file
= open(filename
[table
-1], O_RDONLY
|O_BINARY
);
61 DEBUGF("Can't open codepage file: %s.cp\n", filename
[table
-1]);
65 tablesize
= lseek(file
, 0, SEEK_END
) / 2;
66 lseek(file
, 0, SEEK_SET
);
68 if (tablesize
> MAX_CP_TABLE_SIZE
) {
69 DEBUGF("Invalid codepage file: %s.cp\n", filename
[table
-1]);
74 while (i
< tablesize
) {
75 if (!read(file
, tmp
, 2)) {
76 DEBUGF("Can't read from codepage file: %s.cp\n",
81 codepage_table
[i
++] = (tmp
[1] << 8) | tmp
[0];
84 loaded_cp_table
= table
;
89 /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
90 unsigned char* utf8encode(unsigned long ucs
, unsigned char *utf8
)
95 while (ucs
>> (5*tail
+ 6))
98 *utf8
++ = (ucs
>> (6*tail
)) | utf8comp
[tail
];
100 *utf8
++ = ((ucs
>> (6*tail
)) & (MASK
^ 0xFF)) | COMP
;
105 /* Recode an iso encoded string to UTF-8 */
106 unsigned char* iso_decode(const unsigned char *iso
, unsigned char *utf8
,
109 unsigned short ucs
, tmp
;
111 if (cp
== -1) /* use default codepage */
112 cp
= default_codepage
;
114 if (!load_cp_table(cp
)) cp
= 0;
122 /* cp tells us which codepage to convert from */
124 case 0x01: /* Greek (ISO-8859-7) */
125 case 0x02: /* Hebrew (ISO-8859-8) */
126 case 0x03: /* Russian (CP1251) */
127 case 0x04: /* Thai (ISO-8859-11) */
128 case 0x05: /* Arabic (ISO-8859-6) */
129 case 0x06: /* Turkish (ISO-8859-9) */
130 case 0x07: /* Latin Extended (ISO-8859-2) */
131 tmp
= ((cp
-1)*128) + (*iso
++ - 128);
132 ucs
= codepage_table
[tmp
];
135 case 0x08: /* Japanese (SJIS) */
136 if (*iso
> 0xA0 && *iso
< 0xE0) {
138 ucs
= codepage_table
[tmp
];
142 case 0x09: /* Simplified Chinese (GB2312) */
143 case 0x0A: /* Korean (KSX1001) */
144 case 0x0B: /* Traditional Chinese (BIG5) */
145 if (count
< 1 || !iso
[1]) {
150 /* we assume all cjk strings are written
151 in big endian order */
155 ucs
= codepage_table
[tmp
];
159 case 0x0C: /* UTF-8, do nothing */
165 if (ucs
== 0) /* unknown char, assume invalid encoding */
167 utf8
= utf8encode(ucs
, utf8
);
173 /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
174 unsigned char* utf16LEdecode(const unsigned char *utf16
, unsigned char *utf8
,
180 /* Check for a surrogate pair */
181 if (utf16
[1] >= 0xD8 && utf16
[1] < 0xE0) {
182 ucs
= 0x10000 + ((utf16
[0] << 10) | ((utf16
[1] - 0xD8) << 18)
183 | utf16
[2] | ((utf16
[3] - 0xDC) << 8));
187 ucs
= (utf16
[0] | (utf16
[1] << 8));
191 utf8
= utf8encode(ucs
, utf8
);
196 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
197 unsigned char* utf16BEdecode(const unsigned char *utf16
, unsigned char *utf8
,
203 if (*utf16
>= 0xD8 && *utf16
< 0xE0) { /* Check for a surrogate pair */
204 ucs
= 0x10000 + (((utf16
[0] - 0xD8) << 18) | (utf16
[1] << 10)
205 | ((utf16
[2] - 0xDC) << 8) | utf16
[3]);
209 ucs
= (utf16
[0] << 8) | utf16
[1];
213 utf8
= utf8encode(ucs
, utf8
);
218 /* Recode any UTF-16 string to UTF-8 */
219 unsigned char* utf16decode(const unsigned char *utf16
, unsigned char *utf8
,
224 ucs
= *(utf16
++) << 8;
227 if (ucs
== 0xFEFF) /* Check for BOM */
228 return utf16BEdecode(utf16
, utf8
, count
-1);
229 else if (ucs
== 0xFFFE)
230 return utf16LEdecode(utf16
, utf8
, count
-1);
231 else { /* ADDME: Should default be LE or BE? */
233 return utf16BEdecode(utf16
, utf8
, count
);
237 /* Return the number of UTF-8 chars in a string */
238 unsigned long utf8length(const unsigned char *utf8
)
243 if ((*utf8
++ & MASK
) != COMP
)
249 /* Decode 1 UTF-8 char and return a pointer to the next char. */
250 const unsigned char* utf8decode(const unsigned char *utf8
, unsigned short *ucs
)
252 unsigned char c
= *utf8
++;
256 if ((c
<= 0x7f) || (c
>= 0xc2)) {
257 /* Start of new character. */
258 if (c
< 0x80) { /* U-00000000 - U-0000007F, 1 byte */
260 } else if (c
< 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
263 } else if (c
< 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
266 } else if (c
< 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
274 while (tail
-- && ((c
= *utf8
++) != 0)) {
275 if ((c
& 0xc0) == 0x80) {
276 /* Valid continuation character. */
277 code
= (code
<< 6) | (c
& 0x3f);
280 /* Invalid continuation char */
287 /* Invalid UTF-8 char */
290 /* currently we don't support chars above U-FFFF */
291 *ucs
= (code
< 0x10000) ? code
: 0xffff;
295 void set_codepage(int cp
)
297 default_codepage
= cp
;
301 /* seek to a given char in a utf8 string and
302 return its start position in the string */
303 int utf8seek(const unsigned char* utf8
, int offset
)
309 while ((utf8
[pos
] & MASK
) == COMP
)