remove editor hints for vi
[uclibc-ng.git] / libiconv / iconv.c
blobcb4e947758c1c4ee2e18934bb0c4fda0cd563573
1 #include <iconv.h>
2 #include <errno.h>
3 #include <wchar.h>
4 #include <string.h>
5 #include <strings.h>
6 #include <stdlib.h>
7 #include <limits.h>
9 #include <dirent.h>
10 #include <fcntl.h>
11 #include <sys/mman.h>
12 #include <sys/stat.h>
13 #include <unistd.h>
14 #include <stdint.h>
16 /* builtin charmaps */
17 #include "charmaps.h"
19 /* only 0-7 are valid as dest charset */
20 #define UTF_16BE 000
21 #define UTF_16LE 001
22 #define UTF_32BE 002
23 #define UTF_32LE 003
24 #define WCHAR_T 004
25 #define UTF_8 005
26 #define US_ASCII 006
27 #define LATIN_1 007
29 /* additional charsets with algorithmic conversion */
30 #define LATIN_9 010
31 #define TIS_620 011
32 #define JIS_0201 012
34 /* some programs like php need this */
35 int _libiconv_version = _LIBICONV_VERSION;
37 /* these must match the constants above */
38 static const unsigned char charsets[] =
39 "\005" "UTF-8" "\0"
40 "\004" "WCHAR_T" "\0"
41 "\000" "UTF-16BE" "\0"
42 "\001" "UTF-16LE" "\0"
43 "\002" "UTF-32BE" "\0"
44 "\003" "UTF-32LE" "\0"
45 "\006" "ASCII" "\0"
46 "\006" "US-ASCII" "\0"
47 "\007" "ISO-8859-1" "\0"
48 "\007" "LATIN1" "\0"
49 "\010" "ISO-8859-15""\0"
50 "\010" "LATIN9" "\0"
51 "\011" "ISO-8859-11""\0"
52 "\011" "TIS-620" "\0"
53 "\012" "JIS-0201" "\0"
54 "\377";
56 /* separate identifiers for sbcs/dbcs/etc map type */
57 #define UCS2_8BIT 000
58 #define UCS3_8BIT 001
59 #define EUC 002
60 #define EUC_TW 003
61 #define SHIFT_JIS 004
62 #define BIG5 005
63 #define GBK 006
65 /* FIXME: these are not implemented yet
66 // EUC: A1-FE A1-FE
67 // GBK: 81-FE 40-7E,80-FE
68 // Big5: A1-FE 40-7E,A1-FE
71 static const unsigned short maplen[] = {
72 [UCS2_8BIT] = 4+ 2* 128,
73 [UCS3_8BIT] = 4+ 3* 128,
74 [EUC] = 4+ 2* 94*94,
75 [SHIFT_JIS] = 4+ 2* 94*94,
76 [BIG5] = 4+ 2* 94*157,
77 [GBK] = 4+ 2* 126*190,
78 [EUC_TW] = 4+ 2* 2*94*94,
81 static int find_charmap(const char *name)
83 int i;
84 for (i = 0; i < (sizeof(charmaps) / sizeof(charmaps[0])); i++)
85 if (!strcasecmp(charmaps[i].name, name))
86 return i;
87 return -1;
90 static int find_charset(const char *name)
92 const unsigned char *s;
93 for (s=charsets; *s<0xff && strcasecmp(s+1, name); s+=strlen(s)+1);
94 return *s;
97 iconv_t iconv_open(const char *to, const char *from)
99 unsigned f, t;
100 int m;
102 if ((t = find_charset(to)) > 8)
103 return -1;
105 if ((f = find_charset(from)) < 255)
106 return 0 | (t<<1) | (f<<8);
108 if ((m = find_charmap(from)) > -1)
109 return 1 | (t<<1) | (m<<8);
111 return -1;
114 int iconv_close(iconv_t cd)
116 return 0;
119 static inline wchar_t get_16(const unsigned char *s, int endian)
121 endian &= 1;
122 return s[endian]<<8 | s[endian^1];
125 static inline void put_16(unsigned char *s, wchar_t c, int endian)
127 endian &= 1;
128 s[endian] = c>>8;
129 s[endian^1] = c;
132 static inline int utf8enc_wchar(char *outb, wchar_t c)
134 if (c <= 0x7F) {
135 *outb = c;
136 return 1;
138 else if (c <= 0x7FF) {
139 *outb++ = ((c >> 6) & 0x1F) | 0xC0;
140 *outb++ = ( c & 0x3F) | 0x80;
141 return 2;
143 else if (c <= 0xFFFF) {
144 *outb++ = ((c >> 12) & 0x0F) | 0xE0;
145 *outb++ = ((c >> 6) & 0x3F) | 0x80;
146 *outb++ = ( c & 0x3F) | 0x80;
147 return 3;
149 else if (c <= 0x10FFFF) {
150 *outb++ = ((c >> 18) & 0x07) | 0xF0;
151 *outb++ = ((c >> 12) & 0x3F) | 0x80;
152 *outb++ = ((c >> 6) & 0x3F) | 0x80;
153 *outb++ = ( c & 0x3F) | 0x80;
154 return 4;
156 else {
157 *outb++ = '?';
158 return 1;
162 static inline int utf8seq_is_overlong(char *s, int n)
164 switch (n)
166 case 2:
167 /* 1100000x (10xxxxxx) */
168 return (((*s >> 1) == 0x60) &&
169 ((*(s+1) >> 6) == 0x02));
171 case 3:
172 /* 11100000 100xxxxx (10xxxxxx) */
173 return ((*s == 0xE0) &&
174 ((*(s+1) >> 5) == 0x04) &&
175 ((*(s+2) >> 6) == 0x02));
177 case 4:
178 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
179 return ((*s == 0xF0) &&
180 ((*(s+1) >> 4) == 0x08) &&
181 ((*(s+2) >> 6) == 0x02) &&
182 ((*(s+3) >> 6) == 0x02));
185 return 0;
188 static inline int utf8seq_is_surrogate(char *s, int n)
190 return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
193 static inline int utf8seq_is_illegal(char *s, int n)
195 return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
196 (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
199 static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb)
201 int i;
202 int n = -1;
204 /* trivial char */
205 if (*in <= 0x7F) {
206 *c = *in;
207 return 1;
210 /* find utf8 sequence length */
211 if ((*in & 0xE0) == 0xC0) n = 2;
212 else if ((*in & 0xF0) == 0xE0) n = 3;
213 else if ((*in & 0xF8) == 0xF0) n = 4;
214 else if ((*in & 0xFC) == 0xF8) n = 5;
215 else if ((*in & 0xFE) == 0xFC) n = 6;
217 /* starved? */
218 if (n > inb)
219 return -2;
221 /* decode ... */
222 if (n > 1 && n < 5) {
223 /* reject invalid sequences */
224 if (utf8seq_is_overlong(in, n) ||
225 utf8seq_is_surrogate(in, n) ||
226 utf8seq_is_illegal(in, n))
227 return -1;
229 /* decode ... */
230 *c = (char)(*in++ & (0x7F >> n));
232 for (i = 1; i < n; i++) {
233 /* illegal continuation byte */
234 if (*in < 0x80 || *in > 0xBF)
235 return -1;
237 *c = (*c << 6) | (*in++ & 0x3F);
240 return n;
243 /* unmapped sequence (> 4) */
244 return -1;
247 static inline char latin9_translit(wchar_t c)
249 /* a number of trivial iso-8859-15 <> utf-8 transliterations */
250 switch (c) {
251 case 0x20AC: return 0xA4; /* Euro */
252 case 0x0160: return 0xA6; /* S caron */
253 case 0x0161: return 0xA8; /* s caron */
254 case 0x017D: return 0xB4; /* Z caron */
255 case 0x017E: return 0xB8; /* z caron */
256 case 0x0152: return 0xBC; /* OE */
257 case 0x0153: return 0xBD; /* oe */
258 case 0x0178: return 0xBE; /* Y diaeresis */
259 default: return '?';
263 size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb)
265 size_t x=0;
266 unsigned char to = (cd>>1)&127;
267 unsigned char from = 255;
268 const unsigned char *map = 0;
269 char tmp[MB_LEN_MAX];
270 wchar_t c, d;
271 size_t k, l;
272 int err;
274 if (!in || !*in || !*inb) return 0;
276 if (cd & 1)
277 map = charmaps[cd>>8].map;
278 else
279 from = cd>>8;
281 for (; *inb; *in+=l, *inb-=l) {
282 c = *(unsigned char *)*in;
283 l = 1;
284 if (from >= UTF_8 && c < 0x80) goto charok;
285 switch (from) {
286 case WCHAR_T:
287 l = sizeof(wchar_t);
288 if (*inb < l) goto starved;
289 c = *(wchar_t *)*in;
290 break;
291 case UTF_8:
292 l = utf8dec_wchar(&c, *in, *inb);
293 if (!l) l++;
294 else if (l == (size_t)-1) goto ilseq;
295 else if (l == (size_t)-2) goto starved;
296 break;
297 case US_ASCII:
298 goto ilseq;
299 case LATIN_9:
300 if ((unsigned)c - 0xa4 <= 0xbe - 0xa4) {
301 static const unsigned char map[] = {
302 0, 0x60, 0, 0x61, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0x7d, 0, 0, 0, 0x7e, 0, 0, 0,
304 0x52, 0x53, 0x78
306 if (c == 0xa4) c = 0x20ac;
307 else if (map[c-0xa5]) c = 0x100 | map[c-0xa5];
309 case LATIN_1:
310 goto charok;
311 case TIS_620:
312 if (c >= 0xa1) c += 0x0e01-0xa1;
313 goto charok;
314 case JIS_0201:
315 if (c >= 0xa1) {
316 if (c <= 0xdf) c += 0xff61-0xa1;
317 else goto ilseq;
319 goto charok;
320 case UTF_16BE:
321 case UTF_16LE:
322 l = 2;
323 if (*inb < 2) goto starved;
324 c = get_16(*in, from);
325 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
326 if ((unsigned)(c-0xd800) < 0x400) {
327 l = 4;
328 if (*inb < 4) goto starved;
329 d = get_16(*in + 2, from);
330 if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq;
331 c = ((c-0xd800)<<10) | (d-0xdc00);
333 break;
334 case UTF_32BE:
335 case UTF_32LE:
336 l = 4;
337 if (*inb < 4) goto starved;
338 // FIXME
339 // c = get_32(*in, from);
340 break;
341 default:
342 /* only support ascii supersets */
343 if (c < 0x80) break;
344 switch (map[0]) {
345 case UCS2_8BIT:
346 c -= 0x80;
347 break;
348 case EUC:
349 if ((unsigned)c - 0xa1 >= 94) goto ilseq;
350 if ((unsigned)in[0][1] - 0xa1 >= 94) goto ilseq;
351 c = (c-0xa1)*94 + (in[0][1]-0xa1);
352 l = 2;
353 break;
354 case SHIFT_JIS:
355 if ((unsigned)c - 0xa1 <= 0xdf-0xa1) {
356 c += 0xff61-0xa1;
357 goto charok;
359 // FIXME...
360 l = 2;
361 break;
362 default:
363 goto badf;
365 c = get_16(map + 4 + 2*c, 0);
366 if (c == 0xffff) goto ilseq;
367 goto charok;
370 if ((unsigned)c - 0xd800 < 0x800 || (unsigned)c >= 0x110000)
371 goto ilseq;
372 charok:
373 switch (to) {
374 case WCHAR_T:
375 if (*outb < sizeof(wchar_t)) goto toobig;
376 *(wchar_t *)*out = c;
377 *out += sizeof(wchar_t);
378 *outb -= sizeof(wchar_t);
379 break;
380 case UTF_8:
381 if (*outb < 4) {
382 k = utf8enc_wchar(tmp, c);
383 if (*outb < k) goto toobig;
384 memcpy(*out, tmp, k);
385 } else k = utf8enc_wchar(*out, c);
386 *out += k;
387 *outb -= k;
388 break;
389 case US_ASCII:
390 if (c > 0x7f) c = 0xfffd;
391 /* fall thru and count replacement in latin1 case */
392 case LATIN_9:
393 if (c >= 0x100 && c != 0xfffd)
394 c = latin9_translit(c);
395 /* fall through */
396 case LATIN_1:
397 if (!*outb) goto toobig;
398 if (c < 0x100) **out = c;
399 else x++, **out = '*'; //FIXME: translit?
400 ++*out;
401 --*outb;
402 break;
403 case UTF_16BE:
404 case UTF_16LE:
405 if (c < 0x10000) {
406 if (*outb < 2) goto toobig;
407 put_16(*out, c, to);
408 *out += 2;
409 *outb -= 2;
410 break;
412 if (*outb < 4) goto toobig;
413 put_16(*out, (c>>10)|0xd800, to);
414 put_16(*out + 2, (c&0x3ff)|0xdc00, to);
415 *out += 4;
416 *outb -= 4;
417 break;
418 default:
419 goto badf;
422 return x;
423 ilseq:
424 err = EILSEQ;
425 x = -1;
426 goto end;
427 badf:
428 err = EBADF;
429 x = -1;
430 goto end;
431 toobig:
432 err = E2BIG;
433 x = -1;
434 goto end;
435 starved:
436 err = EINVAL;
437 end:
438 errno = err;
439 return x;