fstatat64: define it as a wrapper of statx if the kernel does not support fstatat64...
[uclibc-ng.git] / libiconv / iconv.c
blobec01f381dbf44b7df8a57ba5cfbc64fa85fae287
1 /*
2 * Copyright © 2018 Waldemar Brodkorb <wbx@uclibc-ng.org>
3 * Simplified port of iconv.c from musl C library including
4 * parts of libiconv-tiny.
5 */
7 /* Copyright © 2005-2018 Rich Felker, et al.
9 Permission is hereby granted, free of charge, to any person obtaining
10 a copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be
18 included in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #include <iconv.h>
31 #include <errno.h>
32 #include <wchar.h>
33 #include <string.h>
34 #include <strings.h>
35 #include <stdlib.h>
36 #include <limits.h>
37 #include <dirent.h>
38 #include <fcntl.h>
39 #include <sys/mman.h>
40 #include <sys/stat.h>
41 #include <unistd.h>
42 #include <stdint.h>
44 #define UTF_32BE 0300
45 #define UTF_16LE 0301
46 #define UTF_16BE 0302
47 #define UTF_32LE 0303
48 #define UCS2BE 0304
49 #define UCS2LE 0305
50 #define WCHAR_T 0306
51 #define US_ASCII 0307
52 #define UTF_8 0310
53 #define UTF_16 0312
54 #define UTF_32 0313
55 #define UCS2 0314
56 #define EUC_JP 0320
57 #define SHIFT_JIS 0321
58 #define ISO2022_JP 0322
59 #define GB18030 0330
60 #define GBK 0331
61 #define GB2312 0332
62 #define BIG5 0340
63 #define EUC_KR 0350
65 /* Definitions of charmaps. Each charmap consists of:
66 * 1. Empty-string-terminated list of null-terminated aliases.
67 * 2. Special type code or number of elided quads of entries.
68 * 3. Character table (size determined by field 2), consisting
69 * of 5 bytes for every 4 characters, interpreted as 10-bit
70 * indices into the legacy_chars table. */
72 static const unsigned char charmaps[] =
73 "utf8\0char\0\0\310"
74 "wchart\0\0\306"
75 "ucs2be\0\0\304"
76 "ucs2le\0\0\305"
77 "utf16be\0\0\302"
78 "utf16le\0\0\301"
79 "ucs4be\0utf32be\0\0\300"
80 "ucs4le\0utf32le\0\0\303"
81 "ascii\0usascii\0iso646\0iso646us\0\0\307"
82 "utf16\0\0\312"
83 "ucs4\0utf32\0\0\313"
84 "ucs2\0\0\314"
85 "eucjp\0\0\320"
86 "shiftjis\0sjis\0\0\321"
87 "iso2022jp\0\0\322"
88 "gb18030\0\0\330"
89 "gbk\0\0\331"
90 "gb2312\0\0\332"
91 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
92 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
93 #include "codepages.h"
96 /* Table of characters that appear in legacy 8-bit codepages,
97 * limited to 1024 slots (10 bit indices). The first 256 entries
98 * are elided since those characters are obviously all included. */
99 static const unsigned short legacy_chars[] = {
100 #include "legacychars.h"
103 static const unsigned short jis0208[84][94] = {
104 #include "jis0208.h"
107 static const unsigned short rev_jis[] = {
108 #include "revjis.h"
111 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
113 for (; *a && *b; a++, b++) {
114 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
115 if ((*a|32U) != *b) return 1;
117 return *a != *b;
120 static size_t find_charmap(const void *name)
122 const unsigned char *s;
123 if (!*(char *)name) name=charmaps; /* "utf8" */
124 for (s=charmaps; *s; ) {
125 if (!fuzzycmp(name, s)) {
126 for (; *s; s+=strlen((void *)s)+1);
127 return s+1-charmaps;
129 s += strlen((void *)s)+1;
130 if (!*s) {
131 if (s[1] > 0200) s+=2;
132 else s+=2+(64U-s[1])*5;
135 return -1;
138 struct stateful_cd {
139 iconv_t base_cd;
140 unsigned state;
143 static iconv_t combine_to_from(size_t t, size_t f)
145 return (void *)(f<<16 | t<<1 | 1);
148 static size_t extract_from(iconv_t cd)
150 return (size_t)cd >> 16;
153 static size_t extract_to(iconv_t cd)
155 return (size_t)cd >> 1 & 0x7fff;
158 iconv_t iconv_open(const char *to, const char *from)
160 size_t f, t;
161 struct stateful_cd *scd;
163 if ((t = find_charmap(to))==-1
164 || (f = find_charmap(from))==-1
165 || (charmaps[t] >= 0330)) {
166 errno = EINVAL;
167 return (iconv_t)-1;
169 iconv_t cd = combine_to_from(t, f);
171 switch (charmaps[f]) {
172 case UTF_16:
173 case UTF_32:
174 case UCS2:
175 case ISO2022_JP:
176 scd = malloc(sizeof *scd);
177 if (!scd) return (iconv_t)-1;
178 scd->base_cd = cd;
179 scd->state = 0;
180 cd = (iconv_t)scd;
183 return cd;
186 static unsigned get_16(const unsigned char *s, int e)
188 e &= 1;
189 return s[e]<<8 | s[1-e];
192 static void put_16(unsigned char *s, unsigned c, int e)
194 e &= 1;
195 s[e] = c>>8;
196 s[1-e] = c;
199 static unsigned get_32(const unsigned char *s, int e)
201 e &= 3;
202 return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
205 static void put_32(unsigned char *s, unsigned c, int e)
207 e &= 3;
208 s[e^0] = c>>24;
209 s[e^1] = c>>16;
210 s[e^2] = c>>8;
211 s[e^3] = c;
215 static inline int utf8enc_wchar(char *outb, wchar_t c)
217 if (c <= 0x7F) {
218 *outb = c;
219 return 1;
221 else if (c <= 0x7FF) {
222 *outb++ = ((c >> 6) & 0x1F) | 0xC0;
223 *outb++ = ( c & 0x3F) | 0x80;
224 return 2;
226 else if (c <= 0xFFFF) {
227 *outb++ = ((c >> 12) & 0x0F) | 0xE0;
228 *outb++ = ((c >> 6) & 0x3F) | 0x80;
229 *outb++ = ( c & 0x3F) | 0x80;
230 return 3;
232 else if (c <= 0x10FFFF) {
233 *outb++ = ((c >> 18) & 0x07) | 0xF0;
234 *outb++ = ((c >> 12) & 0x3F) | 0x80;
235 *outb++ = ((c >> 6) & 0x3F) | 0x80;
236 *outb++ = ( c & 0x3F) | 0x80;
237 return 4;
239 else {
240 *outb++ = '?';
241 return 1;
245 static inline int utf8seq_is_overlong(char *s, int n)
247 switch (n)
249 case 2:
250 /* 1100000x (10xxxxxx) */
251 return (((*s >> 1) == 0x60) &&
252 ((*(s+1) >> 6) == 0x02));
254 case 3:
255 /* 11100000 100xxxxx (10xxxxxx) */
256 return ((*s == 0xE0) &&
257 ((*(s+1) >> 5) == 0x04) &&
258 ((*(s+2) >> 6) == 0x02));
260 case 4:
261 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
262 return ((*s == 0xF0) &&
263 ((*(s+1) >> 4) == 0x08) &&
264 ((*(s+2) >> 6) == 0x02) &&
265 ((*(s+3) >> 6) == 0x02));
268 return 0;
271 static inline int utf8seq_is_surrogate(char *s, int n)
273 return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
276 static inline int utf8seq_is_illegal(char *s, int n)
278 return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
279 (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
282 static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb)
284 int i;
285 int n = -1;
287 /* trivial char */
288 if (*in <= 0x7F) {
289 *c = *in;
290 return 1;
293 /* find utf8 sequence length */
294 if ((*in & 0xE0) == 0xC0) n = 2;
295 else if ((*in & 0xF0) == 0xE0) n = 3;
296 else if ((*in & 0xF8) == 0xF0) n = 4;
297 else if ((*in & 0xFC) == 0xF8) n = 5;
298 else if ((*in & 0xFE) == 0xFC) n = 6;
300 /* starved? */
301 if (n > inb)
302 return -2;
304 /* decode ... */
305 if (n > 1 && n < 5) {
306 /* reject invalid sequences */
307 if (utf8seq_is_overlong(in, n) ||
308 utf8seq_is_surrogate(in, n) ||
309 utf8seq_is_illegal(in, n))
310 return -1;
312 /* decode ... */
313 *c = (char)(*in++ & (0x7F >> n));
315 for (i = 1; i < n; i++) {
316 /* illegal continuation byte */
317 if (*in < 0x80 || *in > 0xBF)
318 return -1;
320 *c = (*c << 6) | (*in++ & 0x3F);
323 return n;
326 /* unmapped sequence (> 4) */
327 return -1;
330 static unsigned legacy_map(const unsigned char *map, unsigned c)
332 if (c < 4*map[-1]) return c;
333 unsigned x = c - 4*map[-1];
334 x = map[x*5/4]>>2*x%8 | map[x*5/4+1]<<8-2*x%8 & 1023;
335 return x < 256 ? x : legacy_chars[x-256];
338 static unsigned uni_to_jis(unsigned c)
340 unsigned nel = sizeof rev_jis / sizeof *rev_jis;
341 unsigned d, j, i, b = 0;
342 for (;;) {
343 i = nel/2;
344 j = rev_jis[b+i];
345 d = jis0208[j/256][j%256];
346 if (d==c) return j + 0x2121;
347 else if (nel == 1) return 0;
348 else if (c < d)
349 nel /= 2;
350 else {
351 b += i;
352 nel -= nel/2;
357 size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
359 size_t x=0;
360 struct stateful_cd *scd=0;
361 if (!((size_t)cd & 1)) {
362 scd = (void *)cd;
363 cd = scd->base_cd;
365 unsigned to = extract_to(cd);
366 unsigned from = extract_from(cd);
367 const unsigned char *map = charmaps+from+1;
368 const unsigned char *tomap = charmaps+to+1;
369 char tmp[MB_LEN_MAX];
370 unsigned c, d;
371 size_t k, l;
372 int err;
373 unsigned char type = map[-1];
374 unsigned char totype = tomap[-1];
376 if (!in || !*in || !*inb) return 0;
378 for (; *inb; *in+=l, *inb-=l) {
379 c = *(unsigned char *)*in;
380 l = 1;
382 switch (type) {
383 case UTF_8:
384 if (c < 128) break;
385 l = utf8dec_wchar(&c, *in, *inb);
386 if (!l) l++;
387 else if (l == (size_t)-1) goto ilseq;
388 else if (l == (size_t)-2) goto starved;
389 break;
390 case US_ASCII:
391 if (c >= 128) goto ilseq;
392 break;
393 case WCHAR_T:
394 l = sizeof(wchar_t);
395 if (*inb < l) goto starved;
396 c = *(wchar_t *)*in;
397 if (0) {
398 case UTF_32BE:
399 case UTF_32LE:
400 l = 4;
401 if (*inb < 4) goto starved;
402 c = get_32((void *)*in, type);
404 if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
405 break;
406 case UCS2BE:
407 case UCS2LE:
408 case UTF_16BE:
409 case UTF_16LE:
410 l = 2;
411 if (*inb < 2) goto starved;
412 c = get_16((void *)*in, type);
413 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
414 if ((unsigned)(c-0xd800) < 0x400) {
415 if (type-UCS2BE < 2U) goto ilseq;
416 l = 4;
417 if (*inb < 4) goto starved;
418 d = get_16((void *)(*in + 2), type);
419 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
420 c = ((c-0xd7c0)<<10) + (d-0xdc00);
422 break;
423 case UCS2:
424 case UTF_16:
425 l = 0;
426 if (!scd->state) {
427 if (*inb < 2) goto starved;
428 c = get_16((void *)*in, 0);
429 scd->state = type==UCS2
430 ? c==0xfffe ? UCS2LE : UCS2BE
431 : c==0xfffe ? UTF_16LE : UTF_16BE;
432 if (c == 0xfffe || c == 0xfeff)
433 l = 2;
435 type = scd->state;
436 continue;
437 case UTF_32:
438 l = 0;
439 if (!scd->state) {
440 if (*inb < 4) goto starved;
441 c = get_32((void *)*in, 0);
442 scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
443 if (c == 0xfffe0000 || c == 0xfeff)
444 l = 4;
446 type = scd->state;
447 continue;
448 case SHIFT_JIS:
449 if (c < 128) break;
450 if (c-0xa1 <= 0xdf-0xa1) {
451 c += 0xff61-0xa1;
452 break;
454 l = 2;
455 if (*inb < 2) goto starved;
456 d = *((unsigned char *)*in + 1);
457 if (c-129 <= 159-129) c -= 129;
458 else if (c-224 <= 239-224) c -= 193;
459 else goto ilseq;
460 c *= 2;
461 if (d-64 <= 158-64) {
462 if (d==127) goto ilseq;
463 if (d>127) d--;
464 d -= 64;
465 } else if (d-159 <= 252-159) {
466 c++;
467 d -= 159;
469 c = jis0208[c][d];
470 if (!c) goto ilseq;
471 break;
472 case EUC_JP:
473 if (c < 128) break;
474 l = 2;
475 if (*inb < 2) goto starved;
476 d = *((unsigned char *)*in + 1);
477 if (c==0x8e) {
478 c = d;
479 if (c-0xa1 > 0xdf-0xa1) goto ilseq;
480 c += 0xff61 - 0xa1;
481 break;
483 c -= 0xa1;
484 d -= 0xa1;
485 if (c >= 84 || d >= 94) goto ilseq;
486 c = jis0208[c][d];
487 if (!c) goto ilseq;
488 break;
489 case ISO2022_JP:
490 if (c >= 128) goto ilseq;
491 if (c == '\033') {
492 l = 3;
493 if (*inb < 3) goto starved;
494 c = *((unsigned char *)*in + 1);
495 d = *((unsigned char *)*in + 2);
496 if (c != '(' && c != '$') goto ilseq;
497 switch (128*(c=='$') + d) {
498 case 'B': scd->state=0; continue;
499 case 'J': scd->state=1; continue;
500 case 'I': scd->state=4; continue;
501 case 128+'@': scd->state=2; continue;
502 case 128+'B': scd->state=3; continue;
504 goto ilseq;
506 switch (scd->state) {
507 case 1:
508 if (c=='\\') c = 0xa5;
509 if (c=='~') c = 0x203e;
510 break;
511 case 2:
512 case 3:
513 l = 2;
514 if (*inb < 2) goto starved;
515 d = *((unsigned char *)*in + 1);
516 c -= 0x21;
517 d -= 0x21;
518 if (c >= 84 || d >= 94) goto ilseq;
519 c = jis0208[c][d];
520 if (!c) goto ilseq;
521 break;
522 case 4:
523 if (c-0x60 < 0x1f) goto ilseq;
524 if (c-0x21 < 0x5e) c += 0xff61-0x21;
525 break;
527 break;
528 default:
529 if (!c) break;
530 c = legacy_map(map, c);
531 if (!c) goto ilseq;
534 switch (totype) {
535 case WCHAR_T:
536 if (*outb < sizeof(wchar_t)) goto toobig;
537 *(wchar_t *)*out = c;
538 *out += sizeof(wchar_t);
539 *outb -= sizeof(wchar_t);
540 break;
541 case UTF_8:
542 if (*outb < 4) {
543 k = utf8enc_wchar(tmp, c);
544 if (*outb < k) goto toobig;
545 memcpy(*out, tmp, k);
546 } else k = utf8enc_wchar(*out, c);
547 *out += k;
548 *outb -= k;
549 break;
550 case US_ASCII:
551 if (c > 0x7f) subst: x++, c='*';
552 default:
553 if (*outb < 1) goto toobig;
554 if (c<256 && c==legacy_map(tomap, c)) {
555 revout:
556 *(*out)++ = c;
557 *outb -= 1;
558 break;
560 d = c;
561 for (c=4*totype; c<256; c++) {
562 if (d == legacy_map(tomap, c)) {
563 goto revout;
566 goto subst;
567 case SHIFT_JIS:
568 if (c < 128) goto revout;
569 if (c == 0xa5) {
570 x++;
571 c = '\\';
572 goto revout;
574 if (c == 0x203e) {
575 x++;
576 c = '~';
577 goto revout;
579 if (c-0xff61 <= 0xdf-0xa1) {
580 c += 0xa1 - 0xff61;
581 goto revout;
583 c = uni_to_jis(c);
584 if (!c) goto subst;
585 if (*outb < 2) goto toobig;
586 d = c%256;
587 c = c/256;
588 *(*out)++ = (c+1)/2 + (c<95 ? 112 : 176);
589 *(*out)++ = c%2 ? d + 31 + d/96 : d + 126;
590 *outb -= 2;
591 break;
592 case EUC_JP:
593 if (c < 128) goto revout;
594 if (c-0xff61 <= 0xdf-0xa1) {
595 c += 0x0e00 + 0x21 - 0xff61;
596 } else {
597 c = uni_to_jis(c);
599 if (!c) goto subst;
600 if (*outb < 2) goto toobig;
601 *(*out)++ = c/256 + 0x80;
602 *(*out)++ = c%256 + 0x80;
603 *outb -= 2;
604 break;
605 case ISO2022_JP:
606 if (c < 128) goto revout;
607 if (c-0xff61 <= 0xdf-0xa1 || c==0xa5 || c==0x203e) {
608 if (*outb < 7) goto toobig;
609 *(*out)++ = '\033';
610 *(*out)++ = '(';
611 if (c==0xa5) {
612 *(*out)++ = 'J';
613 *(*out)++ = '\\';
614 } else if (c==0x203e) {
615 *(*out)++ = 'J';
616 *(*out)++ = '~';
617 } else {
618 *(*out)++ = 'I';
619 *(*out)++ = c-0xff61+0x21;
621 *(*out)++ = '\033';
622 *(*out)++ = '(';
623 *(*out)++ = 'B';
624 *outb -= 7;
625 break;
627 c = uni_to_jis(c);
628 if (!c) goto subst;
629 if (*outb < 8) goto toobig;
630 *(*out)++ = '\033';
631 *(*out)++ = '$';
632 *(*out)++ = 'B';
633 *(*out)++ = c/256;
634 *(*out)++ = c%256;
635 *(*out)++ = '\033';
636 *(*out)++ = '(';
637 *(*out)++ = 'B';
638 *outb -= 8;
639 break;
640 case UCS2:
641 totype = UCS2BE;
642 case UCS2BE:
643 case UCS2LE:
644 case UTF_16:
645 case UTF_16BE:
646 case UTF_16LE:
647 if (c < 0x10000 || totype-UCS2BE < 2U) {
648 if (c >= 0x10000) c = 0xFFFD;
649 if (*outb < 2) goto toobig;
650 put_16((void *)*out, c, totype);
651 *out += 2;
652 *outb -= 2;
653 break;
655 if (*outb < 4) goto toobig;
656 c -= 0x10000;
657 put_16((void *)*out, (c>>10)|0xd800, totype);
658 put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
659 *out += 4;
660 *outb -= 4;
661 break;
662 case UTF_32BE:
663 case UTF_32LE:
664 if (*outb < 4) goto toobig;
665 put_32((void *)*out, c, totype);
666 *out += 4;
667 *outb -= 4;
668 break;
671 return x;
672 ilseq:
673 err = EILSEQ;
674 x = -1;
675 goto end;
676 toobig:
677 err = E2BIG;
678 x = -1;
679 goto end;
680 starved:
681 err = EINVAL;
682 x = -1;
683 end:
684 errno = err;
685 return x;
688 int iconv_close(iconv_t cd)
690 return 0;