fix TLS memory leak with dlopen
[uclibc-ng.git] / libiconv / iconv.c
blob095932fd6049f5cb9a50568ccbe2ace35d2f9bf3
1 /*
2 * Copyright © 2018 Waldemar Brodkorb <wbx@uclibc-ng.org>
3 * Simplified port of iconv.c from musl C library including
4 * parts of libiconv-tiny.
5 */
7 /* Copyright © 2005-2018 Rich Felker, et al.
9 Permission is hereby granted, free of charge, to any person obtaining
10 a copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be
18 included in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #include <iconv.h>
31 #include <errno.h>
32 #include <wchar.h>
33 #include <string.h>
34 #include <strings.h>
35 #include <stdlib.h>
36 #include <limits.h>
37 #include <dirent.h>
38 #include <fcntl.h>
39 #include <sys/mman.h>
40 #include <sys/stat.h>
41 #include <unistd.h>
42 #include <stdint.h>
44 #define UTF_32BE 0300
45 #define UTF_16LE 0301
46 #define UTF_16BE 0302
47 #define UTF_32LE 0303
48 #define UCS2BE 0304
49 #define UCS2LE 0305
50 #define WCHAR_T 0306
51 #define US_ASCII 0307
52 #define UTF_8 0310
53 #define UTF_16 0312
54 #define UTF_32 0313
55 #define UCS2 0314
56 #define EUC_JP 0320
57 #define SHIFT_JIS 0321
58 #define ISO2022_JP 0322
59 #define GB18030 0330
60 #define GBK 0331
61 #define GB2312 0332
62 #define BIG5 0340
63 #define EUC_KR 0350
65 /* Definitions of charmaps. Each charmap consists of:
66 * 1. Empty-string-terminated list of null-terminated aliases.
67 * 2. Special type code or number of elided quads of entries.
68 * 3. Character table (size determined by field 2), consisting
69 * of 5 bytes for every 4 characters, interpreted as 10-bit
70 * indices into the legacy_chars table. */
72 static const unsigned char charmaps[] =
73 "utf8\0char\0\0\310"
74 "wchart\0\0\306"
75 "ucs2be\0\0\304"
76 "ucs2le\0\0\305"
77 "utf16be\0\0\302"
78 "utf16le\0\0\301"
79 "ucs4be\0utf32be\0\0\300"
80 "ucs4le\0utf32le\0\0\303"
81 "ascii\0usascii\0iso646\0iso646us\0\0\307"
82 "utf16\0\0\312"
83 "ucs4\0utf32\0\0\313"
84 "ucs2\0\0\314"
85 "eucjp\0\0\320"
86 "shiftjis\0sjis\0\0\321"
87 "iso2022jp\0\0\322"
88 "gb18030\0\0\330"
89 "gbk\0\0\331"
90 "gb2312\0\0\332"
91 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
92 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
93 #include "codepages.h"
96 /* Table of characters that appear in legacy 8-bit codepages,
97 * limited to 1024 slots (10 bit indices). The first 256 entries
98 * are elided since those characters are obviously all included. */
99 static const unsigned short legacy_chars[] = {
100 #include "legacychars.h"
103 static const unsigned short jis0208[84][94] = {
104 #include "jis0208.h"
107 static const unsigned short rev_jis[] = {
108 #include "revjis.h"
111 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
113 for (; *a && *b; a++, b++) {
114 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
115 if ((*a|32U) != *b) return 1;
117 return *a != *b;
120 static size_t find_charmap(const void *name)
122 const unsigned char *s;
123 if (!*(char *)name) name=charmaps; /* "utf8" */
124 for (s=charmaps; *s; ) {
125 if (!fuzzycmp(name, s)) {
126 for (; *s; s+=strlen((void *)s)+1);
127 return s+1-charmaps;
129 s += strlen((void *)s)+1;
130 if (!*s) {
131 if (s[1] > 0200) s+=2;
132 else s+=2+(64U-s[1])*5;
135 return -1;
138 struct stateful_cd {
139 iconv_t base_cd;
140 unsigned state;
143 static iconv_t combine_to_from(size_t t, size_t f)
145 return (iconv_t)(f<<16 | t<<1 | 1);
148 static size_t extract_from(iconv_t cd)
150 return (size_t)cd >> 16;
153 static size_t extract_to(iconv_t cd)
155 return (size_t)cd >> 1 & 0x7fff;
158 iconv_t iconv_open(const char *to, const char *from)
160 size_t f, t;
161 struct stateful_cd *scd;
163 if ((t = find_charmap(to))==-1
164 || (f = find_charmap(from))==-1
165 || (charmaps[t] >= 0330)) {
166 errno = EINVAL;
167 return (iconv_t)-1;
169 iconv_t cd = combine_to_from(t, f);
171 switch (charmaps[f]) {
172 case UTF_16:
173 case UTF_32:
174 case UCS2:
175 case ISO2022_JP:
176 scd = malloc(sizeof *scd);
177 if (!scd) return (iconv_t)-1;
178 scd->base_cd = cd;
179 scd->state = 0;
180 cd = (iconv_t)scd;
183 return cd;
186 static unsigned get_16(const unsigned char *s, int e)
188 e &= 1;
189 return s[e]<<8 | s[1-e];
192 static void put_16(unsigned char *s, unsigned c, int e)
194 e &= 1;
195 s[e] = c>>8;
196 s[1-e] = c;
199 static unsigned get_32(const unsigned char *s, int e)
201 e &= 3;
202 return (s[e]+0U)<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
205 static void put_32(unsigned char *s, unsigned c, int e)
207 e &= 3;
208 s[e^0] = c>>24;
209 s[e^1] = c>>16;
210 s[e^2] = c>>8;
211 s[e^3] = c;
215 static inline int utf8enc_wchar(char *outb, wchar_t c)
217 if (c <= 0x7F) {
218 *outb = c;
219 return 1;
221 else if (c <= 0x7FF) {
222 *outb++ = ((c >> 6) & 0x1F) | 0xC0;
223 *outb++ = ( c & 0x3F) | 0x80;
224 return 2;
226 else if (c <= 0xFFFF) {
227 *outb++ = ((c >> 12) & 0x0F) | 0xE0;
228 *outb++ = ((c >> 6) & 0x3F) | 0x80;
229 *outb++ = ( c & 0x3F) | 0x80;
230 return 3;
232 else if (c <= 0x10FFFF) {
233 *outb++ = ((c >> 18) & 0x07) | 0xF0;
234 *outb++ = ((c >> 12) & 0x3F) | 0x80;
235 *outb++ = ((c >> 6) & 0x3F) | 0x80;
236 *outb++ = ( c & 0x3F) | 0x80;
237 return 4;
239 else {
240 *outb++ = '?';
241 return 1;
245 static inline int utf8seq_is_overlong(unsigned char *s, int n)
247 switch (n)
249 case 2:
250 /* 1100000x (10xxxxxx) */
251 return (((*s >> 1) == 0x60) &&
252 ((*(s+1) >> 6) == 0x02));
254 case 3:
255 /* 11100000 100xxxxx (10xxxxxx) */
256 return ((*s == 0xE0) &&
257 ((*(s+1) >> 5) == 0x04) &&
258 ((*(s+2) >> 6) == 0x02));
260 case 4:
261 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
262 return ((*s == 0xF0) &&
263 ((*(s+1) >> 4) == 0x08) &&
264 ((*(s+2) >> 6) == 0x02) &&
265 ((*(s+3) >> 6) == 0x02));
268 return 0;
271 static inline int utf8seq_is_surrogate(unsigned char *s, int n)
273 return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
276 static inline int utf8seq_is_illegal(unsigned char *s, int n)
278 return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
279 (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
282 static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb)
284 int i;
285 int n = -1;
287 /* trivial char */
288 if (*in <= 0x7F) {
289 *c = *in;
290 return 1;
293 /* find utf8 sequence length */
294 if ((*in & 0xE0) == 0xC0) n = 2;
295 else if ((*in & 0xF0) == 0xE0) n = 3;
296 else if ((*in & 0xF8) == 0xF0) n = 4;
297 else if ((*in & 0xFC) == 0xF8) n = 5;
298 else if ((*in & 0xFE) == 0xFC) n = 6;
300 /* starved? */
301 if (n > inb)
302 return -2;
304 /* decode ... */
305 if (n > 1 && n < 5) {
306 /* reject invalid sequences */
307 if (utf8seq_is_overlong(in, n) ||
308 utf8seq_is_surrogate(in, n) ||
309 utf8seq_is_illegal(in, n))
310 return -1;
312 /* decode ... */
313 *c = (char)(*in++ & (0x7F >> n));
315 for (i = 1; i < n; i++) {
316 /* illegal continuation byte */
317 if (*in < 0x80 || *in > 0xBF)
318 return -1;
320 *c = (*c << 6) | (*in++ & 0x3F);
323 return n;
326 /* unmapped sequence (> 4) */
327 return -1;
330 static unsigned legacy_map(const unsigned char *map, unsigned c)
332 if (c < 4*map[-1]) return c;
333 unsigned x = c - 4*map[-1];
334 x = map[x*5/4]>>(2*x%8) | (map[x*5/4+1]<<(8-2*x%8) & 1023);
335 return x < 256 ? x : legacy_chars[x-256];
338 static unsigned uni_to_jis(unsigned c)
340 unsigned nel = sizeof rev_jis / sizeof *rev_jis;
341 unsigned d, j, i, b = 0;
342 for (;;) {
343 i = nel/2;
344 j = rev_jis[b+i];
345 d = jis0208[j/256][j%256];
346 if (d==c) return j + 0x2121;
347 else if (nel == 1) return 0;
348 else if (c < d)
349 nel /= 2;
350 else {
351 b += i;
352 nel -= nel/2;
357 size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
359 size_t x=0;
360 struct stateful_cd *scd=0;
361 if (!((size_t)cd & 1)) {
362 scd = (void *)cd;
363 cd = scd->base_cd;
365 unsigned to = extract_to(cd);
366 unsigned from = extract_from(cd);
367 const unsigned char *map = charmaps+from+1;
368 const unsigned char *tomap = charmaps+to+1;
369 char tmp[MB_LEN_MAX];
370 unsigned c, d;
371 size_t k, l;
372 int err;
373 unsigned char type = map[-1];
374 unsigned char totype = tomap[-1];
376 if (!in || !*in || !*inb) return 0;
378 for (; *inb; *in+=l, *inb-=l) {
379 c = *(unsigned char *)*in;
380 l = 1;
382 switch (type) {
383 case UTF_8:
384 if (c < 128) break;
385 else {
386 wchar_t wc;
387 l = utf8dec_wchar(&wc, (unsigned char*)(*in), *inb);
388 c = wc;
390 if (!l) l++;
391 else if (l == (size_t)-1) goto ilseq;
392 else if (l == (size_t)-2) goto starved;
393 break;
394 case US_ASCII:
395 if (c >= 128) goto ilseq;
396 break;
397 case WCHAR_T:
398 l = sizeof(wchar_t);
399 if (*inb < l) goto starved;
400 c = *(wchar_t *)*in;
401 if (0) {
402 case UTF_32BE:
403 case UTF_32LE:
404 l = 4;
405 if (*inb < 4) goto starved;
406 c = get_32((void *)*in, type);
408 if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
409 break;
410 case UCS2BE:
411 case UCS2LE:
412 case UTF_16BE:
413 case UTF_16LE:
414 l = 2;
415 if (*inb < 2) goto starved;
416 c = get_16((void *)*in, type);
417 if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
418 if ((unsigned)(c-0xd800) < 0x400) {
419 if (type-UCS2BE < 2U) goto ilseq;
420 l = 4;
421 if (*inb < 4) goto starved;
422 d = get_16((void *)(*in + 2), type);
423 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
424 c = ((c-0xd7c0)<<10) + (d-0xdc00);
426 break;
427 case UCS2:
428 case UTF_16:
429 l = 0;
430 if (!scd->state) {
431 if (*inb < 2) goto starved;
432 c = get_16((void *)*in, 0);
433 scd->state = type==UCS2
434 ? c==0xfffe ? UCS2LE : UCS2BE
435 : c==0xfffe ? UTF_16LE : UTF_16BE;
436 if (c == 0xfffe || c == 0xfeff)
437 l = 2;
439 type = scd->state;
440 continue;
441 case UTF_32:
442 l = 0;
443 if (!scd->state) {
444 if (*inb < 4) goto starved;
445 c = get_32((void *)*in, 0);
446 scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
447 if (c == 0xfffe0000 || c == 0xfeff)
448 l = 4;
450 type = scd->state;
451 continue;
452 case SHIFT_JIS:
453 if (c < 128) break;
454 if (c-0xa1 <= 0xdf-0xa1) {
455 c += 0xff61-0xa1;
456 break;
458 l = 2;
459 if (*inb < 2) goto starved;
460 d = *((unsigned char *)*in + 1);
461 if (c-129 <= 159-129) c -= 129;
462 else if (c-224 <= 239-224) c -= 193;
463 else goto ilseq;
464 c *= 2;
465 if (d-64 <= 158-64) {
466 if (d==127) goto ilseq;
467 if (d>127) d--;
468 d -= 64;
469 } else if (d-159 <= 252-159) {
470 c++;
471 d -= 159;
473 c = jis0208[c][d];
474 if (!c) goto ilseq;
475 break;
476 case EUC_JP:
477 if (c < 128) break;
478 l = 2;
479 if (*inb < 2) goto starved;
480 d = *((unsigned char *)*in + 1);
481 if (c==0x8e) {
482 c = d;
483 if (c-0xa1 > 0xdf-0xa1) goto ilseq;
484 c += 0xff61 - 0xa1;
485 break;
487 c -= 0xa1;
488 d -= 0xa1;
489 if (c >= 84 || d >= 94) goto ilseq;
490 c = jis0208[c][d];
491 if (!c) goto ilseq;
492 break;
493 case ISO2022_JP:
494 if (c >= 128) goto ilseq;
495 if (c == '\033') {
496 l = 3;
497 if (*inb < 3) goto starved;
498 c = *((unsigned char *)*in + 1);
499 d = *((unsigned char *)*in + 2);
500 if (c != '(' && c != '$') goto ilseq;
501 switch (128*(c=='$') + d) {
502 case 'B': scd->state=0; continue;
503 case 'J': scd->state=1; continue;
504 case 'I': scd->state=4; continue;
505 case 128+'@': scd->state=2; continue;
506 case 128+'B': scd->state=3; continue;
508 goto ilseq;
510 switch (scd->state) {
511 case 1:
512 if (c=='\\') c = 0xa5;
513 if (c=='~') c = 0x203e;
514 break;
515 case 2:
516 case 3:
517 l = 2;
518 if (*inb < 2) goto starved;
519 d = *((unsigned char *)*in + 1);
520 c -= 0x21;
521 d -= 0x21;
522 if (c >= 84 || d >= 94) goto ilseq;
523 c = jis0208[c][d];
524 if (!c) goto ilseq;
525 break;
526 case 4:
527 if (c-0x60 < 0x1f) goto ilseq;
528 if (c-0x21 < 0x5e) c += 0xff61-0x21;
529 break;
531 break;
532 default:
533 if (!c) break;
534 c = legacy_map(map, c);
535 if (!c) goto ilseq;
538 switch (totype) {
539 case WCHAR_T:
540 if (*outb < sizeof(wchar_t)) goto toobig;
541 *(wchar_t *)*out = c;
542 *out += sizeof(wchar_t);
543 *outb -= sizeof(wchar_t);
544 break;
545 case UTF_8:
546 if (*outb < 4) {
547 k = utf8enc_wchar(tmp, c);
548 if (*outb < k) goto toobig;
549 memcpy(*out, tmp, k);
550 } else k = utf8enc_wchar(*out, c);
551 *out += k;
552 *outb -= k;
553 break;
554 case US_ASCII:
555 if (c > 0x7f) subst: x++, c='*';
556 default:
557 if (*outb < 1) goto toobig;
558 if (c<256 && c==legacy_map(tomap, c)) {
559 revout:
560 *(*out)++ = c;
561 *outb -= 1;
562 break;
564 d = c;
565 for (c=4*totype; c<256; c++) {
566 if (d == legacy_map(tomap, c)) {
567 goto revout;
570 goto subst;
571 case SHIFT_JIS:
572 if (c < 128) goto revout;
573 if (c == 0xa5) {
574 x++;
575 c = '\\';
576 goto revout;
578 if (c == 0x203e) {
579 x++;
580 c = '~';
581 goto revout;
583 if (c-0xff61 <= 0xdf-0xa1) {
584 c += 0xa1 - 0xff61;
585 goto revout;
587 c = uni_to_jis(c);
588 if (!c) goto subst;
589 if (*outb < 2) goto toobig;
590 d = c%256;
591 c = c/256;
592 *(*out)++ = (c+1)/2 + (c<95 ? 112 : 176);
593 *(*out)++ = c%2 ? d + 31 + d/96 : d + 126;
594 *outb -= 2;
595 break;
596 case EUC_JP:
597 if (c < 128) goto revout;
598 if (c-0xff61 <= 0xdf-0xa1) {
599 c += 0x0e00 + 0x21 - 0xff61;
600 } else {
601 c = uni_to_jis(c);
603 if (!c) goto subst;
604 if (*outb < 2) goto toobig;
605 *(*out)++ = c/256 + 0x80;
606 *(*out)++ = c%256 + 0x80;
607 *outb -= 2;
608 break;
609 case ISO2022_JP:
610 if (c < 128) goto revout;
611 if (c-0xff61 <= 0xdf-0xa1 || c==0xa5 || c==0x203e) {
612 if (*outb < 7) goto toobig;
613 *(*out)++ = '\033';
614 *(*out)++ = '(';
615 if (c==0xa5) {
616 *(*out)++ = 'J';
617 *(*out)++ = '\\';
618 } else if (c==0x203e) {
619 *(*out)++ = 'J';
620 *(*out)++ = '~';
621 } else {
622 *(*out)++ = 'I';
623 *(*out)++ = c-0xff61+0x21;
625 *(*out)++ = '\033';
626 *(*out)++ = '(';
627 *(*out)++ = 'B';
628 *outb -= 7;
629 break;
631 c = uni_to_jis(c);
632 if (!c) goto subst;
633 if (*outb < 8) goto toobig;
634 *(*out)++ = '\033';
635 *(*out)++ = '$';
636 *(*out)++ = 'B';
637 *(*out)++ = c/256;
638 *(*out)++ = c%256;
639 *(*out)++ = '\033';
640 *(*out)++ = '(';
641 *(*out)++ = 'B';
642 *outb -= 8;
643 break;
644 case UCS2:
645 totype = UCS2BE;
646 case UCS2BE:
647 case UCS2LE:
648 case UTF_16:
649 case UTF_16BE:
650 case UTF_16LE:
651 if (c < 0x10000 || totype-UCS2BE < 2U) {
652 if (c >= 0x10000) c = 0xFFFD;
653 if (*outb < 2) goto toobig;
654 put_16((void *)*out, c, totype);
655 *out += 2;
656 *outb -= 2;
657 break;
659 if (*outb < 4) goto toobig;
660 c -= 0x10000;
661 put_16((void *)*out, (c>>10)|0xd800, totype);
662 put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
663 *out += 4;
664 *outb -= 4;
665 break;
666 case UTF_32BE:
667 case UTF_32LE:
668 if (*outb < 4) goto toobig;
669 put_32((void *)*out, c, totype);
670 *out += 4;
671 *outb -= 4;
672 break;
675 return x;
676 ilseq:
677 err = EILSEQ;
678 x = -1;
679 goto end;
680 toobig:
681 err = E2BIG;
682 x = -1;
683 goto end;
684 starved:
685 err = EINVAL;
686 x = -1;
687 end:
688 errno = err;
689 return x;
692 int iconv_close(iconv_t cd)
694 return 0;