2 * Copyright © 2018 Waldemar Brodkorb <wbx@uclibc-ng.org>
3 * Simplified port of iconv.c from musl C library including
4 * parts of libiconv-tiny.
7 /* Copyright © 2005-2018 Rich Felker, et al.
9 Permission is hereby granted, free of charge, to any person obtaining
10 a copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be
18 included in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
57 #define SHIFT_JIS 0321
58 #define ISO2022_JP 0322
65 /* Definitions of charmaps. Each charmap consists of:
66 * 1. Empty-string-terminated list of null-terminated aliases.
67 * 2. Special type code or number of elided quads of entries.
68 * 3. Character table (size determined by field 2), consisting
69 * of 5 bytes for every 4 characters, interpreted as 10-bit
70 * indices into the legacy_chars table. */
72 static const unsigned char charmaps
[] =
79 "ucs4be\0utf32be\0\0\300"
80 "ucs4le\0utf32le\0\0\303"
81 "ascii\0usascii\0iso646\0iso646us\0\0\307"
86 "shiftjis\0sjis\0\0\321"
91 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
92 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
93 #include "codepages.h"
96 /* Table of characters that appear in legacy 8-bit codepages,
97 * limited to 1024 slots (10 bit indices). The first 256 entries
98 * are elided since those characters are obviously all included. */
99 static const unsigned short legacy_chars
[] = {
100 #include "legacychars.h"
103 static const unsigned short jis0208
[84][94] = {
107 static const unsigned short rev_jis
[] = {
111 static int fuzzycmp(const unsigned char *a
, const unsigned char *b
)
113 for (; *a
&& *b
; a
++, b
++) {
114 while (*a
&& (*a
|32U)-'a'>26 && *a
-'0'>10U) a
++;
115 if ((*a
|32U) != *b
) return 1;
120 static size_t find_charmap(const void *name
)
122 const unsigned char *s
;
123 if (!*(char *)name
) name
=charmaps
; /* "utf8" */
124 for (s
=charmaps
; *s
; ) {
125 if (!fuzzycmp(name
, s
)) {
126 for (; *s
; s
+=strlen((void *)s
)+1);
129 s
+= strlen((void *)s
)+1;
131 if (s
[1] > 0200) s
+=2;
132 else s
+=2+(64U-s
[1])*5;
143 static iconv_t
combine_to_from(size_t t
, size_t f
)
145 return (iconv_t
)(f
<<16 | t
<<1 | 1);
148 static size_t extract_from(iconv_t cd
)
150 return (size_t)cd
>> 16;
153 static size_t extract_to(iconv_t cd
)
155 return (size_t)cd
>> 1 & 0x7fff;
158 iconv_t
iconv_open(const char *to
, const char *from
)
161 struct stateful_cd
*scd
;
163 if ((t
= find_charmap(to
))==-1
164 || (f
= find_charmap(from
))==-1
165 || (charmaps
[t
] >= 0330)) {
169 iconv_t cd
= combine_to_from(t
, f
);
171 switch (charmaps
[f
]) {
176 scd
= malloc(sizeof *scd
);
177 if (!scd
) return (iconv_t
)-1;
186 static unsigned get_16(const unsigned char *s
, int e
)
189 return s
[e
]<<8 | s
[1-e
];
192 static void put_16(unsigned char *s
, unsigned c
, int e
)
199 static unsigned get_32(const unsigned char *s
, int e
)
202 return (s
[e
]+0U)<<24 | s
[e
^1]<<16 | s
[e
^2]<<8 | s
[e
^3];
205 static void put_32(unsigned char *s
, unsigned c
, int e
)
215 static inline int utf8enc_wchar(char *outb
, wchar_t c
)
221 else if (c
<= 0x7FF) {
222 *outb
++ = ((c
>> 6) & 0x1F) | 0xC0;
223 *outb
++ = ( c
& 0x3F) | 0x80;
226 else if (c
<= 0xFFFF) {
227 *outb
++ = ((c
>> 12) & 0x0F) | 0xE0;
228 *outb
++ = ((c
>> 6) & 0x3F) | 0x80;
229 *outb
++ = ( c
& 0x3F) | 0x80;
232 else if (c
<= 0x10FFFF) {
233 *outb
++ = ((c
>> 18) & 0x07) | 0xF0;
234 *outb
++ = ((c
>> 12) & 0x3F) | 0x80;
235 *outb
++ = ((c
>> 6) & 0x3F) | 0x80;
236 *outb
++ = ( c
& 0x3F) | 0x80;
245 static inline int utf8seq_is_overlong(unsigned char *s
, int n
)
250 /* 1100000x (10xxxxxx) */
251 return (((*s
>> 1) == 0x60) &&
252 ((*(s
+1) >> 6) == 0x02));
255 /* 11100000 100xxxxx (10xxxxxx) */
256 return ((*s
== 0xE0) &&
257 ((*(s
+1) >> 5) == 0x04) &&
258 ((*(s
+2) >> 6) == 0x02));
261 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
262 return ((*s
== 0xF0) &&
263 ((*(s
+1) >> 4) == 0x08) &&
264 ((*(s
+2) >> 6) == 0x02) &&
265 ((*(s
+3) >> 6) == 0x02));
271 static inline int utf8seq_is_surrogate(unsigned char *s
, int n
)
273 return ((n
== 3) && (*s
== 0xED) && (*(s
+1) >= 0xA0) && (*(s
+1) <= 0xBF));
276 static inline int utf8seq_is_illegal(unsigned char *s
, int n
)
278 return ((n
== 3) && (*s
== 0xEF) && (*(s
+1) == 0xBF) &&
279 (*(s
+2) >= 0xBE) && (*(s
+2) <= 0xBF));
282 static inline int utf8dec_wchar(wchar_t *c
, unsigned char *in
, size_t inb
)
293 /* find utf8 sequence length */
294 if ((*in
& 0xE0) == 0xC0) n
= 2;
295 else if ((*in
& 0xF0) == 0xE0) n
= 3;
296 else if ((*in
& 0xF8) == 0xF0) n
= 4;
297 else if ((*in
& 0xFC) == 0xF8) n
= 5;
298 else if ((*in
& 0xFE) == 0xFC) n
= 6;
305 if (n
> 1 && n
< 5) {
306 /* reject invalid sequences */
307 if (utf8seq_is_overlong(in
, n
) ||
308 utf8seq_is_surrogate(in
, n
) ||
309 utf8seq_is_illegal(in
, n
))
313 *c
= (char)(*in
++ & (0x7F >> n
));
315 for (i
= 1; i
< n
; i
++) {
316 /* illegal continuation byte */
317 if (*in
< 0x80 || *in
> 0xBF)
320 *c
= (*c
<< 6) | (*in
++ & 0x3F);
326 /* unmapped sequence (> 4) */
330 static unsigned legacy_map(const unsigned char *map
, unsigned c
)
332 if (c
< 4*map
[-1]) return c
;
333 unsigned x
= c
- 4*map
[-1];
334 x
= map
[x
*5/4]>>(2*x
%8) | (map
[x
*5/4+1]<<(8-2*x
%8) & 1023);
335 return x
< 256 ? x
: legacy_chars
[x
-256];
338 static unsigned uni_to_jis(unsigned c
)
340 unsigned nel
= sizeof rev_jis
/ sizeof *rev_jis
;
341 unsigned d
, j
, i
, b
= 0;
345 d
= jis0208
[j
/256][j
%256];
346 if (d
==c
) return j
+ 0x2121;
347 else if (nel
== 1) return 0;
357 size_t iconv(iconv_t cd
, char **restrict in
, size_t *restrict inb
, char **restrict out
, size_t *restrict outb
)
360 struct stateful_cd
*scd
=0;
361 if (!((size_t)cd
& 1)) {
365 unsigned to
= extract_to(cd
);
366 unsigned from
= extract_from(cd
);
367 const unsigned char *map
= charmaps
+from
+1;
368 const unsigned char *tomap
= charmaps
+to
+1;
369 char tmp
[MB_LEN_MAX
];
373 unsigned char type
= map
[-1];
374 unsigned char totype
= tomap
[-1];
376 if (!in
|| !*in
|| !*inb
) return 0;
378 for (; *inb
; *in
+=l
, *inb
-=l
) {
379 c
= *(unsigned char *)*in
;
387 l
= utf8dec_wchar(&wc
, (unsigned char*)(*in
), *inb
);
391 else if (l
== (size_t)-1) goto ilseq
;
392 else if (l
== (size_t)-2) goto starved
;
395 if (c
>= 128) goto ilseq
;
399 if (*inb
< l
) goto starved
;
405 if (*inb
< 4) goto starved
;
406 c
= get_32((void *)*in
, type
);
408 if (c
-0xd800u
< 0x800u
|| c
>= 0x110000u
) goto ilseq
;
415 if (*inb
< 2) goto starved
;
416 c
= get_16((void *)*in
, type
);
417 if ((unsigned)(c
-0xdc00) < 0x400) goto ilseq
;
418 if ((unsigned)(c
-0xd800) < 0x400) {
419 if (type
-UCS2BE
< 2U) goto ilseq
;
421 if (*inb
< 4) goto starved
;
422 d
= get_16((void *)(*in
+ 2), type
);
423 if ((unsigned)(d
-0xdc00) >= 0x400) goto ilseq
;
424 c
= ((c
-0xd7c0)<<10) + (d
-0xdc00);
431 if (*inb
< 2) goto starved
;
432 c
= get_16((void *)*in
, 0);
433 scd
->state
= type
==UCS2
434 ? c
==0xfffe ? UCS2LE
: UCS2BE
435 : c
==0xfffe ? UTF_16LE
: UTF_16BE
;
436 if (c
== 0xfffe || c
== 0xfeff)
444 if (*inb
< 4) goto starved
;
445 c
= get_32((void *)*in
, 0);
446 scd
->state
= c
==0xfffe0000 ? UTF_32LE
: UTF_32BE
;
447 if (c
== 0xfffe0000 || c
== 0xfeff)
454 if (c
-0xa1 <= 0xdf-0xa1) {
459 if (*inb
< 2) goto starved
;
460 d
= *((unsigned char *)*in
+ 1);
461 if (c
-129 <= 159-129) c
-= 129;
462 else if (c
-224 <= 239-224) c
-= 193;
465 if (d
-64 <= 158-64) {
466 if (d
==127) goto ilseq
;
469 } else if (d
-159 <= 252-159) {
479 if (*inb
< 2) goto starved
;
480 d
= *((unsigned char *)*in
+ 1);
483 if (c
-0xa1 > 0xdf-0xa1) goto ilseq
;
489 if (c
>= 84 || d
>= 94) goto ilseq
;
494 if (c
>= 128) goto ilseq
;
497 if (*inb
< 3) goto starved
;
498 c
= *((unsigned char *)*in
+ 1);
499 d
= *((unsigned char *)*in
+ 2);
500 if (c
!= '(' && c
!= '$') goto ilseq
;
501 switch (128*(c
=='$') + d
) {
502 case 'B': scd
->state
=0; continue;
503 case 'J': scd
->state
=1; continue;
504 case 'I': scd
->state
=4; continue;
505 case 128+'@': scd
->state
=2; continue;
506 case 128+'B': scd
->state
=3; continue;
510 switch (scd
->state
) {
512 if (c
=='\\') c
= 0xa5;
513 if (c
=='~') c
= 0x203e;
518 if (*inb
< 2) goto starved
;
519 d
= *((unsigned char *)*in
+ 1);
522 if (c
>= 84 || d
>= 94) goto ilseq
;
527 if (c
-0x60 < 0x1f) goto ilseq
;
528 if (c
-0x21 < 0x5e) c
+= 0xff61-0x21;
534 c
= legacy_map(map
, c
);
540 if (*outb
< sizeof(wchar_t)) goto toobig
;
541 *(wchar_t *)*out
= c
;
542 *out
+= sizeof(wchar_t);
543 *outb
-= sizeof(wchar_t);
547 k
= utf8enc_wchar(tmp
, c
);
548 if (*outb
< k
) goto toobig
;
549 memcpy(*out
, tmp
, k
);
550 } else k
= utf8enc_wchar(*out
, c
);
555 if (c
> 0x7f) subst
: x
++, c
='*';
557 if (*outb
< 1) goto toobig
;
558 if (c
<256 && c
==legacy_map(tomap
, c
)) {
565 for (c
=4*totype
; c
<256; c
++) {
566 if (d
== legacy_map(tomap
, c
)) {
572 if (c
< 128) goto revout
;
583 if (c
-0xff61 <= 0xdf-0xa1) {
589 if (*outb
< 2) goto toobig
;
592 *(*out
)++ = (c
+1)/2 + (c
<95 ? 112 : 176);
593 *(*out
)++ = c
%2 ? d
+ 31 + d
/96 : d
+ 126;
597 if (c
< 128) goto revout
;
598 if (c
-0xff61 <= 0xdf-0xa1) {
599 c
+= 0x0e00 + 0x21 - 0xff61;
604 if (*outb
< 2) goto toobig
;
605 *(*out
)++ = c
/256 + 0x80;
606 *(*out
)++ = c
%256 + 0x80;
610 if (c
< 128) goto revout
;
611 if (c
-0xff61 <= 0xdf-0xa1 || c
==0xa5 || c
==0x203e) {
612 if (*outb
< 7) goto toobig
;
618 } else if (c
==0x203e) {
623 *(*out
)++ = c
-0xff61+0x21;
633 if (*outb
< 8) goto toobig
;
651 if (c
< 0x10000 || totype
-UCS2BE
< 2U) {
652 if (c
>= 0x10000) c
= 0xFFFD;
653 if (*outb
< 2) goto toobig
;
654 put_16((void *)*out
, c
, totype
);
659 if (*outb
< 4) goto toobig
;
661 put_16((void *)*out
, (c
>>10)|0xd800, totype
);
662 put_16((void *)(*out
+ 2), (c
&0x3ff)|0xdc00, totype
);
668 if (*outb
< 4) goto toobig
;
669 put_32((void *)*out
, c
, totype
);
692 int iconv_close(iconv_t cd
)