2 * Copyright © 2018 Waldemar Brodkorb <wbx@uclibc-ng.org>
3 * Simplified port of iconv.c from musl C library including
4 * parts of libiconv-tiny.
7 /* Copyright © 2005-2018 Rich Felker, et al.
9 Permission is hereby granted, free of charge, to any person obtaining
10 a copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be
18 included in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
57 #define SHIFT_JIS 0321
58 #define ISO2022_JP 0322
65 /* Definitions of charmaps. Each charmap consists of:
66 * 1. Empty-string-terminated list of null-terminated aliases.
67 * 2. Special type code or number of elided quads of entries.
68 * 3. Character table (size determined by field 2), consisting
69 * of 5 bytes for every 4 characters, interpreted as 10-bit
70 * indices into the legacy_chars table. */
72 static const unsigned char charmaps
[] =
79 "ucs4be\0utf32be\0\0\300"
80 "ucs4le\0utf32le\0\0\303"
81 "ascii\0usascii\0iso646\0iso646us\0\0\307"
86 "shiftjis\0sjis\0\0\321"
91 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
92 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
93 #include "codepages.h"
96 /* Table of characters that appear in legacy 8-bit codepages,
97 * limited to 1024 slots (10 bit indices). The first 256 entries
98 * are elided since those characters are obviously all included. */
99 static const unsigned short legacy_chars
[] = {
100 #include "legacychars.h"
103 static const unsigned short jis0208
[84][94] = {
107 static const unsigned short rev_jis
[] = {
111 static int fuzzycmp(const unsigned char *a
, const unsigned char *b
)
113 for (; *a
&& *b
; a
++, b
++) {
114 while (*a
&& (*a
|32U)-'a'>26 && *a
-'0'>10U) a
++;
115 if ((*a
|32U) != *b
) return 1;
120 static size_t find_charmap(const void *name
)
122 const unsigned char *s
;
123 if (!*(char *)name
) name
=charmaps
; /* "utf8" */
124 for (s
=charmaps
; *s
; ) {
125 if (!fuzzycmp(name
, s
)) {
126 for (; *s
; s
+=strlen((void *)s
)+1);
129 s
+= strlen((void *)s
)+1;
131 if (s
[1] > 0200) s
+=2;
132 else s
+=2+(64U-s
[1])*5;
143 static iconv_t
combine_to_from(size_t t
, size_t f
)
145 return (void *)(f
<<16 | t
<<1 | 1);
148 static size_t extract_from(iconv_t cd
)
150 return (size_t)cd
>> 16;
153 static size_t extract_to(iconv_t cd
)
155 return (size_t)cd
>> 1 & 0x7fff;
158 iconv_t
iconv_open(const char *to
, const char *from
)
161 struct stateful_cd
*scd
;
163 if ((t
= find_charmap(to
))==-1
164 || (f
= find_charmap(from
))==-1
165 || (charmaps
[t
] >= 0330)) {
169 iconv_t cd
= combine_to_from(t
, f
);
171 switch (charmaps
[f
]) {
176 scd
= malloc(sizeof *scd
);
177 if (!scd
) return (iconv_t
)-1;
186 static unsigned get_16(const unsigned char *s
, int e
)
189 return s
[e
]<<8 | s
[1-e
];
192 static void put_16(unsigned char *s
, unsigned c
, int e
)
199 static unsigned get_32(const unsigned char *s
, int e
)
202 return s
[e
]+0U<<24 | s
[e
^1]<<16 | s
[e
^2]<<8 | s
[e
^3];
205 static void put_32(unsigned char *s
, unsigned c
, int e
)
215 static inline int utf8enc_wchar(char *outb
, wchar_t c
)
221 else if (c
<= 0x7FF) {
222 *outb
++ = ((c
>> 6) & 0x1F) | 0xC0;
223 *outb
++ = ( c
& 0x3F) | 0x80;
226 else if (c
<= 0xFFFF) {
227 *outb
++ = ((c
>> 12) & 0x0F) | 0xE0;
228 *outb
++ = ((c
>> 6) & 0x3F) | 0x80;
229 *outb
++ = ( c
& 0x3F) | 0x80;
232 else if (c
<= 0x10FFFF) {
233 *outb
++ = ((c
>> 18) & 0x07) | 0xF0;
234 *outb
++ = ((c
>> 12) & 0x3F) | 0x80;
235 *outb
++ = ((c
>> 6) & 0x3F) | 0x80;
236 *outb
++ = ( c
& 0x3F) | 0x80;
245 static inline int utf8seq_is_overlong(char *s
, int n
)
250 /* 1100000x (10xxxxxx) */
251 return (((*s
>> 1) == 0x60) &&
252 ((*(s
+1) >> 6) == 0x02));
255 /* 11100000 100xxxxx (10xxxxxx) */
256 return ((*s
== 0xE0) &&
257 ((*(s
+1) >> 5) == 0x04) &&
258 ((*(s
+2) >> 6) == 0x02));
261 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
262 return ((*s
== 0xF0) &&
263 ((*(s
+1) >> 4) == 0x08) &&
264 ((*(s
+2) >> 6) == 0x02) &&
265 ((*(s
+3) >> 6) == 0x02));
271 static inline int utf8seq_is_surrogate(char *s
, int n
)
273 return ((n
== 3) && (*s
== 0xED) && (*(s
+1) >= 0xA0) && (*(s
+1) <= 0xBF));
276 static inline int utf8seq_is_illegal(char *s
, int n
)
278 return ((n
== 3) && (*s
== 0xEF) && (*(s
+1) == 0xBF) &&
279 (*(s
+2) >= 0xBE) && (*(s
+2) <= 0xBF));
282 static inline int utf8dec_wchar(wchar_t *c
, unsigned char *in
, size_t inb
)
293 /* find utf8 sequence length */
294 if ((*in
& 0xE0) == 0xC0) n
= 2;
295 else if ((*in
& 0xF0) == 0xE0) n
= 3;
296 else if ((*in
& 0xF8) == 0xF0) n
= 4;
297 else if ((*in
& 0xFC) == 0xF8) n
= 5;
298 else if ((*in
& 0xFE) == 0xFC) n
= 6;
305 if (n
> 1 && n
< 5) {
306 /* reject invalid sequences */
307 if (utf8seq_is_overlong(in
, n
) ||
308 utf8seq_is_surrogate(in
, n
) ||
309 utf8seq_is_illegal(in
, n
))
313 *c
= (char)(*in
++ & (0x7F >> n
));
315 for (i
= 1; i
< n
; i
++) {
316 /* illegal continuation byte */
317 if (*in
< 0x80 || *in
> 0xBF)
320 *c
= (*c
<< 6) | (*in
++ & 0x3F);
326 /* unmapped sequence (> 4) */
330 static unsigned legacy_map(const unsigned char *map
, unsigned c
)
332 if (c
< 4*map
[-1]) return c
;
333 unsigned x
= c
- 4*map
[-1];
334 x
= map
[x
*5/4]>>2*x
%8 | map
[x
*5/4+1]<<8-2*x
%8 & 1023;
335 return x
< 256 ? x
: legacy_chars
[x
-256];
338 static unsigned uni_to_jis(unsigned c
)
340 unsigned nel
= sizeof rev_jis
/ sizeof *rev_jis
;
341 unsigned d
, j
, i
, b
= 0;
345 d
= jis0208
[j
/256][j
%256];
346 if (d
==c
) return j
+ 0x2121;
347 else if (nel
== 1) return 0;
357 size_t iconv(iconv_t cd
, char **restrict in
, size_t *restrict inb
, char **restrict out
, size_t *restrict outb
)
360 struct stateful_cd
*scd
=0;
361 if (!((size_t)cd
& 1)) {
365 unsigned to
= extract_to(cd
);
366 unsigned from
= extract_from(cd
);
367 const unsigned char *map
= charmaps
+from
+1;
368 const unsigned char *tomap
= charmaps
+to
+1;
369 char tmp
[MB_LEN_MAX
];
373 unsigned char type
= map
[-1];
374 unsigned char totype
= tomap
[-1];
376 if (!in
|| !*in
|| !*inb
) return 0;
378 for (; *inb
; *in
+=l
, *inb
-=l
) {
379 c
= *(unsigned char *)*in
;
385 l
= utf8dec_wchar(&c
, *in
, *inb
);
387 else if (l
== (size_t)-1) goto ilseq
;
388 else if (l
== (size_t)-2) goto starved
;
391 if (c
>= 128) goto ilseq
;
395 if (*inb
< l
) goto starved
;
401 if (*inb
< 4) goto starved
;
402 c
= get_32((void *)*in
, type
);
404 if (c
-0xd800u
< 0x800u
|| c
>= 0x110000u
) goto ilseq
;
411 if (*inb
< 2) goto starved
;
412 c
= get_16((void *)*in
, type
);
413 if ((unsigned)(c
-0xdc00) < 0x400) goto ilseq
;
414 if ((unsigned)(c
-0xd800) < 0x400) {
415 if (type
-UCS2BE
< 2U) goto ilseq
;
417 if (*inb
< 4) goto starved
;
418 d
= get_16((void *)(*in
+ 2), type
);
419 if ((unsigned)(d
-0xdc00) >= 0x400) goto ilseq
;
420 c
= ((c
-0xd7c0)<<10) + (d
-0xdc00);
427 if (*inb
< 2) goto starved
;
428 c
= get_16((void *)*in
, 0);
429 scd
->state
= type
==UCS2
430 ? c
==0xfffe ? UCS2LE
: UCS2BE
431 : c
==0xfffe ? UTF_16LE
: UTF_16BE
;
432 if (c
== 0xfffe || c
== 0xfeff)
440 if (*inb
< 4) goto starved
;
441 c
= get_32((void *)*in
, 0);
442 scd
->state
= c
==0xfffe0000 ? UTF_32LE
: UTF_32BE
;
443 if (c
== 0xfffe0000 || c
== 0xfeff)
450 if (c
-0xa1 <= 0xdf-0xa1) {
455 if (*inb
< 2) goto starved
;
456 d
= *((unsigned char *)*in
+ 1);
457 if (c
-129 <= 159-129) c
-= 129;
458 else if (c
-224 <= 239-224) c
-= 193;
461 if (d
-64 <= 158-64) {
462 if (d
==127) goto ilseq
;
465 } else if (d
-159 <= 252-159) {
475 if (*inb
< 2) goto starved
;
476 d
= *((unsigned char *)*in
+ 1);
479 if (c
-0xa1 > 0xdf-0xa1) goto ilseq
;
485 if (c
>= 84 || d
>= 94) goto ilseq
;
490 if (c
>= 128) goto ilseq
;
493 if (*inb
< 3) goto starved
;
494 c
= *((unsigned char *)*in
+ 1);
495 d
= *((unsigned char *)*in
+ 2);
496 if (c
!= '(' && c
!= '$') goto ilseq
;
497 switch (128*(c
=='$') + d
) {
498 case 'B': scd
->state
=0; continue;
499 case 'J': scd
->state
=1; continue;
500 case 'I': scd
->state
=4; continue;
501 case 128+'@': scd
->state
=2; continue;
502 case 128+'B': scd
->state
=3; continue;
506 switch (scd
->state
) {
508 if (c
=='\\') c
= 0xa5;
509 if (c
=='~') c
= 0x203e;
514 if (*inb
< 2) goto starved
;
515 d
= *((unsigned char *)*in
+ 1);
518 if (c
>= 84 || d
>= 94) goto ilseq
;
523 if (c
-0x60 < 0x1f) goto ilseq
;
524 if (c
-0x21 < 0x5e) c
+= 0xff61-0x21;
530 c
= legacy_map(map
, c
);
536 if (*outb
< sizeof(wchar_t)) goto toobig
;
537 *(wchar_t *)*out
= c
;
538 *out
+= sizeof(wchar_t);
539 *outb
-= sizeof(wchar_t);
543 k
= utf8enc_wchar(tmp
, c
);
544 if (*outb
< k
) goto toobig
;
545 memcpy(*out
, tmp
, k
);
546 } else k
= utf8enc_wchar(*out
, c
);
551 if (c
> 0x7f) subst
: x
++, c
='*';
553 if (*outb
< 1) goto toobig
;
554 if (c
<256 && c
==legacy_map(tomap
, c
)) {
561 for (c
=4*totype
; c
<256; c
++) {
562 if (d
== legacy_map(tomap
, c
)) {
568 if (c
< 128) goto revout
;
579 if (c
-0xff61 <= 0xdf-0xa1) {
585 if (*outb
< 2) goto toobig
;
588 *(*out
)++ = (c
+1)/2 + (c
<95 ? 112 : 176);
589 *(*out
)++ = c
%2 ? d
+ 31 + d
/96 : d
+ 126;
593 if (c
< 128) goto revout
;
594 if (c
-0xff61 <= 0xdf-0xa1) {
595 c
+= 0x0e00 + 0x21 - 0xff61;
600 if (*outb
< 2) goto toobig
;
601 *(*out
)++ = c
/256 + 0x80;
602 *(*out
)++ = c
%256 + 0x80;
606 if (c
< 128) goto revout
;
607 if (c
-0xff61 <= 0xdf-0xa1 || c
==0xa5 || c
==0x203e) {
608 if (*outb
< 7) goto toobig
;
614 } else if (c
==0x203e) {
619 *(*out
)++ = c
-0xff61+0x21;
629 if (*outb
< 8) goto toobig
;
647 if (c
< 0x10000 || totype
-UCS2BE
< 2U) {
648 if (c
>= 0x10000) c
= 0xFFFD;
649 if (*outb
< 2) goto toobig
;
650 put_16((void *)*out
, c
, totype
);
655 if (*outb
< 4) goto toobig
;
657 put_16((void *)*out
, (c
>>10)|0xd800, totype
);
658 put_16((void *)(*out
+ 2), (c
&0x3ff)|0xdc00, totype
);
664 if (*outb
< 4) goto toobig
;
665 put_32((void *)*out
, c
, totype
);
688 int iconv_close(iconv_t cd
)