8 #include "locale_impl.h"
23 #define SHIFT_JIS 0321
24 #define ISO2022_JP 0322
31 /* Definitions of charmaps. Each charmap consists of:
32 * 1. Empty-string-terminated list of null-terminated aliases.
33 * 2. Special type code or number of elided quads of entries.
34 * 3. Character table (size determined by field 2), consisting
35 * of 5 bytes for every 4 characters, interpreted as 10-bit
36 * indices into the legacy_chars table. */
38 static const unsigned char charmaps
[] =
45 "ucs4be\0utf32be\0\0\300"
46 "ucs4le\0utf32le\0\0\303"
47 "ascii\0usascii\0iso646\0iso646us\0\0\307"
52 "shiftjis\0sjis\0cp932\0\0\321"
57 "big5\0bigfive\0cp950\0big5hkscs\0\0\340"
58 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
59 #include "codepages.h"
62 /* Table of characters that appear in legacy 8-bit codepages,
63 * limited to 1024 slots (10 bit indices). The first 256 entries
64 * are elided since those characters are obviously all included. */
65 static const unsigned short legacy_chars
[] = {
66 #include "legacychars.h"
69 static const unsigned short jis0208
[84][94] = {
73 static const unsigned short gb18030
[126][190] = {
77 static const unsigned short big5
[89][157] = {
81 static const unsigned short hkscs
[] = {
85 static const unsigned short ksc
[93][94] = {
89 static const unsigned short rev_jis
[] = {
93 static int fuzzycmp(const unsigned char *a
, const unsigned char *b
)
95 for (; *a
&& *b
; a
++, b
++) {
96 while (*a
&& (*a
|32U)-'a'>26 && *a
-'0'>10U) a
++;
97 if ((*a
|32U) != *b
) return 1;
102 static size_t find_charmap(const void *name
)
104 const unsigned char *s
;
105 if (!*(char *)name
) name
=charmaps
; /* "utf8" */
106 for (s
=charmaps
; *s
; ) {
107 if (!fuzzycmp(name
, s
)) {
108 for (; *s
; s
+=strlen((void *)s
)+1);
111 s
+= strlen((void *)s
)+1;
113 if (s
[1] > 0200) s
+=2;
114 else s
+=2+(64U-s
[1])*5;
125 static iconv_t
combine_to_from(size_t t
, size_t f
)
127 return (void *)(f
<<16 | t
<<1 | 1);
130 static size_t extract_from(iconv_t cd
)
132 return (size_t)cd
>> 16;
135 static size_t extract_to(iconv_t cd
)
137 return (size_t)cd
>> 1 & 0x7fff;
140 iconv_t
iconv_open(const char *to
, const char *from
)
143 struct stateful_cd
*scd
;
145 if ((t
= find_charmap(to
))==-1
146 || (f
= find_charmap(from
))==-1
147 || (charmaps
[t
] >= 0330)) {
151 iconv_t cd
= combine_to_from(t
, f
);
153 switch (charmaps
[f
]) {
158 scd
= malloc(sizeof *scd
);
159 if (!scd
) return (iconv_t
)-1;
168 static unsigned get_16(const unsigned char *s
, int e
)
171 return s
[e
]<<8 | s
[1-e
];
174 static void put_16(unsigned char *s
, unsigned c
, int e
)
181 static unsigned get_32(const unsigned char *s
, int e
)
184 return s
[e
]+0U<<24 | s
[e
^1]<<16 | s
[e
^2]<<8 | s
[e
^3];
187 static void put_32(unsigned char *s
, unsigned c
, int e
)
196 /* Adapt as needed */
197 #define mbrtowc_utf8 mbrtowc
198 #define wctomb_utf8 wctomb
200 static unsigned legacy_map(const unsigned char *map
, unsigned c
)
202 if (c
< 4*map
[-1]) return c
;
203 unsigned x
= c
- 4*map
[-1];
204 x
= map
[x
*5/4]>>2*x
%8 | map
[x
*5/4+1]<<8-2*x
%8 & 1023;
205 return x
< 256 ? x
: legacy_chars
[x
-256];
208 static unsigned uni_to_jis(unsigned c
)
210 unsigned nel
= sizeof rev_jis
/ sizeof *rev_jis
;
211 unsigned d
, j
, i
, b
= 0;
215 d
= jis0208
[j
/256][j
%256];
216 if (d
==c
) return j
+ 0x2121;
217 else if (nel
== 1) return 0;
227 size_t iconv(iconv_t cd
, char **restrict in
, size_t *restrict inb
, char **restrict out
, size_t *restrict outb
)
230 struct stateful_cd
*scd
=0;
231 if (!((size_t)cd
& 1)) {
235 unsigned to
= extract_to(cd
);
236 unsigned from
= extract_from(cd
);
237 const unsigned char *map
= charmaps
+from
+1;
238 const unsigned char *tomap
= charmaps
+to
+1;
244 unsigned char type
= map
[-1];
245 unsigned char totype
= tomap
[-1];
246 locale_t
*ploc
= &CURRENT_LOCALE
, loc
= *ploc
;
248 if (!in
|| !*in
|| !*inb
) return 0;
252 for (; *inb
; *in
+=l
, *inb
-=l
) {
253 c
= *(unsigned char *)*in
;
259 l
= mbrtowc_utf8(&wc
, *in
, *inb
, &st
);
260 if (l
== (size_t)-1) goto ilseq
;
261 if (l
== (size_t)-2) goto starved
;
265 if (c
>= 128) goto ilseq
;
269 if (*inb
< l
) goto starved
;
275 if (*inb
< 4) goto starved
;
276 c
= get_32((void *)*in
, type
);
278 if (c
-0xd800u
< 0x800u
|| c
>= 0x110000u
) goto ilseq
;
285 if (*inb
< 2) goto starved
;
286 c
= get_16((void *)*in
, type
);
287 if ((unsigned)(c
-0xdc00) < 0x400) goto ilseq
;
288 if ((unsigned)(c
-0xd800) < 0x400) {
289 if (type
-UCS2BE
< 2U) goto ilseq
;
291 if (*inb
< 4) goto starved
;
292 d
= get_16((void *)(*in
+ 2), type
);
293 if ((unsigned)(d
-0xdc00) >= 0x400) goto ilseq
;
294 c
= ((c
-0xd7c0)<<10) + (d
-0xdc00);
301 if (*inb
< 2) goto starved
;
302 c
= get_16((void *)*in
, 0);
303 scd
->state
= type
==UCS2
304 ? c
==0xfffe ? UCS2LE
: UCS2BE
305 : c
==0xfffe ? UTF_16LE
: UTF_16BE
;
306 if (c
== 0xfffe || c
== 0xfeff)
314 if (*inb
< 4) goto starved
;
315 c
= get_32((void *)*in
, 0);
316 scd
->state
= c
==0xfffe0000 ? UTF_32LE
: UTF_32BE
;
317 if (c
== 0xfffe0000 || c
== 0xfeff)
324 if (c
-0xa1 <= 0xdf-0xa1) {
329 if (*inb
< 2) goto starved
;
330 d
= *((unsigned char *)*in
+ 1);
331 if (c
-129 <= 159-129) c
-= 129;
332 else if (c
-224 <= 239-224) c
-= 193;
335 if (d
-64 <= 158-64) {
336 if (d
==127) goto ilseq
;
339 } else if (d
-159 <= 252-159) {
349 if (*inb
< 2) goto starved
;
350 d
= *((unsigned char *)*in
+ 1);
353 if (c
-0xa1 > 0xdf-0xa1) goto ilseq
;
359 if (c
>= 84 || d
>= 94) goto ilseq
;
364 if (c
>= 128) goto ilseq
;
367 if (*inb
< 3) goto starved
;
368 c
= *((unsigned char *)*in
+ 1);
369 d
= *((unsigned char *)*in
+ 2);
370 if (c
!= '(' && c
!= '$') goto ilseq
;
371 switch (128*(c
=='$') + d
) {
372 case 'B': scd
->state
=0; continue;
373 case 'J': scd
->state
=1; continue;
374 case 'I': scd
->state
=4; continue;
375 case 128+'@': scd
->state
=2; continue;
376 case 128+'B': scd
->state
=3; continue;
380 switch (scd
->state
) {
382 if (c
=='\\') c
= 0xa5;
383 if (c
=='~') c
= 0x203e;
388 if (*inb
< 2) goto starved
;
389 d
= *((unsigned char *)*in
+ 1);
392 if (c
>= 84 || d
>= 94) goto ilseq
;
397 if (c
-0x60 < 0x1f) goto ilseq
;
398 if (c
-0x21 < 0x5e) c
+= 0xff61-0x21;
404 if (c
< 0xa1) goto ilseq
;
413 if (c
>= 126) goto ilseq
;
415 if (*inb
< 2) goto starved
;
416 d
= *((unsigned char *)*in
+ 1);
417 if (d
< 0xa1 && type
== GB2312
) goto ilseq
;
418 if (d
-0x40>=191 || d
==127) {
419 if (d
-'0'>9 || type
!= GB18030
)
422 if (*inb
< 4) goto starved
;
423 c
= (10*c
+ d
-'0') * 1260;
424 d
= *((unsigned char *)*in
+ 2);
425 if (d
-0x81>126) goto ilseq
;
427 d
= *((unsigned char *)*in
+ 3);
428 if (d
-'0'>9) goto ilseq
;
433 for (int i
=0; i
<126; i
++)
434 for (int j
=0; j
<190; j
++)
435 if (gb18030
[i
][j
]-d
<= c
-d
)
449 if (*inb
< 2) goto starved
;
450 d
= *((unsigned char *)*in
+ 1);
451 if (d
-0x40>=0xff-0x40 || d
-0x7f<0xa1-0x7f) goto ilseq
;
453 if (d
> 0x3e) d
-= 0x22;
454 if (c
-0xa1>=0xfa-0xa1) {
455 if (c
-0x87>=0xff-0x87) goto ilseq
;
456 if (c
< 0xa1) c
-= 0x87;
457 else c
-= 0x87 + (0xfa-0xa1);
458 c
= (hkscs
[4867+(c
*157+d
)/16]>>(c
*157+d
)%16)%2<<17
460 /* A few HKSCS characters map to pairs of UCS
461 * characters. These are mapped to surrogate
462 * range in the hkscs table then hard-coded
463 * here. Ugly, yes. */
470 size_t tmpx
= iconv(combine_to_from(to
, find_charmap("utf8")),
471 &(char *){"\303\212\314\204"
475 +c
%256}, &(size_t){4},
476 &ptmp
, &(size_t){sizeof tmp
});
477 size_t tmplen
= ptmp
- tmp
.c
;
478 if (tmplen
> *outb
) goto toobig
;
480 memcpy(*out
, &tmp
, tmplen
);
489 c
= big5
[c
][d
]|(c
==0x27&&(d
==0x3a||d
==0x3c||d
==0x42))<<17;
495 if (*inb
< 2) goto starved
;
496 d
= *((unsigned char *)*in
+ 1);
499 if (c
>= 93 || d
>= 94) {
502 if (c
>= 93 || c
>=0xc6-0x81 && d
>0x52)
504 if (d
-'A'<26) d
= d
-'A';
505 else if (d
-'a'<26) d
= d
-'a'+26;
506 else if (d
-0x81<0xff-0x81) d
= d
-0x81+52;
508 if (c
< 0x20) c
= 178*c
+ d
;
509 else c
= 178*0x20 + 84*(c
-0x20) + d
;
511 for (d
=0xac00; d
<=c
; ) {
513 for (int i
=0; i
<93; i
++)
514 for (int j
=0; j
<94; j
++)
515 if (ksc
[i
][j
]-d
<= c
-d
)
527 c
= legacy_map(map
, c
);
533 if (*outb
< sizeof(wchar_t)) goto toobig
;
534 *(wchar_t *)*out
= c
;
535 *out
+= sizeof(wchar_t);
536 *outb
-= sizeof(wchar_t);
541 k
= wctomb_utf8(tmp
, c
);
542 if (*outb
< k
) goto toobig
;
543 memcpy(*out
, tmp
, k
);
544 } else k
= wctomb_utf8(*out
, c
);
549 if (c
> 0x7f) subst
: x
++, c
='*';
551 if (*outb
< 1) goto toobig
;
552 if (c
<256 && c
==legacy_map(tomap
, c
)) {
554 if (*outb
< 1) goto toobig
;
560 for (c
=4*totype
; c
<256; c
++) {
561 if (d
== legacy_map(tomap
, c
)) {
567 if (c
< 128) goto revout
;
578 if (c
-0xff61 <= 0xdf-0xa1) {
584 if (*outb
< 2) goto toobig
;
587 *(*out
)++ = (c
+1)/2 + (c
<95 ? 112 : 176);
588 *(*out
)++ = c
%2 ? d
+ 31 + d
/96 : d
+ 126;
592 if (c
< 128) goto revout
;
593 if (c
-0xff61 <= 0xdf-0xa1) {
594 c
+= 0x0e00 + 0x21 - 0xff61;
599 if (*outb
< 2) goto toobig
;
600 *(*out
)++ = c
/256 + 0x80;
601 *(*out
)++ = c
%256 + 0x80;
605 if (c
< 128) goto revout
;
606 if (c
-0xff61 <= 0xdf-0xa1 || c
==0xa5 || c
==0x203e) {
607 if (*outb
< 7) goto toobig
;
613 } else if (c
==0x203e) {
618 *(*out
)++ = c
-0xff61+0x21;
628 if (*outb
< 8) goto toobig
;
646 if (c
< 0x10000 || totype
-UCS2BE
< 2U) {
647 if (c
>= 0x10000) c
= 0xFFFD;
648 if (*outb
< 2) goto toobig
;
649 put_16((void *)*out
, c
, totype
);
654 if (*outb
< 4) goto toobig
;
656 put_16((void *)*out
, (c
>>10)|0xd800, totype
);
657 put_16((void *)(*out
+ 2), (c
&0x3ff)|0xdc00, totype
);
665 if (*outb
< 4) goto toobig
;
666 put_32((void *)*out
, c
, totype
);