2 * Copyright (c) 2003, 2005 Ryuichiro Imura
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: head/sys/libkern/iconv_ucs.c 267291 2014-06-09 19:27:47Z jhb $
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/systm.h>
32 #include <sys/malloc.h>
33 #include <sys/iconv.h>
35 #include "iconv_converter_if.h"
41 #define KICONV_UCS_COMBINE 0x1
42 #define KICONV_UCS_FROM_UTF8 0x2
43 #define KICONV_UCS_TO_UTF8 0x4
44 #define KICONV_UCS_FROM_LE 0x8
45 #define KICONV_UCS_TO_LE 0x10
46 #define KICONV_UCS_FROM_UTF16 0x20
47 #define KICONV_UCS_TO_UTF16 0x40
48 #define KICONV_UCS_UCS4 0x80
50 #define ENCODING_UTF16 "UTF-16BE"
51 #define ENCODING_UTF8 "UTF-8"
55 int from_flag
, to_flag
;
56 } unicode_family
[] = {
57 { "UTF-8", KICONV_UCS_FROM_UTF8
, KICONV_UCS_TO_UTF8
},
58 { "UCS-2LE", KICONV_UCS_FROM_LE
, KICONV_UCS_TO_LE
},
59 { "UTF-16BE", KICONV_UCS_FROM_UTF16
, KICONV_UCS_TO_UTF16
},
60 { "UTF-16LE", KICONV_UCS_FROM_UTF16
|KICONV_UCS_FROM_LE
,
61 KICONV_UCS_TO_UTF16
|KICONV_UCS_TO_LE
},
65 static uint32_t utf8_to_ucs4(const char *src
, size_t *utf8width
, size_t srclen
);
66 static u_char
*ucs4_to_utf8(uint32_t ucs4
, char * dst
, size_t *utf8width
, size_t dstlen
);
67 static uint32_t encode_surrogate(uint32_t code
);
68 static uint32_t decode_surrogate(const u_char
*ucs
);
71 MODULE_DEPEND(iconv_ucs
, libiconv
, 2, 2, 2);
75 * UCS converter instance
80 struct iconv_cspair
* d_csp
;
81 struct iconv_cspair
* d_cspf
;
88 iconv_ucs_open(struct iconv_converter_class
*dcp
,
89 struct iconv_cspair
*csp
, struct iconv_cspair
*cspf
, void **dpp
)
93 const char *from
, *to
;
95 dp
= (struct iconv_ucs
*)kobj_create((struct kobj_class
*)dcp
, M_ICONV
, M_WAITOK
);
97 from
= cspf
? cspf
->cp_from
: csp
->cp_from
;
102 dp
->convtype
|= KICONV_UCS_COMBINE
;
103 for (i
= 0; unicode_family
[i
].name
; i
++) {
104 if (strcasecmp(from
, unicode_family
[i
].name
) == 0)
105 dp
->convtype
|= unicode_family
[i
].from_flag
;
106 if (strcasecmp(to
, unicode_family
[i
].name
) == 0)
107 dp
->convtype
|= unicode_family
[i
].to_flag
;
109 if (strcmp(ENCODING_UNICODE
, ENCODING_UTF16
) == 0)
110 dp
->convtype
|= KICONV_UCS_UCS4
;
112 dp
->convtype
&= ~KICONV_UCS_UCS4
;
114 dp
->f_ctp
= dp
->t_ctp
= NULL
;
115 if (dp
->convtype
& KICONV_UCS_COMBINE
) {
116 if ((dp
->convtype
& KICONV_UCS_FROM_UTF8
) == 0 &&
117 (dp
->convtype
& KICONV_UCS_FROM_LE
) == 0) {
118 iconv_open(ENCODING_UNICODE
, from
, &dp
->f_ctp
);
120 if ((dp
->convtype
& KICONV_UCS_TO_UTF8
) == 0 &&
121 (dp
->convtype
& KICONV_UCS_TO_LE
) == 0) {
122 iconv_open(to
, ENCODING_UNICODE
, &dp
->t_ctp
);
127 if (dp
->convtype
& (KICONV_UCS_FROM_UTF8
| KICONV_UCS_TO_UTF8
))
128 iconv_open(KICONV_WCTYPE_NAME
, ENCODING_UTF8
, &dp
->ctype
);
131 if (dp
->convtype
& (KICONV_UCS_FROM_UTF8
| KICONV_UCS_FROM_LE
)) {
138 if (dp
->convtype
& (KICONV_UCS_TO_UTF8
| KICONV_UCS_TO_LE
))
145 iconv_ucs_close(void *data
)
147 struct iconv_ucs
*dp
= data
;
150 iconv_close(dp
->f_ctp
);
152 iconv_close(dp
->t_ctp
);
154 iconv_close(dp
->ctype
);
156 dp
->d_cspf
->cp_refcount
--;
157 else if (dp
->convtype
& (KICONV_UCS_FROM_UTF8
| KICONV_UCS_FROM_LE
))
158 dp
->d_csp
->cp_refcount
--;
159 if (dp
->convtype
& (KICONV_UCS_TO_UTF8
| KICONV_UCS_TO_LE
))
160 dp
->d_csp
->cp_refcount
--;
161 kobj_delete((struct kobj
*)data
, M_ICONV
);
166 iconv_ucs_conv(void *d2p
, const char **inbuf
,
167 size_t *inbytesleft
, char **outbuf
, size_t *outbytesleft
,
168 int convchar
, int casetype
)
170 struct iconv_ucs
*dp
= (struct iconv_ucs
*)d2p
;
172 size_t in
, on
, ir
, or, inlen
, outlen
, ucslen
;
178 if (inbuf
== NULL
|| *inbuf
== NULL
|| outbuf
== NULL
|| *outbuf
== NULL
)
180 ir
= in
= *inbytesleft
;
181 or = on
= *outbytesleft
;
185 while (ir
> 0 && or > 0) {
188 * The first half of conversion.
189 * (convert any code into ENCODING_UNICODE)
193 if (dp
->convtype
& KICONV_UCS_FROM_UTF8
) {
194 /* convert UTF-8 to ENCODING_UNICODE */
196 code
= utf8_to_ucs4(p
, &inlen
, ir
);
202 if (casetype
== KICONV_FROM_LOWER
&& dp
->ctype
) {
203 code
= towlower(code
, dp
->ctype
);
204 } else if (casetype
== KICONV_FROM_UPPER
&& dp
->ctype
) {
205 code
= towupper(code
, dp
->ctype
);
208 if ((code
>= 0xd800 && code
< 0xe000) || code
>= 0x110000 ) {
209 /* reserved for utf-16 surrogate pair */
210 /* invalid unicode */
216 if (dp
->convtype
& KICONV_UCS_UCS4
) {
218 code
= encode_surrogate(code
);
220 /* can't handle with ucs-2 */
228 /* save UCS-4 into ucs[] */
229 for (q
= ucs
, i
= ucslen
- 1 ; i
>= 0 ; i
--)
230 *q
++ = (code
>> (i
<< 3)) & 0xff;
232 } else if (dp
->convtype
& KICONV_UCS_COMBINE
&& dp
->f_ctp
) {
233 /* convert local code to ENCODING_UNICODE */
237 ret
= iconv_convchr_case(dp
->f_ctp
, &p
, &inlen
, (char **)&q
,
238 &ucslen
, casetype
& (KICONV_FROM_LOWER
| KICONV_FROM_UPPER
));
245 /* src code is a proper subset of ENCODING_UNICODE */
247 if (dp
->convtype
& KICONV_UCS_FROM_LE
) {
255 if ((*q
& 0xfc) == 0xd8) {
256 if (dp
->convtype
& KICONV_UCS_UCS4
&&
257 dp
->convtype
& KICONV_UCS_FROM_UTF16
) {
260 /* invalid unicode */
273 if (dp
->convtype
& KICONV_UCS_FROM_LE
) {
280 if ((*q
& 0xfc) != 0xdc) {
281 /* invalid unicode */
289 * The second half of conversion.
290 * (convert ENCODING_UNICODE into any code)
293 if (dp
->convtype
& KICONV_UCS_TO_UTF8
) {
295 if (ucslen
== 4 && dp
->convtype
& KICONV_UCS_UCS4
) {
296 /* decode surrogate pair */
297 code
= decode_surrogate(p
);
299 code
= (ucs
[0] << 8) | ucs
[1];
302 if (casetype
== KICONV_LOWER
&& dp
->ctype
) {
303 code
= towlower(code
, dp
->ctype
);
304 } else if (casetype
== KICONV_UPPER
&& dp
->ctype
) {
305 code
= towupper(code
, dp
->ctype
);
309 if (ucs4_to_utf8(code
, q
, &outlen
, or) == NULL
) {
319 } else if (dp
->convtype
& KICONV_UCS_COMBINE
&& dp
->t_ctp
) {
320 ret
= iconv_convchr_case(dp
->t_ctp
, &p
, &ucslen
, &dst
,
321 &or, casetype
& (KICONV_LOWER
| KICONV_UPPER
));
329 /* dst code is a proper subset of ENCODING_UNICODE */
337 if (dp
->convtype
& KICONV_UCS_TO_LE
) {
346 if ((dp
->convtype
& KICONV_UCS_UCS4
) == 0 ||
347 (dp
->convtype
& KICONV_UCS_TO_UTF16
) == 0) {
351 if (dp
->convtype
& KICONV_UCS_TO_LE
) {
367 *inbytesleft
-= in
- ir
;
368 *outbytesleft
-= on
- or;
373 iconv_ucs_init(struct iconv_converter_class
*dcp
)
377 error
= iconv_add(ENCODING_UNICODE
, ENCODING_UNICODE
, ENCODING_UTF8
);
380 error
= iconv_add(ENCODING_UNICODE
, ENCODING_UTF8
, ENCODING_UNICODE
);
387 iconv_ucs_done(struct iconv_converter_class
*dcp
)
393 iconv_ucs_name(struct iconv_converter_class
*dcp
)
395 return (ENCODING_UNICODE
);
398 static kobj_method_t iconv_ucs_methods
[] = {
399 KOBJMETHOD(iconv_converter_open
, iconv_ucs_open
),
400 KOBJMETHOD(iconv_converter_close
, iconv_ucs_close
),
401 KOBJMETHOD(iconv_converter_conv
, iconv_ucs_conv
),
402 KOBJMETHOD(iconv_converter_init
, iconv_ucs_init
),
403 KOBJMETHOD(iconv_converter_done
, iconv_ucs_done
),
404 KOBJMETHOD(iconv_converter_name
, iconv_ucs_name
),
408 KICONV_CONVERTER(ucs
, sizeof(struct iconv_ucs
));
411 utf8_to_ucs4(const char *src
, size_t *utf8width
, size_t srclen
)
417 * get leading 1 byte from utf-8
419 if ((*src
& 0x80) == 0) {
421 * leading 1 bit is "0"
423 * ucs-4: 00000000 00000000 00000000 0xxxxxxx
426 /* get trailing 7 bits */
428 } else if ((*src
& 0xe0) == 0xc0) {
430 * leading 3 bits are "110"
431 * utf-8: 110xxxxx 10yyyyyy
432 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
435 /* get trailing 5 bits */
437 } else if ((*src
& 0xf0) == 0xe0) {
439 * leading 4 bits are "1110"
440 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
441 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
444 /* get trailing 4 bits */
446 } else if ((*src
& 0xf8) == 0xf0) {
448 * leading 5 bits are "11110"
449 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
450 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
453 /* get trailing 3 bits */
456 /* out of utf-16 range or having illegal bits */
464 * get left parts from utf-8
466 for (i
= 1 ; i
< w
; i
++) {
467 if ((*(src
+ i
) & 0xc0) != 0x80) {
468 /* invalid: leading 2 bits are not "10" */
471 /* concatenate trailing 6 bits into ucs4 */
473 ucs4
|= *(src
+ i
) & 0x3f;
481 ucs4_to_utf8(uint32_t ucs4
, char *dst
, size_t *utf8width
, size_t dstlen
)
487 * determine utf-8 width and leading bits
492 } else if (ucs4
< 0x800) {
494 lead
= 0xc0; /* "11" */
495 } else if (ucs4
< 0x10000) {
497 lead
= 0xe0; /* "111" */
498 } else if (ucs4
< 0x200000) {
500 lead
= 0xf0; /* "1111" */
512 for (i
= w
- 1 ; i
>= 1 ; i
--) {
513 /* get trailing 6 bits and put it with leading bit as "1" */
514 *(p
+ i
) = (ucs4
& 0x3f) | 0x80;
525 encode_surrogate(register uint32_t code
)
527 return ((((code
- 0x10000) << 6) & 0x3ff0000) |
528 ((code
- 0x10000) & 0x3ff) | 0xd800dc00);
532 decode_surrogate(register const u_char
*ucs
)
534 return ((((ucs
[0] & 0x3) << 18) | (ucs
[1] << 10) |
535 ((ucs
[2] & 0x3) << 8) | ucs
[3]) + 0x10000);