2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
3 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
5 * Copyright (c) 2002-2004 Tim J. Robbins
8 * Copyright (c) 2011 The FreeBSD Foundation
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * WCSBIN_EOF - Indicate EOF on input buffer.
38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8
39 * has already been escaped), on bytes-to-wchars and
40 * wchars-to-bytes. Escaping of other illegal codes will
41 * still occur on input but de-escaping will not occur
42 * on output (they will remain in the surrogate space).
44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
45 * (normally illegal), otherwise escape it on input
48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail.
51 #include <sys/param.h>
61 extern int __mb_sb_limit
;
63 static size_t _UTF8_mbrtowc(wchar_t * __restrict
, const char * __restrict
,
64 size_t, mbstate_t * __restrict
);
65 static int _UTF8_mbsinit(const mbstate_t *);
66 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict
,
67 const char ** __restrict
, size_t, size_t,
68 mbstate_t * __restrict
);
69 static size_t _UTF8_wcrtomb(char * __restrict
, wchar_t,
70 mbstate_t * __restrict
);
71 static size_t _UTF8_wcsnrtombs(char * __restrict
, const wchar_t ** __restrict
,
72 size_t, size_t, mbstate_t * __restrict
);
73 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst
,
74 const char * __restrict src
,
75 size_t dlen
, size_t *slen
, int flags
);
76 static size_t _UTF8_wcrtombin(char * __restrict dst
,
77 const wchar_t * __restrict src
,
78 size_t dlen
, size_t *slen
, int flags
);
87 _UTF8_init(struct xlocale_ctype
*l
, _RuneLocale
*rl
)
90 l
->__mbrtowc
= _UTF8_mbrtowc
;
91 l
->__wcrtomb
= _UTF8_wcrtomb
;
92 l
->__mbsinit
= _UTF8_mbsinit
;
93 l
->__mbsnrtowcs
= _UTF8_mbsnrtowcs
;
94 l
->__wcsnrtombs
= _UTF8_wcsnrtombs
;
95 l
->__mbintowcr
= _UTF8_mbintowcr
;
96 l
->__wcrtombin
= _UTF8_wcrtombin
;
100 * UCS-4 encoding used as the internal representation, so
101 * slots 0x0080-0x00FF are occuped and must be excluded
102 * from the single byte ctype by setting the limit.
104 l
->__mb_sb_limit
= 128;
110 _UTF8_mbsinit(const mbstate_t *ps
)
113 return (ps
== NULL
|| ((const _UTF8State
*)ps
)->want
== 0);
117 _UTF8_mbrtowc(wchar_t * __restrict pwc
, const char * __restrict s
, size_t n
,
118 mbstate_t * __restrict ps
)
121 int ch
, i
, mask
, want
;
124 us
= (_UTF8State
*)ps
;
126 if (us
->want
< 0 || us
->want
> 4) {
138 /* Incomplete multibyte sequence */
143 * Determine the number of octets that make up this character
144 * from the first octet, and a mask that extracts the
145 * interesting bits of the first octet. We already know
146 * the character is at least two bytes long.
148 * We also specify a lower bound for the character code to
149 * detect redundant, non-"shortest form" encodings. For
150 * example, the sequence C0 80 is _not_ a legal representation
151 * of the null character. This enforces a 1-to-1 mapping
152 * between character codes and their multibyte representations.
154 ch
= (unsigned char)*s
;
155 if ((ch
& 0x80) == 0) {
156 /* Fast path for plain ASCII characters. */
159 return (ch
!= '\0' ? 1 : 0);
161 if ((ch
& 0xe0) == 0xc0) {
165 } else if ((ch
& 0xf0) == 0xe0) {
169 } else if ((ch
& 0xf8) == 0xf0) {
175 * Malformed input; input is not UTF-8.
186 * Decode the octet sequence representing the character in chunks
187 * of 6 bits, most significant first.
190 wch
= (unsigned char)*s
++ & mask
;
194 for (i
= (us
->want
== 0) ? 1 : 0; i
< MIN(want
, n
); i
++) {
195 if ((*s
& 0xc0) != 0x80) {
197 * Malformed input; bad characters in the middle
207 /* Incomplete multibyte sequence. */
213 if (wch
< lbound
|| wch
> 0x10ffff) {
215 * Malformed input; redundant encoding or illegal
224 return (wch
== L
'\0' ? 0 : want
);
228 _UTF8_mbsnrtowcs(wchar_t * __restrict dst
, const char ** __restrict src
,
229 size_t nms
, size_t len
, mbstate_t * __restrict ps
)
237 us
= (_UTF8State
*)ps
;
244 * The fast path in the loop below is not safe if an ASCII
245 * character appears as anything but the first byte of a
246 * multibyte sequence. Check now to avoid doing it in the loop.
248 if (nms
> 0 && us
->want
> 0 && (signed char)*s
> 0) {
253 if (nms
> 0 && (signed char)*s
> 0)
255 * Fast path for plain ASCII characters
259 else if ((nb
= _UTF8_mbrtowc(&wc
, s
, nms
, ps
)) ==
261 /* Invalid sequence - mbrtowc() sets errno. */
263 else if (nb
== 0 || nb
== (size_t)-2)
273 * The fast path in the loop below is not safe if an ASCII
274 * character appears as anything but the first byte of a
275 * multibyte sequence. Check now to avoid doing it in the loop.
277 if (nms
> 0 && len
> 0 && us
->want
> 0 && (signed char)*s
> 0) {
282 if (nms
> 0 && (signed char)*s
> 0) {
284 * Fast path for plain ASCII characters
289 } else if ((nb
= _UTF8_mbrtowc(dst
, s
, nms
, ps
)) ==
293 } else if (nb
== (size_t)-2) {
296 } else if (nb
== 0) {
310 _UTF8_wcrtomb(char * __restrict s
, wchar_t wc
, mbstate_t * __restrict ps
)
316 us
= (_UTF8State
*)ps
;
324 /* Reset to initial shift state (no-op) */
328 * Determine the number of octets needed to represent this character.
329 * We always output the shortest sequence possible. Also specify the
330 * first few bits of the first octet, which contains the information
331 * about the sequence length.
333 if ((wc
& ~0x7f) == 0) {
334 /* Fast path for plain ASCII characters. */
337 } else if ((wc
& ~0x7ff) == 0) {
340 } else if ((wc
& ~0xffff) == 0) {
343 } else if (wc
<= 0x10ffff) {
352 * Output the octets representing the character in chunks
353 * of 6 bits, least significant last. The first octet is
354 * a special case because it contains the sequence length
357 for (i
= len
- 1; i
> 0; i
--) {
358 s
[i
] = (wc
& 0x3f) | 0x80;
361 *s
= (wc
& 0xff) | lead
;
367 _UTF8_wcsnrtombs(char * __restrict dst
, const wchar_t ** __restrict src
,
368 size_t nwc
, size_t len
, mbstate_t * __restrict ps
)
371 char buf
[MB_LEN_MAX
];
376 us
= (_UTF8State
*)ps
;
388 if (0 <= *s
&& *s
< 0x80)
389 /* Fast path for plain ASCII characters. */
391 else if ((nb
= _UTF8_wcrtomb(buf
, *s
, ps
)) ==
393 /* Invalid character - wcrtomb() sets errno. */
396 return (nbytes
+ nb
- 1);
403 while (len
> 0 && nwc
-- > 0) {
404 if (0 <= *s
&& *s
< 0x80) {
405 /* Fast path for plain ASCII characters. */
408 } else if (len
> (size_t)MB_CUR_MAX
) {
409 /* Enough space to translate in-place. */
410 if ((nb
= _UTF8_wcrtomb(dst
, *s
, ps
)) == (size_t)-1) {
416 * May not be enough space; use temp. buffer.
418 if ((nb
= _UTF8_wcrtomb(buf
, *s
, ps
)) == (size_t)-1) {
423 /* MB sequence for character won't fit. */
425 (void) memcpy(dst
, buf
, nb
);
429 return (nbytes
+ nb
- 1);
441 * Clean binary to wchar buffer conversions. This is basically like a normal
442 * buffer conversion but with a sane argument API and escaping. See none.c
443 * for a more complete description.
446 _UTF8_mbintowcr(wchar_t * __restrict dst
, const char * __restrict src
,
447 size_t dlen
, size_t *slen
, int flags
)
456 for (i
= j
= 0; i
< n
; ++i
) {
459 ch
= (unsigned char)src
[i
];
461 if ((ch
& 0x80) == 0) {
462 /* Fast path for plain ASCII characters. */
468 if ((ch
& 0xe0) == 0xc0) {
472 } else if ((ch
& 0xf0) == 0xe0) {
476 } else if ((ch
& 0xf8) == 0xf0) {
480 } else if ((ch
& 0xfc) == 0xf8) {
481 /* normally illegal, handled down below */
485 } else if ((ch
& 0xfe) == 0xfc) {
486 /* normally illegal, handled down below */
492 * Malformed input; input is not UTF-8, escape
495 if (flags
& WCSBIN_STRICT
) {
503 dst
[j
] = 0xDC00 | ch
;
509 * Construct wchar_t from multibyte sequence.
512 for (k
= 1; k
< want
; ++k
) {
514 * Stop if not enough input (don't do this early
515 * so we can detect illegal characters as they occur
518 * If termination is requested force-escape all chars.
521 if (flags
& WCSBIN_EOF
) {
529 if ((ch
& 0xc0) != 0x80) {
531 * Malformed input, bad characters in the
532 * middle of a multibyte sequence. Escape
535 if (flags
& WCSBIN_STRICT
) {
543 dst
[j
] = 0xDC00 | (unsigned char)src
[i
];
552 * Check validity of the wchar. If invalid we could escape
553 * just the first character and loop up, but it ought to be
554 * more readable if we escape all the chars in the sequence
555 * (since they are all >= 0x80 and might represent a legacy
556 * 5-byte or 6-byte code).
559 ((flags
& WCSBIN_LONGCODES
) == 0 && wch
> 0x10ffff)) {
564 * Check if wch is a surrogate code (which also encloses our
565 * UTF-8B escaping range). This is normally illegal in UTF8.
566 * If it is, we need to escape each characer in the sequence.
567 * Breakout if there isn't enough output buffer space.
569 * If (flags & WCSBIN_SURRO) the caller wishes to accept
570 * surrogate codes, i.e. the input might potentially already
571 * be escaped UTF8-B or unchecked UTF-16 that was converted
574 if ((flags
& WCSBIN_SURRO
) == 0 &&
575 wch
>= 0xD800 && wch
<= 0xDFFF) {
579 if (flags
& WCSBIN_STRICT
) {
586 for (k
= 0; k
< want
; ++k
) {
589 (unsigned char)src
[i
+k
];
610 _UTF8_wcrtombin(char * __restrict dst
, const wchar_t * __restrict src
,
611 size_t dlen
, size_t *slen
, int flags
)
621 for (i
= j
= 0; i
< n
; ++i
) {
626 if ((wc
& ~0x7f) == 0) {
627 /* Fast path for plain ASCII characters. */
629 dst
[j
] = (unsigned char)wc
;
633 if ((wc
& ~0x7ff) == 0) {
636 } else if (wc
>= 0xDC80 && wc
<= 0xDCFF &&
637 (flags
& WCSBIN_SURRO
) == 0) {
638 if (flags
& WCSBIN_STRICT
) {
640 * STRICT without SURRO is an error for
650 dst
[j
] = (unsigned char)wc
;
653 } else if ((wc
& ~0xffff) == 0) {
654 if (wc
>= 0xD800 && wc
<= 0xDFFF &&
655 (flags
& (WCSBIN_SURRO
| WCSBIN_STRICT
)) ==
658 * Surrogates in general are an error
659 * if STRICT is specified and SURRO is not
670 } else if (wc
<= 0x10ffff) {
673 } else if ((flags
& WCSBIN_LONGCODES
) && wc
< 0x200000) {
674 /* normally illegal */
677 } else if ((flags
& WCSBIN_LONGCODES
) && wc
< 0x4000000) {
678 /* normally illegal */
681 } else if ((flags
& WCSBIN_LONGCODES
) &&
682 (uint32_t)wc
< 0x80000000U
) {
683 /* normally illegal */
691 /* stop here, process error on next loop */
696 * Output the octets representing the character in chunks
697 * of 6 bits, least significant last. The first octet is
698 * a special case because it contains the sequence length
707 dst
[k
+ len
] = (wc
& 0x3f) | 0x80;
710 dst
[k
] = (wc
& 0xff) | lead
;
719 utf8towcr(wchar_t * __restrict dst
, const char * __restrict src
,
720 size_t dlen
, size_t *slen
, int flags
)
722 return _UTF8_mbintowcr(dst
, src
, dlen
, slen
, flags
);
726 wcrtoutf8(char * __restrict dst
, const wchar_t * __restrict src
,
727 size_t dlen
, size_t *slen
, int flags
)
729 return _UTF8_wcrtombin(dst
, src
, dlen
, slen
, flags
);