drm/radeon: add missing ttm_eu_backoff_reservation to radeon_bo_list_validate
[dragonfly.git] / lib / libc / locale / utf8.c
blobe92792198b69b71e8e40b8ea205bfbf568e8919c
1 /*
2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
3 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
5 * Copyright (c) 2002-2004 Tim J. Robbins
6 * All rights reserved.
8 * Copyright (c) 2011 The FreeBSD Foundation
9 * All rights reserved.
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * WCSBIN_EOF - Indicate EOF on input buffer.
38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8
39 * has already been escaped), on bytes-to-wchars and
40 * wchars-to-bytes. Escaping of other illegal codes will
41 * still occur on input but de-escaping will not occur
42 * on output (they will remain in the surrogate space).
44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
45 * (normally illegal), otherwise escape it on input
46 * and fail on output.
48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail.
51 #include <sys/param.h>
53 #include <errno.h>
54 #include <limits.h>
55 #include <runetype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <wchar.h>
59 #include "mblocal.h"
61 extern int __mb_sb_limit;
63 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
64 size_t, mbstate_t * __restrict);
65 static int _UTF8_mbsinit(const mbstate_t *);
66 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
67 const char ** __restrict, size_t, size_t,
68 mbstate_t * __restrict);
69 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
70 mbstate_t * __restrict);
71 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
72 size_t, size_t, mbstate_t * __restrict);
73 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst,
74 const char * __restrict src,
75 size_t dlen, size_t *slen, int flags);
76 static size_t _UTF8_wcrtombin(char * __restrict dst,
77 const wchar_t * __restrict src,
78 size_t dlen, size_t *slen, int flags);
80 typedef struct {
81 wchar_t ch;
82 int want;
83 wchar_t lbound;
84 } _UTF8State;
86 int
87 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
90 l->__mbrtowc = _UTF8_mbrtowc;
91 l->__wcrtomb = _UTF8_wcrtomb;
92 l->__mbsinit = _UTF8_mbsinit;
93 l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
94 l->__wcsnrtombs = _UTF8_wcsnrtombs;
95 l->__mbintowcr = _UTF8_mbintowcr;
96 l->__wcrtombin = _UTF8_wcrtombin;
97 l->runes = rl;
98 l->__mb_cur_max = 4;
100 * UCS-4 encoding used as the internal representation, so
101 * slots 0x0080-0x00FF are occuped and must be excluded
102 * from the single byte ctype by setting the limit.
104 l->__mb_sb_limit = 128;
106 return (0);
109 static int
110 _UTF8_mbsinit(const mbstate_t *ps)
113 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
116 static size_t
117 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
118 mbstate_t * __restrict ps)
120 _UTF8State *us;
121 int ch, i, mask, want;
122 wchar_t lbound, wch;
124 us = (_UTF8State *)ps;
126 if (us->want < 0 || us->want > 4) {
127 errno = EINVAL;
128 return ((size_t)-1);
131 if (s == NULL) {
132 s = "";
133 n = 1;
134 pwc = NULL;
137 if (n == 0)
138 /* Incomplete multibyte sequence */
139 return ((size_t)-2);
141 if (us->want == 0) {
143 * Determine the number of octets that make up this character
144 * from the first octet, and a mask that extracts the
145 * interesting bits of the first octet. We already know
146 * the character is at least two bytes long.
148 * We also specify a lower bound for the character code to
149 * detect redundant, non-"shortest form" encodings. For
150 * example, the sequence C0 80 is _not_ a legal representation
151 * of the null character. This enforces a 1-to-1 mapping
152 * between character codes and their multibyte representations.
154 ch = (unsigned char)*s;
155 if ((ch & 0x80) == 0) {
156 /* Fast path for plain ASCII characters. */
157 if (pwc != NULL)
158 *pwc = ch;
159 return (ch != '\0' ? 1 : 0);
161 if ((ch & 0xe0) == 0xc0) {
162 mask = 0x1f;
163 want = 2;
164 lbound = 0x80;
165 } else if ((ch & 0xf0) == 0xe0) {
166 mask = 0x0f;
167 want = 3;
168 lbound = 0x800;
169 } else if ((ch & 0xf8) == 0xf0) {
170 mask = 0x07;
171 want = 4;
172 lbound = 0x10000;
173 } else {
175 * Malformed input; input is not UTF-8.
177 errno = EILSEQ;
178 return ((size_t)-1);
180 } else {
181 want = us->want;
182 lbound = us->lbound;
186 * Decode the octet sequence representing the character in chunks
187 * of 6 bits, most significant first.
189 if (us->want == 0)
190 wch = (unsigned char)*s++ & mask;
191 else
192 wch = us->ch;
194 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
195 if ((*s & 0xc0) != 0x80) {
197 * Malformed input; bad characters in the middle
198 * of a character.
200 errno = EILSEQ;
201 return ((size_t)-1);
203 wch <<= 6;
204 wch |= *s++ & 0x3f;
206 if (i < want) {
207 /* Incomplete multibyte sequence. */
208 us->want = want - i;
209 us->lbound = lbound;
210 us->ch = wch;
211 return ((size_t)-2);
213 if (wch < lbound || wch > 0x10ffff) {
215 * Malformed input; redundant encoding or illegal
216 * code sequence.
218 errno = EILSEQ;
219 return ((size_t)-1);
221 if (pwc != NULL)
222 *pwc = wch;
223 us->want = 0;
224 return (wch == L'\0' ? 0 : want);
227 static size_t
228 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
229 size_t nms, size_t len, mbstate_t * __restrict ps)
231 _UTF8State *us;
232 const char *s;
233 size_t nchr;
234 wchar_t wc;
235 size_t nb;
237 us = (_UTF8State *)ps;
239 s = *src;
240 nchr = 0;
242 if (dst == NULL) {
244 * The fast path in the loop below is not safe if an ASCII
245 * character appears as anything but the first byte of a
246 * multibyte sequence. Check now to avoid doing it in the loop.
248 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
249 errno = EILSEQ;
250 return ((size_t)-1);
252 for (;;) {
253 if (nms > 0 && (signed char)*s > 0)
255 * Fast path for plain ASCII characters
256 * excluding NUL.
258 nb = 1;
259 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
260 (size_t)-1)
261 /* Invalid sequence - mbrtowc() sets errno. */
262 return ((size_t)-1);
263 else if (nb == 0 || nb == (size_t)-2)
264 return (nchr);
265 s += nb;
266 nms -= nb;
267 nchr++;
269 /*NOTREACHED*/
273 * The fast path in the loop below is not safe if an ASCII
274 * character appears as anything but the first byte of a
275 * multibyte sequence. Check now to avoid doing it in the loop.
277 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
278 errno = EILSEQ;
279 return ((size_t)-1);
281 while (len-- > 0) {
282 if (nms > 0 && (signed char)*s > 0) {
284 * Fast path for plain ASCII characters
285 * excluding NUL.
287 *dst = (wchar_t)*s;
288 nb = 1;
289 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
290 (size_t)-1) {
291 *src = s;
292 return ((size_t)-1);
293 } else if (nb == (size_t)-2) {
294 *src = s + nms;
295 return (nchr);
296 } else if (nb == 0) {
297 *src = NULL;
298 return (nchr);
300 s += nb;
301 nms -= nb;
302 nchr++;
303 dst++;
305 *src = s;
306 return (nchr);
309 static size_t
310 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
312 _UTF8State *us;
313 unsigned char lead;
314 int i, len;
316 us = (_UTF8State *)ps;
318 if (us->want != 0) {
319 errno = EINVAL;
320 return ((size_t)-1);
323 if (s == NULL)
324 /* Reset to initial shift state (no-op) */
325 return (1);
328 * Determine the number of octets needed to represent this character.
329 * We always output the shortest sequence possible. Also specify the
330 * first few bits of the first octet, which contains the information
331 * about the sequence length.
333 if ((wc & ~0x7f) == 0) {
334 /* Fast path for plain ASCII characters. */
335 *s = (char)wc;
336 return (1);
337 } else if ((wc & ~0x7ff) == 0) {
338 lead = 0xc0;
339 len = 2;
340 } else if ((wc & ~0xffff) == 0) {
341 lead = 0xe0;
342 len = 3;
343 } else if (wc <= 0x10ffff) {
344 lead = 0xf0;
345 len = 4;
346 } else {
347 errno = EILSEQ;
348 return ((size_t)-1);
352 * Output the octets representing the character in chunks
353 * of 6 bits, least significant last. The first octet is
354 * a special case because it contains the sequence length
355 * information.
357 for (i = len - 1; i > 0; i--) {
358 s[i] = (wc & 0x3f) | 0x80;
359 wc >>= 6;
361 *s = (wc & 0xff) | lead;
363 return (len);
366 static size_t
367 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
368 size_t nwc, size_t len, mbstate_t * __restrict ps)
370 _UTF8State *us;
371 char buf[MB_LEN_MAX];
372 const wchar_t *s;
373 size_t nbytes;
374 size_t nb;
376 us = (_UTF8State *)ps;
378 if (us->want != 0) {
379 errno = EINVAL;
380 return ((size_t)-1);
383 s = *src;
384 nbytes = 0;
386 if (dst == NULL) {
387 while (nwc-- > 0) {
388 if (0 <= *s && *s < 0x80)
389 /* Fast path for plain ASCII characters. */
390 nb = 1;
391 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
392 (size_t)-1)
393 /* Invalid character - wcrtomb() sets errno. */
394 return ((size_t)-1);
395 if (*s == L'\0')
396 return (nbytes + nb - 1);
397 s++;
398 nbytes += nb;
400 return (nbytes);
403 while (len > 0 && nwc-- > 0) {
404 if (0 <= *s && *s < 0x80) {
405 /* Fast path for plain ASCII characters. */
406 nb = 1;
407 *dst = *s;
408 } else if (len > (size_t)MB_CUR_MAX) {
409 /* Enough space to translate in-place. */
410 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
411 *src = s;
412 return ((size_t)-1);
414 } else {
416 * May not be enough space; use temp. buffer.
418 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
419 *src = s;
420 return ((size_t)-1);
422 if (nb > (int)len)
423 /* MB sequence for character won't fit. */
424 break;
425 (void) memcpy(dst, buf, nb);
427 if (*s == L'\0') {
428 *src = NULL;
429 return (nbytes + nb - 1);
431 s++;
432 dst += nb;
433 len -= nb;
434 nbytes += nb;
436 *src = s;
437 return (nbytes);
441 * Clean binary to wchar buffer conversions. This is basically like a normal
442 * buffer conversion but with a sane argument API and escaping. See none.c
443 * for a more complete description.
445 static size_t
446 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src,
447 size_t dlen, size_t *slen, int flags)
449 size_t i;
450 size_t j;
451 size_t k;
452 size_t n = *slen;
453 int ch, mask, want;
454 wchar_t lbound, wch;
456 for (i = j = 0; i < n; ++i) {
457 if (j == dlen)
458 break;
459 ch = (unsigned char)src[i];
461 if ((ch & 0x80) == 0) {
462 /* Fast path for plain ASCII characters. */
463 if (dst)
464 dst[j] = ch;
465 ++j;
466 continue;
468 if ((ch & 0xe0) == 0xc0) {
469 mask = 0x1f;
470 want = 2;
471 lbound = 0x80;
472 } else if ((ch & 0xf0) == 0xe0) {
473 mask = 0x0f;
474 want = 3;
475 lbound = 0x800;
476 } else if ((ch & 0xf8) == 0xf0) {
477 mask = 0x07;
478 want = 4;
479 lbound = 0x10000;
480 } else if ((ch & 0xfc) == 0xf8) {
481 /* normally illegal, handled down below */
482 mask = 0x03;
483 want = 5;
484 lbound = 0x200000;
485 } else if ((ch & 0xfe) == 0xfc) {
486 /* normally illegal, handled down below */
487 mask = 0x01;
488 want = 6;
489 lbound = 0x4000000;
490 } else {
492 * Malformed input; input is not UTF-8, escape
493 * with UTF-8B.
495 if (flags & WCSBIN_STRICT) {
496 if (i == 0) {
497 errno = EILSEQ;
498 return ((size_t)-1);
500 break;
502 if (dst)
503 dst[j] = 0xDC00 | ch;
504 ++j;
505 continue;
509 * Construct wchar_t from multibyte sequence.
511 wch = ch & mask;
512 for (k = 1; k < want; ++k) {
514 * Stop if not enough input (don't do this early
515 * so we can detect illegal characters as they occur
516 * in the stream).
518 * If termination is requested force-escape all chars.
520 if (i + k >= n) {
521 if (flags & WCSBIN_EOF) {
522 want = n - i;
523 goto forceesc;
525 goto breakout;
528 ch = src[i+k];
529 if ((ch & 0xc0) != 0x80) {
531 * Malformed input, bad characters in the
532 * middle of a multibyte sequence. Escape
533 * with UTF-8B.
535 if (flags & WCSBIN_STRICT) {
536 if (i == 0) {
537 errno = EILSEQ;
538 return ((size_t)-1);
540 goto breakout;
542 if (dst)
543 dst[j] = 0xDC00 | (unsigned char)src[i];
544 ++j;
545 goto loopup;
547 wch <<= 6;
548 wch |= ch & 0x3f;
552 * Check validity of the wchar. If invalid we could escape
553 * just the first character and loop up, but it ought to be
554 * more readable if we escape all the chars in the sequence
555 * (since they are all >= 0x80 and might represent a legacy
556 * 5-byte or 6-byte code).
558 if (wch < lbound ||
559 ((flags & WCSBIN_LONGCODES) == 0 && wch > 0x10ffff)) {
560 goto forceesc;
564 * Check if wch is a surrogate code (which also encloses our
565 * UTF-8B escaping range). This is normally illegal in UTF8.
566 * If it is, we need to escape each characer in the sequence.
567 * Breakout if there isn't enough output buffer space.
569 * If (flags & WCSBIN_SURRO) the caller wishes to accept
570 * surrogate codes, i.e. the input might potentially already
571 * be escaped UTF8-B or unchecked UTF-16 that was converted
572 * into UTF-8.
574 if ((flags & WCSBIN_SURRO) == 0 &&
575 wch >= 0xD800 && wch <= 0xDFFF) {
576 forceesc:
577 if (j + want > dlen)
578 break;
579 if (flags & WCSBIN_STRICT) {
580 if (i == 0) {
581 errno = EILSEQ;
582 return ((size_t)-1);
584 break;
586 for (k = 0; k < want; ++k) {
587 if (dst) {
588 dst[j] = 0xDC00 |
589 (unsigned char)src[i+k];
591 ++j;
593 i += k - 1;
594 } else {
595 i += k - 1;
596 if (dst)
597 dst[j] = wch;
598 ++j;
600 loopup:
603 breakout:
604 *slen = i;
606 return j;
609 static size_t
610 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src,
611 size_t dlen, size_t *slen, int flags)
613 size_t i;
614 size_t j;
615 size_t k;
616 size_t n = *slen;
617 size_t len;
618 unsigned char lead;
619 wchar_t wc;
621 for (i = j = 0; i < n; ++i) {
622 if (j == dlen)
623 break;
624 wc = src[i];
626 if ((wc & ~0x7f) == 0) {
627 /* Fast path for plain ASCII characters. */
628 if (dst)
629 dst[j] = (unsigned char)wc;
630 ++j;
631 continue;
633 if ((wc & ~0x7ff) == 0) {
634 lead = 0xc0;
635 len = 2;
636 } else if (wc >= 0xDC80 && wc <= 0xDCFF &&
637 (flags & WCSBIN_SURRO) == 0) {
638 if (flags & WCSBIN_STRICT) {
640 * STRICT without SURRO is an error for
641 * surrogates.
643 if (i == 0) {
644 errno = EILSEQ;
645 return ((size_t)-1);
647 break;
649 if (dst)
650 dst[j] = (unsigned char)wc;
651 ++j;
652 continue;
653 } else if ((wc & ~0xffff) == 0) {
654 if (wc >= 0xD800 && wc <= 0xDFFF &&
655 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) ==
656 WCSBIN_STRICT) {
658 * Surrogates in general are an error
659 * if STRICT is specified and SURRO is not
660 * specified.
662 if (i == 0) {
663 errno = EILSEQ;
664 return ((size_t)-1);
666 break;
668 lead = 0xe0;
669 len = 3;
670 } else if (wc <= 0x10ffff) {
671 lead = 0xf0;
672 len = 4;
673 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) {
674 /* normally illegal */
675 lead = 0xf0;
676 len = 4;
677 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) {
678 /* normally illegal */
679 lead = 0xf8;
680 len = 5;
681 } else if ((flags & WCSBIN_LONGCODES) &&
682 (uint32_t)wc < 0x80000000U) {
683 /* normally illegal */
684 lead = 0xfc;
685 len = 6;
686 } else {
687 if (i == 0) {
688 errno = EILSEQ;
689 return ((size_t)-1);
691 /* stop here, process error on next loop */
692 break;
696 * Output the octets representing the character in chunks
697 * of 6 bits, least significant last. The first octet is
698 * a special case because it contains the sequence length
699 * information.
701 if (j + len > dlen)
702 break;
703 k = j;
704 j += len;
705 if (dst) {
706 while (--len > 0) {
707 dst[k + len] = (wc & 0x3f) | 0x80;
708 wc >>= 6;
710 dst[k] = (wc & 0xff) | lead;
713 *slen = i;
715 return j;
718 size_t
719 utf8towcr(wchar_t * __restrict dst, const char * __restrict src,
720 size_t dlen, size_t *slen, int flags)
722 return _UTF8_mbintowcr(dst, src, dlen, slen, flags);
725 size_t
726 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src,
727 size_t dlen, size_t *slen, int flags)
729 return _UTF8_wcrtombin(dst, src, dlen, slen, flags);