8990 /opt/onbld/gk is useless
[unleashed.git] / usr / src / common / unicode / uconv.c
blobfd65fc99b57618a718e647941a11df567c945688
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
37 #include <sys/types.h>
38 #ifdef _KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/systm.h>
42 #include <sys/debug.h>
43 #include <sys/kmem.h>
44 #include <sys/sunddi.h>
45 #else
46 #include <sys/u8_textprep.h>
47 #endif /* _KERNEL */
48 #include <sys/byteorder.h>
49 #include <sys/errno.h>
53 * The max and min values of high and low surrogate pairs of UTF-16,
54 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
56 #define UCONV_U16_HI_MIN (0xd800U)
57 #define UCONV_U16_HI_MAX (0xdbffU)
58 #define UCONV_U16_LO_MIN (0xdc00U)
59 #define UCONV_U16_LO_MAX (0xdfffU)
60 #define UCONV_U16_BIT_SHIFT (0x0400U)
61 #define UCONV_U16_BIT_MASK (0x0fffffU)
62 #define UCONV_U16_START (0x010000U)
64 /* The maximum value of Unicode coding space and ASCII coding space. */
65 #define UCONV_UNICODE_MAX (0x10ffffU)
66 #define UCONV_ASCII_MAX (0x7fU)
68 /* The mask values for input and output endians. */
69 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
70 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
72 /* Native and reversed endian macros. */
73 #ifdef _BIG_ENDIAN
74 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
75 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
76 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
77 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
78 #else
79 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
80 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
81 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
82 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
83 #endif /* _BIG_ENDIAN */
85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
86 #define UCONV_BOM_NORMAL (0xfeffU)
87 #define UCONV_BOM_SWAPPED (0xfffeU)
88 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
91 #define UCONV_U8_ONE_BYTE (0x7fU)
92 #define UCONV_U8_TWO_BYTES (0x7ffU)
93 #define UCONV_U8_THREE_BYTES (0xffffU)
94 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
96 /* The common minimum and maximum values at the UTF-8 character bytes. */
97 #define UCONV_U8_BYTE_MIN (0x80U)
98 #define UCONV_U8_BYTE_MAX (0xbfU)
101 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
102 * UTF-8 character bytes.
104 #define UCONV_U8_BIT_SHIFT 6
105 #define UCONV_U8_BIT_MASK 0x3f
108 * The following vector shows remaining bytes in a UTF-8 character.
109 * Index will be the first byte of the character.
111 static const uchar_t remaining_bytes_tbl[0x100] = {
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
126 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
135 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
139 * The following is a vector of bit-masks to get used bits in
140 * the first byte of a UTF-8 character. Index is remaining bytes at above of
141 * the character.
143 #ifdef _KERNEL
144 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 #else
146 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147 #endif /* _KERNEL */
150 * The following two vectors are to provide valid minimum and
151 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
152 * better illegal sequence checking. The index value must be the value of
153 * the first byte of the UTF-8 character.
155 static const uchar_t valid_min_2nd_byte[0x100] = {
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
181 /* C0 C1 C2 C3 C4 C5 C6 C7 */
182 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
184 /* C8 C9 CA CB CC CD CE CF */
185 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187 /* D0 D1 D2 D3 D4 D5 D6 D7 */
188 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
190 /* D8 D9 DA DB DC DD DE DF */
191 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
193 /* E0 E1 E2 E3 E4 E5 E6 E7 */
194 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
196 /* E8 E9 EA EB EC ED EE EF */
197 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
199 /* F0 F1 F2 F3 F4 F5 F6 F7 */
200 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0
205 static const uchar_t valid_max_2nd_byte[0x100] = {
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
231 /* C0 C1 C2 C3 C4 C5 C6 C7 */
232 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
234 /* C8 C9 CA CB CC CD CE CF */
235 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
237 /* D0 D1 D2 D3 D4 D5 D6 D7 */
238 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
240 /* D8 D9 DA DB DC DD DE DF */
241 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
243 /* E0 E1 E2 E3 E4 E5 E6 E7 */
244 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
246 /* E8 E9 EA EB EC ED EE EF */
247 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
249 /* F0 F1 F2 F3 F4 F5 F6 F7 */
250 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0
256 static int
257 check_endian(int flag, int *in, int *out)
259 *in = flag & UCONV_IN_ENDIAN_MASKS;
261 /* You cannot have both. */
262 if (*in == UCONV_IN_ENDIAN_MASKS)
263 return (EBADF);
265 if (*in == 0)
266 *in = UCONV_IN_NAT_ENDIAN;
268 *out = flag & UCONV_OUT_ENDIAN_MASKS;
270 /* You cannot have both. */
271 if (*out == UCONV_OUT_ENDIAN_MASKS)
272 return (EBADF);
274 if (*out == 0)
275 *out = UCONV_OUT_NAT_ENDIAN;
277 return (0);
280 static boolean_t
281 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
283 if (u16l > 0) {
284 if (*u16s == UCONV_BOM_NORMAL) {
285 *in = UCONV_IN_NAT_ENDIAN;
286 return (B_TRUE);
288 if (*u16s == UCONV_BOM_SWAPPED) {
289 *in = UCONV_IN_REV_ENDIAN;
290 return (B_TRUE);
294 return (B_FALSE);
297 static boolean_t
298 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
300 if (u32l > 0) {
301 if (*u32s == UCONV_BOM_NORMAL) {
302 *in = UCONV_IN_NAT_ENDIAN;
303 return (B_TRUE);
305 if (*u32s == UCONV_BOM_SWAPPED_32) {
306 *in = UCONV_IN_REV_ENDIAN;
307 return (B_TRUE);
311 return (B_FALSE);
315 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
316 uint32_t *u32s, size_t *utf32len, int flag)
318 int inendian;
319 int outendian;
320 size_t u16l;
321 size_t u32l;
322 uint32_t hi;
323 uint32_t lo;
324 boolean_t do_not_ignore_null;
327 * Do preliminary validity checks on parameters and collect info on
328 * endians.
330 if (u16s == NULL || utf16len == NULL)
331 return (EILSEQ);
333 if (u32s == NULL || utf32len == NULL)
334 return (E2BIG);
336 if (check_endian(flag, &inendian, &outendian) != 0)
337 return (EBADF);
340 * Initialize input and output parameter buffer indices and
341 * temporary variables.
343 u16l = u32l = 0;
344 hi = 0;
345 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
348 * Check on the BOM at the beginning of the input buffer if required
349 * and if there is indeed one, process it.
351 if ((flag & UCONV_IN_ACCEPT_BOM) &&
352 check_bom16(u16s, *utf16len, &inendian))
353 u16l++;
356 * Reset inendian and outendian so that after this point, those can be
357 * used as condition values.
359 inendian &= UCONV_IN_NAT_ENDIAN;
360 outendian &= UCONV_OUT_NAT_ENDIAN;
363 * If there is something in the input buffer and if necessary and
364 * requested, save the BOM at the output buffer.
366 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
367 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
368 UCONV_BOM_SWAPPED_32;
371 * Do conversion; if encounter a surrogate pair, assemble high and
372 * low pair values to form a UTF-32 character. If a half of a pair
373 * exists alone, then, either it is an illegal (EILSEQ) or
374 * invalid (EINVAL) value.
376 for (; u16l < *utf16len; u16l++) {
377 if (u16s[u16l] == 0 && do_not_ignore_null)
378 break;
380 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
382 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
383 if (hi)
384 return (EILSEQ);
385 hi = lo;
386 continue;
387 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
388 if (! hi)
389 return (EILSEQ);
390 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
391 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
392 + UCONV_U16_START;
393 hi = 0;
394 } else if (hi) {
395 return (EILSEQ);
398 if (u32l >= *utf32len)
399 return (E2BIG);
401 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
405 * If high half didn't see low half, then, it's most likely the input
406 * parameter is incomplete.
408 if (hi)
409 return (EINVAL);
412 * Save the number of consumed and saved characters. They do not
413 * include terminating NULL character (U+0000) at the end of
414 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
415 * the input buffer length is big enough to include the terminating
416 * NULL character).
418 *utf16len = u16l;
419 *utf32len = u32l;
421 return (0);
425 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
426 uchar_t *u8s, size_t *utf8len, int flag)
428 int inendian;
429 int outendian;
430 size_t u16l;
431 size_t u8l;
432 uint32_t hi;
433 uint32_t lo;
434 boolean_t do_not_ignore_null;
436 if (u16s == NULL || utf16len == NULL)
437 return (EILSEQ);
439 if (u8s == NULL || utf8len == NULL)
440 return (E2BIG);
442 if (check_endian(flag, &inendian, &outendian) != 0)
443 return (EBADF);
445 u16l = u8l = 0;
446 hi = 0;
447 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
449 if ((flag & UCONV_IN_ACCEPT_BOM) &&
450 check_bom16(u16s, *utf16len, &inendian))
451 u16l++;
453 inendian &= UCONV_IN_NAT_ENDIAN;
455 for (; u16l < *utf16len; u16l++) {
456 if (u16s[u16l] == 0 && do_not_ignore_null)
457 break;
459 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
461 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
462 if (hi)
463 return (EILSEQ);
464 hi = lo;
465 continue;
466 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
467 if (! hi)
468 return (EILSEQ);
469 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
470 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
471 + UCONV_U16_START;
472 hi = 0;
473 } else if (hi) {
474 return (EILSEQ);
478 * Now we convert a UTF-32 character into a UTF-8 character.
479 * Unicode coding space is between U+0000 and U+10FFFF;
480 * anything bigger is an illegal character.
482 if (lo <= UCONV_U8_ONE_BYTE) {
483 if (u8l >= *utf8len)
484 return (E2BIG);
485 u8s[u8l++] = (uchar_t)lo;
486 } else if (lo <= UCONV_U8_TWO_BYTES) {
487 if ((u8l + 1) >= *utf8len)
488 return (E2BIG);
489 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
490 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
491 } else if (lo <= UCONV_U8_THREE_BYTES) {
492 if ((u8l + 2) >= *utf8len)
493 return (E2BIG);
494 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
495 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
496 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
497 } else if (lo <= UCONV_U8_FOUR_BYTES) {
498 if ((u8l + 3) >= *utf8len)
499 return (E2BIG);
500 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
501 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
502 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
503 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
504 } else {
505 return (EILSEQ);
509 if (hi)
510 return (EINVAL);
512 *utf16len = u16l;
513 *utf8len = u8l;
515 return (0);
519 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
520 uint16_t *u16s, size_t *utf16len, int flag)
522 int inendian;
523 int outendian;
524 size_t u16l;
525 size_t u32l;
526 uint32_t hi;
527 uint32_t lo;
528 boolean_t do_not_ignore_null;
530 if (u32s == NULL || utf32len == NULL)
531 return (EILSEQ);
533 if (u16s == NULL || utf16len == NULL)
534 return (E2BIG);
536 if (check_endian(flag, &inendian, &outendian) != 0)
537 return (EBADF);
539 u16l = u32l = 0;
540 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
542 if ((flag & UCONV_IN_ACCEPT_BOM) &&
543 check_bom32(u32s, *utf32len, &inendian))
544 u32l++;
546 inendian &= UCONV_IN_NAT_ENDIAN;
547 outendian &= UCONV_OUT_NAT_ENDIAN;
549 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
550 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
551 UCONV_BOM_SWAPPED;
553 for (; u32l < *utf32len; u32l++) {
554 if (u32s[u32l] == 0 && do_not_ignore_null)
555 break;
557 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
560 * Anything bigger than the Unicode coding space, i.e.,
561 * Unicode scalar value bigger than U+10FFFF, is an illegal
562 * character.
564 if (hi > UCONV_UNICODE_MAX)
565 return (EILSEQ);
568 * Anything bigger than U+FFFF must be converted into
569 * a surrogate pair in UTF-16.
571 if (hi >= UCONV_U16_START) {
572 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
573 UCONV_U16_LO_MIN;
574 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
575 UCONV_U16_HI_MIN;
577 if ((u16l + 1) >= *utf16len)
578 return (E2BIG);
580 if (outendian) {
581 u16s[u16l++] = (uint16_t)hi;
582 u16s[u16l++] = (uint16_t)lo;
583 } else {
584 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
585 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
587 } else {
588 if (u16l >= *utf16len)
589 return (E2BIG);
590 u16s[u16l++] = (outendian) ? (uint16_t)hi :
591 BSWAP_16(((uint16_t)hi));
595 *utf16len = u16l;
596 *utf32len = u32l;
598 return (0);
602 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
603 uchar_t *u8s, size_t *utf8len, int flag)
605 int inendian;
606 int outendian;
607 size_t u32l;
608 size_t u8l;
609 uint32_t lo;
610 boolean_t do_not_ignore_null;
612 if (u32s == NULL || utf32len == NULL)
613 return (EILSEQ);
615 if (u8s == NULL || utf8len == NULL)
616 return (E2BIG);
618 if (check_endian(flag, &inendian, &outendian) != 0)
619 return (EBADF);
621 u32l = u8l = 0;
622 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
624 if ((flag & UCONV_IN_ACCEPT_BOM) &&
625 check_bom32(u32s, *utf32len, &inendian))
626 u32l++;
628 inendian &= UCONV_IN_NAT_ENDIAN;
630 for (; u32l < *utf32len; u32l++) {
631 if (u32s[u32l] == 0 && do_not_ignore_null)
632 break;
634 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
636 if (lo <= UCONV_U8_ONE_BYTE) {
637 if (u8l >= *utf8len)
638 return (E2BIG);
639 u8s[u8l++] = (uchar_t)lo;
640 } else if (lo <= UCONV_U8_TWO_BYTES) {
641 if ((u8l + 1) >= *utf8len)
642 return (E2BIG);
643 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
644 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
645 } else if (lo <= UCONV_U8_THREE_BYTES) {
646 if ((u8l + 2) >= *utf8len)
647 return (E2BIG);
648 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
649 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
650 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
651 } else if (lo <= UCONV_U8_FOUR_BYTES) {
652 if ((u8l + 3) >= *utf8len)
653 return (E2BIG);
654 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
655 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
656 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
657 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
658 } else {
659 return (EILSEQ);
663 *utf32len = u32l;
664 *utf8len = u8l;
666 return (0);
670 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
671 uint16_t *u16s, size_t *utf16len, int flag)
673 int inendian;
674 int outendian;
675 size_t u16l;
676 size_t u8l;
677 uint32_t hi;
678 uint32_t lo;
679 int remaining_bytes;
680 int first_b;
681 boolean_t do_not_ignore_null;
683 if (u8s == NULL || utf8len == NULL)
684 return (EILSEQ);
686 if (u16s == NULL || utf16len == NULL)
687 return (E2BIG);
689 if (check_endian(flag, &inendian, &outendian) != 0)
690 return (EBADF);
692 u16l = u8l = 0;
693 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
695 outendian &= UCONV_OUT_NAT_ENDIAN;
697 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
698 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
699 UCONV_BOM_SWAPPED;
701 for (; u8l < *utf8len; ) {
702 if (u8s[u8l] == 0 && do_not_ignore_null)
703 break;
706 * Collect a UTF-8 character and convert it to a UTF-32
707 * character. In doing so, we screen out illegally formed
708 * UTF-8 characters and treat such as illegal characters.
709 * The algorithm at below also screens out anything bigger
710 * than the U+10FFFF.
712 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
713 * more details on the illegal values of UTF-8 character
714 * bytes.
716 hi = (uint32_t)u8s[u8l++];
718 if (hi > UCONV_ASCII_MAX) {
719 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
720 return (EILSEQ);
722 first_b = hi;
723 hi = hi & u8_masks_tbl[remaining_bytes];
725 for (; remaining_bytes > 0; remaining_bytes--) {
727 * If we have no more bytes, the current
728 * UTF-8 character is incomplete.
730 if (u8l >= *utf8len)
731 return (EINVAL);
733 lo = (uint32_t)u8s[u8l++];
735 if (first_b) {
736 if (lo < valid_min_2nd_byte[first_b] ||
737 lo > valid_max_2nd_byte[first_b])
738 return (EILSEQ);
739 first_b = 0;
740 } else if (lo < UCONV_U8_BYTE_MIN ||
741 lo > UCONV_U8_BYTE_MAX) {
742 return (EILSEQ);
744 hi = (hi << UCONV_U8_BIT_SHIFT) |
745 (lo & UCONV_U8_BIT_MASK);
749 if (hi >= UCONV_U16_START) {
750 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
751 UCONV_U16_LO_MIN;
752 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
753 UCONV_U16_HI_MIN;
755 if ((u16l + 1) >= *utf16len)
756 return (E2BIG);
758 if (outendian) {
759 u16s[u16l++] = (uint16_t)hi;
760 u16s[u16l++] = (uint16_t)lo;
761 } else {
762 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
763 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
765 } else {
766 if (u16l >= *utf16len)
767 return (E2BIG);
769 u16s[u16l++] = (outendian) ? (uint16_t)hi :
770 BSWAP_16(((uint16_t)hi));
774 *utf16len = u16l;
775 *utf8len = u8l;
777 return (0);
781 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
782 uint32_t *u32s, size_t *utf32len, int flag)
784 int inendian;
785 int outendian;
786 size_t u32l;
787 size_t u8l;
788 uint32_t hi;
789 uint32_t c;
790 int remaining_bytes;
791 int first_b;
792 boolean_t do_not_ignore_null;
794 if (u8s == NULL || utf8len == NULL)
795 return (EILSEQ);
797 if (u32s == NULL || utf32len == NULL)
798 return (E2BIG);
800 if (check_endian(flag, &inendian, &outendian) != 0)
801 return (EBADF);
803 u32l = u8l = 0;
804 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
806 outendian &= UCONV_OUT_NAT_ENDIAN;
808 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
809 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
810 UCONV_BOM_SWAPPED_32;
812 for (; u8l < *utf8len; ) {
813 if (u8s[u8l] == 0 && do_not_ignore_null)
814 break;
816 hi = (uint32_t)u8s[u8l++];
818 if (hi > UCONV_ASCII_MAX) {
819 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
820 return (EILSEQ);
822 first_b = hi;
823 hi = hi & u8_masks_tbl[remaining_bytes];
825 for (; remaining_bytes > 0; remaining_bytes--) {
826 if (u8l >= *utf8len)
827 return (EINVAL);
829 c = (uint32_t)u8s[u8l++];
831 if (first_b) {
832 if (c < valid_min_2nd_byte[first_b] ||
833 c > valid_max_2nd_byte[first_b])
834 return (EILSEQ);
835 first_b = 0;
836 } else if (c < UCONV_U8_BYTE_MIN ||
837 c > UCONV_U8_BYTE_MAX) {
838 return (EILSEQ);
840 hi = (hi << UCONV_U8_BIT_SHIFT) |
841 (c & UCONV_U8_BIT_MASK);
845 if (u32l >= *utf32len)
846 return (E2BIG);
848 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
851 *utf32len = u32l;
852 *utf8len = u8l;
854 return (0);