usr/src/common/unicode/uconv.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 /*
  29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
  30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
  31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
  32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
  33  * the section 3C man pages.
  34  * Interface stability: Committed
  35  */
  36
  37 #include <sys/types.h>
  38 #ifdef  _KERNEL
  39 #include <sys/param.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/systm.h>
  42 #include <sys/debug.h>
  43 #include <sys/kmem.h>
  44 #include <sys/sunddi.h>
  45 #else
  46 #include <sys/u8_textprep.h>
  47 #endif  /* _KERNEL */
  48 #include <sys/byteorder.h>
  49 #include <sys/errno.h>
  50
  51
  52 /*
  53  * The max and min values of high and low surrogate pairs of UTF-16,
  54  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
  55  */
  56 #define UCONV_U16_HI_MIN        (0xd800U)
  57 #define UCONV_U16_HI_MAX        (0xdbffU)
  58 #define UCONV_U16_LO_MIN        (0xdc00U)
  59 #define UCONV_U16_LO_MAX        (0xdfffU)
  60 #define UCONV_U16_BIT_SHIFT     (0x0400U)
  61 #define UCONV_U16_BIT_MASK      (0x0fffffU)
  62 #define UCONV_U16_START         (0x010000U)
  63
  64 /* The maximum value of Unicode coding space and ASCII coding space. */
  65 #define UCONV_UNICODE_MAX       (0x10ffffU)
  66 #define UCONV_ASCII_MAX         (0x7fU)
  67
  68 /* The mask values for input and output endians. */
  69 #define UCONV_IN_ENDIAN_MASKS   (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
  70 #define UCONV_OUT_ENDIAN_MASKS  (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
  71
  72 /* Native and reversed endian macros. */
  73 #ifdef  _BIG_ENDIAN
  74 #define UCONV_IN_NAT_ENDIAN     UCONV_IN_BIG_ENDIAN
  75 #define UCONV_IN_REV_ENDIAN     UCONV_IN_LITTLE_ENDIAN
  76 #define UCONV_OUT_NAT_ENDIAN    UCONV_OUT_BIG_ENDIAN
  77 #define UCONV_OUT_REV_ENDIAN    UCONV_OUT_LITTLE_ENDIAN
  78 #else
  79 #define UCONV_IN_NAT_ENDIAN     UCONV_IN_LITTLE_ENDIAN
  80 #define UCONV_IN_REV_ENDIAN     UCONV_IN_BIG_ENDIAN
  81 #define UCONV_OUT_NAT_ENDIAN    UCONV_OUT_LITTLE_ENDIAN
  82 #define UCONV_OUT_REV_ENDIAN    UCONV_OUT_BIG_ENDIAN
  83 #endif  /* _BIG_ENDIAN */
  84
  85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
  86 #define UCONV_BOM_NORMAL        (0xfeffU)
  87 #define UCONV_BOM_SWAPPED       (0xfffeU)
  88 #define UCONV_BOM_SWAPPED_32    (0xfffe0000U)
  89
  90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
  91 #define UCONV_U8_ONE_BYTE       (0x7fU)
  92 #define UCONV_U8_TWO_BYTES      (0x7ffU)
  93 #define UCONV_U8_THREE_BYTES    (0xffffU)
  94 #define UCONV_U8_FOUR_BYTES     (0x10ffffU)
  95
  96 /* The common minimum and maximum values at the UTF-8 character bytes. */
  97 #define UCONV_U8_BYTE_MIN       (0x80U)
  98 #define UCONV_U8_BYTE_MAX       (0xbfU)
  99
 100 /*
 101  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
 102  * UTF-8 character bytes.
 103  */
 104 #define UCONV_U8_BIT_SHIFT      6
 105 #define UCONV_U8_BIT_MASK       0x3f
 106
 107 /*
 108  * The following vector shows remaining bytes in a UTF-8 character.
 109  * Index will be the first byte of the character.
 110  */
 111 static const uchar_t remaining_bytes_tbl[0x100] = {
 112         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 113         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 114         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 115         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 116         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 117         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 118         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 119         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 120         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 121         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 122         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 123         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 124
 125 /*      C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
 126         0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 127
 128 /*      D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
 129         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 130
 131 /*      E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
 132         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 133
 134 /*      F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
 135         3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 136 };
 137
 138 /*
 139  * The following is a vector of bit-masks to get used bits in
 140  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
 141  * the character.
 142  */
 143 #ifdef  _KERNEL
 144 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 145 #else
 146 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 147 #endif  /* _KERNEL */
 148
 149 /*
 150  * The following two vectors are to provide valid minimum and
 151  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
 152  * better illegal sequence checking. The index value must be the value of
 153  * the first byte of the UTF-8 character.
 154  */
 155 static const uchar_t valid_min_2nd_byte[0x100] = {
 156         0,    0,    0,    0,    0,    0,    0,    0,
 157         0,    0,    0,    0,    0,    0,    0,    0,
 158         0,    0,    0,    0,    0,    0,    0,    0,
 159         0,    0,    0,    0,    0,    0,    0,    0,
 160         0,    0,    0,    0,    0,    0,    0,    0,
 161         0,    0,    0,    0,    0,    0,    0,    0,
 162         0,    0,    0,    0,    0,    0,    0,    0,
 163         0,    0,    0,    0,    0,    0,    0,    0,
 164         0,    0,    0,    0,    0,    0,    0,    0,
 165         0,    0,    0,    0,    0,    0,    0,    0,
 166         0,    0,    0,    0,    0,    0,    0,    0,
 167         0,    0,    0,    0,    0,    0,    0,    0,
 168         0,    0,    0,    0,    0,    0,    0,    0,
 169         0,    0,    0,    0,    0,    0,    0,    0,
 170         0,    0,    0,    0,    0,    0,    0,    0,
 171         0,    0,    0,    0,    0,    0,    0,    0,
 172         0,    0,    0,    0,    0,    0,    0,    0,
 173         0,    0,    0,    0,    0,    0,    0,    0,
 174         0,    0,    0,    0,    0,    0,    0,    0,
 175         0,    0,    0,    0,    0,    0,    0,    0,
 176         0,    0,    0,    0,    0,    0,    0,    0,
 177         0,    0,    0,    0,    0,    0,    0,    0,
 178         0,    0,    0,    0,    0,    0,    0,    0,
 179         0,    0,    0,    0,    0,    0,    0,    0,
 180
 181 /*      C0    C1    C2    C3    C4    C5    C6    C7 */
 182         0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 183
 184 /*      C8    C9    CA    CB    CC    CD    CE    CF */
 185         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 186
 187 /*      D0    D1    D2    D3    D4    D5    D6    D7 */
 188         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 189
 190 /*      D8    D9    DA    DB    DC    DD    DE    DF */
 191         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 192
 193 /*      E0    E1    E2    E3    E4    E5    E6    E7 */
 194         0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 195
 196 /*      E8    E9    EA    EB    EC    ED    EE    EF */
 197         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 198
 199 /*      F0    F1    F2    F3    F4    F5    F6    F7 */
 200         0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
 201
 202         0,    0,    0,    0,    0,    0,    0,    0
 203 };
 204
 205 static const uchar_t valid_max_2nd_byte[0x100] = {
 206         0,    0,    0,    0,    0,    0,    0,    0,
 207         0,    0,    0,    0,    0,    0,    0,    0,
 208         0,    0,    0,    0,    0,    0,    0,    0,
 209         0,    0,    0,    0,    0,    0,    0,    0,
 210         0,    0,    0,    0,    0,    0,    0,    0,
 211         0,    0,    0,    0,    0,    0,    0,    0,
 212         0,    0,    0,    0,    0,    0,    0,    0,
 213         0,    0,    0,    0,    0,    0,    0,    0,
 214         0,    0,    0,    0,    0,    0,    0,    0,
 215         0,    0,    0,    0,    0,    0,    0,    0,
 216         0,    0,    0,    0,    0,    0,    0,    0,
 217         0,    0,    0,    0,    0,    0,    0,    0,
 218         0,    0,    0,    0,    0,    0,    0,    0,
 219         0,    0,    0,    0,    0,    0,    0,    0,
 220         0,    0,    0,    0,    0,    0,    0,    0,
 221         0,    0,    0,    0,    0,    0,    0,    0,
 222         0,    0,    0,    0,    0,    0,    0,    0,
 223         0,    0,    0,    0,    0,    0,    0,    0,
 224         0,    0,    0,    0,    0,    0,    0,    0,
 225         0,    0,    0,    0,    0,    0,    0,    0,
 226         0,    0,    0,    0,    0,    0,    0,    0,
 227         0,    0,    0,    0,    0,    0,    0,    0,
 228         0,    0,    0,    0,    0,    0,    0,    0,
 229         0,    0,    0,    0,    0,    0,    0,    0,
 230
 231 /*      C0    C1    C2    C3    C4    C5    C6    C7 */
 232         0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 233
 234 /*      C8    C9    CA    CB    CC    CD    CE    CF */
 235         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 236
 237 /*      D0    D1    D2    D3    D4    D5    D6    D7 */
 238         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 239
 240 /*      D8    D9    DA    DB    DC    DD    DE    DF */
 241         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 242
 243 /*      E0    E1    E2    E3    E4    E5    E6    E7 */
 244         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 245
 246 /*      E8    E9    EA    EB    EC    ED    EE    EF */
 247         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
 248
 249 /*      F0    F1    F2    F3    F4    F5    F6    F7 */
 250         0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
 251
 252         0,    0,    0,    0,    0,    0,    0,    0
 253 };
 254
 255
 256 static int
 257 check_endian(int flag, int *in, int *out)
 258 {
 259         *in = flag & UCONV_IN_ENDIAN_MASKS;
 260
 261         /* You cannot have both. */
 262         if (*in == UCONV_IN_ENDIAN_MASKS)
 263                 return (EBADF);
 264
 265         if (*in == 0)
 266                 *in = UCONV_IN_NAT_ENDIAN;
 267
 268         *out = flag & UCONV_OUT_ENDIAN_MASKS;
 269
 270         /* You cannot have both. */
 271         if (*out == UCONV_OUT_ENDIAN_MASKS)
 272                 return (EBADF);
 273
 274         if (*out == 0)
 275                 *out = UCONV_OUT_NAT_ENDIAN;
 276
 277         return (0);
 278 }
 279
 280 static boolean_t
 281 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
 282 {
 283         if (u16l > 0) {
 284                 if (*u16s == UCONV_BOM_NORMAL) {
 285                         *in = UCONV_IN_NAT_ENDIAN;
 286                         return (B_TRUE);
 287                 }
 288                 if (*u16s == UCONV_BOM_SWAPPED) {
 289                         *in = UCONV_IN_REV_ENDIAN;
 290                         return (B_TRUE);
 291                 }
 292         }
 293
 294         return (B_FALSE);
 295 }
 296
 297 static boolean_t
 298 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
 299 {
 300         if (u32l > 0) {
 301                 if (*u32s == UCONV_BOM_NORMAL) {
 302                         *in = UCONV_IN_NAT_ENDIAN;
 303                         return (B_TRUE);
 304                 }
 305                 if (*u32s == UCONV_BOM_SWAPPED_32) {
 306                         *in = UCONV_IN_REV_ENDIAN;
 307                         return (B_TRUE);
 308                 }
 309         }
 310
 311         return (B_FALSE);
 312 }
 313
 314 int
 315 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
 316     uint32_t *u32s, size_t *utf32len, int flag)
 317 {
 318         int inendian;
 319         int outendian;
 320         size_t u16l;
 321         size_t u32l;
 322         uint32_t hi;
 323         uint32_t lo;
 324         boolean_t do_not_ignore_null;
 325
 326         /*
 327          * Do preliminary validity checks on parameters and collect info on
 328          * endians.
 329          */
 330         if (u16s == NULL || utf16len == NULL)
 331                 return (EILSEQ);
 332
 333         if (u32s == NULL || utf32len == NULL)
 334                 return (E2BIG);
 335
 336         if (check_endian(flag, &inendian, &outendian) != 0)
 337                 return (EBADF);
 338
 339         /*
 340          * Initialize input and output parameter buffer indices and
 341          * temporary variables.
 342          */
 343         u16l = u32l = 0;
 344         hi = 0;
 345         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 346
 347         /*
 348          * Check on the BOM at the beginning of the input buffer if required
 349          * and if there is indeed one, process it.
 350          */
 351         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 352             check_bom16(u16s, *utf16len, &inendian))
 353                 u16l++;
 354
 355         /*
 356          * Reset inendian and outendian so that after this point, those can be
 357          * used as condition values.
 358          */
 359         inendian &= UCONV_IN_NAT_ENDIAN;
 360         outendian &= UCONV_OUT_NAT_ENDIAN;
 361
 362         /*
 363          * If there is something in the input buffer and if necessary and
 364          * requested, save the BOM at the output buffer.
 365          */
 366         if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 367                 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
 368                     UCONV_BOM_SWAPPED_32;
 369
 370         /*
 371          * Do conversion; if encounter a surrogate pair, assemble high and
 372          * low pair values to form a UTF-32 character. If a half of a pair
 373          * exists alone, then, either it is an illegal (EILSEQ) or
 374          * invalid (EINVAL) value.
 375          */
 376         for (; u16l < *utf16len; u16l++) {
 377                 if (u16s[u16l] == 0 && do_not_ignore_null)
 378                         break;
 379
 380                 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
 381
 382                 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
 383                         if (hi)
 384                                 return (EILSEQ);
 385                         hi = lo;
 386                         continue;
 387                 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
 388                         if (! hi)
 389                                 return (EILSEQ);
 390                         lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
 391                             lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
 392                             + UCONV_U16_START;
 393                         hi = 0;
 394                 } else if (hi) {
 395                         return (EILSEQ);
 396                 }
 397
 398                 if (u32l >= *utf32len)
 399                         return (E2BIG);
 400
 401                 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
 402         }
 403
 404         /*
 405          * If high half didn't see low half, then, it's most likely the input
 406          * parameter is incomplete.
 407          */
 408         if (hi)
 409                 return (EINVAL);
 410
 411         /*
 412          * Save the number of consumed and saved characters. They do not
 413          * include terminating NULL character (U+0000) at the end of
 414          * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
 415          * the input buffer length is big enough to include the terminating
 416          * NULL character).
 417          */
 418         *utf16len = u16l;
 419         *utf32len = u32l;
 420
 421         return (0);
 422 }
 423
 424 int
 425 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
 426     uchar_t *u8s, size_t *utf8len, int flag)
 427 {
 428         int inendian;
 429         int outendian;
 430         size_t u16l;
 431         size_t u8l;
 432         uint32_t hi;
 433         uint32_t lo;
 434         boolean_t do_not_ignore_null;
 435
 436         if (u16s == NULL || utf16len == NULL)
 437                 return (EILSEQ);
 438
 439         if (u8s == NULL || utf8len == NULL)
 440                 return (E2BIG);
 441
 442         if (check_endian(flag, &inendian, &outendian) != 0)
 443                 return (EBADF);
 444
 445         u16l = u8l = 0;
 446         hi = 0;
 447         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 448
 449         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 450             check_bom16(u16s, *utf16len, &inendian))
 451                 u16l++;
 452
 453         inendian &= UCONV_IN_NAT_ENDIAN;
 454
 455         for (; u16l < *utf16len; u16l++) {
 456                 if (u16s[u16l] == 0 && do_not_ignore_null)
 457                         break;
 458
 459                 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
 460
 461                 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
 462                         if (hi)
 463                                 return (EILSEQ);
 464                         hi = lo;
 465                         continue;
 466                 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
 467                         if (! hi)
 468                                 return (EILSEQ);
 469                         lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
 470                             lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
 471                             + UCONV_U16_START;
 472                         hi = 0;
 473                 } else if (hi) {
 474                         return (EILSEQ);
 475                 }
 476
 477                 /*
 478                  * Now we convert a UTF-32 character into a UTF-8 character.
 479                  * Unicode coding space is between U+0000 and U+10FFFF;
 480                  * anything bigger is an illegal character.
 481                  */
 482                 if (lo <= UCONV_U8_ONE_BYTE) {
 483                         if (u8l >= *utf8len)
 484                                 return (E2BIG);
 485                         u8s[u8l++] = (uchar_t)lo;
 486                 } else if (lo <= UCONV_U8_TWO_BYTES) {
 487                         if ((u8l + 1) >= *utf8len)
 488                                 return (E2BIG);
 489                         u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
 490                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
 491                 } else if (lo <= UCONV_U8_THREE_BYTES) {
 492                         if ((u8l + 2) >= *utf8len)
 493                                 return (E2BIG);
 494                         u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
 495                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
 496                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
 497                 } else if (lo <= UCONV_U8_FOUR_BYTES) {
 498                         if ((u8l + 3) >= *utf8len)
 499                                 return (E2BIG);
 500                         u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
 501                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
 502                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
 503                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
 504                 } else {
 505                         return (EILSEQ);
 506                 }
 507         }
 508
 509         if (hi)
 510                 return (EINVAL);
 511
 512         *utf16len = u16l;
 513         *utf8len = u8l;
 514
 515         return (0);
 516 }
 517
 518 int
 519 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
 520     uint16_t *u16s, size_t *utf16len, int flag)
 521 {
 522         int inendian;
 523         int outendian;
 524         size_t u16l;
 525         size_t u32l;
 526         uint32_t hi;
 527         uint32_t lo;
 528         boolean_t do_not_ignore_null;
 529
 530         if (u32s == NULL || utf32len == NULL)
 531                 return (EILSEQ);
 532
 533         if (u16s == NULL || utf16len == NULL)
 534                 return (E2BIG);
 535
 536         if (check_endian(flag, &inendian, &outendian) != 0)
 537                 return (EBADF);
 538
 539         u16l = u32l = 0;
 540         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 541
 542         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 543             check_bom32(u32s, *utf32len, &inendian))
 544                 u32l++;
 545
 546         inendian &= UCONV_IN_NAT_ENDIAN;
 547         outendian &= UCONV_OUT_NAT_ENDIAN;
 548
 549         if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 550                 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
 551                     UCONV_BOM_SWAPPED;
 552
 553         for (; u32l < *utf32len; u32l++) {
 554                 if (u32s[u32l] == 0 && do_not_ignore_null)
 555                         break;
 556
 557                 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
 558
 559                 /*
 560                  * Anything bigger than the Unicode coding space, i.e.,
 561                  * Unicode scalar value bigger than U+10FFFF, is an illegal
 562                  * character.
 563                  */
 564                 if (hi > UCONV_UNICODE_MAX)
 565                         return (EILSEQ);
 566
 567                 /*
 568                  * Anything bigger than U+FFFF must be converted into
 569                  * a surrogate pair in UTF-16.
 570                  */
 571                 if (hi >= UCONV_U16_START) {
 572                         lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
 573                             UCONV_U16_LO_MIN;
 574                         hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
 575                             UCONV_U16_HI_MIN;
 576
 577                         if ((u16l + 1) >= *utf16len)
 578                                 return (E2BIG);
 579
 580                         if (outendian) {
 581                                 u16s[u16l++] = (uint16_t)hi;
 582                                 u16s[u16l++] = (uint16_t)lo;
 583                         } else {
 584                                 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
 585                                 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
 586                         }
 587                 } else {
 588                         if (u16l >= *utf16len)
 589                                 return (E2BIG);
 590                         u16s[u16l++] = (outendian) ? (uint16_t)hi :
 591                             BSWAP_16(((uint16_t)hi));
 592                 }
 593         }
 594
 595         *utf16len = u16l;
 596         *utf32len = u32l;
 597
 598         return (0);
 599 }
 600
 601 int
 602 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
 603     uchar_t *u8s, size_t *utf8len, int flag)
 604 {
 605         int inendian;
 606         int outendian;
 607         size_t u32l;
 608         size_t u8l;
 609         uint32_t lo;
 610         boolean_t do_not_ignore_null;
 611
 612         if (u32s == NULL || utf32len == NULL)
 613                 return (EILSEQ);
 614
 615         if (u8s == NULL || utf8len == NULL)
 616                 return (E2BIG);
 617
 618         if (check_endian(flag, &inendian, &outendian) != 0)
 619                 return (EBADF);
 620
 621         u32l = u8l = 0;
 622         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 623
 624         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 625             check_bom32(u32s, *utf32len, &inendian))
 626                 u32l++;
 627
 628         inendian &= UCONV_IN_NAT_ENDIAN;
 629
 630         for (; u32l < *utf32len; u32l++) {
 631                 if (u32s[u32l] == 0 && do_not_ignore_null)
 632                         break;
 633
 634                 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
 635
 636                 if (lo <= UCONV_U8_ONE_BYTE) {
 637                         if (u8l >= *utf8len)
 638                                 return (E2BIG);
 639                         u8s[u8l++] = (uchar_t)lo;
 640                 } else if (lo <= UCONV_U8_TWO_BYTES) {
 641                         if ((u8l + 1) >= *utf8len)
 642                                 return (E2BIG);
 643                         u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
 644                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
 645                 } else if (lo <= UCONV_U8_THREE_BYTES) {
 646                         if ((u8l + 2) >= *utf8len)
 647                                 return (E2BIG);
 648                         u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
 649                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
 650                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
 651                 } else if (lo <= UCONV_U8_FOUR_BYTES) {
 652                         if ((u8l + 3) >= *utf8len)
 653                                 return (E2BIG);
 654                         u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
 655                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
 656                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
 657                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
 658                 } else {
 659                         return (EILSEQ);
 660                 }
 661         }
 662
 663         *utf32len = u32l;
 664         *utf8len = u8l;
 665
 666         return (0);
 667 }
 668
 669 int
 670 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
 671     uint16_t *u16s, size_t *utf16len, int flag)
 672 {
 673         int inendian;
 674         int outendian;
 675         size_t u16l;
 676         size_t u8l;
 677         uint32_t hi;
 678         uint32_t lo;
 679         int remaining_bytes;
 680         int first_b;
 681         boolean_t do_not_ignore_null;
 682
 683         if (u8s == NULL || utf8len == NULL)
 684                 return (EILSEQ);
 685
 686         if (u16s == NULL || utf16len == NULL)
 687                 return (E2BIG);
 688
 689         if (check_endian(flag, &inendian, &outendian) != 0)
 690                 return (EBADF);
 691
 692         u16l = u8l = 0;
 693         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 694
 695         outendian &= UCONV_OUT_NAT_ENDIAN;
 696
 697         if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 698                 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
 699                     UCONV_BOM_SWAPPED;
 700
 701         for (; u8l < *utf8len; ) {
 702                 if (u8s[u8l] == 0 && do_not_ignore_null)
 703                         break;
 704
 705                 /*
 706                  * Collect a UTF-8 character and convert it to a UTF-32
 707                  * character. In doing so, we screen out illegally formed
 708                  * UTF-8 characters and treat such as illegal characters.
 709                  * The algorithm at below also screens out anything bigger
 710                  * than the U+10FFFF.
 711                  *
 712                  * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
 713                  * more details on the illegal values of UTF-8 character
 714                  * bytes.
 715                  */
 716                 hi = (uint32_t)u8s[u8l++];
 717
 718                 if (hi > UCONV_ASCII_MAX) {
 719                         if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
 720                                 return (EILSEQ);
 721
 722                         first_b = hi;
 723                         hi = hi & u8_masks_tbl[remaining_bytes];
 724
 725                         for (; remaining_bytes > 0; remaining_bytes--) {
 726                                 /*
 727                                  * If we have no more bytes, the current
 728                                  * UTF-8 character is incomplete.
 729                                  */
 730                                 if (u8l >= *utf8len)
 731                                         return (EINVAL);
 732
 733                                 lo = (uint32_t)u8s[u8l++];
 734
 735                                 if (first_b) {
 736                                         if (lo < valid_min_2nd_byte[first_b] ||
 737                                             lo > valid_max_2nd_byte[first_b])
 738                                                 return (EILSEQ);
 739                                         first_b = 0;
 740                                 } else if (lo < UCONV_U8_BYTE_MIN ||
 741                                     lo > UCONV_U8_BYTE_MAX) {
 742                                         return (EILSEQ);
 743                                 }
 744                                 hi = (hi << UCONV_U8_BIT_SHIFT) |
 745                                     (lo & UCONV_U8_BIT_MASK);
 746                         }
 747                 }
 748
 749                 if (hi >= UCONV_U16_START) {
 750                         lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
 751                             UCONV_U16_LO_MIN;
 752                         hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
 753                             UCONV_U16_HI_MIN;
 754
 755                         if ((u16l + 1) >= *utf16len)
 756                                 return (E2BIG);
 757
 758                         if (outendian) {
 759                                 u16s[u16l++] = (uint16_t)hi;
 760                                 u16s[u16l++] = (uint16_t)lo;
 761                         } else {
 762                                 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
 763                                 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
 764                         }
 765                 } else {
 766                         if (u16l >= *utf16len)
 767                                 return (E2BIG);
 768
 769                         u16s[u16l++] = (outendian) ? (uint16_t)hi :
 770                             BSWAP_16(((uint16_t)hi));
 771                 }
 772         }
 773
 774         *utf16len = u16l;
 775         *utf8len = u8l;
 776
 777         return (0);
 778 }
 779
 780 int
 781 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
 782     uint32_t *u32s, size_t *utf32len, int flag)
 783 {
 784         int inendian;
 785         int outendian;
 786         size_t u32l;
 787         size_t u8l;
 788         uint32_t hi;
 789         uint32_t c;
 790         int remaining_bytes;
 791         int first_b;
 792         boolean_t do_not_ignore_null;
 793
 794         if (u8s == NULL || utf8len == NULL)
 795                 return (EILSEQ);
 796
 797         if (u32s == NULL || utf32len == NULL)
 798                 return (E2BIG);
 799
 800         if (check_endian(flag, &inendian, &outendian) != 0)
 801                 return (EBADF);
 802
 803         u32l = u8l = 0;
 804         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 805
 806         outendian &= UCONV_OUT_NAT_ENDIAN;
 807
 808         if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 809                 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
 810                     UCONV_BOM_SWAPPED_32;
 811
 812         for (; u8l < *utf8len; ) {
 813                 if (u8s[u8l] == 0 && do_not_ignore_null)
 814                         break;
 815
 816                 hi = (uint32_t)u8s[u8l++];
 817
 818                 if (hi > UCONV_ASCII_MAX) {
 819                         if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
 820                                 return (EILSEQ);
 821
 822                         first_b = hi;
 823                         hi = hi & u8_masks_tbl[remaining_bytes];
 824
 825                         for (; remaining_bytes > 0; remaining_bytes--) {
 826                                 if (u8l >= *utf8len)
 827                                         return (EINVAL);
 828
 829                                 c = (uint32_t)u8s[u8l++];
 830
 831                                 if (first_b) {
 832                                         if (c < valid_min_2nd_byte[first_b] ||
 833                                             c > valid_max_2nd_byte[first_b])
 834                                                 return (EILSEQ);
 835                                         first_b = 0;
 836                                 } else if (c < UCONV_U8_BYTE_MIN ||
 837                                     c > UCONV_U8_BYTE_MAX) {
 838                                         return (EILSEQ);
 839                                 }
 840                                 hi = (hi << UCONV_U8_BIT_SHIFT) |
 841                                     (c & UCONV_U8_BIT_MASK);
 842                         }
 843                 }
 844
 845                 if (u32l >= *utf32len)
 846                         return (E2BIG);
 847
 848                 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
 849         }
 850
 851         *utf32len = u32l;
 852         *utf8len = u8l;
 853
 854         return (0);
 855 }