usr/src/common/unicode/u8_textprep.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28
  29 /*
  30  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
  31  *
  32  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
  33  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
  34  * the section 3C man pages.
  35  * Interface stability: Committed.
  36  */
  37
  38 #include <sys/types.h>
  39 #ifdef  _KERNEL
  40 #include <sys/param.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/systm.h>
  43 #include <sys/debug.h>
  44 #include <sys/kmem.h>
  45 #include <sys/ddi.h>
  46 #include <sys/sunddi.h>
  47 #else
  48 #include <sys/u8_textprep.h>
  49 #include <strings.h>
  50 #endif  /* _KERNEL */
  51 #include <sys/byteorder.h>
  52 #include <sys/errno.h>
  53 #include <sys/u8_textprep_data.h>
  54
  55
  56 /* The maximum possible number of bytes in a UTF-8 character. */
  57 #define U8_MB_CUR_MAX                   (4)
  58
  59 /*
  60  * The maximum number of bytes needed for a UTF-8 character to cover
  61  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
  62  */
  63 #define U8_MAX_BYTES_UCS2               (3)
  64
  65 /* The maximum possible number of bytes in a Stream-Safe Text. */
  66 #define U8_STREAM_SAFE_TEXT_MAX         (128)
  67
  68 /*
  69  * The maximum number of characters in a combining/conjoining sequence and
  70  * the actual upperbound limit of a combining/conjoining sequence.
  71  */
  72 #define U8_MAX_CHARS_A_SEQ              (32)
  73 #define U8_UPPER_LIMIT_IN_A_SEQ         (31)
  74
  75 /* The combining class value for Starter. */
  76 #define U8_COMBINING_CLASS_STARTER      (0)
  77
  78 /*
  79  * Some Hangul related macros at below.
  80  *
  81  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
  82  * Vowels, and optional Trailing consonants in Unicode scalar values.
  83  *
  84  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
  85  * the actual U+11A8. This is due to that the trailing consonant is optional
  86  * and thus we are doing a pre-calculation of subtracting one.
  87  *
  88  * Each of 19 modern leading consonants has total 588 possible syllables since
  89  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
  90  * no trailing consonant case, i.e., 21 x 28 = 588.
  91  *
  92  * We also have bunch of Hangul related macros at below. Please bear in mind
  93  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
  94  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
  95  * Jamo; it just guarantee that it will be most likely.
  96  */
  97 #define U8_HANGUL_SYL_FIRST             (0xAC00U)
  98 #define U8_HANGUL_SYL_LAST              (0xD7A3U)
  99
 100 #define U8_HANGUL_JAMO_L_FIRST          (0x1100U)
 101 #define U8_HANGUL_JAMO_L_LAST           (0x1112U)
 102 #define U8_HANGUL_JAMO_V_FIRST          (0x1161U)
 103 #define U8_HANGUL_JAMO_V_LAST           (0x1175U)
 104 #define U8_HANGUL_JAMO_T_FIRST          (0x11A7U)
 105 #define U8_HANGUL_JAMO_T_LAST           (0x11C2U)
 106
 107 #define U8_HANGUL_V_COUNT               (21)
 108 #define U8_HANGUL_VT_COUNT              (588)
 109 #define U8_HANGUL_T_COUNT               (28)
 110
 111 #define U8_HANGUL_JAMO_1ST_BYTE         (0xE1U)
 112
 113 #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
 114         (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
 115         (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
 116         (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
 117
 118 #define U8_HANGUL_JAMO_L(u) \
 119         ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
 120
 121 #define U8_HANGUL_JAMO_V(u) \
 122         ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
 123
 124 #define U8_HANGUL_JAMO_T(u) \
 125         ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
 126
 127 #define U8_HANGUL_JAMO(u) \
 128         ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
 129
 130 #define U8_HANGUL_SYLLABLE(u) \
 131         ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
 132
 133 #define U8_HANGUL_COMPOSABLE_L_V(s, u) \
 134         ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
 135
 136 #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
 137         ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
 138
 139 /* The types of decomposition mappings. */
 140 #define U8_DECOMP_BOTH                  (0xF5U)
 141 #define U8_DECOMP_CANONICAL             (0xF6U)
 142
 143 /* The indicator for 16-bit table. */
 144 #define U8_16BIT_TABLE_INDICATOR        (0x8000U)
 145
 146 /* The following are some convenience macros. */
 147 #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
 148         (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
 149                 (uint32_t)(b3) & 0x3F;
 150
 151 #define U8_SIMPLE_SWAP(a, b, t) \
 152         (t) = (a); \
 153         (a) = (b); \
 154         (b) = (t);
 155
 156 #define U8_ASCII_TOUPPER(c) \
 157         (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
 158
 159 #define U8_ASCII_TOLOWER(c) \
 160         (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
 161
 162 #define U8_ISASCII(c)                   (((uchar_t)(c)) < 0x80U)
 163 /*
 164  * The following macro assumes that the two characters that are to be
 165  * swapped are adjacent to each other and 'a' comes before 'b'.
 166  *
 167  * If the assumptions are not met, then, the macro will fail.
 168  */
 169 #define U8_SWAP_COMB_MARKS(a, b) \
 170         for (k = 0; k < disp[(a)]; k++) \
 171                 u8t[k] = u8s[start[(a)] + k]; \
 172         for (k = 0; k < disp[(b)]; k++) \
 173                 u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
 174         start[(b)] = start[(a)] + disp[(b)]; \
 175         for (k = 0; k < disp[(a)]; k++) \
 176                 u8s[start[(b)] + k] = u8t[k]; \
 177         U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
 178         U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
 179
 180 /* The possible states during normalization. */
 181 typedef enum {
 182         U8_STATE_START = 0,
 183         U8_STATE_HANGUL_L = 1,
 184         U8_STATE_HANGUL_LV = 2,
 185         U8_STATE_HANGUL_LVT = 3,
 186         U8_STATE_HANGUL_V = 4,
 187         U8_STATE_HANGUL_T = 5,
 188         U8_STATE_COMBINING_MARK = 6
 189 } u8_normalization_states_t;
 190
 191 /*
 192  * The three vectors at below are used to check bytes of a given UTF-8
 193  * character are valid and not containing any malformed byte values.
 194  *
 195  * We used to have a quite relaxed UTF-8 binary representation but then there
 196  * was some security related issues and so the Unicode Consortium defined
 197  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
 198  * one more time at the Unicode 3.2. The following three tables are based on
 199  * that.
 200  */
 201
 202 #define U8_ILLEGAL_NEXT_BYTE_COMMON(c)  ((c) < 0x80 || (c) > 0xBF)
 203
 204 #define I_                              U8_ILLEGAL_CHAR
 205 #define O_                              U8_OUT_OF_RANGE_CHAR
 206
 207 const int8_t u8_number_of_bytes[0x100] = {
 208         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 209         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 210         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 211         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 212         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 213         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 214         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 215         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 216
 217 /*      80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
 218         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 219
 220 /*      90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
 221         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 222
 223 /*      A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
 224         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 225
 226 /*      B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
 227         I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 228
 229 /*      C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
 230         I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 231
 232 /*      D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
 233         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 234
 235 /*      E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
 236         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
 237
 238 /*      F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
 239         4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
 240 };
 241
 242 #undef  I_
 243 #undef  O_
 244
 245 const uint8_t u8_valid_min_2nd_byte[0x100] = {
 246         0,    0,    0,    0,    0,    0,    0,    0,
 247         0,    0,    0,    0,    0,    0,    0,    0,
 248         0,    0,    0,    0,    0,    0,    0,    0,
 249         0,    0,    0,    0,    0,    0,    0,    0,
 250         0,    0,    0,    0,    0,    0,    0,    0,
 251         0,    0,    0,    0,    0,    0,    0,    0,
 252         0,    0,    0,    0,    0,    0,    0,    0,
 253         0,    0,    0,    0,    0,    0,    0,    0,
 254         0,    0,    0,    0,    0,    0,    0,    0,
 255         0,    0,    0,    0,    0,    0,    0,    0,
 256         0,    0,    0,    0,    0,    0,    0,    0,
 257         0,    0,    0,    0,    0,    0,    0,    0,
 258         0,    0,    0,    0,    0,    0,    0,    0,
 259         0,    0,    0,    0,    0,    0,    0,    0,
 260         0,    0,    0,    0,    0,    0,    0,    0,
 261         0,    0,    0,    0,    0,    0,    0,    0,
 262         0,    0,    0,    0,    0,    0,    0,    0,
 263         0,    0,    0,    0,    0,    0,    0,    0,
 264         0,    0,    0,    0,    0,    0,    0,    0,
 265         0,    0,    0,    0,    0,    0,    0,    0,
 266         0,    0,    0,    0,    0,    0,    0,    0,
 267         0,    0,    0,    0,    0,    0,    0,    0,
 268         0,    0,    0,    0,    0,    0,    0,    0,
 269         0,    0,    0,    0,    0,    0,    0,    0,
 270 /*      C0    C1    C2    C3    C4    C5    C6    C7    */
 271         0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 272 /*      C8    C9    CA    CB    CC    CD    CE    CF    */
 273         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 274 /*      D0    D1    D2    D3    D4    D5    D6    D7    */
 275         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 276 /*      D8    D9    DA    DB    DC    DD    DE    DF    */
 277         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 278 /*      E0    E1    E2    E3    E4    E5    E6    E7    */
 279         0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 280 /*      E8    E9    EA    EB    EC    ED    EE    EF    */
 281         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 282 /*      F0    F1    F2    F3    F4    F5    F6    F7    */
 283         0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
 284         0,    0,    0,    0,    0,    0,    0,    0,
 285 };
 286
 287 const uint8_t u8_valid_max_2nd_byte[0x100] = {
 288         0,    0,    0,    0,    0,    0,    0,    0,
 289         0,    0,    0,    0,    0,    0,    0,    0,
 290         0,    0,    0,    0,    0,    0,    0,    0,
 291         0,    0,    0,    0,    0,    0,    0,    0,
 292         0,    0,    0,    0,    0,    0,    0,    0,
 293         0,    0,    0,    0,    0,    0,    0,    0,
 294         0,    0,    0,    0,    0,    0,    0,    0,
 295         0,    0,    0,    0,    0,    0,    0,    0,
 296         0,    0,    0,    0,    0,    0,    0,    0,
 297         0,    0,    0,    0,    0,    0,    0,    0,
 298         0,    0,    0,    0,    0,    0,    0,    0,
 299         0,    0,    0,    0,    0,    0,    0,    0,
 300         0,    0,    0,    0,    0,    0,    0,    0,
 301         0,    0,    0,    0,    0,    0,    0,    0,
 302         0,    0,    0,    0,    0,    0,    0,    0,
 303         0,    0,    0,    0,    0,    0,    0,    0,
 304         0,    0,    0,    0,    0,    0,    0,    0,
 305         0,    0,    0,    0,    0,    0,    0,    0,
 306         0,    0,    0,    0,    0,    0,    0,    0,
 307         0,    0,    0,    0,    0,    0,    0,    0,
 308         0,    0,    0,    0,    0,    0,    0,    0,
 309         0,    0,    0,    0,    0,    0,    0,    0,
 310         0,    0,    0,    0,    0,    0,    0,    0,
 311         0,    0,    0,    0,    0,    0,    0,    0,
 312 /*      C0    C1    C2    C3    C4    C5    C6    C7    */
 313         0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 314 /*      C8    C9    CA    CB    CC    CD    CE    CF    */
 315         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 316 /*      D0    D1    D2    D3    D4    D5    D6    D7    */
 317         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 318 /*      D8    D9    DA    DB    DC    DD    DE    DF    */
 319         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 320 /*      E0    E1    E2    E3    E4    E5    E6    E7    */
 321         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 322 /*      E8    E9    EA    EB    EC    ED    EE    EF    */
 323         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
 324 /*      F0    F1    F2    F3    F4    F5    F6    F7    */
 325         0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
 326         0,    0,    0,    0,    0,    0,    0,    0,
 327 };
 328
 329
 330 /*
 331  * The u8_validate() validates on the given UTF-8 character string and
 332  * calculate the byte length. It is quite similar to mblen(3C) except that
 333  * this will validate against the list of characters if required and
 334  * specific to UTF-8 and Unicode.
 335  */
 336 int
 337 u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
 338 {
 339         uchar_t *ib;
 340         uchar_t *ibtail;
 341         uchar_t **p;
 342         uchar_t *s1;
 343         uchar_t *s2;
 344         uchar_t f;
 345         int sz;
 346         size_t i;
 347         int ret_val;
 348         boolean_t second;
 349         boolean_t no_need_to_validate_entire;
 350         boolean_t check_additional;
 351         boolean_t validate_ucs2_range_only;
 352
 353         if (! u8str)
 354                 return (0);
 355
 356         ib = (uchar_t *)u8str;
 357         ibtail = ib + n;
 358
 359         ret_val = 0;
 360
 361         no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
 362         check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
 363         validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
 364
 365         while (ib < ibtail) {
 366                 /*
 367                  * The first byte of a UTF-8 character tells how many
 368                  * bytes will follow for the character. If the first byte
 369                  * is an illegal byte value or out of range value, we just
 370                  * return -1 with an appropriate error number.
 371                  */
 372                 sz = u8_number_of_bytes[*ib];
 373                 if (sz == U8_ILLEGAL_CHAR) {
 374                         *errnum = EILSEQ;
 375                         return (-1);
 376                 }
 377
 378                 if (sz == U8_OUT_OF_RANGE_CHAR ||
 379                     (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
 380                         *errnum = ERANGE;
 381                         return (-1);
 382                 }
 383
 384                 /*
 385                  * If we don't have enough bytes to check on, that's also
 386                  * an error. As you can see, we give illegal byte sequence
 387                  * checking higher priority then EINVAL cases.
 388                  */
 389                 if ((ibtail - ib) < sz) {
 390                         *errnum = EINVAL;
 391                         return (-1);
 392                 }
 393
 394                 if (sz == 1) {
 395                         ib++;
 396                         ret_val++;
 397                 } else {
 398                         /*
 399                          * Check on the multi-byte UTF-8 character. For more
 400                          * details on this, see comment added for the used
 401                          * data structures at the beginning of the file.
 402                          */
 403                         f = *ib++;
 404                         ret_val++;
 405                         second = B_TRUE;
 406                         for (i = 1; i < sz; i++) {
 407                                 if (second) {
 408                                         if (*ib < u8_valid_min_2nd_byte[f] ||
 409                                             *ib > u8_valid_max_2nd_byte[f]) {
 410                                                 *errnum = EILSEQ;
 411                                                 return (-1);
 412                                         }
 413                                         second = B_FALSE;
 414                                 } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
 415                                         *errnum = EILSEQ;
 416                                         return (-1);
 417                                 }
 418                                 ib++;
 419                                 ret_val++;
 420                         }
 421                 }
 422
 423                 if (check_additional) {
 424                         for (p = (uchar_t **)list, i = 0; p[i]; i++) {
 425                                 s1 = ib - sz;
 426                                 s2 = p[i];
 427                                 while (s1 < ib) {
 428                                         if (*s1 != *s2 || *s2 == '\0')
 429                                                 break;
 430                                         s1++;
 431                                         s2++;
 432                                 }
 433
 434                                 if (s1 >= ib && *s2 == '\0') {
 435                                         *errnum = EBADF;
 436                                         return (-1);
 437                                 }
 438                         }
 439                 }
 440
 441                 if (no_need_to_validate_entire)
 442                         break;
 443         }
 444
 445         return (ret_val);
 446 }
 447
 448 /*
 449  * The do_case_conv() looks at the mapping tables and returns found
 450  * bytes if any. If not found, the input bytes are returned. The function
 451  * always terminate the return bytes with a null character assuming that
 452  * there are plenty of room to do so.
 453  *
 454  * The case conversions are simple case conversions mapping a character to
 455  * another character as specified in the Unicode data. The byte size of
 456  * the mapped character could be different from that of the input character.
 457  *
 458  * The return value is the byte length of the returned character excluding
 459  * the terminating null byte.
 460  */
 461 static size_t
 462 do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
 463 {
 464         size_t i;
 465         uint16_t b1 = 0;
 466         uint16_t b2 = 0;
 467         uint16_t b3 = 0;
 468         uint16_t b3_tbl;
 469         uint16_t b3_base;
 470         uint16_t b4 = 0;
 471         size_t start_id;
 472         size_t end_id;
 473
 474         /*
 475          * At this point, the only possible values for sz are 2, 3, and 4.
 476          * The u8s should point to a vector that is well beyond the size of
 477          * 5 bytes.
 478          */
 479         if (sz == 2) {
 480                 b3 = u8s[0] = s[0];
 481                 b4 = u8s[1] = s[1];
 482         } else if (sz == 3) {
 483                 b2 = u8s[0] = s[0];
 484                 b3 = u8s[1] = s[1];
 485                 b4 = u8s[2] = s[2];
 486         } else if (sz == 4) {
 487                 b1 = u8s[0] = s[0];
 488                 b2 = u8s[1] = s[1];
 489                 b3 = u8s[2] = s[2];
 490                 b4 = u8s[3] = s[3];
 491         } else {
 492                 /* This is not possible but just in case as a fallback. */
 493                 if (is_it_toupper)
 494                         *u8s = U8_ASCII_TOUPPER(*s);
 495                 else
 496                         *u8s = U8_ASCII_TOLOWER(*s);
 497                 u8s[1] = '\0';
 498
 499                 return (1);
 500         }
 501         u8s[sz] = '\0';
 502
 503         /*
 504          * Let's find out if we have a corresponding character.
 505          */
 506         b1 = u8_common_b1_tbl[uv][b1];
 507         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
 508                 return ((size_t)sz);
 509
 510         b2 = u8_case_common_b2_tbl[uv][b1][b2];
 511         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
 512                 return ((size_t)sz);
 513
 514         if (is_it_toupper) {
 515                 b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
 516                 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
 517                         return ((size_t)sz);
 518
 519                 start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
 520                 end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
 521
 522                 /* Either there is no match or an error at the table. */
 523                 if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
 524                         return ((size_t)sz);
 525
 526                 b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
 527
 528                 for (i = 0; start_id < end_id; start_id++)
 529                         u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
 530         } else {
 531                 b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
 532                 if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
 533                         return ((size_t)sz);
 534
 535                 start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
 536                 end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
 537
 538                 if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
 539                         return ((size_t)sz);
 540
 541                 b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
 542
 543                 for (i = 0; start_id < end_id; start_id++)
 544                         u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
 545         }
 546
 547         /*
 548          * If i is still zero, that means there is no corresponding character.
 549          */
 550         if (i == 0)
 551                 return ((size_t)sz);
 552
 553         u8s[i] = '\0';
 554
 555         return (i);
 556 }
 557
 558 /*
 559  * The do_case_compare() function compares the two input strings, s1 and s2,
 560  * one character at a time doing case conversions if applicable and return
 561  * the comparison result as like strcmp().
 562  *
 563  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
 564  * we treat the 7-bit ASCII characters as a special case trying to yield
 565  * faster processing time.
 566  */
 567 static int
 568 do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
 569         size_t n2, boolean_t is_it_toupper, int *errnum)
 570 {
 571         int f;
 572         int sz1;
 573         int sz2;
 574         size_t j;
 575         size_t i1;
 576         size_t i2;
 577         uchar_t u8s1[U8_MB_CUR_MAX + 1];
 578         uchar_t u8s2[U8_MB_CUR_MAX + 1];
 579
 580         i1 = i2 = 0;
 581         while (i1 < n1 && i2 < n2) {
 582                 /*
 583                  * Find out what would be the byte length for this UTF-8
 584                  * character at string s1 and also find out if this is
 585                  * an illegal start byte or not and if so, issue a proper
 586                  * error number and yet treat this byte as a character.
 587                  */
 588                 sz1 = u8_number_of_bytes[*s1];
 589                 if (sz1 < 0) {
 590                         *errnum = EILSEQ;
 591                         sz1 = 1;
 592                 }
 593
 594                 /*
 595                  * For 7-bit ASCII characters mainly, we do a quick case
 596                  * conversion right at here.
 597                  *
 598                  * If we don't have enough bytes for this character, issue
 599                  * an EINVAL error and use what are available.
 600                  *
 601                  * If we have enough bytes, find out if there is
 602                  * a corresponding uppercase character and if so, copy over
 603                  * the bytes for a comparison later. If there is no
 604                  * corresponding uppercase character, then, use what we have
 605                  * for the comparison.
 606                  */
 607                 if (sz1 == 1) {
 608                         if (is_it_toupper)
 609                                 u8s1[0] = U8_ASCII_TOUPPER(*s1);
 610                         else
 611                                 u8s1[0] = U8_ASCII_TOLOWER(*s1);
 612                         s1++;
 613                         u8s1[1] = '\0';
 614                 } else if ((i1 + sz1) > n1) {
 615                         *errnum = EINVAL;
 616                         for (j = 0; (i1 + j) < n1; )
 617                                 u8s1[j++] = *s1++;
 618                         u8s1[j] = '\0';
 619                 } else {
 620                         (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
 621                         s1 += sz1;
 622                 }
 623
 624                 /* Do the same for the string s2. */
 625                 sz2 = u8_number_of_bytes[*s2];
 626                 if (sz2 < 0) {
 627                         *errnum = EILSEQ;
 628                         sz2 = 1;
 629                 }
 630
 631                 if (sz2 == 1) {
 632                         if (is_it_toupper)
 633                                 u8s2[0] = U8_ASCII_TOUPPER(*s2);
 634                         else
 635                                 u8s2[0] = U8_ASCII_TOLOWER(*s2);
 636                         s2++;
 637                         u8s2[1] = '\0';
 638                 } else if ((i2 + sz2) > n2) {
 639                         *errnum = EINVAL;
 640                         for (j = 0; (i2 + j) < n2; )
 641                                 u8s2[j++] = *s2++;
 642                         u8s2[j] = '\0';
 643                 } else {
 644                         (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
 645                         s2 += sz2;
 646                 }
 647
 648                 /* Now compare the two characters. */
 649                 if (sz1 == 1 && sz2 == 1) {
 650                         if (*u8s1 > *u8s2)
 651                                 return (1);
 652                         if (*u8s1 < *u8s2)
 653                                 return (-1);
 654                 } else {
 655                         f = strcmp((const char *)u8s1, (const char *)u8s2);
 656                         if (f != 0)
 657                                 return (f);
 658                 }
 659
 660                 /*
 661                  * They were the same. Let's move on to the next
 662                  * characters then.
 663                  */
 664                 i1 += sz1;
 665                 i2 += sz2;
 666         }
 667
 668         /*
 669          * We compared until the end of either or both strings.
 670          *
 671          * If we reached to or went over the ends for the both, that means
 672          * they are the same.
 673          *
 674          * If we reached only one of the two ends, that means the other string
 675          * has something which then the fact can be used to determine
 676          * the return value.
 677          */
 678         if (i1 >= n1) {
 679                 if (i2 >= n2)
 680                         return (0);
 681                 return (-1);
 682         }
 683         return (1);
 684 }
 685
 686 /*
 687  * The combining_class() function checks on the given bytes and find out
 688  * the corresponding Unicode combining class value. The return value 0 means
 689  * it is a Starter. Any illegal UTF-8 character will also be treated as
 690  * a Starter.
 691  */
 692 static uchar_t
 693 combining_class(size_t uv, uchar_t *s, size_t sz)
 694 {
 695         uint16_t b1 = 0;
 696         uint16_t b2 = 0;
 697         uint16_t b3 = 0;
 698         uint16_t b4 = 0;
 699
 700         if (sz == 1 || sz > 4)
 701                 return (0);
 702
 703         if (sz == 2) {
 704                 b3 = s[0];
 705                 b4 = s[1];
 706         } else if (sz == 3) {
 707                 b2 = s[0];
 708                 b3 = s[1];
 709                 b4 = s[2];
 710         } else if (sz == 4) {
 711                 b1 = s[0];
 712                 b2 = s[1];
 713                 b3 = s[2];
 714                 b4 = s[3];
 715         }
 716
 717         b1 = u8_common_b1_tbl[uv][b1];
 718         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
 719                 return (0);
 720
 721         b2 = u8_combining_class_b2_tbl[uv][b1][b2];
 722         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
 723                 return (0);
 724
 725         b3 = u8_combining_class_b3_tbl[uv][b2][b3];
 726         if (b3 == U8_TBL_ELEMENT_NOT_DEF)
 727                 return (0);
 728
 729         return (u8_combining_class_b4_tbl[uv][b3][b4]);
 730 }
 731
 732 /*
 733  * The do_decomp() function finds out a matching decomposition if any
 734  * and return. If there is no match, the input bytes are copied and returned.
 735  * The function also checks if there is a Hangul, decomposes it if necessary
 736  * and returns.
 737  *
 738  * To save time, a single byte 7-bit ASCII character should be handled by
 739  * the caller.
 740  *
 741  * The function returns the number of bytes returned sans always terminating
 742  * the null byte. It will also return a state that will tell if there was
 743  * a Hangul character decomposed which then will be used by the caller.
 744  */
 745 static size_t
 746 do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
 747         boolean_t canonical_decomposition, u8_normalization_states_t *state)
 748 {
 749         uint16_t b1 = 0;
 750         uint16_t b2 = 0;
 751         uint16_t b3 = 0;
 752         uint16_t b3_tbl;
 753         uint16_t b3_base;
 754         uint16_t b4 = 0;
 755         size_t start_id;
 756         size_t end_id;
 757         size_t i;
 758         uint32_t u1;
 759
 760         if (sz == 2) {
 761                 b3 = u8s[0] = s[0];
 762                 b4 = u8s[1] = s[1];
 763                 u8s[2] = '\0';
 764         } else if (sz == 3) {
 765                 /* Convert it to a Unicode scalar value. */
 766                 U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
 767
 768                 /*
 769                  * If this is a Hangul syllable, we decompose it into
 770                  * a leading consonant, a vowel, and an optional trailing
 771                  * consonant and then return.
 772                  */
 773                 if (U8_HANGUL_SYLLABLE(u1)) {
 774                         u1 -= U8_HANGUL_SYL_FIRST;
 775
 776                         b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
 777                         b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
 778                             / U8_HANGUL_T_COUNT;
 779                         b3 = u1 % U8_HANGUL_T_COUNT;
 780
 781                         U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
 782                         U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
 783                         if (b3) {
 784                                 b3 += U8_HANGUL_JAMO_T_FIRST;
 785                                 U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
 786
 787                                 u8s[9] = '\0';
 788                                 *state = U8_STATE_HANGUL_LVT;
 789                                 return (9);
 790                         }
 791
 792                         u8s[6] = '\0';
 793                         *state = U8_STATE_HANGUL_LV;
 794                         return (6);
 795                 }
 796
 797                 b2 = u8s[0] = s[0];
 798                 b3 = u8s[1] = s[1];
 799                 b4 = u8s[2] = s[2];
 800                 u8s[3] = '\0';
 801
 802                 /*
 803                  * If this is a Hangul Jamo, we know there is nothing
 804                  * further that we can decompose.
 805                  */
 806                 if (U8_HANGUL_JAMO_L(u1)) {
 807                         *state = U8_STATE_HANGUL_L;
 808                         return (3);
 809                 }
 810
 811                 if (U8_HANGUL_JAMO_V(u1)) {
 812                         if (*state == U8_STATE_HANGUL_L)
 813                                 *state = U8_STATE_HANGUL_LV;
 814                         else
 815                                 *state = U8_STATE_HANGUL_V;
 816                         return (3);
 817                 }
 818
 819                 if (U8_HANGUL_JAMO_T(u1)) {
 820                         if (*state == U8_STATE_HANGUL_LV)
 821                                 *state = U8_STATE_HANGUL_LVT;
 822                         else
 823                                 *state = U8_STATE_HANGUL_T;
 824                         return (3);
 825                 }
 826         } else if (sz == 4) {
 827                 b1 = u8s[0] = s[0];
 828                 b2 = u8s[1] = s[1];
 829                 b3 = u8s[2] = s[2];
 830                 b4 = u8s[3] = s[3];
 831                 u8s[4] = '\0';
 832         } else {
 833                 /*
 834                  * This is a fallback and should not happen if the function
 835                  * was called properly.
 836                  */
 837                 u8s[0] = s[0];
 838                 u8s[1] = '\0';
 839                 *state = U8_STATE_START;
 840                 return (1);
 841         }
 842
 843         /*
 844          * At this point, this rountine does not know what it would get.
 845          * The caller should sort it out if the state isn't a Hangul one.
 846          */
 847         *state = U8_STATE_START;
 848
 849         /* Try to find matching decomposition mapping byte sequence. */
 850         b1 = u8_common_b1_tbl[uv][b1];
 851         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
 852                 return ((size_t)sz);
 853
 854         b2 = u8_decomp_b2_tbl[uv][b1][b2];
 855         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
 856                 return ((size_t)sz);
 857
 858         b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
 859         if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
 860                 return ((size_t)sz);
 861
 862         /*
 863          * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
 864          * which is 0x8000, this means we couldn't fit the mappings into
 865          * the cardinality of a unsigned byte.
 866          */
 867         if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
 868                 b3_tbl -= U8_16BIT_TABLE_INDICATOR;
 869                 start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
 870                 end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
 871         } else {
 872                 start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
 873                 end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
 874         }
 875
 876         /* This also means there wasn't any matching decomposition. */
 877         if (start_id >= end_id)
 878                 return ((size_t)sz);
 879
 880         /*
 881          * The final table for decomposition mappings has three types of
 882          * byte sequences depending on whether a mapping is for compatibility
 883          * decomposition, canonical decomposition, or both like the following:
 884          *
 885          * (1) Compatibility decomposition mappings:
 886          *
 887          *      +---+---+-...-+---+
 888          *      | B0| B1| ... | Bm|
 889          *      +---+---+-...-+---+
 890          *
 891          *      The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
 892          *
 893          * (2) Canonical decomposition mappings:
 894          *
 895          *      +---+---+---+-...-+---+
 896          *      | T | b0| b1| ... | bn|
 897          *      +---+---+---+-...-+---+
 898          *
 899          *      where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
 900          *
 901          * (3) Both mappings:
 902          *
 903          *      +---+---+---+---+-...-+---+---+---+-...-+---+
 904          *      | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
 905          *      +---+---+---+---+-...-+---+---+---+-...-+---+
 906          *
 907          *      where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
 908          *      byte, b0 to bn are canonical mapping bytes and B0 to Bm are
 909          *      compatibility mapping bytes.
 910          *
 911          * Note that compatibility decomposition means doing recursive
 912          * decompositions using both compatibility decomposition mappings and
 913          * canonical decomposition mappings. On the other hand, canonical
 914          * decomposition means doing recursive decompositions using only
 915          * canonical decomposition mappings. Since the table we have has gone
 916          * through the recursions already, we do not need to do so during
 917          * runtime, i.e., the table has been completely flattened out
 918          * already.
 919          */
 920
 921         b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
 922
 923         /* Get the type, T, of the byte sequence. */
 924         b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
 925
 926         /*
 927          * If necessary, adjust start_id, end_id, or both. Note that if
 928          * this is compatibility decomposition mapping, there is no
 929          * adjustment.
 930          */
 931         if (canonical_decomposition) {
 932                 /* Is the mapping only for compatibility decomposition? */
 933                 if (b1 < U8_DECOMP_BOTH)
 934                         return ((size_t)sz);
 935
 936                 start_id++;
 937
 938                 if (b1 == U8_DECOMP_BOTH) {
 939                         end_id = start_id +
 940                             u8_decomp_final_tbl[uv][b3_base + start_id];
 941                         start_id++;
 942                 }
 943         } else {
 944                 /*
 945                  * Unless this is a compatibility decomposition mapping,
 946                  * we adjust the start_id.
 947                  */
 948                 if (b1 == U8_DECOMP_BOTH) {
 949                         start_id++;
 950                         start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
 951                 } else if (b1 == U8_DECOMP_CANONICAL) {
 952                         start_id++;
 953                 }
 954         }
 955
 956         for (i = 0; start_id < end_id; start_id++)
 957                 u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
 958         u8s[i] = '\0';
 959
 960         return (i);
 961 }
 962
 963 /*
 964  * The find_composition_start() function uses the character bytes given and
 965  * find out the matching composition mappings if any and return the address
 966  * to the composition mappings as explained in the do_composition().
 967  */
 968 static uchar_t *
 969 find_composition_start(size_t uv, uchar_t *s, size_t sz)
 970 {
 971         uint16_t b1 = 0;
 972         uint16_t b2 = 0;
 973         uint16_t b3 = 0;
 974         uint16_t b3_tbl;
 975         uint16_t b3_base;
 976         uint16_t b4 = 0;
 977         size_t start_id;
 978         size_t end_id;
 979
 980         if (sz == 1) {
 981                 b4 = s[0];
 982         } else if (sz == 2) {
 983                 b3 = s[0];
 984                 b4 = s[1];
 985         } else if (sz == 3) {
 986                 b2 = s[0];
 987                 b3 = s[1];
 988                 b4 = s[2];
 989         } else if (sz == 4) {
 990                 b1 = s[0];
 991                 b2 = s[1];
 992                 b3 = s[2];
 993                 b4 = s[3];
 994         } else {
 995                 /*
 996                  * This is a fallback and should not happen if the function
 997                  * was called properly.
 998                  */
 999                 return (NULL);
1000         }
1001
1002         b1 = u8_composition_b1_tbl[uv][b1];
1003         if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1004                 return (NULL);
1005
1006         b2 = u8_composition_b2_tbl[uv][b1][b2];
1007         if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1008                 return (NULL);
1009
1010         b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1011         if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1012                 return (NULL);
1013
1014         if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1015                 b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1016                 start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1017                 end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1018         } else {
1019                 start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1020                 end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1021         }
1022
1023         if (start_id >= end_id)
1024                 return (NULL);
1025
1026         b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1027
1028         return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1029 }
1030
1031 /*
1032  * The blocked() function checks on the combining class values of previous
1033  * characters in this sequence and return whether it is blocked or not.
1034  */
1035 static boolean_t
1036 blocked(uchar_t *comb_class, size_t last)
1037 {
1038         uchar_t my_comb_class;
1039         size_t i;
1040
1041         my_comb_class = comb_class[last];
1042         for (i = 1; i < last; i++)
1043                 if (comb_class[i] >= my_comb_class ||
1044                     comb_class[i] == U8_COMBINING_CLASS_STARTER)
1045                         return (B_TRUE);
1046
1047         return (B_FALSE);
1048 }
1049
1050 /*
1051  * The do_composition() reads the character string pointed by 's' and
1052  * do necessary canonical composition and then copy over the result back to
1053  * the 's'.
1054  *
1055  * The input argument 's' cannot contain more than 32 characters.
1056  */
1057 static size_t
1058 do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1059         uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1060 {
1061         uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1062         uchar_t tc[U8_MB_CUR_MAX];
1063         uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1064         size_t saved_marks_count;
1065         uchar_t *p;
1066         uchar_t *saved_p;
1067         uchar_t *q;
1068         size_t i;
1069         size_t saved_i;
1070         size_t j;
1071         size_t k;
1072         size_t l;
1073         size_t C;
1074         size_t saved_l;
1075         size_t size;
1076         uint32_t u1;
1077         uint32_t u2;
1078         boolean_t match_not_found = B_TRUE;
1079
1080         /*
1081          * This should never happen unless the callers are doing some strange
1082          * and unexpected things.
1083          *
1084          * The "last" is the index pointing to the last character not last + 1.
1085          */
1086         if (last >= U8_MAX_CHARS_A_SEQ)
1087                 last = U8_UPPER_LIMIT_IN_A_SEQ;
1088
1089         for (i = l = 0; i <= last; i++) {
1090                 /*
1091                  * The last or any non-Starters at the beginning, we don't
1092                  * have any chance to do composition and so we just copy them
1093                  * to the temporary buffer.
1094                  */
1095                 if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1096 SAVE_THE_CHAR:
1097                         p = s + start[i];
1098                         size = disp[i];
1099                         for (k = 0; k < size; k++)
1100                                 t[l++] = *p++;
1101                         continue;
1102                 }
1103
1104                 /*
1105                  * If this could be a start of Hangul Jamos, then, we try to
1106                  * conjoin them.
1107                  */
1108                 if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1109                         U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1110                             s[start[i] + 1], s[start[i] + 2]);
1111                         U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1112                             s[start[i] + 4], s[start[i] + 5]);
1113
1114                         if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1115                                 u1 -= U8_HANGUL_JAMO_L_FIRST;
1116                                 u2 -= U8_HANGUL_JAMO_V_FIRST;
1117                                 u1 = U8_HANGUL_SYL_FIRST +
1118                                     (u1 * U8_HANGUL_V_COUNT + u2) *
1119                                     U8_HANGUL_T_COUNT;
1120
1121                                 i += 2;
1122                                 if (i <= last) {
1123                                         U8_PUT_3BYTES_INTO_UTF32(u2,
1124                                             s[start[i]], s[start[i] + 1],
1125                                             s[start[i] + 2]);
1126
1127                                         if (U8_HANGUL_JAMO_T(u2)) {
1128                                                 u1 += u2 -
1129                                                     U8_HANGUL_JAMO_T_FIRST;
1130                                                 i++;
1131                                         }
1132                                 }
1133
1134                                 U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1135                                 i--;
1136                                 l += 3;
1137                                 continue;
1138                         }
1139                 }
1140
1141                 /*
1142                  * Let's then find out if this Starter has composition
1143                  * mapping.
1144                  */
1145                 p = find_composition_start(uv, s + start[i], disp[i]);
1146                 if (p == NULL)
1147                         goto SAVE_THE_CHAR;
1148
1149                 /*
1150                  * We have a Starter with composition mapping and the next
1151                  * character is a non-Starter. Let's try to find out if
1152                  * we can do composition.
1153                  */
1154
1155                 saved_p = p;
1156                 saved_i = i;
1157                 saved_l = l;
1158                 saved_marks_count = 0;
1159
1160 TRY_THE_NEXT_MARK:
1161                 q = s + start[++i];
1162                 size = disp[i];
1163
1164                 /*
1165                  * The next for() loop compares the non-Starter pointed by
1166                  * 'q' with the possible (joinable) characters pointed by 'p'.
1167                  *
1168                  * The composition final table entry pointed by the 'p'
1169                  * looks like the following:
1170                  *
1171                  * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1172                  * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1173                  * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1174                  *
1175                  * where C is the count byte indicating the number of
1176                  * mapping pairs where each pair would be look like
1177                  * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1178                  * character of a canonical decomposition and the B0-Bm are
1179                  * the bytes of a matching composite character. The F is
1180                  * a filler byte after each character as the separator.
1181                  */
1182
1183                 match_not_found = B_TRUE;
1184
1185                 for (C = *p++; C > 0; C--) {
1186                         for (k = 0; k < size; p++, k++)
1187                                 if (*p != q[k])
1188                                         break;
1189
1190                         /* Have we found it? */
1191                         if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1192                                 match_not_found = B_FALSE;
1193
1194                                 l = saved_l;
1195
1196                                 while (*++p != U8_TBL_ELEMENT_FILLER)
1197                                         t[l++] = *p;
1198
1199                                 break;
1200                         }
1201
1202                         /* We didn't find; skip to the next pair. */
1203                         if (*p != U8_TBL_ELEMENT_FILLER)
1204                                 while (*++p != U8_TBL_ELEMENT_FILLER)
1205                                         ;
1206                         while (*++p != U8_TBL_ELEMENT_FILLER)
1207                                 ;
1208                         p++;
1209                 }
1210
1211                 /*
1212                  * If there was no match, we will need to save the combining
1213                  * mark for later appending. After that, if the next one
1214                  * is a non-Starter and not blocked, then, we try once
1215                  * again to do composition with the next non-Starter.
1216                  *
1217                  * If there was no match and this was a Starter, then,
1218                  * this is a new start.
1219                  *
1220                  * If there was a match and a composition done and we have
1221                  * more to check on, then, we retrieve a new composition final
1222                  * table entry for the composite and then try to do the
1223                  * composition again.
1224                  */
1225
1226                 if (match_not_found) {
1227                         if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1228                                 i--;
1229                                 goto SAVE_THE_CHAR;
1230                         }
1231
1232                         saved_marks[saved_marks_count++] = i;
1233                 }
1234
1235                 if (saved_l == l) {
1236                         while (i < last) {
1237                                 if (blocked(comb_class, i + 1))
1238                                         saved_marks[saved_marks_count++] = ++i;
1239                                 else
1240                                         break;
1241                         }
1242                         if (i < last) {
1243                                 p = saved_p;
1244                                 goto TRY_THE_NEXT_MARK;
1245                         }
1246                 } else if (i < last) {
1247                         p = find_composition_start(uv, t + saved_l,
1248                             l - saved_l);
1249                         if (p != NULL) {
1250                                 saved_p = p;
1251                                 goto TRY_THE_NEXT_MARK;
1252                         }
1253                 }
1254
1255                 /*
1256                  * There is no more composition possible.
1257                  *
1258                  * If there was no composition what so ever then we copy
1259                  * over the original Starter and then append any non-Starters
1260                  * remaining at the target string sequentially after that.
1261                  */
1262
1263                 if (saved_l == l) {
1264                         p = s + start[saved_i];
1265                         size = disp[saved_i];
1266                         for (j = 0; j < size; j++)
1267                                 t[l++] = *p++;
1268                 }
1269
1270                 for (k = 0; k < saved_marks_count; k++) {
1271                         p = s + start[saved_marks[k]];
1272                         size = disp[saved_marks[k]];
1273                         for (j = 0; j < size; j++)
1274                                 t[l++] = *p++;
1275                 }
1276         }
1277
1278         /*
1279          * If the last character is a Starter and if we have a character
1280          * (possibly another Starter) that can be turned into a composite,
1281          * we do so and we do so until there is no more of composition
1282          * possible.
1283          */
1284         if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1285                 p = *os;
1286                 saved_l = l - disp[last];
1287
1288                 while (p < oslast) {
1289                         size = u8_number_of_bytes[*p];
1290                         if (size <= 1 || (p + size) > oslast)
1291                                 break;
1292
1293                         saved_p = p;
1294
1295                         for (i = 0; i < size; i++)
1296                                 tc[i] = *p++;
1297
1298                         q = find_composition_start(uv, t + saved_l,
1299                             l - saved_l);
1300                         if (q == NULL) {
1301                                 p = saved_p;
1302                                 break;
1303                         }
1304
1305                         match_not_found = B_TRUE;
1306
1307                         for (C = *q++; C > 0; C--) {
1308                                 for (k = 0; k < size; q++, k++)
1309                                         if (*q != tc[k])
1310                                                 break;
1311
1312                                 if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1313                                         match_not_found = B_FALSE;
1314
1315                                         l = saved_l;
1316
1317                                         while (*++q != U8_TBL_ELEMENT_FILLER) {
1318                                                 /*
1319                                                  * This is practically
1320                                                  * impossible but we don't
1321                                                  * want to take any chances.
1322                                                  */
1323                                                 if (l >=
1324                                                     U8_STREAM_SAFE_TEXT_MAX) {
1325                                                         p = saved_p;
1326                                                         goto SAFE_RETURN;
1327                                                 }
1328                                                 t[l++] = *q;
1329                                         }
1330
1331                                         break;
1332                                 }
1333
1334                                 if (*q != U8_TBL_ELEMENT_FILLER)
1335                                         while (*++q != U8_TBL_ELEMENT_FILLER)
1336                                                 ;
1337                                 while (*++q != U8_TBL_ELEMENT_FILLER)
1338                                         ;
1339                                 q++;
1340                         }
1341
1342                         if (match_not_found) {
1343                                 p = saved_p;
1344                                 break;
1345                         }
1346                 }
1347 SAFE_RETURN:
1348                 *os = p;
1349         }
1350
1351         /*
1352          * Now we copy over the temporary string to the target string.
1353          * Since composition always reduces the number of characters or
1354          * the number of characters stay, we don't need to worry about
1355          * the buffer overflow here.
1356          */
1357         for (i = 0; i < l; i++)
1358                 s[i] = t[i];
1359         s[l] = '\0';
1360
1361         return (l);
1362 }
1363
1364 /*
1365  * The collect_a_seq() function checks on the given string s, collect
1366  * a sequence of characters at u8s, and return the sequence. While it collects
1367  * a sequence, it also applies case conversion, canonical or compatibility
1368  * decomposition, canonical decomposition, or some or all of them and
1369  * in that order.
1370  *
1371  * The collected sequence cannot be bigger than 32 characters since if
1372  * it is having more than 31 characters, the sequence will be terminated
1373  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1374  * a Stream-Safe Text. The collected sequence is always terminated with
1375  * a null byte and the return value is the byte length of the sequence
1376  * including 0. The return value does not include the terminating
1377  * null byte.
1378  */
1379 static size_t
1380 collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1381         boolean_t is_it_toupper,
1382         boolean_t is_it_tolower,
1383         boolean_t canonical_decomposition,
1384         boolean_t compatibility_decomposition,
1385         boolean_t canonical_composition,
1386         int *errnum, u8_normalization_states_t *state)
1387 {
1388         uchar_t *s;
1389         int sz;
1390         int saved_sz;
1391         size_t i;
1392         size_t j;
1393         size_t k;
1394         size_t l;
1395         uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1396         uchar_t disp[U8_MAX_CHARS_A_SEQ];
1397         uchar_t start[U8_MAX_CHARS_A_SEQ];
1398         uchar_t u8t[U8_MB_CUR_MAX];
1399         uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1400         uchar_t tc;
1401         size_t last;
1402         size_t saved_last;
1403         uint32_t u1;
1404
1405         /*
1406          * Save the source string pointer which we will return a changed
1407          * pointer if we do processing.
1408          */
1409         s = *source;
1410
1411         /*
1412          * The following is a fallback for just in case callers are not
1413          * checking the string boundaries before the calling.
1414          */
1415         if (s >= slast) {
1416                 u8s[0] = '\0';
1417
1418                 return (0);
1419         }
1420
1421         /*
1422          * As the first thing, let's collect a character and do case
1423          * conversion if necessary.
1424          */
1425
1426         sz = u8_number_of_bytes[*s];
1427
1428         if (sz < 0) {
1429                 *errnum = EILSEQ;
1430
1431                 u8s[0] = *s++;
1432                 u8s[1] = '\0';
1433
1434                 *source = s;
1435
1436                 return (1);
1437         }
1438
1439         if (sz == 1) {
1440                 if (is_it_toupper)
1441                         u8s[0] = U8_ASCII_TOUPPER(*s);
1442                 else if (is_it_tolower)
1443                         u8s[0] = U8_ASCII_TOLOWER(*s);
1444                 else
1445                         u8s[0] = *s;
1446                 s++;
1447                 u8s[1] = '\0';
1448         } else if ((s + sz) > slast) {
1449                 *errnum = EINVAL;
1450
1451                 for (i = 0; s < slast; )
1452                         u8s[i++] = *s++;
1453                 u8s[i] = '\0';
1454
1455                 *source = s;
1456
1457                 return (i);
1458         } else {
1459                 if (is_it_toupper || is_it_tolower) {
1460                         i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1461                         s += sz;
1462                         sz = i;
1463                 } else {
1464                         for (i = 0; i < sz; )
1465                                 u8s[i++] = *s++;
1466                         u8s[i] = '\0';
1467                 }
1468         }
1469
1470         /*
1471          * And then canonical/compatibility decomposition followed by
1472          * an optional canonical composition. Please be noted that
1473          * canonical composition is done only when a decomposition is
1474          * done.
1475          */
1476         if (canonical_decomposition || compatibility_decomposition) {
1477                 if (sz == 1) {
1478                         *state = U8_STATE_START;
1479
1480                         saved_sz = 1;
1481
1482                         comb_class[0] = 0;
1483                         start[0] = 0;
1484                         disp[0] = 1;
1485
1486                         last = 1;
1487                 } else {
1488                         saved_sz = do_decomp(uv, u8s, u8s, sz,
1489                             canonical_decomposition, state);
1490
1491                         last = 0;
1492
1493                         for (i = 0; i < saved_sz; ) {
1494                                 sz = u8_number_of_bytes[u8s[i]];
1495
1496                                 comb_class[last] = combining_class(uv,
1497                                     u8s + i, sz);
1498                                 start[last] = i;
1499                                 disp[last] = sz;
1500
1501                                 last++;
1502                                 i += sz;
1503                         }
1504
1505                         /*
1506                          * Decomposition yields various Hangul related
1507                          * states but not on combining marks. We need to
1508                          * find out at here by checking on the last
1509                          * character.
1510                          */
1511                         if (*state == U8_STATE_START) {
1512                                 if (comb_class[last - 1])
1513                                         *state = U8_STATE_COMBINING_MARK;
1514                         }
1515                 }
1516
1517                 saved_last = last;
1518
1519                 while (s < slast) {
1520                         sz = u8_number_of_bytes[*s];
1521
1522                         /*
1523                          * If this is an illegal character, an incomplete
1524                          * character, or an 7-bit ASCII Starter character,
1525                          * then we have collected a sequence; break and let
1526                          * the next call deal with the two cases.
1527                          *
1528                          * Note that this is okay only if you are using this
1529                          * function with a fixed length string, not on
1530                          * a buffer with multiple calls of one chunk at a time.
1531                          */
1532                         if (sz <= 1) {
1533                                 break;
1534                         } else if ((s + sz) > slast) {
1535                                 break;
1536                         } else {
1537                                 /*
1538                                  * If the previous character was a Hangul Jamo
1539                                  * and this character is a Hangul Jamo that
1540                                  * can be conjoined, we collect the Jamo.
1541                                  */
1542                                 if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1543                                         U8_PUT_3BYTES_INTO_UTF32(u1,
1544                                             *s, *(s + 1), *(s + 2));
1545
1546                                         if (U8_HANGUL_COMPOSABLE_L_V(*state,
1547                                             u1)) {
1548                                                 i = 0;
1549                                                 *state = U8_STATE_HANGUL_LV;
1550                                                 goto COLLECT_A_HANGUL;
1551                                         }
1552
1553                                         if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1554                                             u1)) {
1555                                                 i = 0;
1556                                                 *state = U8_STATE_HANGUL_LVT;
1557                                                 goto COLLECT_A_HANGUL;
1558                                         }
1559                                 }
1560
1561                                 /*
1562                                  * Regardless of whatever it was, if this is
1563                                  * a Starter, we don't collect the character
1564                                  * since that's a new start and we will deal
1565                                  * with it at the next time.
1566                                  */
1567                                 i = combining_class(uv, s, sz);
1568                                 if (i == U8_COMBINING_CLASS_STARTER)
1569                                         break;
1570
1571                                 /*
1572                                  * We know the current character is a combining
1573                                  * mark. If the previous character wasn't
1574                                  * a Starter (not Hangul) or a combining mark,
1575                                  * then, we don't collect this combining mark.
1576                                  */
1577                                 if (*state != U8_STATE_START &&
1578                                     *state != U8_STATE_COMBINING_MARK)
1579                                         break;
1580
1581                                 *state = U8_STATE_COMBINING_MARK;
1582 COLLECT_A_HANGUL:
1583                                 /*
1584                                  * If we collected a Starter and combining
1585                                  * marks up to 30, i.e., total 31 characters,
1586                                  * then, we terminate this degenerately long
1587                                  * combining sequence with a U+034F COMBINING
1588                                  * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1589                                  * UTF-8 and turn this into a Stream-Safe
1590                                  * Text. This will be extremely rare but
1591                                  * possible.
1592                                  *
1593                                  * The following will also guarantee that
1594                                  * we are not writing more than 32 characters
1595                                  * plus a NULL at u8s[].
1596                                  */
1597                                 if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1598 TURN_STREAM_SAFE:
1599                                         *state = U8_STATE_START;
1600                                         comb_class[last] = 0;
1601                                         start[last] = saved_sz;
1602                                         disp[last] = 2;
1603                                         last++;
1604
1605                                         u8s[saved_sz++] = 0xCD;
1606                                         u8s[saved_sz++] = 0x8F;
1607
1608                                         break;
1609                                 }
1610
1611                                 /*
1612                                  * Some combining marks also do decompose into
1613                                  * another combining mark or marks.
1614                                  */
1615                                 if (*state == U8_STATE_COMBINING_MARK) {
1616                                         k = last;
1617                                         l = sz;
1618                                         i = do_decomp(uv, uts, s, sz,
1619                                             canonical_decomposition, state);
1620                                         for (j = 0; j < i; ) {
1621                                                 sz = u8_number_of_bytes[uts[j]];
1622
1623                                                 comb_class[last] =
1624                                                     combining_class(uv,
1625                                                     uts + j, sz);
1626                                                 start[last] = saved_sz + j;
1627                                                 disp[last] = sz;
1628
1629                                                 last++;
1630                                                 if (last >=
1631                                                     U8_UPPER_LIMIT_IN_A_SEQ) {
1632                                                         last = k;
1633                                                         goto TURN_STREAM_SAFE;
1634                                                 }
1635                                                 j += sz;
1636                                         }
1637
1638                                         *state = U8_STATE_COMBINING_MARK;
1639                                         sz = i;
1640                                         s += l;
1641
1642                                         for (i = 0; i < sz; i++)
1643                                                 u8s[saved_sz++] = uts[i];
1644                                 } else {
1645                                         comb_class[last] = i;
1646                                         start[last] = saved_sz;
1647                                         disp[last] = sz;
1648                                         last++;
1649
1650                                         for (i = 0; i < sz; i++)
1651                                                 u8s[saved_sz++] = *s++;
1652                                 }
1653
1654                                 /*
1655                                  * If this is U+0345 COMBINING GREEK
1656                                  * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1657                                  * iota subscript, and need to be converted to
1658                                  * uppercase letter, convert it to U+0399 GREEK
1659                                  * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1660                                  * i.e., convert to capital adscript form as
1661                                  * specified in the Unicode standard.
1662                                  *
1663                                  * This is the only special case of (ambiguous)
1664                                  * case conversion at combining marks and
1665                                  * probably the standard will never have
1666                                  * anything similar like this in future.
1667                                  */
1668                                 if (is_it_toupper && sz >= 2 &&
1669                                     u8s[saved_sz - 2] == 0xCD &&
1670                                     u8s[saved_sz - 1] == 0x85) {
1671                                         u8s[saved_sz - 2] = 0xCE;
1672                                         u8s[saved_sz - 1] = 0x99;
1673                                 }
1674                         }
1675                 }
1676
1677                 /*
1678                  * Let's try to ensure a canonical ordering for the collected
1679                  * combining marks. We do this only if we have collected
1680                  * at least one more non-Starter. (The decomposition mapping
1681                  * data tables have fully (and recursively) expanded and
1682                  * canonically ordered decompositions.)
1683                  *
1684                  * The U8_SWAP_COMB_MARKS() convenience macro has some
1685                  * assumptions and we are meeting the assumptions.
1686                  */
1687                 last--;
1688                 if (last >= saved_last) {
1689                         for (i = 0; i < last; i++)
1690                                 for (j = last; j > i; j--)
1691                                         if (comb_class[j] &&
1692                                             comb_class[j - 1] > comb_class[j]) {
1693                                                 U8_SWAP_COMB_MARKS(j - 1, j);
1694                                         }
1695                 }
1696
1697                 *source = s;
1698
1699                 if (! canonical_composition) {
1700                         u8s[saved_sz] = '\0';
1701                         return (saved_sz);
1702                 }
1703
1704                 /*
1705                  * Now do the canonical composition. Note that we do this
1706                  * only after a canonical or compatibility decomposition to
1707                  * finish up NFC or NFKC.
1708                  */
1709                 sz = do_composition(uv, u8s, comb_class, start, disp, last,
1710                     &s, slast);
1711         }
1712
1713         *source = s;
1714
1715         return ((size_t)sz);
1716 }
1717
1718 /*
1719  * The do_norm_compare() function does string comparion based on Unicode
1720  * simple case mappings and Unicode Normalization definitions.
1721  *
1722  * It does so by collecting a sequence of character at a time and comparing
1723  * the collected sequences from the strings.
1724  *
1725  * The meanings on the return values are the same as the usual strcmp().
1726  */
1727 static int
1728 do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1729         int flag, int *errnum)
1730 {
1731         int result;
1732         size_t sz1;
1733         size_t sz2;
1734         uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1735         uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1736         uchar_t *s1last;
1737         uchar_t *s2last;
1738         boolean_t is_it_toupper;
1739         boolean_t is_it_tolower;
1740         boolean_t canonical_decomposition;
1741         boolean_t compatibility_decomposition;
1742         boolean_t canonical_composition;
1743         u8_normalization_states_t state;
1744
1745         s1last = s1 + n1;
1746         s2last = s2 + n2;
1747
1748         is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1749         is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1750         canonical_decomposition = flag & U8_CANON_DECOMP;
1751         compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1752         canonical_composition = flag & U8_CANON_COMP;
1753
1754         while (s1 < s1last && s2 < s2last) {
1755                 /*
1756                  * If the current character is a 7-bit ASCII and the last
1757                  * character, or, if the current character and the next
1758                  * character are both some 7-bit ASCII characters then
1759                  * we treat the current character as a sequence.
1760                  *
1761                  * In any other cases, we need to call collect_a_seq().
1762                  */
1763
1764                 if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1765                     ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1766                         if (is_it_toupper)
1767                                 u8s1[0] = U8_ASCII_TOUPPER(*s1);
1768                         else if (is_it_tolower)
1769                                 u8s1[0] = U8_ASCII_TOLOWER(*s1);
1770                         else
1771                                 u8s1[0] = *s1;
1772                         u8s1[1] = '\0';
1773                         sz1 = 1;
1774                         s1++;
1775                 } else {
1776                         state = U8_STATE_START;
1777                         sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1778                             is_it_toupper, is_it_tolower,
1779                             canonical_decomposition,
1780                             compatibility_decomposition,
1781                             canonical_composition, errnum, &state);
1782                 }
1783
1784                 if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1785                     ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1786                         if (is_it_toupper)
1787                                 u8s2[0] = U8_ASCII_TOUPPER(*s2);
1788                         else if (is_it_tolower)
1789                                 u8s2[0] = U8_ASCII_TOLOWER(*s2);
1790                         else
1791                                 u8s2[0] = *s2;
1792                         u8s2[1] = '\0';
1793                         sz2 = 1;
1794                         s2++;
1795                 } else {
1796                         state = U8_STATE_START;
1797                         sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1798                             is_it_toupper, is_it_tolower,
1799                             canonical_decomposition,
1800                             compatibility_decomposition,
1801                             canonical_composition, errnum, &state);
1802                 }
1803
1804                 /*
1805                  * Now compare the two characters. If they are the same,
1806                  * we move on to the next character sequences.
1807                  */
1808                 if (sz1 == 1 && sz2 == 1) {
1809                         if (*u8s1 > *u8s2)
1810                                 return (1);
1811                         if (*u8s1 < *u8s2)
1812                                 return (-1);
1813                 } else {
1814                         result = strcmp((const char *)u8s1, (const char *)u8s2);
1815                         if (result != 0)
1816                                 return (result);
1817                 }
1818         }
1819
1820         /*
1821          * We compared until the end of either or both strings.
1822          *
1823          * If we reached to or went over the ends for the both, that means
1824          * they are the same.
1825          *
1826          * If we reached only one end, that means the other string has
1827          * something which then can be used to determine the return value.
1828          */
1829         if (s1 >= s1last) {
1830                 if (s2 >= s2last)
1831                         return (0);
1832                 return (-1);
1833         }
1834         return (1);
1835 }
1836
1837 /*
1838  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1839  * the strcmp(). For the comparison, however, Unicode Normalization specific
1840  * equivalency and Unicode simple case conversion mappings based equivalency
1841  * can be requested and checked against.
1842  */
1843 int
1844 u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1845                 int *errnum)
1846 {
1847         int f;
1848         size_t n1;
1849         size_t n2;
1850
1851         *errnum = 0;
1852
1853         /*
1854          * Check on the requested Unicode version, case conversion, and
1855          * normalization flag values.
1856          */
1857
1858         if (uv > U8_UNICODE_LATEST) {
1859                 *errnum = ERANGE;
1860                 uv = U8_UNICODE_LATEST;
1861         }
1862
1863         if (flag == 0) {
1864                 flag = U8_STRCMP_CS;
1865         } else {
1866                 f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
1867                     U8_STRCMP_CI_LOWER);
1868                 if (f == 0) {
1869                         flag |= U8_STRCMP_CS;
1870                 } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1871                     f != U8_STRCMP_CI_LOWER) {
1872                         *errnum = EBADF;
1873                         flag = U8_STRCMP_CS;
1874                 }
1875
1876                 f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1877                 if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1878                     f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1879                         *errnum = EBADF;
1880                         flag = U8_STRCMP_CS;
1881                 }
1882         }
1883
1884         if (flag == U8_STRCMP_CS) {
1885                 return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1886         }
1887
1888         n1 = strlen(s1);
1889         n2 = strlen(s2);
1890         if (n != 0) {
1891                 if (n < n1)
1892                         n1 = n;
1893                 if (n < n2)
1894                         n2 = n;
1895         }
1896
1897         /*
1898          * Simple case conversion can be done much faster and so we do
1899          * them separately here.
1900          */
1901         if (flag == U8_STRCMP_CI_UPPER) {
1902                 return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1903                     n1, n2, B_TRUE, errnum));
1904         } else if (flag == U8_STRCMP_CI_LOWER) {
1905                 return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906                     n1, n2, B_FALSE, errnum));
1907         }
1908
1909         return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1910             flag, errnum));
1911 }
1912
1913 size_t
1914 u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1915         int flag, size_t unicode_version, int *errnum)
1916 {
1917         int f;
1918         int sz;
1919         uchar_t *ib;
1920         uchar_t *ibtail;
1921         uchar_t *ob;
1922         uchar_t *obtail;
1923         boolean_t do_not_ignore_null;
1924         boolean_t do_not_ignore_invalid;
1925         boolean_t is_it_toupper;
1926         boolean_t is_it_tolower;
1927         boolean_t canonical_decomposition;
1928         boolean_t compatibility_decomposition;
1929         boolean_t canonical_composition;
1930         size_t ret_val;
1931         size_t i;
1932         size_t j;
1933         uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1934         u8_normalization_states_t state;
1935
1936         if (unicode_version > U8_UNICODE_LATEST) {
1937                 *errnum = ERANGE;
1938                 return ((size_t)-1);
1939         }
1940
1941         f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1942         if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1943                 *errnum = EBADF;
1944                 return ((size_t)-1);
1945         }
1946
1947         f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1948         if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1949             f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1950                 *errnum = EBADF;
1951                 return ((size_t)-1);
1952         }
1953
1954         if (inarray == NULL || *inlen == 0)
1955                 return (0);
1956
1957         if (outarray == NULL) {
1958                 *errnum = E2BIG;
1959                 return ((size_t)-1);
1960         }
1961
1962         ib = (uchar_t *)inarray;
1963         ob = (uchar_t *)outarray;
1964         ibtail = ib + *inlen;
1965         obtail = ob + *outlen;
1966
1967         do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1968         do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1969         is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1970         is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1971
1972         ret_val = 0;
1973
1974         /*
1975          * If we don't have a normalization flag set, we do the simple case
1976          * conversion based text preparation separately below. Text
1977          * preparation involving Normalization will be done in the false task
1978          * block, again, separately since it will take much more time and
1979          * resource than doing simple case conversions.
1980          */
1981         if (f == 0) {
1982                 while (ib < ibtail) {
1983                         if (*ib == '\0' && do_not_ignore_null)
1984                                 break;
1985
1986                         sz = u8_number_of_bytes[*ib];
1987
1988                         if (sz < 0) {
1989                                 if (do_not_ignore_invalid) {
1990                                         *errnum = EILSEQ;
1991                                         ret_val = (size_t)-1;
1992                                         break;
1993                                 }
1994
1995                                 sz = 1;
1996                                 ret_val++;
1997                         }
1998
1999                         if (sz == 1) {
2000                                 if (ob >= obtail) {
2001                                         *errnum = E2BIG;
2002                                         ret_val = (size_t)-1;
2003                                         break;
2004                                 }
2005
2006                                 if (is_it_toupper)
2007                                         *ob = U8_ASCII_TOUPPER(*ib);
2008                                 else if (is_it_tolower)
2009                                         *ob = U8_ASCII_TOLOWER(*ib);
2010                                 else
2011                                         *ob = *ib;
2012                                 ib++;
2013                                 ob++;
2014                         } else if ((ib + sz) > ibtail) {
2015                                 if (do_not_ignore_invalid) {
2016                                         *errnum = EINVAL;
2017                                         ret_val = (size_t)-1;
2018                                         break;
2019                                 }
2020
2021                                 if ((obtail - ob) < (ibtail - ib)) {
2022                                         *errnum = E2BIG;
2023                                         ret_val = (size_t)-1;
2024                                         break;
2025                                 }
2026
2027                                 /*
2028                                  * We treat the remaining incomplete character
2029                                  * bytes as a character.
2030                                  */
2031                                 ret_val++;
2032
2033                                 while (ib < ibtail)
2034                                         *ob++ = *ib++;
2035                         } else {
2036                                 if (is_it_toupper || is_it_tolower) {
2037                                         i = do_case_conv(unicode_version, u8s,
2038                                             ib, sz, is_it_toupper);
2039
2040                                         if ((obtail - ob) < i) {
2041                                                 *errnum = E2BIG;
2042                                                 ret_val = (size_t)-1;
2043                                                 break;
2044                                         }
2045
2046                                         ib += sz;
2047
2048                                         for (sz = 0; sz < i; sz++)
2049                                                 *ob++ = u8s[sz];
2050                                 } else {
2051                                         if ((obtail - ob) < sz) {
2052                                                 *errnum = E2BIG;
2053                                                 ret_val = (size_t)-1;
2054                                                 break;
2055                                         }
2056
2057                                         for (i = 0; i < sz; i++)
2058                                                 *ob++ = *ib++;
2059                                 }
2060                         }
2061                 }
2062         } else {
2063                 canonical_decomposition = flag & U8_CANON_DECOMP;
2064                 compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2065                 canonical_composition = flag & U8_CANON_COMP;
2066
2067                 while (ib < ibtail) {
2068                         if (*ib == '\0' && do_not_ignore_null)
2069                                 break;
2070
2071                         /*
2072                          * If the current character is a 7-bit ASCII
2073                          * character and it is the last character, or,
2074                          * if the current character is a 7-bit ASCII
2075                          * character and the next character is also a 7-bit
2076                          * ASCII character, then, we copy over this
2077                          * character without going through collect_a_seq().
2078                          *
2079                          * In any other cases, we need to look further with
2080                          * the collect_a_seq() function.
2081                          */
2082                         if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2083                             ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2084                                 if (ob >= obtail) {
2085                                         *errnum = E2BIG;
2086                                         ret_val = (size_t)-1;
2087                                         break;
2088                                 }
2089
2090                                 if (is_it_toupper)
2091                                         *ob = U8_ASCII_TOUPPER(*ib);
2092                                 else if (is_it_tolower)
2093                                         *ob = U8_ASCII_TOLOWER(*ib);
2094                                 else
2095                                         *ob = *ib;
2096                                 ib++;
2097                                 ob++;
2098                         } else {
2099                                 *errnum = 0;
2100                                 state = U8_STATE_START;
2101
2102                                 j = collect_a_seq(unicode_version, u8s,
2103                                     &ib, ibtail,
2104                                     is_it_toupper,
2105                                     is_it_tolower,
2106                                     canonical_decomposition,
2107                                     compatibility_decomposition,
2108                                     canonical_composition,
2109                                     errnum, &state);
2110
2111                                 if (*errnum && do_not_ignore_invalid) {
2112                                         ret_val = (size_t)-1;
2113                                         break;
2114                                 }
2115
2116                                 if ((obtail - ob) < j) {
2117                                         *errnum = E2BIG;
2118                                         ret_val = (size_t)-1;
2119                                         break;
2120                                 }
2121
2122                                 for (i = 0; i < j; i++)
2123                                         *ob++ = u8s[i];
2124                         }
2125                 }
2126         }
2127
2128         *inlen = ibtail - ib;
2129         *outlen = obtail - ob;
2130
2131         return (ret_val);
2132 }