sys/libiconv/iconv_ucs.c

   1 /*-
   2  * Copyright (c) 2003, 2005 Ryuichiro Imura
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD: head/sys/libkern/iconv_ucs.c 267291 2014-06-09 19:27:47Z jhb $
  27  */
  28
  29 #include <sys/param.h>
  30 #include <sys/kernel.h>
  31 #include <sys/systm.h>
  32 #include <sys/malloc.h>
  33 #include <sys/iconv.h>
  34
  35 #include "iconv_converter_if.h"
  36
  37 /*
  38  * "UCS" converter
  39  */
  40
  41 #define KICONV_UCS_COMBINE      0x1
  42 #define KICONV_UCS_FROM_UTF8    0x2
  43 #define KICONV_UCS_TO_UTF8      0x4
  44 #define KICONV_UCS_FROM_LE      0x8
  45 #define KICONV_UCS_TO_LE        0x10
  46 #define KICONV_UCS_FROM_UTF16   0x20
  47 #define KICONV_UCS_TO_UTF16     0x40
  48 #define KICONV_UCS_UCS4         0x80
  49
  50 #define ENCODING_UTF16  "UTF-16BE"
  51 #define ENCODING_UTF8   "UTF-8"
  52
  53 static struct {
  54         const char *name;
  55         int from_flag, to_flag;
  56 } unicode_family[] = {
  57         { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
  58         { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
  59         { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
  60         { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
  61             KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
  62         { NULL,         0,      0 }
  63 };
  64
  65 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
  66 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
  67 static uint32_t encode_surrogate(uint32_t code);
  68 static uint32_t decode_surrogate(const u_char *ucs);
  69
  70 #ifdef MODULE_DEPEND
  71 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
  72 #endif
  73
  74 /*
  75  * UCS converter instance
  76  */
  77 struct iconv_ucs {
  78         KOBJ_FIELDS;
  79         int                     convtype;
  80         struct iconv_cspair *   d_csp;
  81         struct iconv_cspair *   d_cspf;
  82         void *                  f_ctp;
  83         void *                  t_ctp;
  84         void *                  ctype;
  85 };
  86
  87 static int
  88 iconv_ucs_open(struct iconv_converter_class *dcp,
  89         struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
  90 {
  91         struct iconv_ucs *dp;
  92         int i;
  93         const char *from, *to;
  94
  95         dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
  96         to = csp->cp_to;
  97         from = cspf ? cspf->cp_from : csp->cp_from;
  98
  99         dp->convtype = 0;
 100
 101         if (cspf)
 102                 dp->convtype |= KICONV_UCS_COMBINE;
 103         for (i = 0; unicode_family[i].name; i++) {
 104                 if (strcasecmp(from, unicode_family[i].name) == 0)
 105                         dp->convtype |= unicode_family[i].from_flag;
 106                 if (strcasecmp(to, unicode_family[i].name) == 0)
 107                         dp->convtype |= unicode_family[i].to_flag;
 108         }
 109         if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
 110                 dp->convtype |= KICONV_UCS_UCS4;
 111         else
 112                 dp->convtype &= ~KICONV_UCS_UCS4;
 113
 114         dp->f_ctp = dp->t_ctp = NULL;
 115         if (dp->convtype & KICONV_UCS_COMBINE) {
 116                 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
 117                     (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
 118                         iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
 119                 }
 120                 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
 121                     (dp->convtype & KICONV_UCS_TO_LE) == 0) {
 122                         iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
 123                 }
 124         }
 125
 126         dp->ctype = NULL;
 127         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
 128                 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
 129
 130         dp->d_csp = csp;
 131         if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
 132                 if (cspf) {
 133                         dp->d_cspf = cspf;
 134                         cspf->cp_refcount++;
 135                 } else
 136                         csp->cp_refcount++;
 137         }
 138         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
 139                 csp->cp_refcount++;
 140         *dpp = (void*)dp;
 141         return 0;
 142 }
 143
 144 static int
 145 iconv_ucs_close(void *data)
 146 {
 147         struct iconv_ucs *dp = data;
 148
 149         if (dp->f_ctp)
 150                 iconv_close(dp->f_ctp);
 151         if (dp->t_ctp)
 152                 iconv_close(dp->t_ctp);
 153         if (dp->ctype)
 154                 iconv_close(dp->ctype);
 155         if (dp->d_cspf)
 156                 dp->d_cspf->cp_refcount--;
 157         else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
 158                 dp->d_csp->cp_refcount--;
 159         if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
 160                 dp->d_csp->cp_refcount--;
 161         kobj_delete((struct kobj*)data, M_ICONV);
 162         return 0;
 163 }
 164
 165 static int
 166 iconv_ucs_conv(void *d2p, const char **inbuf,
 167         size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
 168         int convchar, int casetype)
 169 {
 170         struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
 171         int ret = 0, i;
 172         size_t in, on, ir, or, inlen, outlen, ucslen;
 173         const char *src, *p;
 174         char *dst;
 175         u_char ucs[4], *q;
 176         uint32_t code;
 177
 178         if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
 179                 return 0;
 180         ir = in = *inbytesleft;
 181         or = on = *outbytesleft;
 182         src = *inbuf;
 183         dst = *outbuf;
 184
 185         while (ir > 0 && or > 0) {
 186
 187                 /*
 188                  * The first half of conversion.
 189                  * (convert any code into ENCODING_UNICODE)
 190                  */
 191                 code = 0;
 192                 p = src;
 193                 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
 194                         /* convert UTF-8 to ENCODING_UNICODE */
 195                         inlen = 0;
 196                         code = utf8_to_ucs4(p, &inlen, ir);
 197                         if (code == 0) {
 198                                 ret = -1;
 199                                 break;
 200                         }
 201
 202                         if (casetype == KICONV_FROM_LOWER && dp->ctype) {
 203                                 code = towlower(code, dp->ctype);
 204                         } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
 205                                 code = towupper(code, dp->ctype);
 206                         }
 207
 208                         if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
 209                                 /* reserved for utf-16 surrogate pair */
 210                                 /* invalid unicode */
 211                                 ret = -1;
 212                                 break;
 213                         }
 214
 215                         if (inlen == 4) {
 216                                 if (dp->convtype & KICONV_UCS_UCS4) {
 217                                         ucslen = 4;
 218                                         code = encode_surrogate(code);
 219                                 } else {
 220                                         /* can't handle with ucs-2 */
 221                                         ret = -1;
 222                                         break;
 223                                 }
 224                         } else {
 225                                 ucslen = 2;
 226                         }
 227
 228                         /* save UCS-4 into ucs[] */
 229                         for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
 230                                 *q++ = (code >> (i << 3)) & 0xff;
 231
 232                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
 233                         /* convert local code to ENCODING_UNICODE */
 234                         ucslen = 4;
 235                         inlen = ir;
 236                         q = ucs;
 237                         ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
 238                             &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
 239                         if (ret)
 240                                 break;
 241                         inlen = ir - inlen;
 242                         ucslen = 4 - ucslen;
 243
 244                 } else {
 245                         /* src code is a proper subset of ENCODING_UNICODE */
 246                         q = ucs;
 247                         if (dp->convtype & KICONV_UCS_FROM_LE) {
 248                                 *q = *(p + 1);
 249                                 *(q + 1) = *p;
 250                                 p += 2;
 251                         } else {
 252                                 *q = *p++;
 253                                 *(q + 1) = *p++;
 254                         }
 255                         if ((*q & 0xfc) == 0xd8) {
 256                                 if (dp->convtype & KICONV_UCS_UCS4 &&
 257                                     dp->convtype & KICONV_UCS_FROM_UTF16) {
 258                                         inlen = ucslen = 4;
 259                                 } else {
 260                                         /* invalid unicode */
 261                                         ret = -1;
 262                                         break;
 263                                 }
 264                         } else {
 265                                 inlen = ucslen = 2;
 266                         }
 267                         if (ir < inlen) {
 268                                 ret = -1;
 269                                 break;
 270                         }
 271                         if (ucslen == 4) {
 272                                 q += 2;
 273                                 if (dp->convtype & KICONV_UCS_FROM_LE) {
 274                                         *q = *(p + 1);
 275                                         *(q + 1) = *p;
 276                                 } else {
 277                                         *q = *p++;
 278                                         *(q + 1) = *p;
 279                                 }
 280                                 if ((*q & 0xfc) != 0xdc) {
 281                                         /* invalid unicode */
 282                                         ret = -1;
 283                                         break;
 284                                 }
 285                         }
 286                 }
 287
 288                 /*
 289                  * The second half of conversion.
 290                  * (convert ENCODING_UNICODE into any code)
 291                  */
 292                 p = ucs;
 293                 if (dp->convtype & KICONV_UCS_TO_UTF8) {
 294                         q = (u_char *)dst;
 295                         if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
 296                                 /* decode surrogate pair */
 297                                 code = decode_surrogate(p);
 298                         } else {
 299                                 code = (ucs[0] << 8) | ucs[1];
 300                         }
 301
 302                         if (casetype == KICONV_LOWER && dp->ctype) {
 303                                 code = towlower(code, dp->ctype);
 304                         } else if (casetype == KICONV_UPPER && dp->ctype) {
 305                                 code = towupper(code, dp->ctype);
 306                         }
 307
 308                         outlen = 0;
 309                         if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
 310                                 ret = -1;
 311                                 break;
 312                         }
 313
 314                         src += inlen;
 315                         ir -= inlen;
 316                         dst += outlen;
 317                         or -= outlen;
 318
 319                 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
 320                         ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
 321                             &or, casetype & (KICONV_LOWER | KICONV_UPPER));
 322                         if (ret)
 323                                 break;
 324
 325                         src += inlen;
 326                         ir -= inlen;
 327
 328                 } else {
 329                         /* dst code is a proper subset of ENCODING_UNICODE */
 330                         if (or < ucslen) {
 331                                 ret = -1;
 332                                 break;
 333                         }
 334                         src += inlen;
 335                         ir -= inlen;
 336                         or -= ucslen;
 337                         if (dp->convtype & KICONV_UCS_TO_LE) {
 338                                 *dst++ = *(p + 1);
 339                                 *dst++ = *p;
 340                                 p += 2;
 341                         } else {
 342                                 *dst++ = *p++;
 343                                 *dst++ = *p++;
 344                         }
 345                         if (ucslen == 4) {
 346                                 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
 347                                     (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
 348                                         ret = -1;
 349                                         break;
 350                                 }
 351                                 if (dp->convtype & KICONV_UCS_TO_LE) {
 352                                         *dst++ = *(p + 1);
 353                                         *dst++ = *p;
 354                                 } else {
 355                                         *dst++ = *p++;
 356                                         *dst++ = *p;
 357                                 }
 358                         }
 359                 }
 360
 361                 if (convchar == 1)
 362                         break;
 363         }
 364
 365         *inbuf += in - ir;
 366         *outbuf += on - or;
 367         *inbytesleft -= in - ir;
 368         *outbytesleft -= on - or;
 369         return (ret);
 370 }
 371
 372 static int
 373 iconv_ucs_init(struct iconv_converter_class *dcp)
 374 {
 375         int error;
 376
 377         error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
 378         if (error)
 379                 return (error);
 380         error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
 381         if (error)
 382                 return (error);
 383         return (0);
 384 }
 385
 386 static int
 387 iconv_ucs_done(struct iconv_converter_class *dcp)
 388 {
 389         return (0);
 390 }
 391
 392 static const char *
 393 iconv_ucs_name(struct iconv_converter_class *dcp)
 394 {
 395         return (ENCODING_UNICODE);
 396 }
 397
 398 static kobj_method_t iconv_ucs_methods[] = {
 399         KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
 400         KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
 401         KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
 402         KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
 403         KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
 404         KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
 405         KOBJMETHOD_END
 406 };
 407
 408 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
 409
 410 static uint32_t
 411 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
 412 {
 413         size_t i, w = 0;
 414         uint32_t ucs4 = 0;
 415
 416         /*
 417          * get leading 1 byte from utf-8
 418          */
 419         if ((*src & 0x80) == 0) {
 420                 /*
 421                  * leading 1 bit is "0"
 422                  *  utf-8: 0xxxxxxx
 423                  *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
 424                  */
 425                 w = 1;
 426                 /* get trailing 7 bits */
 427                 ucs4 = *src & 0x7f;
 428         } else if ((*src & 0xe0) == 0xc0) {
 429                 /*
 430                  * leading 3 bits are "110"
 431                  *  utf-8: 110xxxxx 10yyyyyy
 432                  *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
 433                  */
 434                 w = 2;
 435                 /* get trailing 5 bits */
 436                 ucs4 = *src & 0x1f;
 437         } else if ((*src & 0xf0) == 0xe0) {
 438                 /*
 439                  * leading 4 bits are "1110"
 440                  *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
 441                  *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
 442                  */
 443                 w = 3;
 444                 /* get trailing 4 bits */
 445                 ucs4 = *src & 0x0f;
 446         } else if ((*src & 0xf8) == 0xf0) {
 447                 /*
 448                  * leading 5 bits are "11110"
 449                  *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
 450                  *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
 451                  */
 452                 w = 4;
 453                 /* get trailing 3 bits */
 454                 ucs4 = *src & 0x07;
 455         } else {
 456                 /* out of utf-16 range or having illegal bits */
 457                 return (0);
 458         }
 459
 460         if (srclen < w)
 461                 return (0);
 462
 463         /*
 464          * get left parts from utf-8
 465          */
 466         for (i = 1 ; i < w ; i++) {
 467                 if ((*(src + i) & 0xc0) != 0x80) {
 468                         /* invalid: leading 2 bits are not "10" */
 469                         return (0);
 470                 }
 471                 /* concatenate trailing 6 bits into ucs4 */
 472                 ucs4 <<= 6;
 473                 ucs4 |= *(src + i) & 0x3f;
 474         }
 475
 476         *utf8width = w;
 477         return (ucs4);
 478 }
 479
 480 static u_char *
 481 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
 482 {
 483         u_char lead, *p;
 484         size_t i, w;
 485
 486         /*
 487          * determine utf-8 width and leading bits
 488          */
 489         if (ucs4 < 0x80) {
 490                 w = 1;
 491                 lead = 0;       /* "0" */
 492         } else if (ucs4 < 0x800) {
 493                 w = 2;
 494                 lead = 0xc0;    /* "11" */
 495         } else if (ucs4 < 0x10000) {
 496                 w = 3;
 497                 lead = 0xe0;    /* "111" */
 498         } else if (ucs4 < 0x200000) {
 499                 w = 4;
 500                 lead = 0xf0;    /* "1111" */
 501         } else {
 502                 return (NULL);
 503         }
 504
 505         if (dstlen < w)
 506                 return (NULL);
 507
 508         /*
 509          * construct utf-8
 510          */
 511         p = dst;
 512         for (i = w - 1 ; i >= 1 ; i--) {
 513                 /* get trailing 6 bits and put it with leading bit as "1" */
 514                 *(p + i) = (ucs4 & 0x3f) | 0x80;
 515                 ucs4 >>= 6;
 516         }
 517         *p = ucs4 | lead;
 518
 519         *utf8width = w;
 520
 521         return (p);
 522 }
 523
 524 static uint32_t
 525 encode_surrogate(register uint32_t code)
 526 {
 527         return ((((code - 0x10000) << 6) & 0x3ff0000) |
 528             ((code - 0x10000) & 0x3ff) | 0xd800dc00);
 529 }
 530
 531 static uint32_t
 532 decode_surrogate(register const u_char *ucs)
 533 {
 534         return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
 535             ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
 536 }
 537