lib/wind/utf8.c

   1 /*
   2  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
   3  * (Royal Institute of Technology, Stockholm, Sweden).
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the Institute nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 #include <config.h>
  35 #include "windlocl.h"
  36
  37 static int
  38 utf8toutf32(const unsigned char **pp, uint32_t *out)
  39 {
  40     const unsigned char *p = *pp;
  41     unsigned c = *p;
  42
  43     if (c & 0x80) {
  44         if ((c & 0xE0) == 0xC0) {
  45             const unsigned c2 = *++p;
  46             if ((c2 & 0xC0) == 0x80) {
  47                 *out =  ((c  & 0x1F) << 6)
  48                     | (c2 & 0x3F);
  49             } else {
  50                 return WIND_ERR_INVALID_UTF8;
  51             }
  52         } else if ((c & 0xF0) == 0xE0) {
  53             const unsigned c2 = *++p;
  54             if ((c2 & 0xC0) == 0x80) {
  55                 const unsigned c3 = *++p;
  56                 if ((c3 & 0xC0) == 0x80) {
  57                     *out =   ((c  & 0x0F) << 12)
  58                         | ((c2 & 0x3F) << 6)
  59                         |  (c3 & 0x3F);
  60                 } else {
  61                     return WIND_ERR_INVALID_UTF8;
  62                 }
  63             } else {
  64                 return WIND_ERR_INVALID_UTF8;
  65             }
  66         } else if ((c & 0xF8) == 0xF0) {
  67             const unsigned c2 = *++p;
  68             if ((c2 & 0xC0) == 0x80) {
  69                 const unsigned c3 = *++p;
  70                 if ((c3 & 0xC0) == 0x80) {
  71                     const unsigned c4 = *++p;
  72                     if ((c4 & 0xC0) == 0x80) {
  73                         *out =   ((c  & 0x07) << 18)
  74                             | ((c2 & 0x3F) << 12)
  75                             | ((c3 & 0x3F) <<  6)
  76                             |  (c4 & 0x3F);
  77                     } else {
  78                         return WIND_ERR_INVALID_UTF8;
  79                     }
  80                 } else {
  81                     return WIND_ERR_INVALID_UTF8;
  82                 }
  83             } else {
  84                 return WIND_ERR_INVALID_UTF8;
  85             }
  86         } else {
  87             return WIND_ERR_INVALID_UTF8;
  88         }
  89     } else {
  90         *out = c;
  91     }
  92
  93     *pp = p;
  94
  95     return 0;
  96 }
  97
  98 /**
  99  * Convert an UTF-8 string to an UCS4 string.
 100  *
 101  * @param in an UTF-8 string to convert.
 102  * @param out the resulting UCS4 strint, must be at least
 103  * wind_utf8ucs4_length() long.  If out is NULL, the function will
 104  * calculate the needed space for the out variable (just like
 105  * wind_utf8ucs4_length()).
 106  * @param out_len before processing out_len should be the length of
 107  * the out variable, after processing it will be the length of the out
 108  * string.
 109  *
 110  * @return returns 0 on success, an wind error code otherwise
 111  * @ingroup wind
 112  */
 113
 114 int
 115 wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
 116 {
 117     const unsigned char *p;
 118     size_t o = 0;
 119     int ret;
 120
 121     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 122         uint32_t u;
 123
 124         ret = utf8toutf32(&p, &u);
 125         if (ret)
 126             return ret;
 127
 128         if (out) {
 129             if (o >= *out_len)
 130                 return WIND_ERR_OVERRUN;
 131             out[o] = u;
 132         }
 133         o++;
 134     }
 135     *out_len = o;
 136     return 0;
 137 }
 138
 139 /**
 140  * Calculate the length of from converting a UTF-8 string to a UCS4
 141  * string.
 142  *
 143  * @param in an UTF-8 string to convert.
 144  * @param out_len the length of the resulting UCS4 string.
 145  *
 146  * @return returns 0 on success, an wind error code otherwise
 147  * @ingroup wind
 148  */
 149
 150 int
 151 wind_utf8ucs4_length(const char *in, size_t *out_len)
 152 {
 153     return wind_utf8ucs4(in, NULL, out_len);
 154 }
 155
 156 static const char first_char[4] =
 157     { 0x00, 0xC0, 0xE0, 0xF0 };
 158
 159 /**
 160  * Convert an UCS4 string to a UTF-8 string.
 161  *
 162  * @param in an UCS4 string to convert.
 163  * @param in_len the length input array.
 164
 165  * @param out the resulting UTF-8 strint, must be at least
 166  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
 167  * out is NULL, the function will calculate the needed space for the
 168  * out variable (just like wind_ucs4utf8_length()).
 169
 170  * @param out_len before processing out_len should be the length of
 171  * the out variable, after processing it will be the length of the out
 172  * string.
 173  *
 174  * @return returns 0 on success, an wind error code otherwise
 175  * @ingroup wind
 176  */
 177
 178 int
 179 wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
 180 {
 181     uint32_t ch;
 182     size_t i, len, o;
 183
 184     for (o = 0, i = 0; i < in_len; i++) {
 185         ch = in[i];
 186
 187         if (ch < 0x80) {
 188             len = 1;
 189         } else if (ch < 0x800) {
 190             len = 2;
 191         } else if (ch < 0x10000) {
 192             len = 3;
 193         } else if (ch <= 0x10FFFF) {
 194             len = 4;
 195         } else
 196             return WIND_ERR_INVALID_UTF32;
 197
 198         o += len;
 199
 200         if (out) {
 201             if (o >= *out_len)
 202                 return WIND_ERR_OVERRUN;
 203
 204             switch(len) {
 205             case 4:
 206                 out[3] = (ch | 0x80) & 0xbf;
 207                 ch = ch >> 6;
 208                 HEIM_FALLTHROUGH;
 209             case 3:
 210                 out[2] = (ch | 0x80) & 0xbf;
 211                 ch = ch >> 6;
 212                 HEIM_FALLTHROUGH;
 213             case 2:
 214                 out[1] = (ch | 0x80) & 0xbf;
 215                 ch = ch >> 6;
 216                 HEIM_FALLTHROUGH;
 217             case 1:
 218                 out[0] = ch | first_char[len - 1];
 219                 HEIM_FALLTHROUGH;
 220             default:
 221                 break;
 222             }
 223         }
 224         out += len;
 225     }
 226     if (out) {
 227         if (o + 1 >= *out_len)
 228             return WIND_ERR_OVERRUN;
 229         *out = '\0';
 230     }
 231     *out_len = o;
 232     return 0;
 233 }
 234
 235 /**
 236  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
 237  *
 238  * @param in an UCS4 string to convert.
 239  * @param in_len the length of UCS4 string to convert.
 240  * @param out_len the length of the resulting UTF-8 string.
 241  *
 242  * @return returns 0 on success, an wind error code otherwise
 243  * @ingroup wind
 244  */
 245
 246 int
 247 wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
 248 {
 249     return wind_ucs4utf8(in, in_len, NULL, out_len);
 250 }
 251
 252 /**
 253  * Read in an UCS2 from a buffer.
 254  *
 255  * @param ptr The input buffer to read from.
 256  * @param len the length of the input buffer.
 257  * @param flags Flags to control the behavior of the function.
 258  * @param out the output UCS2, the array must be at least out/2 long.
 259  * @param out_len the output length
 260  *
 261  * @return returns 0 on success, an wind error code otherwise.
 262  * @ingroup wind
 263  */
 264
 265 int
 266 wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
 267               uint16_t *out, size_t *out_len)
 268 {
 269     const unsigned char *p = ptr;
 270     int little = ((*flags) & WIND_RW_LE);
 271     size_t olen = *out_len;
 272
 273     /** if len is zero, flags are unchanged */
 274     if (len == 0) {
 275         *out_len = 0;
 276         return 0;
 277     }
 278
 279     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
 280     if (len & 1)
 281         return WIND_ERR_LENGTH_NOT_MOD2;
 282
 283     /**
 284      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
 285      * found, check is LE/BE flag is already and use that otherwise
 286      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
 287      * the LE/BE flag and set the resulting LE/BE flag.
 288      */
 289     if ((*flags) & WIND_RW_BOM) {
 290         uint16_t bom = (p[0] << 8) + p[1];
 291         if (bom == 0xfffe || bom == 0xfeff) {
 292             little = (bom == 0xfffe);
 293             p += 2;
 294             len -= 2;
 295         } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
 296             /* little already set */
 297         } else
 298             return WIND_ERR_NO_BOM;
 299         *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
 300         *flags |= little ? WIND_RW_LE : WIND_RW_BE;
 301     }
 302
 303     while (len) {
 304         if (olen < 1)
 305             return WIND_ERR_OVERRUN;
 306         if (little)
 307             *out = (p[1] << 8) + p[0];
 308         else
 309             *out = (p[0] << 8) + p[1];
 310         out++; p += 2; len -= 2; olen--;
 311     }
 312     *out_len -= olen;
 313     return 0;
 314 }
 315
 316 /**
 317  * Write an UCS2 string to a buffer.
 318  *
 319  * @param in The input UCS2 string.
 320  * @param in_len the length of the input buffer.
 321  * @param flags Flags to control the behavior of the function.
 322  * @param ptr The input buffer to write to, the array must be at least
 323  * (in + 1) * 2 bytes long.
 324  * @param out_len the output length
 325  *
 326  * @return returns 0 on success, an wind error code otherwise.
 327  * @ingroup wind
 328  */
 329
 330 int
 331 wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
 332                void *ptr, size_t *out_len)
 333 {
 334     unsigned char *p = ptr;
 335     size_t len = *out_len;
 336
 337     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
 338     if (len & 1)
 339         return WIND_ERR_LENGTH_NOT_MOD2;
 340
 341     /** On zero input length, flags are preserved */
 342     if (in_len == 0) {
 343         *out_len = 0;
 344         return 0;
 345     }
 346     /** If flags have WIND_RW_BOM set, the byte order mark is written
 347      * first to the output data */
 348     if ((*flags) & WIND_RW_BOM) {
 349         uint16_t bom = 0xfffe;
 350
 351         if (len < 2)
 352             return WIND_ERR_OVERRUN;
 353
 354         if ((*flags) & WIND_RW_LE) {
 355             p[0] = (bom     ) & 0xff;
 356             p[1] = (bom >> 8) & 0xff;
 357         } else {
 358             p[1] = (bom     ) & 0xff;
 359             p[0] = (bom >> 8) & 0xff;
 360         }
 361         len -= 2;
 362     }
 363
 364     while (in_len) {
 365         /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
 366         if (len < 2)
 367             return WIND_ERR_OVERRUN;
 368         if ((*flags) & WIND_RW_LE) {
 369             p[0] = (in[0]     ) & 0xff;
 370             p[1] = (in[0] >> 8) & 0xff;
 371         } else {
 372             p[1] = (in[0]     ) & 0xff;
 373             p[0] = (in[0] >> 8) & 0xff;
 374         }
 375         len -= 2;
 376         in_len--;
 377         p += 2;
 378         in++;
 379     }
 380     *out_len -= len;
 381     return 0;
 382 }
 383
 384
 385 /**
 386  * Convert an UTF-8 string to an UCS2 string.
 387  *
 388  * @param in an UTF-8 string to convert.
 389  * @param out the resulting UCS2 strint, must be at least
 390  * wind_utf8ucs2_length() long.  If out is NULL, the function will
 391  * calculate the needed space for the out variable (just like
 392  * wind_utf8ucs2_length()).
 393  * @param out_len before processing out_len should be the length of
 394  * the out variable, after processing it will be the length of the out
 395  * string.
 396  *
 397  * @return returns 0 on success, an wind error code otherwise
 398  * @ingroup wind
 399  */
 400
 401 int
 402 wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
 403 {
 404     const unsigned char *p;
 405     size_t o = 0;
 406     int ret;
 407
 408     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 409         uint32_t u;
 410
 411         ret = utf8toutf32(&p, &u);
 412         if (ret)
 413             return ret;
 414
 415         if (u & 0xffff0000)
 416             return WIND_ERR_NOT_UTF16;
 417
 418         if (out) {
 419             if (o >= *out_len)
 420                 return WIND_ERR_OVERRUN;
 421             out[o] = u;
 422         }
 423         o++;
 424     }
 425     *out_len = o;
 426     return 0;
 427 }
 428
 429 /**
 430  * Calculate the length of from converting a UTF-8 string to a UCS2
 431  * string.
 432  *
 433  * @param in an UTF-8 string to convert.
 434  * @param out_len the length of the resulting UCS4 string.
 435  *
 436  * @return returns 0 on success, an wind error code otherwise
 437  * @ingroup wind
 438  */
 439
 440 int
 441 wind_utf8ucs2_length(const char *in, size_t *out_len)
 442 {
 443     return wind_utf8ucs2(in, NULL, out_len);
 444 }
 445
 446 /**
 447  * Convert an UCS2 string to a UTF-8 string.
 448  *
 449  * @param in an UCS2 string to convert.
 450  * @param in_len the length of the in UCS2 string.
 451  * @param out the resulting UTF-8 strint, must be at least
 452  * wind_ucs2utf8_length() long.  If out is NULL, the function will
 453  * calculate the needed space for the out variable (just like
 454  * wind_ucs2utf8_length()).
 455  * @param out_len before processing out_len should be the length of
 456  * the out variable, after processing it will be the length of the out
 457  * string.
 458  *
 459  * @return returns 0 on success, an wind error code otherwise
 460  * @ingroup wind
 461  */
 462
 463 int
 464 wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
 465 {
 466     uint16_t ch;
 467     size_t i, len, o;
 468
 469     for (o = 0, i = 0; i < in_len; i++) {
 470         ch = in[i];
 471
 472         if (ch < 0x80) {
 473             len = 1;
 474         } else if (ch < 0x800) {
 475             len = 2;
 476         } else
 477             len = 3;
 478
 479         o += len;
 480
 481         if (out) {
 482             if (o >= *out_len)
 483                 return WIND_ERR_OVERRUN;
 484
 485             switch(len) {
 486             case 3:
 487                 out[2] = (ch | 0x80) & 0xbf;
 488                 ch = ch >> 6;
 489                 HEIM_FALLTHROUGH;
 490             case 2:
 491                 out[1] = (ch | 0x80) & 0xbf;
 492                 ch = ch >> 6;
 493                 HEIM_FALLTHROUGH;
 494             case 1:
 495                 out[0] = ch | first_char[len - 1];
 496                 HEIM_FALLTHROUGH;
 497             default:
 498                 break;
 499             }
 500             out += len;
 501         }
 502     }
 503     if (out) {
 504         if (o >= *out_len)
 505             return WIND_ERR_OVERRUN;
 506         *out = '\0';
 507     }
 508     *out_len = o;
 509     return 0;
 510 }
 511
 512 /**
 513  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
 514  *
 515  * @param in an UCS2 string to convert.
 516  * @param in_len an UCS2 string length to convert.
 517  * @param out_len the length of the resulting UTF-8 string.
 518  *
 519  * @return returns 0 on success, an wind error code otherwise
 520  * @ingroup wind
 521  */
 522
 523 int
 524 wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
 525 {
 526     return wind_ucs2utf8(in, in_len, NULL, out_len);
 527 }