lib/wind/utf8.c

   1 /*
   2  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
   3  * (Royal Institute of Technology, Stockholm, Sweden).
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the Institute nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 #include <config.h>
  35 #include "windlocl.h"
  36
  37 static int
  38 utf8toutf32(const unsigned char **pp, uint32_t *out)
  39 {
  40     const unsigned char *p = *pp;
  41     unsigned c = *p;
  42
  43     if (c & 0x80) {
  44         if ((c & 0xE0) == 0xC0) {
  45             const unsigned c2 = *++p;
  46             if ((c2 & 0xC0) == 0x80) {
  47                 *out =  ((c  & 0x1F) << 6)
  48                     | (c2 & 0x3F);
  49             } else {
  50                 return WIND_ERR_INVALID_UTF8;
  51             }
  52         } else if ((c & 0xF0) == 0xE0) {
  53             const unsigned c2 = *++p;
  54             if ((c2 & 0xC0) == 0x80) {
  55                 const unsigned c3 = *++p;
  56                 if ((c3 & 0xC0) == 0x80) {
  57                     *out =   ((c  & 0x0F) << 12)
  58                         | ((c2 & 0x3F) << 6)
  59                         |  (c3 & 0x3F);
  60                 } else {
  61                     return WIND_ERR_INVALID_UTF8;
  62                 }
  63             } else {
  64                 return WIND_ERR_INVALID_UTF8;
  65             }
  66         } else if ((c & 0xF8) == 0xF0) {
  67             const unsigned c2 = *++p;
  68             if ((c2 & 0xC0) == 0x80) {
  69                 const unsigned c3 = *++p;
  70                 if ((c3 & 0xC0) == 0x80) {
  71                     const unsigned c4 = *++p;
  72                     if ((c4 & 0xC0) == 0x80) {
  73                         *out =   ((c  & 0x07) << 18)
  74                             | ((c2 & 0x3F) << 12)
  75                             | ((c3 & 0x3F) <<  6)
  76                             |  (c4 & 0x3F);
  77                     } else {
  78                         return WIND_ERR_INVALID_UTF8;
  79                     }
  80                 } else {
  81                     return WIND_ERR_INVALID_UTF8;
  82                 }
  83             } else {
  84                 return WIND_ERR_INVALID_UTF8;
  85             }
  86         } else {
  87             return WIND_ERR_INVALID_UTF8;
  88         }
  89     } else {
  90         *out = c;
  91     }
  92
  93     *pp = p;
  94
  95     return 0;
  96 }
  97
  98 /**
  99  * Convert an UTF-8 string to an UCS4 string.
 100  *
 101  * @param in an UTF-8 string to convert.
 102  * @param out the resulting UCS4 strint, must be at least
 103  * wind_utf8ucs4_length() long.  If out is NULL, the function will
 104  * calculate the needed space for the out variable (just like
 105  * wind_utf8ucs4_length()).
 106  * @param out_len before processing out_len should be the length of
 107  * the out variable, after processing it will be the length of the out
 108  * string.
 109  *
 110  * @return returns 0 on success, an wind error code otherwise
 111  * @ingroup wind
 112  */
 113
 114 int
 115 wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
 116 {
 117     const unsigned char *p;
 118     size_t o = 0;
 119     int ret;
 120
 121     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 122         uint32_t u;
 123
 124         ret = utf8toutf32(&p, &u);
 125         if (ret)
 126             return ret;
 127
 128         if (out) {
 129             if (o >= *out_len)
 130                 return WIND_ERR_OVERRUN;
 131             out[o] = u;
 132         }
 133         o++;
 134     }
 135     *out_len = o;
 136     return 0;
 137 }
 138
 139 /**
 140  * Calculate the length of from converting a UTF-8 string to a UCS4
 141  * string.
 142  *
 143  * @param in an UTF-8 string to convert.
 144  * @param out_len the length of the resulting UCS4 string.
 145  *
 146  * @return returns 0 on success, an wind error code otherwise
 147  * @ingroup wind
 148  */
 149
 150 int
 151 wind_utf8ucs4_length(const char *in, size_t *out_len)
 152 {
 153     return wind_utf8ucs4(in, NULL, out_len);
 154 }
 155
 156 static const char first_char[4] =
 157     { 0x00, 0xC0, 0xE0, 0xF0 };
 158
 159 /**
 160  * Convert an UCS4 string to a UTF-8 string.
 161  *
 162  * @param in an UCS4 string to convert.
 163  * @param in_len the length input array.
 164
 165  * @param out the resulting UTF-8 strint, must be at least
 166  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
 167  * out is NULL, the function will calculate the needed space for the
 168  * out variable (just like wind_ucs4utf8_length()).
 169
 170  * @param out_len before processing out_len should be the length of
 171  * the out variable, after processing it will be the length of the out
 172  * string.
 173  *
 174  * @return returns 0 on success, an wind error code otherwise
 175  * @ingroup wind
 176  */
 177
 178 int
 179 wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
 180 {
 181     uint32_t ch;
 182     size_t i, len, o;
 183
 184     for (o = 0, i = 0; i < in_len; i++) {
 185         ch = in[i];
 186
 187         if (ch < 0x80) {
 188             len = 1;
 189         } else if (ch < 0x800) {
 190             len = 2;
 191         } else if (ch < 0x10000) {
 192             len = 3;
 193         } else if (ch <= 0x10FFFF) {
 194             len = 4;
 195         } else
 196             return WIND_ERR_INVALID_UTF32;
 197
 198         o += len;
 199
 200         if (out) {
 201             if (o >= *out_len)
 202                 return WIND_ERR_OVERRUN;
 203
 204             switch(len) {
 205             case 4:
 206                 out[3] = (ch | 0x80) & 0xbf;
 207                 ch = ch << 6;
 208             case 3:
 209                 out[2] = (ch | 0x80) & 0xbf;
 210                 ch = ch << 6;
 211             case 2:
 212                 out[1] = (ch | 0x80) & 0xbf;
 213                 ch = ch << 6;
 214             case 1:
 215                 out[0] = ch | first_char[len - 1];
 216             }
 217         }
 218         out += len;
 219     }
 220     if (out) {
 221         if (o + 1 >= *out_len)
 222             return WIND_ERR_OVERRUN;
 223         *out = '\0';
 224     }
 225     *out_len = o;
 226     return 0;
 227 }
 228
 229 /**
 230  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
 231  *
 232  * @param in an UCS4 string to convert.
 233  * @param in_len the length of UCS4 string to convert.
 234  * @param out_len the length of the resulting UTF-8 string.
 235  *
 236  * @return returns 0 on success, an wind error code otherwise
 237  * @ingroup wind
 238  */
 239
 240 int
 241 wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
 242 {
 243     return wind_ucs4utf8(in, in_len, NULL, out_len);
 244 }
 245
 246 /**
 247  * Read in an UCS2 from a buffer.
 248  *
 249  * @param ptr The input buffer to read from.
 250  * @param len the length of the input buffer.
 251  * @param flags Flags to control the behavior of the function.
 252  * @param out the output UCS2, the array must be at least out/2 long.
 253  * @param out_len the output length
 254  *
 255  * @return returns 0 on success, an wind error code otherwise.
 256  * @ingroup wind
 257  */
 258
 259 int
 260 wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
 261               uint16_t *out, size_t *out_len)
 262 {
 263     const unsigned char *p = ptr;
 264     int little = ((*flags) & WIND_RW_LE);
 265     size_t olen = *out_len;
 266
 267     /** if len is zero, flags are unchanged */
 268     if (len == 0) {
 269         *out_len = 0;
 270         return 0;
 271     }
 272
 273     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
 274     if (len & 1)
 275         return WIND_ERR_LENGTH_NOT_MOD2;
 276
 277     /**
 278      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
 279      * found, check is LE/BE flag is already and use that otherwise
 280      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
 281      * the LE/BE flag and set the resulting LE/BE flag.
 282      */
 283     if ((*flags) & WIND_RW_BOM) {
 284         uint16_t bom = (p[0] << 8) + p[1];
 285         if (bom == 0xfffe || bom == 0xfeff) {
 286             little = (bom == 0xfffe);
 287             p += 2;
 288             len -= 2;
 289         } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
 290             /* little already set */
 291         } else
 292             return WIND_ERR_NO_BOM;
 293         *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
 294         *flags |= little ? WIND_RW_LE : WIND_RW_BE;
 295     }
 296
 297     while (len) {
 298         if (olen < 1)
 299             return WIND_ERR_OVERRUN;
 300         if (little)
 301             *out = (p[1] << 8) + p[0];
 302         else
 303             *out = (p[0] << 8) + p[1];
 304         out++; p += 2; len -= 2; olen--;
 305     }
 306     *out_len -= olen;
 307     return 0;
 308 }
 309
 310 /**
 311  * Write an UCS2 string to a buffer.
 312  *
 313  * @param in The input UCS2 string.
 314  * @param in_len the length of the input buffer.
 315  * @param flags Flags to control the behavior of the function.
 316  * @param ptr The input buffer to write to, the array must be at least
 317  * (in + 1) * 2 bytes long.
 318  * @param out_len the output length
 319  *
 320  * @return returns 0 on success, an wind error code otherwise.
 321  * @ingroup wind
 322  */
 323
 324 int
 325 wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
 326                void *ptr, size_t *out_len)
 327 {
 328     unsigned char *p = ptr;
 329     size_t len = *out_len;
 330
 331     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
 332     if (len & 1)
 333         return WIND_ERR_LENGTH_NOT_MOD2;
 334
 335     /** On zero input length, flags are preserved */
 336     if (in_len == 0) {
 337         *out_len = 0;
 338         return 0;
 339     }
 340     /** If flags have WIND_RW_BOM set, the byte order mark is written
 341      * first to the output data */
 342     if ((*flags) & WIND_RW_BOM) {
 343         uint16_t bom = 0xfffe;
 344
 345         if (len < 2)
 346             return WIND_ERR_OVERRUN;
 347
 348         if ((*flags) & WIND_RW_LE) {
 349             p[0] = (bom >> 8) & 0xff;
 350             p[1] = (bom     ) & 0xff;
 351         } else {
 352             p[1] = (bom     ) & 0xff;
 353             p[0] = (bom >> 8) & 0xff;
 354         }
 355         len -= 2;
 356     }
 357
 358     while (in_len) {
 359         /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
 360         if (len < 2)
 361             return WIND_ERR_OVERRUN;
 362         if ((*flags) & WIND_RW_LE) {
 363             p[0] = (in[0] >> 8) & 0xff;
 364             p[1] = (in[0]     ) & 0xff;
 365         } else {
 366             p[1] = (in[0]     ) & 0xff;
 367             p[0] = (in[0] >> 8) & 0xff;
 368         }
 369         len -= 2;
 370         in_len--;
 371         p += 2;
 372         in++;
 373     }
 374     *out_len -= len;
 375     return 0;
 376 }
 377
 378
 379 /**
 380  * Convert an UTF-8 string to an UCS2 string.
 381  *
 382  * @param in an UTF-8 string to convert.
 383  * @param out the resulting UCS2 strint, must be at least
 384  * wind_utf8ucs2_length() long.  If out is NULL, the function will
 385  * calculate the needed space for the out variable (just like
 386  * wind_utf8ucs2_length()).
 387  * @param out_len before processing out_len should be the length of
 388  * the out variable, after processing it will be the length of the out
 389  * string.
 390  *
 391  * @return returns 0 on success, an wind error code otherwise
 392  * @ingroup wind
 393  */
 394
 395 int
 396 wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
 397 {
 398     const unsigned char *p;
 399     size_t o = 0;
 400     int ret;
 401
 402     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
 403         uint32_t u;
 404
 405         ret = utf8toutf32(&p, &u);
 406         if (ret)
 407             return ret;
 408
 409         if (u & 0xffff0000)
 410             return WIND_ERR_NOT_UTF16;
 411
 412         if (out) {
 413             if (o >= *out_len)
 414                 return WIND_ERR_OVERRUN;
 415             out[o] = u;
 416         }
 417         o++;
 418     }
 419     *out_len = o;
 420     return 0;
 421 }
 422
 423 /**
 424  * Calculate the length of from converting a UTF-8 string to a UCS2
 425  * string.
 426  *
 427  * @param in an UTF-8 string to convert.
 428  * @param out_len the length of the resulting UCS4 string.
 429  *
 430  * @return returns 0 on success, an wind error code otherwise
 431  * @ingroup wind
 432  */
 433
 434 int
 435 wind_utf8ucs2_length(const char *in, size_t *out_len)
 436 {
 437     return wind_utf8ucs2(in, NULL, out_len);
 438 }
 439
 440 /**
 441  * Convert an UCS2 string to a UTF-8 string.
 442  *
 443  * @param in an UCS2 string to convert.
 444  * @param in_len the length of the in UCS2 string.
 445  * @param out the resulting UTF-8 strint, must be at least
 446  * wind_ucs2utf8_length() long.  If out is NULL, the function will
 447  * calculate the needed space for the out variable (just like
 448  * wind_ucs2utf8_length()).
 449  * @param out_len before processing out_len should be the length of
 450  * the out variable, after processing it will be the length of the out
 451  * string.
 452  *
 453  * @return returns 0 on success, an wind error code otherwise
 454  * @ingroup wind
 455  */
 456
 457 int
 458 wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
 459 {
 460     uint16_t ch;
 461     size_t i, len, o;
 462
 463     for (o = 0, i = 0; i < in_len; i++) {
 464         ch = in[i];
 465
 466         if (ch < 0x80) {
 467             len = 1;
 468         } else if (ch < 0x800) {
 469             len = 2;
 470         } else
 471             len = 3;
 472
 473         o += len;
 474
 475         if (out) {
 476             if (o >= *out_len)
 477                 return WIND_ERR_OVERRUN;
 478
 479             switch(len) {
 480             case 3:
 481                 out[2] = (ch | 0x80) & 0xbf;
 482                 ch = ch << 6;
 483             case 2:
 484                 out[1] = (ch | 0x80) & 0xbf;
 485                 ch = ch << 6;
 486             case 1:
 487                 out[0] = ch | first_char[len - 1];
 488             }
 489             out += len;
 490         }
 491     }
 492     if (out) {
 493         if (o >= *out_len)
 494             return WIND_ERR_OVERRUN;
 495         *out = '\0';
 496     }
 497     *out_len = o;
 498     return 0;
 499 }
 500
 501 /**
 502  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
 503  *
 504  * @param in an UCS2 string to convert.
 505  * @param in_len an UCS2 string length to convert.
 506  * @param out_len the length of the resulting UTF-8 string.
 507  *
 508  * @return returns 0 on success, an wind error code otherwise
 509  * @ingroup wind
 510  */
 511
 512 int
 513 wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
 514 {
 515     return wind_ucs2utf8(in, in_len, NULL, out_len);
 516 }