usr/src/lib/libsmbfs/smb/utf_str.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*
  28  * Unicode conversions (yet more)
  29  */
  30
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <errno.h>
  35 #include <iconv.h>
  36 #include <libintl.h>
  37
  38 #include <sys/u8_textprep.h>
  39
  40 #include <netsmb/smb_lib.h>
  41 #include "charsets.h"
  42
  43
  44 /*
  45  * Number of unicode symbols in the string,
  46  * not including the 2-byte null terminator.
  47  * (multiply by two for storage size)
  48  */
  49 size_t
  50 unicode_strlen(const uint16_t *us)
  51 {
  52         size_t len = 0;
  53         while (*us++)
  54                 len++;
  55         return (len);
  56 }
  57
  58 static char *convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
  59
  60 /*
  61  * Convert (native) Unicode string to UTF-8.
  62  * Returns allocated memory.
  63  */
  64 char *
  65 convert_unicode_to_utf8(uint16_t *us)
  66 {
  67         static iconv_t cd1 = (iconv_t)-1;
  68
  69         /* Get conversion descriptor (to, from) */
  70         if (cd1 == (iconv_t)-1)
  71                 cd1 = iconv_open("UTF-8", "UCS-2");
  72
  73         return (convert_ucs2xx_to_utf8(cd1, us));
  74 }
  75
  76 /*
  77  * Convert little-endian Unicode string to UTF-8.
  78  * Returns allocated memory.
  79  */
  80 char *
  81 convert_leunicode_to_utf8(unsigned short *us)
  82 {
  83         static iconv_t cd2 = (iconv_t)-1;
  84
  85         /* Get conversion descriptor (to, from) */
  86         if (cd2 == (iconv_t)-1)
  87                 cd2 = iconv_open("UTF-8", "UCS-2LE");
  88
  89         return (convert_ucs2xx_to_utf8(cd2, us));
  90 }
  91
  92 static char *
  93 convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
  94 {
  95         char *obuf, *optr;
  96         const char *iptr;
  97         size_t  ileft, obsize, oleft, ret;
  98
  99         if (cd == (iconv_t)-1) {
 100                 smb_error(dgettext(TEXT_DOMAIN,
 101                     "iconv_open(UTF-8/UCS-2)"), -1);
 102                 return (NULL);
 103         }
 104
 105         iptr = (const char *)us;
 106         ileft = unicode_strlen(us);
 107         ileft *= 2; /* now bytes */
 108
 109         /* Worst-case output size is 2x input size. */
 110         oleft = ileft * 2;
 111         obsize = oleft + 2; /* room for null */
 112         obuf = malloc(obsize);
 113         if (!obuf)
 114                 return (NULL);
 115         optr = obuf;
 116
 117         ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
 118         *optr = '\0';
 119         if (ret == (size_t)-1) {
 120                 smb_error(dgettext(TEXT_DOMAIN,
 121                     "iconv(%s) failed"), errno, obuf);
 122         }
 123         if (ileft) {
 124                 smb_error(dgettext(TEXT_DOMAIN,
 125                     "iconv(%s) failed"), -1, obuf);
 126                 /*
 127                  * XXX: What's better?  return NULL?
 128                  * The truncated string? << for now
 129                  */
 130         }
 131
 132         return (obuf);
 133 }
 134
 135 static uint16_t *convert_utf8_to_ucs2xx(iconv_t, const char *);
 136
 137 /*
 138  * Convert UTF-8 string to Unicode.
 139  * Returns allocated memory.
 140  */
 141 uint16_t *
 142 convert_utf8_to_unicode(const char *utf8_string)
 143 {
 144         static iconv_t cd3 = (iconv_t)-1;
 145
 146         /* Get conversion descriptor (to, from) */
 147         if (cd3 == (iconv_t)-1)
 148                 cd3 = iconv_open("UCS-2", "UTF-8");
 149         return (convert_utf8_to_ucs2xx(cd3, utf8_string));
 150 }
 151
 152 /*
 153  * Convert UTF-8 string to little-endian Unicode.
 154  * Returns allocated memory.
 155  */
 156 uint16_t *
 157 convert_utf8_to_leunicode(const char *utf8_string)
 158 {
 159         static iconv_t cd4 = (iconv_t)-1;
 160
 161         /* Get conversion descriptor (to, from) */
 162         if (cd4 == (iconv_t)-1)
 163                 cd4 = iconv_open("UCS-2LE", "UTF-8");
 164         return (convert_utf8_to_ucs2xx(cd4, utf8_string));
 165 }
 166
 167 static uint16_t *
 168 convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
 169 {
 170         uint16_t *obuf, *optr;
 171         const char *iptr;
 172         size_t  ileft, obsize, oleft, ret;
 173
 174         if (cd == (iconv_t)-1) {
 175                 smb_error(dgettext(TEXT_DOMAIN,
 176                     "iconv_open(UCS-2/UTF-8)"), -1);
 177                 return (NULL);
 178         }
 179
 180         iptr = utf8_string;
 181         ileft = strlen(iptr);
 182
 183         /* Worst-case output size is 2x input size. */
 184         oleft = ileft * 2;
 185         obsize = oleft + 2; /* room for null */
 186         obuf = malloc(obsize);
 187         if (!obuf)
 188                 return (NULL);
 189         optr = obuf;
 190
 191         ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
 192         *optr = '\0';
 193         if (ret == (size_t)-1) {
 194                 smb_error(dgettext(TEXT_DOMAIN,
 195                     "iconv(%s) failed"), errno, utf8_string);
 196         }
 197         if (ileft) {
 198                 smb_error(dgettext(TEXT_DOMAIN,
 199                     "iconv(%s) failed"), -1, utf8_string);
 200                 /*
 201                  * XXX: What's better?  return NULL?
 202                  * The truncated string? << for now
 203                  */
 204         }
 205
 206         return (obuf);
 207 }
 208
 209
 210 /*
 211  * A simple wrapper around u8_textprep_str() that returns the Unicode
 212  * upper-case version of some string.  Returns memory from malloc.
 213  * Borrowed from idmapd.
 214  */
 215 static char *
 216 utf8_str_to_upper_or_lower(const char *s, int upper_lower)
 217 {
 218         char *res = NULL;
 219         char *outs;
 220         size_t inlen, outlen, inbleft, outbleft;
 221         int rc, err;
 222
 223         /*
 224          * u8_textprep_str() does not allocate memory.  The input and
 225          * output buffers may differ in size (though that would be more
 226          * likely when normalization is done).  We have to loop over it...
 227          *
 228          * To improve the chances that we can avoid looping we add 10
 229          * bytes of output buffer room the first go around.
 230          */
 231         inlen = inbleft = strlen(s);
 232         outlen = outbleft = inlen + 10;
 233         if ((res = malloc(outlen)) == NULL)
 234                 return (NULL);
 235         outs = res;
 236
 237         while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
 238             &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
 239             err == E2BIG) {
 240                 if ((res = realloc(res, outlen + inbleft)) == NULL)
 241                         return (NULL);
 242                 /* adjust input/output buffer pointers */
 243                 s += (inlen - inbleft);
 244                 outs = res + outlen - outbleft;
 245                 /* adjust outbleft and outlen */
 246                 outlen += inbleft;
 247                 outbleft += inbleft;
 248         }
 249
 250         if (rc < 0) {
 251                 free(res);
 252                 res = NULL;
 253                 return (NULL);
 254         }
 255
 256         res[outlen - outbleft] = '\0';
 257
 258         return (res);
 259 }
 260
 261 char *
 262 utf8_str_toupper(const char *s)
 263 {
 264         return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
 265 }
 266
 267 char *
 268 utf8_str_tolower(const char *s)
 269 {
 270         return (utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
 271 }