gl/mbrtowc.c

   1 /* Convert multibyte character to wide character.
   2    Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2008.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include <wchar.h>
  22
  23 #if GNULIB_defined_mbstate_t
  24 /* Implement mbrtowc() on top of mbtowc().  */
  25
  26 # include <errno.h>
  27 # include <stdlib.h>
  28
  29 # include "localcharset.h"
  30 # include "streq.h"
  31 # include "verify.h"
  32
  33
  34 verify (sizeof (mbstate_t) >= 4);
  35
  36 static char internal_state[4];
  37
  38 size_t
  39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  40 {
  41   char *pstate = (char *)ps;
  42
  43   if (pstate == NULL)
  44     pstate = internal_state;
  45
  46   if (s == NULL)
  47     {
  48       pwc = NULL;
  49       s = "";
  50       n = 1;
  51     }
  52
  53   if (n == 0)
  54     return (size_t)(-2);
  55
  56   /* Here n > 0.  */
  57   {
  58     size_t nstate = pstate[0];
  59     char buf[4];
  60     const char *p;
  61     size_t m;
  62
  63     switch (nstate)
  64       {
  65       case 0:
  66         p = s;
  67         m = n;
  68         break;
  69       case 3:
  70         buf[2] = pstate[3];
  71         /*FALLTHROUGH*/
  72       case 2:
  73         buf[1] = pstate[2];
  74         /*FALLTHROUGH*/
  75       case 1:
  76         buf[0] = pstate[1];
  77         p = buf;
  78         m = nstate;
  79         buf[m++] = s[0];
  80         if (n >= 2 && m < 4)
  81           {
  82             buf[m++] = s[1];
  83             if (n >= 3 && m < 4)
  84               buf[m++] = s[2];
  85           }
  86         break;
  87       default:
  88         errno = EINVAL;
  89         return (size_t)(-1);
  90       }
  91
  92     /* Here m > 0.  */
  93
  94 # if __GLIBC__
  95     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
  96     mbtowc (NULL, NULL, 0);
  97 # endif
  98     {
  99       int res = mbtowc (pwc, p, m);
 100
 101       if (res >= 0)
 102         {
 103           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
 104             abort ();
 105           if (nstate >= (res > 0 ? res : 1))
 106             abort ();
 107           res -= nstate;
 108           pstate[0] = 0;
 109           return res;
 110         }
 111
 112       /* mbtowc does not distinguish between invalid and incomplete multibyte
 113          sequences.  But mbrtowc needs to make this distinction.
 114          There are two possible approaches:
 115            - Use iconv() and its return value.
 116            - Use built-in knowledge about the possible encodings.
 117          Given the low quality of implementation of iconv() on the systems that
 118          lack mbrtowc(), we use the second approach.
 119          The possible encodings are:
 120            - 8-bit encodings,
 121            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
 122            - UTF-8.
 123          Use specialized code for each.  */
 124       if (m >= 4 || m >= MB_CUR_MAX)
 125         goto invalid;
 126       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 127       {
 128         const char *encoding = locale_charset ();
 129
 130         if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 131           {
 132             /* Cf. unistr/u8-mblen.c.  */
 133             unsigned char c = (unsigned char) p[0];
 134
 135             if (c >= 0xc2)
 136               {
 137                 if (c < 0xe0)
 138                   {
 139                     if (m == 1)
 140                       goto incomplete;
 141                   }
 142                 else if (c < 0xf0)
 143                   {
 144                     if (m == 1)
 145                       goto incomplete;
 146                     if (m == 2)
 147                       {
 148                         unsigned char c2 = (unsigned char) p[1];
 149
 150                         if ((c2 ^ 0x80) < 0x40
 151                             && (c >= 0xe1 || c2 >= 0xa0)
 152                             && (c != 0xed || c2 < 0xa0))
 153                           goto incomplete;
 154                       }
 155                   }
 156                 else if (c <= 0xf4)
 157                   {
 158                     if (m == 1)
 159                       goto incomplete;
 160                     else /* m == 2 || m == 3 */
 161                       {
 162                         unsigned char c2 = (unsigned char) p[1];
 163
 164                         if ((c2 ^ 0x80) < 0x40
 165                             && (c >= 0xf1 || c2 >= 0x90)
 166                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
 167                           {
 168                             if (m == 2)
 169                               goto incomplete;
 170                             else /* m == 3 */
 171                               {
 172                                 unsigned char c3 = (unsigned char) p[2];
 173
 174                                 if ((c3 ^ 0x80) < 0x40)
 175                                   goto incomplete;
 176                               }
 177                           }
 178                       }
 179                   }
 180               }
 181             goto invalid;
 182           }
 183
 184         /* As a reference for this code, you can use the GNU libiconv
 185            implementation.  Look for uses of the RET_TOOFEW macro.  */
 186
 187         if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
 188           {
 189             if (m == 1)
 190               {
 191                 unsigned char c = (unsigned char) p[0];
 192
 193                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 194                   goto incomplete;
 195               }
 196             if (m == 2)
 197               {
 198                 unsigned char c = (unsigned char) p[0];
 199
 200                 if (c == 0x8f)
 201                   {
 202                     unsigned char c2 = (unsigned char) p[1];
 203
 204                     if (c2 >= 0xa1 && c2 < 0xff)
 205                       goto incomplete;
 206                   }
 207               }
 208             goto invalid;
 209           }
 210         if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 211             || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 212             || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
 213           {
 214             if (m == 1)
 215               {
 216                 unsigned char c = (unsigned char) p[0];
 217
 218                 if (c >= 0xa1 && c < 0xff)
 219                   goto incomplete;
 220               }
 221             goto invalid;
 222           }
 223         if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
 224           {
 225             if (m == 1)
 226               {
 227                 unsigned char c = (unsigned char) p[0];
 228
 229                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 230                   goto incomplete;
 231               }
 232             else /* m == 2 || m == 3 */
 233               {
 234                 unsigned char c = (unsigned char) p[0];
 235
 236                 if (c == 0x8e)
 237                   goto incomplete;
 238               }
 239             goto invalid;
 240           }
 241         if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
 242           {
 243             if (m == 1)
 244               {
 245                 unsigned char c = (unsigned char) p[0];
 246
 247                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
 248                   goto incomplete;
 249               }
 250             else /* m == 2 || m == 3 */
 251               {
 252                 unsigned char c = (unsigned char) p[0];
 253
 254                 if (c >= 0x90 && c <= 0xe3)
 255                   {
 256                     unsigned char c2 = (unsigned char) p[1];
 257
 258                     if (c2 >= 0x30 && c2 <= 0x39)
 259                       {
 260                         if (m == 2)
 261                           goto incomplete;
 262                         else /* m == 3 */
 263                           {
 264                             unsigned char c3 = (unsigned char) p[2];
 265
 266                             if (c3 >= 0x81 && c3 <= 0xfe)
 267                               goto incomplete;
 268                           }
 269                       }
 270                   }
 271               }
 272             goto invalid;
 273           }
 274         if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
 275           {
 276             if (m == 1)
 277               {
 278                 unsigned char c = (unsigned char) p[0];
 279
 280                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 281                     || (c >= 0xf0 && c <= 0xf9))
 282                   goto incomplete;
 283               }
 284             goto invalid;
 285           }
 286
 287         /* An unknown multibyte encoding.  */
 288         goto incomplete;
 289       }
 290
 291      incomplete:
 292       {
 293         size_t k = nstate;
 294         /* Here 0 <= k < m < 4.  */
 295         pstate[++k] = s[0];
 296         if (k < m)
 297           {
 298             pstate[++k] = s[1];
 299             if (k < m)
 300               pstate[++k] = s[2];
 301           }
 302         if (k != m)
 303           abort ();
 304       }
 305       pstate[0] = m;
 306       return (size_t)(-2);
 307
 308      invalid:
 309       errno = EILSEQ;
 310       /* The conversion state is undefined, says POSIX.  */
 311       return (size_t)(-1);
 312     }
 313   }
 314 }
 315
 316 #else
 317 /* Override the system's mbrtowc() function.  */
 318
 319 # undef mbrtowc
 320
 321 size_t
 322 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 323 {
 324 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
 325   if (s == NULL)
 326     {
 327       pwc = NULL;
 328       s = "";
 329       n = 1;
 330     }
 331 # endif
 332
 333 # if MBRTOWC_RETVAL_BUG
 334   {
 335     static mbstate_t internal_state;
 336
 337     /* Override mbrtowc's internal state.  We can not call mbsinit() on the
 338        hidden internal state, but we can call it on our variable.  */
 339     if (ps == NULL)
 340       ps = &internal_state;
 341
 342     if (!mbsinit (ps))
 343       {
 344         /* Parse the rest of the multibyte character byte for byte.  */
 345         size_t count = 0;
 346         for (; n > 0; s++, n--)
 347           {
 348             wchar_t wc;
 349             size_t ret = mbrtowc (&wc, s, 1, ps);
 350
 351             if (ret == (size_t)(-1))
 352               return (size_t)(-1);
 353             count++;
 354             if (ret != (size_t)(-2))
 355               {
 356                 /* The multibyte character has been completed.  */
 357                 if (pwc != NULL)
 358                   *pwc = wc;
 359                 return (wc == 0 ? 0 : count);
 360               }
 361           }
 362         return (size_t)(-2);
 363       }
 364   }
 365 # endif
 366
 367 # if MBRTOWC_NUL_RETVAL_BUG
 368   {
 369     wchar_t wc;
 370     size_t ret = mbrtowc (&wc, s, n, ps);
 371
 372     if (ret != (size_t)(-1) && ret != (size_t)(-2))
 373       {
 374         if (pwc != NULL)
 375           *pwc = wc;
 376         if (wc == 0)
 377           ret = 0;
 378       }
 379     return ret;
 380   }
 381 # else
 382   return mbrtowc (pwc, s, n, ps);
 383 # endif
 384 }
 385
 386 #endif