1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
34 # include "localcharset.h"
40 # define FALLTHROUGH ((void) 0)
42 # define FALLTHROUGH __attribute__ ((__fallthrough__))
46 verify (sizeof (mbstate_t) >= 4);
48 static char internal_state
[4];
51 mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
53 char *pstate
= (char *)ps
;
68 pstate
= internal_state
;
71 size_t nstate
= pstate
[0];
107 # if __GLIBC__ || defined __UCLIBC__
108 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
109 mbtowc (NULL
, NULL
, 0);
112 int res
= mbtowc (pwc
, p
, m
);
116 if (pwc
!= NULL
&& ((*pwc
== 0) != (res
== 0)))
118 if (nstate
>= (res
> 0 ? res
: 1))
125 /* mbtowc does not distinguish between invalid and incomplete multibyte
126 sequences. But mbrtowc needs to make this distinction.
127 There are two possible approaches:
128 - Use iconv() and its return value.
129 - Use built-in knowledge about the possible encodings.
130 Given the low quality of implementation of iconv() on the systems that
131 lack mbrtowc(), we use the second approach.
132 The possible encodings are:
134 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
136 Use specialized code for each. */
137 if (m
>= 4 || m
>= MB_CUR_MAX
)
139 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
141 const char *encoding
= locale_charset ();
143 if (STREQ_OPT (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
145 /* Cf. unistr/u8-mblen.c. */
146 unsigned char c
= (unsigned char) p
[0];
161 unsigned char c2
= (unsigned char) p
[1];
163 if ((c2
^ 0x80) < 0x40
164 && (c
>= 0xe1 || c2
>= 0xa0)
165 && (c
!= 0xed || c2
< 0xa0))
173 else /* m == 2 || m == 3 */
175 unsigned char c2
= (unsigned char) p
[1];
177 if ((c2
^ 0x80) < 0x40
178 && (c
>= 0xf1 || c2
>= 0x90)
179 && (c
< 0xf4 || (c
== 0xf4 && c2
< 0x90)))
185 unsigned char c3
= (unsigned char) p
[2];
187 if ((c3
^ 0x80) < 0x40)
197 /* As a reference for this code, you can use the GNU libiconv
198 implementation. Look for uses of the RET_TOOFEW macro. */
200 if (STREQ_OPT (encoding
,
201 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
205 unsigned char c
= (unsigned char) p
[0];
207 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e || c
== 0x8f)
212 unsigned char c
= (unsigned char) p
[0];
216 unsigned char c2
= (unsigned char) p
[1];
218 if (c2
>= 0xa1 && c2
< 0xff)
224 if (STREQ_OPT (encoding
,
225 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
226 || STREQ_OPT (encoding
,
227 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
228 || STREQ_OPT (encoding
,
229 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
233 unsigned char c
= (unsigned char) p
[0];
235 if (c
>= 0xa1 && c
< 0xff)
240 if (STREQ_OPT (encoding
,
241 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
245 unsigned char c
= (unsigned char) p
[0];
247 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e)
250 else /* m == 2 || m == 3 */
252 unsigned char c
= (unsigned char) p
[0];
259 if (STREQ_OPT (encoding
,
260 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
264 unsigned char c
= (unsigned char) p
[0];
266 if ((c
>= 0x90 && c
<= 0xe3) || (c
>= 0xf8 && c
<= 0xfe))
269 else /* m == 2 || m == 3 */
271 unsigned char c
= (unsigned char) p
[0];
273 if (c
>= 0x90 && c
<= 0xe3)
275 unsigned char c2
= (unsigned char) p
[1];
277 if (c2
>= 0x30 && c2
<= 0x39)
283 unsigned char c3
= (unsigned char) p
[2];
285 if (c3
>= 0x81 && c3
<= 0xfe)
293 if (STREQ_OPT (encoding
, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
297 unsigned char c
= (unsigned char) p
[0];
299 if ((c
>= 0x81 && c
<= 0x9f) || (c
>= 0xe0 && c
<= 0xea)
300 || (c
>= 0xf0 && c
<= 0xf9))
306 /* An unknown multibyte encoding. */
313 /* Here 0 <= k < m < 4. */
329 /* The conversion state is undefined, says POSIX. */
336 /* Override the system's mbrtowc() function. */
341 rpl_mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
346 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
355 # if MBRTOWC_EMPTY_INPUT_BUG
363 # if MBRTOWC_RETVAL_BUG
365 static mbstate_t internal_state
;
367 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
368 hidden internal state, but we can call it on our variable. */
370 ps
= &internal_state
;
374 /* Parse the rest of the multibyte character byte for byte. */
376 for (; n
> 0; s
++, n
--)
378 ret
= mbrtowc (&wc
, s
, 1, ps
);
380 if (ret
== (size_t)(-1))
383 if (ret
!= (size_t)(-2))
385 /* The multibyte character has been completed. */
387 return (wc
== 0 ? 0 : count
);
395 ret
= mbrtowc (pwc
, s
, n
, ps
);
397 # if MBRTOWC_NUL_RETVAL_BUG
398 if (ret
< (size_t) -2 && !*pwc
)
402 # if C_LOCALE_MAYBE_EILSEQ
403 if ((size_t) -2 <= ret
&& n
!= 0 && ! hard_locale (LC_CTYPE
))
405 unsigned char uc
= *s
;