warnings: fix compilation with old autoconf
[gnulib/ericb.git] / lib / mbrtowc.c
blob7415ffffd528fc545282e3ec09a07aee25707e01
1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <config.h>
20 /* Specification. */
21 #include <wchar.h>
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
25 # include <locale.h>
26 #endif
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
31 # include <errno.h>
32 # include <stdlib.h>
34 # include "localcharset.h"
35 # include "streq.h"
36 # include "verify.h"
38 #ifndef FALLTHROUGH
39 # if __GNUC__ < 7
40 # define FALLTHROUGH ((void) 0)
41 # else
42 # define FALLTHROUGH __attribute__ ((__fallthrough__))
43 # endif
44 #endif
46 verify (sizeof (mbstate_t) >= 4);
48 static char internal_state[4];
50 size_t
51 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
53 char *pstate = (char *)ps;
55 if (s == NULL)
57 pwc = NULL;
58 s = "";
59 n = 1;
62 if (n == 0)
63 return (size_t)(-2);
65 /* Here n > 0. */
67 if (pstate == NULL)
68 pstate = internal_state;
71 size_t nstate = pstate[0];
72 char buf[4];
73 const char *p;
74 size_t m;
76 switch (nstate)
78 case 0:
79 p = s;
80 m = n;
81 break;
82 case 3:
83 buf[2] = pstate[3];
84 FALLTHROUGH;
85 case 2:
86 buf[1] = pstate[2];
87 FALLTHROUGH;
88 case 1:
89 buf[0] = pstate[1];
90 p = buf;
91 m = nstate;
92 buf[m++] = s[0];
93 if (n >= 2 && m < 4)
95 buf[m++] = s[1];
96 if (n >= 3 && m < 4)
97 buf[m++] = s[2];
99 break;
100 default:
101 errno = EINVAL;
102 return (size_t)(-1);
105 /* Here m > 0. */
107 # if __GLIBC__ || defined __UCLIBC__
108 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
109 mbtowc (NULL, NULL, 0);
110 # endif
112 int res = mbtowc (pwc, p, m);
114 if (res >= 0)
116 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
117 abort ();
118 if (nstate >= (res > 0 ? res : 1))
119 abort ();
120 res -= nstate;
121 pstate[0] = 0;
122 return res;
125 /* mbtowc does not distinguish between invalid and incomplete multibyte
126 sequences. But mbrtowc needs to make this distinction.
127 There are two possible approaches:
128 - Use iconv() and its return value.
129 - Use built-in knowledge about the possible encodings.
130 Given the low quality of implementation of iconv() on the systems that
131 lack mbrtowc(), we use the second approach.
132 The possible encodings are:
133 - 8-bit encodings,
134 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
135 - UTF-8.
136 Use specialized code for each. */
137 if (m >= 4 || m >= MB_CUR_MAX)
138 goto invalid;
139 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
141 const char *encoding = locale_charset ();
143 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
145 /* Cf. unistr/u8-mblen.c. */
146 unsigned char c = (unsigned char) p[0];
148 if (c >= 0xc2)
150 if (c < 0xe0)
152 if (m == 1)
153 goto incomplete;
155 else if (c < 0xf0)
157 if (m == 1)
158 goto incomplete;
159 if (m == 2)
161 unsigned char c2 = (unsigned char) p[1];
163 if ((c2 ^ 0x80) < 0x40
164 && (c >= 0xe1 || c2 >= 0xa0)
165 && (c != 0xed || c2 < 0xa0))
166 goto incomplete;
169 else if (c <= 0xf4)
171 if (m == 1)
172 goto incomplete;
173 else /* m == 2 || m == 3 */
175 unsigned char c2 = (unsigned char) p[1];
177 if ((c2 ^ 0x80) < 0x40
178 && (c >= 0xf1 || c2 >= 0x90)
179 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
181 if (m == 2)
182 goto incomplete;
183 else /* m == 3 */
185 unsigned char c3 = (unsigned char) p[2];
187 if ((c3 ^ 0x80) < 0x40)
188 goto incomplete;
194 goto invalid;
197 /* As a reference for this code, you can use the GNU libiconv
198 implementation. Look for uses of the RET_TOOFEW macro. */
200 if (STREQ_OPT (encoding,
201 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
203 if (m == 1)
205 unsigned char c = (unsigned char) p[0];
207 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
208 goto incomplete;
210 if (m == 2)
212 unsigned char c = (unsigned char) p[0];
214 if (c == 0x8f)
216 unsigned char c2 = (unsigned char) p[1];
218 if (c2 >= 0xa1 && c2 < 0xff)
219 goto incomplete;
222 goto invalid;
224 if (STREQ_OPT (encoding,
225 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
226 || STREQ_OPT (encoding,
227 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
228 || STREQ_OPT (encoding,
229 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
231 if (m == 1)
233 unsigned char c = (unsigned char) p[0];
235 if (c >= 0xa1 && c < 0xff)
236 goto incomplete;
238 goto invalid;
240 if (STREQ_OPT (encoding,
241 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
243 if (m == 1)
245 unsigned char c = (unsigned char) p[0];
247 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
248 goto incomplete;
250 else /* m == 2 || m == 3 */
252 unsigned char c = (unsigned char) p[0];
254 if (c == 0x8e)
255 goto incomplete;
257 goto invalid;
259 if (STREQ_OPT (encoding,
260 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
262 if (m == 1)
264 unsigned char c = (unsigned char) p[0];
266 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
267 goto incomplete;
269 else /* m == 2 || m == 3 */
271 unsigned char c = (unsigned char) p[0];
273 if (c >= 0x90 && c <= 0xe3)
275 unsigned char c2 = (unsigned char) p[1];
277 if (c2 >= 0x30 && c2 <= 0x39)
279 if (m == 2)
280 goto incomplete;
281 else /* m == 3 */
283 unsigned char c3 = (unsigned char) p[2];
285 if (c3 >= 0x81 && c3 <= 0xfe)
286 goto incomplete;
291 goto invalid;
293 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
295 if (m == 1)
297 unsigned char c = (unsigned char) p[0];
299 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
300 || (c >= 0xf0 && c <= 0xf9))
301 goto incomplete;
303 goto invalid;
306 /* An unknown multibyte encoding. */
307 goto incomplete;
310 incomplete:
312 size_t k = nstate;
313 /* Here 0 <= k < m < 4. */
314 pstate[++k] = s[0];
315 if (k < m)
317 pstate[++k] = s[1];
318 if (k < m)
319 pstate[++k] = s[2];
321 if (k != m)
322 abort ();
324 pstate[0] = m;
325 return (size_t)(-2);
327 invalid:
328 errno = EILSEQ;
329 /* The conversion state is undefined, says POSIX. */
330 return (size_t)(-1);
335 #else
336 /* Override the system's mbrtowc() function. */
338 # undef mbrtowc
340 size_t
341 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
343 size_t ret;
344 wchar_t wc;
346 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
347 if (s == NULL)
349 pwc = NULL;
350 s = "";
351 n = 1;
353 # endif
355 # if MBRTOWC_EMPTY_INPUT_BUG
356 if (n == 0)
357 return (size_t) -2;
358 # endif
360 if (! pwc)
361 pwc = &wc;
363 # if MBRTOWC_RETVAL_BUG
365 static mbstate_t internal_state;
367 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
368 hidden internal state, but we can call it on our variable. */
369 if (ps == NULL)
370 ps = &internal_state;
372 if (!mbsinit (ps))
374 /* Parse the rest of the multibyte character byte for byte. */
375 size_t count = 0;
376 for (; n > 0; s++, n--)
378 ret = mbrtowc (&wc, s, 1, ps);
380 if (ret == (size_t)(-1))
381 return (size_t)(-1);
382 count++;
383 if (ret != (size_t)(-2))
385 /* The multibyte character has been completed. */
386 *pwc = wc;
387 return (wc == 0 ? 0 : count);
390 return (size_t)(-2);
393 # endif
395 ret = mbrtowc (pwc, s, n, ps);
397 # if MBRTOWC_NUL_RETVAL_BUG
398 if (ret < (size_t) -2 && !*pwc)
399 return 0;
400 # endif
402 # if C_LOCALE_MAYBE_EILSEQ
403 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
405 unsigned char uc = *s;
406 *pwc = uc;
407 return 1;
409 # endif
411 return ret;
414 #endif