c32tob: prefer https: URLs
[gnulib.git] / lib / mbrtoc32.c
blobf2cf71ec11b4bf00cc2054851749cf9513e26093
1 /* Convert multibyte character to 32-bit wide character.
2 Copyright (C) 2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2020. */
19 #include <config.h>
21 /* Specification. */
22 #include <uchar.h>
24 #include <errno.h>
25 #include <stdlib.h>
27 # ifndef FALLTHROUGH
28 # if __GNUC__ < 7
29 # define FALLTHROUGH ((void) 0)
30 # else
31 # define FALLTHROUGH __attribute__ ((__fallthrough__))
32 # endif
33 # endif
35 #if GNULIB_defined_mbstate_t /* AIX, IRIX */
36 /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales
37 and directly for the UTF-8 locales. */
39 # if defined _WIN32 && !defined __CYGWIN__
41 # define WIN32_LEAN_AND_MEAN /* avoid including junk */
42 # include <windows.h>
44 # elif HAVE_PTHREAD_API
46 # include <pthread.h>
47 # if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS
48 # include <threads.h>
49 # pragma weak thrd_exit
50 # define c11_threads_in_use() (thrd_exit != NULL)
51 # else
52 # define c11_threads_in_use() 0
53 # endif
55 # elif HAVE_THREADS_H
57 # include <threads.h>
59 # endif
61 # include "verify.h"
62 # include "lc-charset-dispatch.h"
63 # include "mbtowc-lock.h"
65 verify (sizeof (mbstate_t) >= 4);
66 static char internal_state[4];
68 size_t
69 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
71 # define FITS_IN_CHAR_TYPE(wc) 1
72 # include "mbrtowc-impl.h"
75 #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */
77 /* Implement mbrtoc32() based on mbrtowc(). */
79 # include <wchar.h>
81 # include "localcharset.h"
82 # include "streq.h"
84 static mbstate_t internal_state;
86 size_t
87 mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps)
89 /* It's simpler to handle the case s == NULL upfront, than to worry about
90 this case later, before every test of pwc and n. */
91 if (s == NULL)
93 pwc = NULL;
94 s = "";
95 n = 1;
98 # if MBRTOC32_EMPTY_INPUT_BUG || _GL_LARGE_CHAR32_T
99 if (n == 0)
100 return (size_t) -2;
101 # endif
103 if (ps == NULL)
104 ps = &internal_state;
106 # if _GL_LARGE_CHAR32_T
108 /* Special-case all encodings that may produce wide character values
109 > WCHAR_MAX. */
110 const char *encoding = locale_charset ();
111 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
113 /* Special-case the UTF-8 encoding. Assume that the wide-character
114 encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16. */
115 /* Here n > 0. */
116 char *pstate = (char *)ps;
117 size_t nstate = pstate[0];
118 char buf[4];
119 const char *p;
120 size_t m;
121 int res;
123 switch (nstate)
125 case 0:
126 p = s;
127 m = n;
128 break;
129 case 3:
130 buf[2] = pstate[3];
131 FALLTHROUGH;
132 case 2:
133 buf[1] = pstate[2];
134 FALLTHROUGH;
135 case 1:
136 buf[0] = pstate[1];
137 p = buf;
138 m = nstate;
139 buf[m++] = s[0];
140 if (n >= 2 && m < 4)
142 buf[m++] = s[1];
143 if (n >= 3 && m < 4)
144 buf[m++] = s[2];
146 break;
147 default:
148 errno = EINVAL;
149 return (size_t)(-1);
152 /* Here m > 0. */
155 # define FITS_IN_CHAR_TYPE(wc) 1
156 # include "mbrtowc-impl-utf8.h"
159 success:
160 if (nstate >= (res > 0 ? res : 1))
161 abort ();
162 res -= nstate;
163 /* Set *ps to the initial state. */
164 # if defined _WIN32 && !defined __CYGWIN__
165 /* Native Windows. */
166 /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter.
167 On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
168 as an 8-byte struct, of which the first 4 bytes matter. */
169 *(unsigned int *)pstate = 0;
170 # elif defined __CYGWIN__
171 /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes
172 matter. */
173 ps->__count = 0;
174 # else
175 pstate[0] = 0;
176 # endif
177 return res;
179 incomplete:
181 size_t k = nstate;
182 /* Here 0 <= k < m < 4. */
183 pstate[++k] = s[0];
184 if (k < m)
186 pstate[++k] = s[1];
187 if (k < m)
188 pstate[++k] = s[2];
190 if (k != m)
191 abort ();
193 pstate[0] = m;
194 return (size_t)(-2);
196 invalid:
197 errno = EILSEQ;
198 /* The conversion state is undefined, says POSIX. */
199 return (size_t)(-1);
201 else
203 wchar_t wc;
204 size_t ret = mbrtowc (&wc, s, n, ps);
205 if (ret < (size_t) -2 && pwc != NULL)
206 *pwc = wc;
207 return ret;
210 # else
212 /* char32_t and wchar_t are equivalent.
213 Two implementations are possible:
214 - We can call the original mbrtoc32 (if it exists) and handle
215 MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ.
216 - We can call mbrtowc.
217 The latter is simpler. */
218 wchar_t wc;
219 size_t ret = mbrtowc (&wc, s, n, ps);
220 if (ret < (size_t) -2 && pwc != NULL)
221 *pwc = wc;
222 return ret;
224 # endif
227 #endif