exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / mbrtoc16.c
blob1fd0fbf24297d5ebc446621002ae7a0ddf65046b
1 /* Convert multibyte character and return next 16-bit wide character.
2 Copyright (C) 2020-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2023. */
19 #include <config.h>
21 /* Specification. */
22 #include <uchar.h>
24 #include <stdlib.h>
25 #include <wchar.h>
27 /* We must find room for a two-bytes char16_t in an mbstate_t, without
28 interfering with the existing use of the mbstate_t in mbrtoc32. */
29 static_assert (sizeof (mbstate_t) >= 4);
31 #if GNULIB_defined_mbstate_t /* AIX, IRIX */
32 /* mbstate_t has at least 4 bytes. They are used as coded in
33 gnulib/lib/mbrtowc.c. */
34 # define SET_EXTRA_STATE(ps, c16) \
35 (((char *)(ps))[0] = 8, \
36 ((char *)(ps))[1] = (unsigned char) ((c16) >> 8), \
37 ((char *)(ps))[2] = (unsigned char) ((c16) & 0xff))
38 # define GET_EXTRA_STATE(ps) \
39 (((char *)(ps))[0] == 8 \
40 ? ((unsigned char) ((char *)(ps))[1] << 8) | (unsigned char) ((char *)(ps))[2] \
41 : 0)
42 # define RESET_EXTRA_STATE(ps) \
43 (((char *)(ps))[0] = 0)
44 #elif __GLIBC__ >= 2
45 /* mbstate_t is defined in <bits/types/__mbstate_t.h>.
46 For more details, see glibc/iconv/skeleton.c. */
47 # define SET_EXTRA_STATE(ps, c16) \
48 ((ps)->__count |= (c16 << 16))
49 # define GET_EXTRA_STATE(ps) \
50 (((unsigned int) (ps)->__count) >> 16)
51 # define RESET_EXTRA_STATE(ps) \
52 ((ps)->__count &= 0xffff)
53 #elif (defined __APPLE__ && defined __MACH__) || defined __FreeBSD__ || defined __NetBSD__ || defined __OpenBSD__ || defined __minix
54 /* macOS, FreeBSD, NetBSD, OpenBSD, Minix */
55 /* On macOS, mbstate_t is defined in <machine/_types.h>.
56 It is an opaque aligned 128-byte struct, of which at most the first
57 12 bytes are used.
58 For more details, see the __mbsinit implementations in
59 Libc-<version>/locale/FreeBSD/
60 {ascii,none,euc,mskanji,big5,gb2312,gbk,gb18030,utf8,utf2}.c. */
61 /* On FreeBSD, mbstate_t is defined in src/sys/sys/_types.h.
62 It is an opaque aligned 128-byte struct, of which at most the first
63 12 bytes are used.
64 For more details, see the __mbsinit implementations in
65 src/lib/libc/locale/
66 {ascii,none,euc,mskanji,big5,gb2312,gbk,gb18030,utf8}.c. */
67 /* On NetBSD, mbstate_t is defined in src/sys/sys/ansi.h.
68 It is an opaque aligned 128-byte struct, of which at most the first
69 28 bytes are used.
70 For more details, see the *State types in
71 src/lib/libc/citrus/modules/citrus_*.c
72 (ignoring citrus_{hz,iso2022,utf7,viqr,zw}.c, since these implement
73 stateful encodings, not usable as locale encodings). */
74 /* On OpenBSD, mbstate_t is defined in src/sys/sys/_types.h.
75 It is an opaque aligned 128-byte struct, of which at most the first
76 12 bytes are used.
77 For more details, see src/lib/libc/citrus/citrus_*.c. */
78 /* Minix has borrowed its mbstate_t type and mbrtowc implementation from the
79 BSDs. */
80 # define SET_EXTRA_STATE(ps, c16) \
81 (((unsigned short *)(ps))[16] = (c16))
82 # define GET_EXTRA_STATE(ps) \
83 (((unsigned short *)(ps))[16])
84 # define RESET_EXTRA_STATE(ps) \
85 (((unsigned short *)(ps))[16] = 0)
86 #elif defined __sun /* Solaris */
87 /* On Solaris, mbstate_t is defined in <wchar_impl.h>.
88 It is an opaque aligned 24-byte or 32-byte struct, of which at most the first
89 20 or 28 bytes are used.
90 For more details on OpenSolaris derivatives, see the *State types in
91 illumos-gate/usr/src/lib/libc/port/locale/
92 {none,euc,mskanji,big5,gb2312,gbk,gb18030,utf8}.c. */
93 # define SET_EXTRA_STATE(ps, c16) \
94 (((unsigned short *)(ps))[10] = (c16))
95 # define GET_EXTRA_STATE(ps) \
96 (((unsigned short *)(ps))[10])
97 # define RESET_EXTRA_STATE(ps) \
98 (((unsigned short *)(ps))[10] = 0)
99 #elif defined __CYGWIN__
100 /* On Cygwin, mbstate_t is defined in <sys/_types.h>.
101 For more details, see newlib/libc/stdlib/mbtowc_r.c and
102 winsup/cygwin/strfuncs.cc. */
103 # define SET_EXTRA_STATE(ps, c16) \
104 ((ps)->__count = 8, \
105 (ps)->__value.__wch = (c16))
106 # define GET_EXTRA_STATE(ps) \
107 ((ps)->__count == 8 ? (ps)->__value.__wch : 0)
108 # define RESET_EXTRA_STATE(ps) \
109 ((ps)->__count = 0)
110 #elif defined _WIN32 && !defined __CYGWIN__ /* Native Windows. */
111 /* MSVC defines 'mbstate_t' as an aligned 8-byte struct.
112 On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined
113 as an aligned 8-byte struct, of which the first 4 bytes matter. */
114 # define SET_EXTRA_STATE(ps, c16) \
115 (((char *)(ps))[3] = 4, \
116 ((unsigned short *)(ps))[0] = (c16))
117 # define GET_EXTRA_STATE(ps) \
118 (((char *)(ps))[3] == 4 \
119 ? ((unsigned short *)(ps))[0] \
120 : 0)
121 # define RESET_EXTRA_STATE(ps) \
122 (((char *)(ps))[3] = 0, \
123 ((unsigned short *)(ps))[0] = 0)
124 #elif defined __ANDROID__ /* Android */
125 /* Android defines 'mbstate_t' in <bits/mbstate_t.h>.
126 It is an opaque 4-byte or 8-byte struct.
127 For more details, see
128 bionic/libc/private/bionic_mbstate.h
129 bionic/libc/bionic/mbrtoc32.cpp
130 bionic/libc/bionic/mbrtoc16.cpp
132 # define SET_EXTRA_STATE(ps, c16) \
133 (((char *)(ps))[3] = 4, \
134 ((char *)(ps))[0] = (unsigned char) ((c16) & 0xff), \
135 ((char *)(ps))[1] = (unsigned char) ((c16) >> 8))
136 # define GET_EXTRA_STATE(ps) \
137 (((char *)(ps))[3] == 4 \
138 ? ((unsigned char) ((char *)(ps))[1] << 8) | (unsigned char) ((char *)(ps))[0] \
139 : 0)
140 # define RESET_EXTRA_STATE(ps) \
141 (((char *)(ps))[0] = ((char *)(ps))[1] = ((char *)(ps))[2] = ((char *)(ps))[3] = 0)
142 #else
143 /* This is just a wild guess, for other platforms. It likely causes unit test
144 failures. */
145 # define SET_EXTRA_STATE(ps, c16) \
146 (((char *)(ps))[1] = (unsigned char) ((c16) >> 8), \
147 ((char *)(ps))[2] = (unsigned char) ((c16) & 0xff))
148 # define GET_EXTRA_STATE(ps) \
149 (((unsigned char) ((char *)(ps))[1] << 8) | (unsigned char) ((char *)(ps))[2])
150 # define RESET_EXTRA_STATE(ps) \
151 (((char *)(ps))[1] = ((char *)(ps))[2] = 0)
152 #endif
154 static mbstate_t internal_state;
156 size_t
157 mbrtoc16 (char16_t *pwc, const char *s, size_t n, mbstate_t *ps)
158 #undef mbrtoc16
160 /* It's simpler to handle the case s == NULL upfront, than to worry about
161 this case later, before every test of pwc and n. */
162 if (s == NULL)
164 pwc = NULL;
165 s = "";
166 n = 1;
169 if (ps == NULL)
170 ps = &internal_state;
172 if (GET_EXTRA_STATE (ps) == 0)
174 if (n == 0)
175 return (size_t) -2;
177 char32_t c32;
178 size_t ret = mbrtoc32 (&c32, s, n, ps);
179 if (ret == (size_t)(-1) || ret == (size_t)(-2))
181 else if (ret == (size_t)(-3))
183 /* When mbrtoc32 returns several char32_t values for a single
184 multibyte character, they are all in the Unicode BMP range. */
185 if (c32 >= 0x10000)
186 abort ();
187 if (pwc != NULL)
188 *pwc = c32;
190 else if (c32 < 0x10000)
192 if (pwc != NULL)
193 *pwc = c32;
195 else
197 if (c32 >= 0x110000)
198 abort ();
199 /* Decompose a Unicode character into a high surrogate and a low
200 surrogate. */
201 char16_t surr1 = 0xd800 + ((c32 - 0x10000) >> 10);
202 char16_t surr2 = 0xdc00 + ((c32 - 0x10000) & 0x3ff);
203 if (pwc != NULL)
204 *pwc = surr1;
205 SET_EXTRA_STATE (ps, surr2);
207 return ret;
209 else
211 if (pwc != NULL)
212 *pwc = GET_EXTRA_STATE (ps);
213 RESET_EXTRA_STATE (ps);
214 return (size_t)(-3);