exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / lc-charset-unicode.c
blobf6529c1600fa77e6786b4b448da79843c2065436
1 /* Conversion between the current locale's character encoding and Unicode.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2023. */
19 #include <config.h>
21 /* Specification. */
22 #include "lc-charset-unicode.h"
24 #if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
26 /* We use iconv() to convert between a 'wchar_t' value and a Unicode code point.
27 For performance reasons, we don't allocate an iconv_t for each conversion,
28 but instead cache it for subsequent conversions. Since an iconv_t descriptor
29 can only be used in a single thread at a time, this cache must be
30 per-thread. */
32 # include <iconv.h>
33 # include <stdlib.h>
34 # include <string.h>
35 # include <wchar.h>
37 # include "localcharset.h"
38 # include "streq.h"
39 # include "glthread/lock.h"
40 # include "glthread/tls.h"
41 # include "unistr.h"
43 /* Maximum length of encoding. Attained for "ISO-8859-15". */
44 # define MAX_ENCODING_LEN 11
46 struct converters
48 iconv_t cd_locale_to_utf8;
49 iconv_t cd_utf8_to_locale;
50 /* NUL-terminated encoding name. */
51 char encoding[MAX_ENCODING_LEN + 1];
54 static gl_tls_key_t converters_key; /* TLS key for a 'struct converters *' */
56 /* Frees a 'struct converters *', for example when a thread terminates. */
57 static void
58 free_converters (void *p)
60 if (p != NULL)
62 struct converters *conv = p;
63 iconv_close (conv->cd_locale_to_utf8);
64 iconv_close (conv->cd_utf8_to_locale);
65 free (conv);
69 static void
70 key_init (void)
72 gl_tls_key_init (converters_key, free_converters);
73 /* The per-thread initial value is NULL. */
76 /* Ensure that key_init is called once only. */
77 gl_once_define(static, key_init_once)
79 /* Returns the per-thread 'struct converters *' that contains converters for the
80 given encoding. Returns NULL upon failure. */
81 static struct converters *
82 get_converters (const char *encoding)
84 if (strlen (encoding) > MAX_ENCODING_LEN)
85 /* If this happens, increase MAX_ENCODING_LEN. */
86 return NULL;
88 gl_once (key_init_once, key_init);
89 struct converters *conv = gl_tls_get (converters_key);
90 if (conv == NULL)
92 conv = (struct converters *) malloc (sizeof (struct converters));
93 if (conv == NULL)
94 /* Out of memory. */
95 return NULL;
96 conv->cd_locale_to_utf8 = iconv_open ("UTF-8", encoding);
97 conv->cd_utf8_to_locale = iconv_open (encoding, "UTF-8");
98 if (conv->cd_locale_to_utf8 == (iconv_t)(-1)
99 || conv->cd_utf8_to_locale == (iconv_t)(-1))
101 /* iconv does not support this encoding. */
102 if (conv->cd_locale_to_utf8 != (iconv_t)(-1))
103 iconv_close (conv->cd_locale_to_utf8);
104 if (conv->cd_utf8_to_locale != (iconv_t)(-1))
105 iconv_close (conv->cd_utf8_to_locale);
106 free (conv);
107 return NULL;
109 strcpy (conv->encoding, encoding);
111 else if (strcmp (conv->encoding, encoding) != 0)
113 /* The locale encoding of this thread changed. */
114 iconv_t new_cd_locale_to_utf8 = iconv_open ("UTF-8", encoding);
115 iconv_t new_cd_utf8_to_locale = iconv_open (encoding, "UTF-8");
116 if (new_cd_locale_to_utf8 == (iconv_t)(-1)
117 || new_cd_utf8_to_locale == (iconv_t)(-1))
119 /* iconv does not support this encoding. */
120 if (new_cd_locale_to_utf8 != (iconv_t)(-1))
121 iconv_close (new_cd_locale_to_utf8);
122 if (new_cd_utf8_to_locale != (iconv_t)(-1))
123 iconv_close (new_cd_utf8_to_locale);
124 return NULL;
126 iconv_close (conv->cd_locale_to_utf8);
127 iconv_close (conv->cd_utf8_to_locale);
128 conv->cd_locale_to_utf8 = new_cd_locale_to_utf8;
129 conv->cd_utf8_to_locale = new_cd_utf8_to_locale;
130 strcpy (conv->encoding, encoding);
132 return conv;
135 char32_t
136 locale_encoding_to_unicode (wchar_t wc)
138 /* This function is like a simplified variant of u32_conv_from_encoding,
139 that uses a cached per-thread iconv_t instead of allocating an iconv_t
140 at each call. */
141 if (wc == 0)
142 /* Invalid argument. */
143 abort ();
145 const char *encoding = locale_charset ();
146 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
147 /* Assume that if the locale encoding is UTF-8, the wchar_t encoding is
148 Unicode. */
149 return wc;
150 if (STREQ_OPT (encoding, "ASCII", 'A', 'S', 'C', 'I', 'I', 0, 0, 0, 0))
151 /* In the POSIX locale, avoid conversion errors. */
152 return wc;
154 struct converters *conv = get_converters (encoding);
155 if (conv == NULL)
156 return 0;
158 char mbbuf[64];
159 size_t mbcnt;
161 mbstate_t state;
162 mbszero (&state);
163 mbcnt = wcrtomb (mbbuf, wc, &state);
164 if (mbcnt > sizeof (mbbuf))
165 /* wcrtomb did not recognize the wide character wc. */
166 abort ();
169 char utf8buf[6];
170 size_t utf8cnt;
172 char *mbptr = mbbuf;
173 size_t mbsize = mbcnt;
174 char *utf8ptr = utf8buf;
175 size_t utf8size = sizeof (utf8buf);
176 size_t ret = iconv (conv->cd_locale_to_utf8,
177 &mbptr, &mbsize,
178 &utf8ptr, &utf8size);
179 if (ret == (size_t)(-1))
180 /* Conversion error. */
181 return 0;
182 if (mbsize != 0)
183 /* The input was not entirely converted. */
184 return 0;
185 utf8cnt = sizeof (utf8buf) - utf8size; /* = utf8ptr - utf8buf */
186 if (utf8cnt == 0)
187 /* The conversion produced no output. */
188 return 0;
191 ucs4_t uc;
192 if (u8_mbtouc (&uc, (const uint8_t *) utf8buf, utf8cnt) != utf8cnt)
193 /* iconv produced an invalid UTF-8 byte sequence. */
194 abort ();
196 return uc;
199 wchar_t
200 unicode_to_locale_encoding (char32_t uc)
202 if (uc == 0)
203 /* Invalid argument. */
204 abort ();
206 /* This function is like a simplified variant of u32_conv_to_encoding
207 that uses a cached per-thread iconv_t instead of allocating an iconv_t
208 at each call. */
209 const char *encoding = locale_charset ();
210 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
211 /* Assume that if the locale encoding is UTF-8, the wchar_t encoding is
212 Unicode. */
213 return uc;
214 if (STREQ_OPT (encoding, "ASCII", 'A', 'S', 'C', 'I', 'I', 0, 0, 0, 0))
215 /* In the POSIX locale, avoid conversion errors. */
216 return uc;
218 struct converters *conv = get_converters (encoding);
219 if (conv == NULL)
220 return 0;
222 char utf8buf[6];
223 int utf8cnt = u8_uctomb ((uint8_t *) utf8buf, uc, sizeof (utf8buf));
224 if (utf8cnt < 0)
225 /* Out-of-range Unicode character. */
226 return 0;
228 char mbbuf[64];
229 size_t mbcnt;
231 char *utf8ptr = utf8buf;
232 size_t utf8size = utf8cnt;
233 char *mbptr = mbbuf;
234 size_t mbsize = sizeof (mbbuf);
235 size_t ret = iconv (conv->cd_utf8_to_locale,
236 &utf8ptr, &utf8size,
237 &mbptr, &mbsize);
238 if (ret == (size_t)(-1))
239 /* Conversion error. */
240 return 0;
241 if (utf8size != 0)
242 /* The input was not entirely converted. */
243 return 0;
244 mbcnt = sizeof (mbbuf) - mbsize; /* = mbptr - mbbuf */
245 if (mbcnt == 0)
246 /* The conversion produced no output. */
247 return 0;
250 wchar_t wc;
252 mbstate_t state;
253 mbszero (&state);
254 if (mbrtowc (&wc, mbbuf, mbcnt, &state) != mbcnt)
255 /* iconv produced an invalid multibyte sequence. */
256 return 0;
259 return wc;
262 #else
264 /* This declaration is solely to ensure that after preprocessing
265 this file is never empty. */
266 typedef int dummy;
268 #endif