1 /* Conversion between the current locale's character encoding and Unicode.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2023. */
22 #include "lc-charset-unicode.h"
24 #if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION
26 /* We use iconv() to convert between a 'wchar_t' value and a Unicode code point.
27 For performance reasons, we don't allocate an iconv_t for each conversion,
28 but instead cache it for subsequent conversions. Since an iconv_t descriptor
29 can only be used in a single thread at a time, this cache must be
37 # include "localcharset.h"
39 # include "glthread/lock.h"
40 # include "glthread/tls.h"
43 /* Maximum length of encoding. Attained for "ISO-8859-15". */
44 # define MAX_ENCODING_LEN 11
48 iconv_t cd_locale_to_utf8
;
49 iconv_t cd_utf8_to_locale
;
50 /* NUL-terminated encoding name. */
51 char encoding
[MAX_ENCODING_LEN
+ 1];
54 static gl_tls_key_t converters_key
; /* TLS key for a 'struct converters *' */
56 /* Frees a 'struct converters *', for example when a thread terminates. */
58 free_converters (void *p
)
62 struct converters
*conv
= p
;
63 iconv_close (conv
->cd_locale_to_utf8
);
64 iconv_close (conv
->cd_utf8_to_locale
);
72 gl_tls_key_init (converters_key
, free_converters
);
73 /* The per-thread initial value is NULL. */
76 /* Ensure that key_init is called once only. */
77 gl_once_define(static, key_init_once
)
79 /* Returns the per-thread 'struct converters *' that contains converters for the
80 given encoding. Returns NULL upon failure. */
81 static struct converters
*
82 get_converters (const char *encoding
)
84 if (strlen (encoding
) > MAX_ENCODING_LEN
)
85 /* If this happens, increase MAX_ENCODING_LEN. */
88 gl_once (key_init_once
, key_init
);
89 struct converters
*conv
= gl_tls_get (converters_key
);
92 conv
= (struct converters
*) malloc (sizeof (struct converters
));
96 conv
->cd_locale_to_utf8
= iconv_open ("UTF-8", encoding
);
97 conv
->cd_utf8_to_locale
= iconv_open (encoding
, "UTF-8");
98 if (conv
->cd_locale_to_utf8
== (iconv_t
)(-1)
99 || conv
->cd_utf8_to_locale
== (iconv_t
)(-1))
101 /* iconv does not support this encoding. */
102 if (conv
->cd_locale_to_utf8
!= (iconv_t
)(-1))
103 iconv_close (conv
->cd_locale_to_utf8
);
104 if (conv
->cd_utf8_to_locale
!= (iconv_t
)(-1))
105 iconv_close (conv
->cd_utf8_to_locale
);
109 strcpy (conv
->encoding
, encoding
);
111 else if (strcmp (conv
->encoding
, encoding
) != 0)
113 /* The locale encoding of this thread changed. */
114 iconv_t new_cd_locale_to_utf8
= iconv_open ("UTF-8", encoding
);
115 iconv_t new_cd_utf8_to_locale
= iconv_open (encoding
, "UTF-8");
116 if (new_cd_locale_to_utf8
== (iconv_t
)(-1)
117 || new_cd_utf8_to_locale
== (iconv_t
)(-1))
119 /* iconv does not support this encoding. */
120 if (new_cd_locale_to_utf8
!= (iconv_t
)(-1))
121 iconv_close (new_cd_locale_to_utf8
);
122 if (new_cd_utf8_to_locale
!= (iconv_t
)(-1))
123 iconv_close (new_cd_utf8_to_locale
);
126 iconv_close (conv
->cd_locale_to_utf8
);
127 iconv_close (conv
->cd_utf8_to_locale
);
128 conv
->cd_locale_to_utf8
= new_cd_locale_to_utf8
;
129 conv
->cd_utf8_to_locale
= new_cd_utf8_to_locale
;
130 strcpy (conv
->encoding
, encoding
);
136 locale_encoding_to_unicode (wchar_t wc
)
138 /* This function is like a simplified variant of u32_conv_from_encoding,
139 that uses a cached per-thread iconv_t instead of allocating an iconv_t
142 /* Invalid argument. */
145 const char *encoding
= locale_charset ();
146 if (STREQ_OPT (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
147 /* Assume that if the locale encoding is UTF-8, the wchar_t encoding is
150 if (STREQ_OPT (encoding
, "ASCII", 'A', 'S', 'C', 'I', 'I', 0, 0, 0, 0))
151 /* In the POSIX locale, avoid conversion errors. */
154 struct converters
*conv
= get_converters (encoding
);
163 mbcnt
= wcrtomb (mbbuf
, wc
, &state
);
164 if (mbcnt
> sizeof (mbbuf
))
165 /* wcrtomb did not recognize the wide character wc. */
173 size_t mbsize
= mbcnt
;
174 char *utf8ptr
= utf8buf
;
175 size_t utf8size
= sizeof (utf8buf
);
176 size_t ret
= iconv (conv
->cd_locale_to_utf8
,
178 &utf8ptr
, &utf8size
);
179 if (ret
== (size_t)(-1))
180 /* Conversion error. */
183 /* The input was not entirely converted. */
185 utf8cnt
= sizeof (utf8buf
) - utf8size
; /* = utf8ptr - utf8buf */
187 /* The conversion produced no output. */
192 if (u8_mbtouc (&uc
, (const uint8_t *) utf8buf
, utf8cnt
) != utf8cnt
)
193 /* iconv produced an invalid UTF-8 byte sequence. */
200 unicode_to_locale_encoding (char32_t uc
)
203 /* Invalid argument. */
206 /* This function is like a simplified variant of u32_conv_to_encoding
207 that uses a cached per-thread iconv_t instead of allocating an iconv_t
209 const char *encoding
= locale_charset ();
210 if (STREQ_OPT (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
211 /* Assume that if the locale encoding is UTF-8, the wchar_t encoding is
214 if (STREQ_OPT (encoding
, "ASCII", 'A', 'S', 'C', 'I', 'I', 0, 0, 0, 0))
215 /* In the POSIX locale, avoid conversion errors. */
218 struct converters
*conv
= get_converters (encoding
);
223 int utf8cnt
= u8_uctomb ((uint8_t *) utf8buf
, uc
, sizeof (utf8buf
));
225 /* Out-of-range Unicode character. */
231 char *utf8ptr
= utf8buf
;
232 size_t utf8size
= utf8cnt
;
234 size_t mbsize
= sizeof (mbbuf
);
235 size_t ret
= iconv (conv
->cd_utf8_to_locale
,
238 if (ret
== (size_t)(-1))
239 /* Conversion error. */
242 /* The input was not entirely converted. */
244 mbcnt
= sizeof (mbbuf
) - mbsize
; /* = mbptr - mbbuf */
246 /* The conversion produced no output. */
254 if (mbrtowc (&wc
, mbbuf
, mbcnt
, &state
) != mbcnt
)
255 /* iconv produced an invalid multibyte sequence. */
264 /* This declaration is solely to ensure that after preprocessing
265 this file is never empty. */