1 /* Character set conversion with error handling and autodetection.
2 Copyright (C) 2002, 2005, 2007, 2009-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
21 #include "striconveha.h"
28 #include "c-strcase.h"
29 #include "striconveh.h"
31 #define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
34 /* Autodetection list. */
36 struct autodetect_alias
38 struct autodetect_alias
*next
;
40 const char * const *encodings_to_try
;
43 static const char * const autodetect_utf8_try
[] =
45 /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
46 be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */
47 "UTF-8", "ISO-8859-1",
50 static const char * const autodetect_jp_try
[] =
52 /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
54 Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
55 is unavoidable. People will condemn SHIFT_JIS.
56 If we tried SHIFT_JIS first, then some short EUC-JP inputs would
57 come out wrong, and people would condemn EUC-JP and Unix, which
59 Finally try SHIFT_JIS. */
60 "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
63 static const char * const autodetect_kr_try
[] =
65 /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
67 Finally try EUC-KR. */
68 "ISO-2022-KR", "EUC-KR",
72 static struct autodetect_alias autodetect_predefined
[] =
74 { &autodetect_predefined
[1], "autodetect_utf8", autodetect_utf8_try
},
75 { &autodetect_predefined
[2], "autodetect_jp", autodetect_jp_try
},
76 { NULL
, "autodetect_kr", autodetect_kr_try
}
79 static struct autodetect_alias
*autodetect_list
= &autodetect_predefined
[0];
80 static struct autodetect_alias
**autodetect_list_end
=
81 &autodetect_predefined
[SIZEOF(autodetect_predefined
)-1].next
;
84 uniconv_register_autodetect (const char *name
,
85 const char * const *try_in_order
)
92 struct autodetect_alias
*new_alias
;
94 const char **new_try_in_order
;
96 /* The TRY_IN_ORDER list must not be empty. */
97 if (try_in_order
[0] == NULL
)
103 /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
104 with dynamic extent. */
105 namelen
= strlen (name
) + 1;
106 memneed
= sizeof (struct autodetect_alias
) + namelen
+ sizeof (char *);
107 for (i
= 0; try_in_order
[i
] != NULL
; i
++)
108 memneed
+= sizeof (char *) + strlen (try_in_order
[i
]) + 1;
111 memory
= (char *) malloc (memneed
);
114 new_alias
= (struct autodetect_alias
*) memory
;
115 memory
+= sizeof (struct autodetect_alias
);
117 new_try_in_order
= (const char **) memory
;
118 memory
+= (listlen
+ 1) * sizeof (char *);
120 new_name
= (char *) memory
;
121 memcpy (new_name
, name
, namelen
);
124 for (i
= 0; i
< listlen
; i
++)
126 size_t len
= strlen (try_in_order
[i
]) + 1;
127 memcpy (memory
, try_in_order
[i
], len
);
128 new_try_in_order
[i
] = (const char *) memory
;
131 new_try_in_order
[i
] = NULL
;
133 /* Now insert the new alias. */
134 new_alias
->name
= new_name
;
135 new_alias
->encodings_to_try
= new_try_in_order
;
136 new_alias
->next
= NULL
;
137 /* FIXME: Not multithread-safe. */
138 *autodetect_list_end
= new_alias
;
139 autodetect_list_end
= &new_alias
->next
;
149 /* Like mem_iconveha, except no handling of transliteration. */
151 mem_iconveha_notranslit (const char *src
, size_t srclen
,
152 const char *from_codeset
, const char *to_codeset
,
153 enum iconv_ilseq_handler handler
,
155 char **resultp
, size_t *lengthp
)
157 int retval
= mem_iconveh (src
, srclen
, from_codeset
, to_codeset
, handler
,
158 offsets
, resultp
, lengthp
);
159 if (retval
>= 0 || errno
!= EINVAL
)
163 struct autodetect_alias
*alias
;
165 /* Unsupported from_codeset or to_codeset. Check whether the caller
166 requested autodetection. */
167 for (alias
= autodetect_list
; alias
!= NULL
; alias
= alias
->next
)
168 if (strcmp (from_codeset
, alias
->name
) == 0)
170 const char * const *encodings
;
172 if (handler
!= iconveh_error
)
174 /* First try all encodings without any forgiving. */
175 encodings
= alias
->encodings_to_try
;
178 retval
= mem_iconveha_notranslit (src
, srclen
,
179 *encodings
, to_codeset
,
180 iconveh_error
, offsets
,
182 if (!(retval
< 0 && errno
== EILSEQ
))
186 while (*encodings
!= NULL
);
189 encodings
= alias
->encodings_to_try
;
192 retval
= mem_iconveha_notranslit (src
, srclen
,
193 *encodings
, to_codeset
,
196 if (!(retval
< 0 && errno
== EILSEQ
))
200 while (*encodings
!= NULL
);
202 /* Return the last call's result. */
206 /* It wasn't an autodetection name. */
213 mem_iconveha (const char *src
, size_t srclen
,
214 const char *from_codeset
, const char *to_codeset
,
216 enum iconv_ilseq_handler handler
,
218 char **resultp
, size_t *lengthp
)
222 /* Nothing to convert. */
227 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
228 we want to use transliteration. */
229 #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
230 && !defined __UCLIBC__) \
231 || _LIBICONV_VERSION >= 0x0105
235 size_t len
= strlen (to_codeset
);
236 char *to_codeset_suffixed
= (char *) malloca (len
+ 10 + 1);
237 memcpy (to_codeset_suffixed
, to_codeset
, len
);
238 memcpy (to_codeset_suffixed
+ len
, "//TRANSLIT", 10 + 1);
240 retval
= mem_iconveha_notranslit (src
, srclen
,
241 from_codeset
, to_codeset_suffixed
,
242 handler
, offsets
, resultp
, lengthp
);
244 freea (to_codeset_suffixed
);
250 return mem_iconveha_notranslit (src
, srclen
,
251 from_codeset
, to_codeset
,
252 handler
, offsets
, resultp
, lengthp
);
255 /* Like str_iconveha, except no handling of transliteration. */
257 str_iconveha_notranslit (const char *src
,
258 const char *from_codeset
, const char *to_codeset
,
259 enum iconv_ilseq_handler handler
)
261 char *result
= str_iconveh (src
, from_codeset
, to_codeset
, handler
);
263 if (result
!= NULL
|| errno
!= EINVAL
)
267 struct autodetect_alias
*alias
;
269 /* Unsupported from_codeset or to_codeset. Check whether the caller
270 requested autodetection. */
271 for (alias
= autodetect_list
; alias
!= NULL
; alias
= alias
->next
)
272 if (strcmp (from_codeset
, alias
->name
) == 0)
274 const char * const *encodings
;
276 if (handler
!= iconveh_error
)
278 /* First try all encodings without any forgiving. */
279 encodings
= alias
->encodings_to_try
;
282 result
= str_iconveha_notranslit (src
,
283 *encodings
, to_codeset
,
285 if (!(result
== NULL
&& errno
== EILSEQ
))
289 while (*encodings
!= NULL
);
292 encodings
= alias
->encodings_to_try
;
295 result
= str_iconveha_notranslit (src
,
296 *encodings
, to_codeset
,
298 if (!(result
== NULL
&& errno
== EILSEQ
))
302 while (*encodings
!= NULL
);
304 /* Return the last call's result. */
308 /* It wasn't an autodetection name. */
315 str_iconveha (const char *src
,
316 const char *from_codeset
, const char *to_codeset
,
318 enum iconv_ilseq_handler handler
)
320 if (*src
== '\0' || c_strcasecmp (from_codeset
, to_codeset
) == 0)
322 char *result
= strdup (src
);
329 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
330 we want to use transliteration. */
331 #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
332 && !defined __UCLIBC__) \
333 || _LIBICONV_VERSION >= 0x0105
337 size_t len
= strlen (to_codeset
);
338 char *to_codeset_suffixed
= (char *) malloca (len
+ 10 + 1);
339 memcpy (to_codeset_suffixed
, to_codeset
, len
);
340 memcpy (to_codeset_suffixed
+ len
, "//TRANSLIT", 10 + 1);
342 result
= str_iconveha_notranslit (src
, from_codeset
, to_codeset_suffixed
,
345 freea (to_codeset_suffixed
);
351 return str_iconveha_notranslit (src
, from_codeset
, to_codeset
, handler
);