1 /* Test iconv's TRANSLIT and IGNORE option handling
3 Copyright (C) 2020-2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
25 #include <support/support.h>
26 #include <support/check.h>
29 /* Run one iconv test. Arguments:
30 to: destination character set and options
31 from: source character set
32 input: input string to be converted
33 exp_in: expected number of bytes consumed
34 exp_ret: expected return value (error or number of irreversible conversions)
35 exp_out: expected output string
36 exp_err: expected value of `errno' after iconv returns. */
38 test_iconv (const char *to
, const char *from
, char *input
, size_t exp_in
,
39 size_t exp_ret
, const char *exp_out
, int exp_err
)
47 cd
= iconv_open (to
, from
);
48 TEST_VERIFY (cd
!= (iconv_t
) -1);
50 inlen
= strlen (input
);
51 outlen
= sizeof (outbuf
);
56 n
= iconv (cd
, &inptr
, &inlen
, &outptr
, &outlen
);
58 TEST_COMPARE (n
, exp_ret
);
59 TEST_VERIFY (inptr
== input
+ exp_in
);
60 TEST_COMPARE (errno
, exp_err
);
61 TEST_COMPARE_BLOB (outbuf
, outptr
- outbuf
, exp_out
, strlen (exp_out
));
62 TEST_VERIFY (iconv_close (cd
) == 0);
66 /* We test option parsing by converting UTF-8 inputs to ASCII under various
67 option combinations. The UTF-8 inputs fall into three categories:
70 - non-ASCII with invalid UTF-8 characters. */
73 char ascii
[] = "Just some ASCII text";
75 /* 2. Valid UTF-8 input and some corresponding expected outputs with various
76 options. The two non-ASCII characters below are accented alphabets:
77 an `a' then an `o'. */
78 char utf8
[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
79 char u2a
[] = "UTF-8 text with ";
80 char u2a_translit
[] = "UTF-8 text with a couple of non-ASCII characters";
81 char u2a_ignore
[] = "UTF-8 text with couple f non-ASCII characters";
83 /* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is
84 invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */
85 char iutf8
[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
86 char iu2a
[] = "Invalid UTF-8 ";
87 char iu2a_ignore
[] = "Invalid UTF-8 text";
88 char iu2a_both
[] = "Invalid UTF-8 [|text|]";
90 /* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
91 the valid non-ASCII UTF-8 characters appear before the invalid \xff. */
92 char jutf8
[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
93 char ju2a
[] = "Invalid ";
94 char ju2a_translit
[] = "Invalid [|UTF-8|] ";
95 char ju2a_ignore
[] = "Invalid UTF-8 text";
96 char ju2a_both
[] = "Invalid [|UTF-8|] text";
98 /* We also test option handling for character set names that have the form
99 "A/B". In this test, we test conversions "ISO-10646/UTF-8", and either
100 ISO-8859-1 or ASCII. */
102 /* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
103 equivalent ASCII transliteration. */
104 char iso8859_1_a
[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */
105 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */
107 char utf8_a
[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
108 "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
109 char ascii_a
[] = "AAAAAAaaaaaa";
111 /* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */
112 char iascii
[] = {0x80, '~', '\0'};
114 char ia2u_ignore
[] = "~";
119 xsetlocale (LC_ALL
, "en_US.UTF-8");
122 /* 0. iconv_open should gracefully fail for invalid character sets. */
124 TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t
) -1);
125 TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t
) -1);
126 TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t
) -1);
129 /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */
131 test_iconv ("ASCII", "UTF-8", ascii
, strlen (ascii
), 0, ascii
, 0);
132 test_iconv ("ASCII//", "UTF-8", ascii
, strlen (ascii
), 0, ascii
, 0);
133 test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii
, strlen (ascii
), 0, ascii
, 0);
134 test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii
, strlen (ascii
), 0, ascii
,
136 test_iconv ("ASCII//IGNORE", "UTF-8", ascii
, strlen (ascii
), 0, ascii
, 0);
137 test_iconv ("ASCII//IGNORE//", "UTF-8", ascii
, strlen (ascii
), 0, ascii
, 0);
140 /* 2. Valid UTF-8 input with non-ASCII characters: */
142 /* EILSEQ when converted to ASCII. */
143 test_iconv ("ASCII", "UTF-8", utf8
, strlen (u2a
), (size_t) -1, u2a
, EILSEQ
);
145 /* Converted without error with TRANSLIT enabled. */
146 test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8
, strlen (utf8
), 2, u2a_translit
,
149 /* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */
150 test_iconv ("ASCII//IGNORE", "UTF-8", utf8
, strlen (utf8
), (size_t) -1,
153 /* With TRANSLIT and IGNORE enabled, transliterated without error. We test
154 four combinations. */
156 test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8
, strlen (utf8
), 2,
158 test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8
, strlen (utf8
), 2,
160 test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8
, strlen (utf8
), 2,
162 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
163 test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8
, strlen (utf8
), 2,
166 /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
167 works while respecting any other correctly spelled options. */
169 test_iconv ("ASCII//T", "UTF-8", utf8
, strlen (u2a
), (size_t) -1, u2a
,
171 test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8
, strlen (u2a
), (size_t) -1,
173 test_iconv ("ASCII//I", "UTF-8", utf8
, strlen (u2a
), (size_t) -1, u2a
,
175 test_iconv ("ASCII//IGNORED", "UTF-8", utf8
, strlen (u2a
), (size_t) -1, u2a
,
177 test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8
, strlen (u2a
),
178 (size_t) -1, u2a
, EILSEQ
);
179 test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8
, strlen (u2a
),
180 (size_t) -1, u2a
, EILSEQ
);
181 test_iconv ("ASCII//T//I", "UTF-8", utf8
, strlen (u2a
), (size_t) -1, u2a
,
184 test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8
, strlen (utf8
), 2,
186 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
187 test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8
, strlen (utf8
), 2,
189 test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8
, strlen (utf8
), 2,
191 test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8
, strlen (utf8
), 2,
194 test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8
, strlen (utf8
), (size_t) -1,
196 test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8
, strlen (utf8
), (size_t) -1,
198 /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
199 test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8
, strlen (utf8
),
200 (size_t) -1, u2a_ignore
, EILSEQ
);
201 test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8
, strlen (utf8
),
202 (size_t) -1, u2a_ignore
, EILSEQ
);
205 /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */
207 /* EILSEQ; output is truncated at the first invalid UTF-8 character. */
208 test_iconv ("ASCII", "UTF-8", iutf8
, strlen (iu2a
), (size_t) -1, iu2a
,
211 /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
213 test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8
, strlen (iu2a
), (size_t) -1,
216 /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
217 valid UTF-8 non-ASCII characters. */
218 test_iconv ("ASCII//IGNORE", "UTF-8", iutf8
, strlen (iutf8
), (size_t) -1,
219 iu2a_ignore
, EILSEQ
);
221 /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
222 characters and transliterates valid non-ASCII UTF-8 characters. We test
223 four combinations. */
225 test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8
, strlen (iutf8
), 2,
227 /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
228 test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8
, strlen (iutf8
), 2,
230 test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8
, strlen (iutf8
), 2,
232 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
233 test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8
, strlen (iutf8
), 2,
237 /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */
239 /* EILSEQ; output is truncated at the first non-ASCII character. */
240 test_iconv ("ASCII", "UTF-8", jutf8
, strlen (ju2a
), (size_t) -1, ju2a
,
243 /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
245 test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8
, strlen (jutf8
) - 5,
246 (size_t) -1, ju2a_translit
, EILSEQ
);
247 test_iconv ("ASCII//translit", "UTF-8", jutf8
, strlen (jutf8
) - 5,
248 (size_t) -1, ju2a_translit
, EILSEQ
);
250 /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
251 valid UTF-8 non-ASCII characters. */
252 test_iconv ("ASCII//IGNORE", "UTF-8", jutf8
, strlen (jutf8
), (size_t) -1,
253 ju2a_ignore
, EILSEQ
);
254 test_iconv ("ASCII//ignore", "UTF-8", jutf8
, strlen (jutf8
), (size_t) -1,
255 ju2a_ignore
, EILSEQ
);
257 /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
258 characters and transliterates valid non-ASCII UTF-8 characters. We test
259 several combinations. */
261 test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8
, strlen (jutf8
), 2,
263 /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
264 test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8
, strlen (jutf8
), 2,
266 test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8
, strlen (jutf8
), 2,
268 /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
269 test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8
, strlen (jutf8
), 2,
271 test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8
, strlen (jutf8
), 2,
273 /* Trailing whitespace and separators should be ignored. */
274 test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8
, strlen (jutf8
), 2,
276 test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8
, strlen (jutf8
), 2,
278 test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8
, strlen (jutf8
), 2,
280 test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8
, strlen (jutf8
), 2,
282 test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8
, strlen (jutf8
), 2,
284 test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8
, strlen (jutf8
), 2,
287 /* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */
288 test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8
, strlen (ju2a
), (size_t) -1,
290 test_iconv ("ASCII", "UTF-8//IGNORE", jutf8
, strlen (ju2a
), (size_t) -1,
292 test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8
, strlen (ju2a
),
293 (size_t) -1, ju2a
, EILSEQ
);
296 /* 5. Charset names of the form "A/B/": */
298 /* ISO-8859-1 is converted to UTF-8 without needing transliteration. */
299 test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a
,
300 strlen (iso8859_1_a
), 0, utf8_a
, 0);
301 test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a
,
302 strlen (iso8859_1_a
), 0, utf8_a
, 0);
303 test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a
,
304 strlen (iso8859_1_a
), 0, utf8_a
, 0);
305 test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a
,
306 strlen (iso8859_1_a
), 0, utf8_a
, 0);
307 test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a
,
308 strlen (iso8859_1_a
), 0, utf8_a
, 0);
309 test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a
,
310 strlen (iso8859_1_a
), 0, utf8_a
, 0);
311 test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a
,
312 strlen (iso8859_1_a
), 0, utf8_a
, 0);
313 test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a
,
314 strlen (iso8859_1_a
), 0, utf8_a
, 0);
315 test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a
,
316 strlen (iso8859_1_a
), 0, utf8_a
, 0);
318 /* UTF-8 with accented A's is converted to ASCII with transliteration. */
319 test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a
,
320 0, (size_t) -1, empty
, EILSEQ
);
321 test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a
,
322 strlen (utf8_a
), (size_t) -1, empty
, EILSEQ
);
323 test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a
,
324 strlen (utf8_a
), 12, ascii_a
, 0);
326 /* Invalid ASCII is converted to UTF-8 only with IGNORE. */
327 test_iconv ("ISO-10646/UTF-8", "ASCII", iascii
, strlen (empty
), (size_t) -1,
329 test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii
, strlen (empty
),
330 (size_t) -1, empty
, EILSEQ
);
331 test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii
, strlen (iascii
),
332 (size_t) -1, ia2u_ignore
, EILSEQ
);
333 test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii
,
334 strlen (iascii
), (size_t) -1, ia2u_ignore
, EILSEQ
);
335 /* Due to bug 19519, iconv was ignoring IGNORE for the following three
337 test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii
,
338 strlen (iascii
), (size_t) -1, ia2u_ignore
, EILSEQ
);
339 test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii
,
340 strlen (iascii
), (size_t) -1, ia2u_ignore
, EILSEQ
);
341 test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii
,
342 strlen (iascii
), (size_t) -1, ia2u_ignore
, EILSEQ
);
347 #include <support/test-driver.c>