2 * Copyright 2023 The Geany contributors
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 #include "encodingsprivate.h"
23 /* Asserts 2 bytes buffers are identical, trying to provide a somewhat useful
25 static void assert_cmpmem_eq_impl(const char *p1
, const char *p2
, gsize len
,
26 const char *domain
, const char *file
, int line
, const char *func
,
32 for (i
= 0; i
< len
&& p1
[i
] == p2
[i
]; i
++)
37 msg
= g_strdup_printf("assertion failed (%s): bytes %#x and %#x differ at offset %lu (at \"%s\" and \"%s\")",
38 expr
, (guint
) (guchar
) p1
[i
], (guint
) (guchar
) p2
[i
], i
, p1
+ i
, p2
+ i
);
39 g_assertion_message(domain
, file
, line
, func
, msg
);
43 #define assert_cmpmem_eq_with_caller(p1, p2, len, domain, file, line, func) \
44 assert_cmpmem_eq_impl(p1, p2, len, domain, file, line, func, #p1 " == " #p2)
46 #define assert_cmpmem_eq(p1, p2, len) assert_cmpmem_eq_impl(p1, p2, len, \
47 G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, #p1 " == " #p2)
50 * @brief More convenient test API for encodings_convert_to_utf8_auto()
51 * @param input Input buffer, NUL-terminated (well, at least there should be a
53 * @param input_size Actual size of @p input buffer, without the trailing NUL
54 * @param disk_size Size on disk (as reported by e.g stat -- that may be 0 for
55 * virtual files, otherwise should be input_size)
56 * @param forced_enc Forced encoding, or NULL
57 * @param expected_output Expected output data
58 * @param expected_size Expected output size
59 * @param expected_encoding Expected output encoding
60 * @param expected_has_bom Whether the input contains a BOM
61 * @param expected_partial Whether the output is expected to be truncated
62 * @returns Whether the conversion succeeded and followed the parameters
64 static gboolean
assert_convert_to_utf8_auto_impl(
65 const char *domain
, const char *file
, int line
, const char *func
,
66 const gchar
*input
, gsize input_size
,
67 const gsize disk_size
, const gchar
*forced_enc
,
68 const gchar
*expected_output
, gsize expected_size
, const gchar
*expected_encoding
,
69 gboolean expected_has_bom
, gboolean expected_partial
)
71 gchar
*buf
= g_memdup(input
, input_size
+ 1);
72 gsize size
= disk_size
;
73 gchar
*used_encoding
= NULL
;
74 gboolean has_bom
= FALSE
;
75 gboolean partial
= FALSE
;
79 g_log(domain
, G_LOG_LEVEL_INFO
, "%s:%d:%s: converting %lu bytes", file
, line
, func
, input_size
);
80 ret
= encodings_convert_to_utf8_auto(&buf
, &size
, forced_enc
, &used_encoding
, &has_bom
, &partial
, &err
);
84 g_log(domain
, G_LOG_LEVEL_INFO
, "%s:%d:%s: conversion failed: %s", file
, line
, func
, err
->message
);
89 assert_cmpmem_eq_with_caller(buf
, expected_output
, MIN(size
, expected_size
),
90 domain
, file
, line
, func
);
91 g_assert_cmpuint(size
, ==, expected_size
);
92 if (expected_encoding
)
93 g_assert_cmpstr(expected_encoding
, ==, used_encoding
);
94 g_assert_cmpint(has_bom
, ==, expected_has_bom
);
95 g_assert_cmpint(partial
, ==, expected_partial
);
97 g_free(used_encoding
);
106 #define assert_convert_to_utf8_auto(input, input_size, disk_size, forced_enc, \
107 expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) \
108 assert_convert_to_utf8_auto_impl(G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, \
109 input, input_size, disk_size, forced_enc, \
110 expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial)
113 static void test_encodings_convert_ascii_to_utf8_auto(void)
115 #define TEST_ASCII(success, str, forced_enc) \
116 g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \
117 forced_enc, str, G_N_ELEMENTS(str) - 1, forced_enc, FALSE, \
118 strlen(str) != G_N_ELEMENTS(str) - 1))
120 TEST_ASCII(TRUE
, "This is a very basic ASCII test", NULL
);
121 TEST_ASCII(TRUE
, "This is a very basic ASCII test", "None");
122 TEST_ASCII(TRUE
, "This is a very basic ASCII test", "ASCII");
123 TEST_ASCII(TRUE
, "This is a very basic ASCII test", "UTF-8");
124 TEST_ASCII(TRUE
, "S\till ve\ry \b\asic", NULL
);
125 TEST_ASCII(FALSE
, "With\0some\0NULs\0", NULL
);
126 TEST_ASCII(TRUE
, "With\0some\0NULs\0", "None");
127 TEST_ASCII(FALSE
, "With\0some\0NULs\0", "UTF-8");
133 static void test_encodings_convert_utf8_to_utf8_auto(void)
135 #define UTF8_BOM "\xef\xbb\xbf"
136 #define TEST_UTF8(success, str, forced_enc) \
138 gboolean has_bom = strncmp(str, UTF8_BOM, 3) == 0; \
139 g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \
140 forced_enc, str + (has_bom ? 3 : 0), G_N_ELEMENTS(str) - 1 - (has_bom ? 3 : 0), \
141 forced_enc, has_bom, strlen(str) != G_N_ELEMENTS(str) - 1)); \
144 TEST_UTF8(TRUE
, "Thĩs îs å véry basìč ÅSÇǏÍ test", NULL
);
145 TEST_UTF8(TRUE
, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None");
146 TEST_UTF8(TRUE
, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8");
147 TEST_UTF8(FALSE
, "Wíťh\0søme\0NÙLs\0", NULL
);
148 TEST_UTF8(FALSE
, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */
149 TEST_UTF8(TRUE
, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */
151 /* with the inline hint */
152 TEST_UTF8(TRUE
, "coding:utf-8 bãśïč", NULL
);
153 TEST_UTF8(FALSE
, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL
);
155 TEST_UTF8(TRUE
, UTF8_BOM
"With BOM", NULL
);
156 /* These won't pass the UTF-8 validation despite the BOM, so we fallback to
157 * testing other options, and it will succeed with UTF-16 so there's no real
158 * point in verifying this */
159 /*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/
160 /*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/
163 TEST_UTF8(FALSE
, "Th\xec""s", "UTF-8");
164 TEST_UTF8(FALSE
, "Th\xec""s\0", "UTF-8");
165 TEST_UTF8(FALSE
, "\0Th\xec""s", "UTF-8");
172 static void test_encodings_convert_utf_other_to_utf8_auto(void)
174 #define UTF16_LE_BOM "\xff\xfe"
175 #define UTF16_BE_BOM "\xfe\xff"
176 #define UTF32_LE_BOM "\xff\xfe\x00\x00"
177 #define UTF32_BE_BOM "\x00\x00\xfe\xff"
178 #define TEST_ENC(success, input, output, has_bom, forced_enc, expected_encoding) \
179 g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
180 forced_enc, output, G_N_ELEMENTS(output) - 1, expected_encoding, has_bom, \
181 strlen(output) != G_N_ELEMENTS(output) - 1))
182 #define TEST(success, input, output, has_bom, forced_enc) \
183 TEST_ENC(success, input, output, has_bom, forced_enc, forced_enc)
185 TEST(TRUE
, "N\000o\000 \000B\000O\000M\000", "No BOM", FALSE
, NULL
);
186 TEST(TRUE
, "N\000o\000 \000B\000\330\000M\000", "No BØM", FALSE
, NULL
);
187 /* doesn't accept the NULs */
188 TEST(FALSE
, "N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "No BOM\0and NULs", FALSE
, NULL
);
189 TEST(FALSE
, "N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "No BØM\0añd NÙLs", FALSE
, NULL
);
191 TEST(TRUE
, UTF16_LE_BOM
"W\000i\000t\000h\000 \000B\000O\000M\000", "With BOM", TRUE
, NULL
);
192 TEST(TRUE
, UTF16_LE_BOM
"W\000i\000t\000h\000 \000B\000\330\000M\000", "With BØM", TRUE
, NULL
);
193 /* doesn't accept the NULs */
194 TEST(FALSE
, UTF16_LE_BOM
"W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "With BOM\0and NULs", TRUE
, NULL
);
195 TEST(FALSE
, UTF16_LE_BOM
"W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "Wíth BØM\0añd NÙLs", TRUE
, NULL
);
197 /* We should actually be smarter in our selection of encoding introducing
198 * probability scores, because this loads as UTF-16LE but is "圀椀琀栀 䈀伀䴀"
199 * which doesn't seem to be real Chinese */
200 TEST(TRUE
, "\000N\000o\000 \000B\000O\000M", "No BOM", FALSE
, "UTF-16BE");
201 TEST(TRUE
, "\000N\000o\000 \000B\000\330\000M", "No BØM", FALSE
, NULL
);
202 /* doesn't accept the NULs -- and see above for the encoding choice */
203 TEST(FALSE
, "\000N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "No BOM\0and NULs", FALSE
, "UTF-16BE");
204 TEST(FALSE
, "\000N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "No BØM\0añd NÙLs", FALSE
, NULL
);
206 TEST(TRUE
, UTF16_BE_BOM
"\000W\000i\000t\000h\000 \000B\000O\000M", "With BOM", TRUE
, NULL
);
207 TEST(TRUE
, UTF16_BE_BOM
"\000W\000i\000t\000h\000 \000B\000\330\000M", "With BØM", TRUE
, NULL
);
208 /* doesn't accept the NULs */
209 TEST(FALSE
, UTF16_BE_BOM
"\000W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "With BOM\0and NULs", TRUE
, NULL
);
210 TEST(FALSE
, UTF16_BE_BOM
"\000W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "Wíth BØM\0añd NÙLs", TRUE
, NULL
);
212 TEST(TRUE
, UTF32_LE_BOM
"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000", "With BOM", TRUE
, NULL
);
213 TEST(TRUE
, UTF32_LE_BOM
"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000", "With BØM", TRUE
, NULL
);
214 /* doesn't accept the NULs */
215 TEST(FALSE
, UTF32_LE_BOM
"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s\000\000\000", "With BOM\0and NULs", TRUE
, NULL
);
216 TEST(FALSE
, UTF32_LE_BOM
"W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s\000\000\000", "Wíth BØM\0añd NÙLs", TRUE
, NULL
);
218 TEST(TRUE
, UTF32_BE_BOM
"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M", "With BOM", TRUE
, NULL
);
219 TEST(TRUE
, UTF32_BE_BOM
"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M", "With BØM", TRUE
, NULL
);
220 /* doesn't accept the NULs */
221 TEST(FALSE
, UTF32_BE_BOM
"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s", "With BOM\0and NULs", TRUE
, NULL
);
222 TEST(FALSE
, UTF32_BE_BOM
"\000\000\000W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s", "Wíth BØM\0añd NÙLs", TRUE
, NULL
);
225 TEST(TRUE
, "No B+ANg-M", "No BØM", FALSE
, "UTF-7");
226 TEST(TRUE
, "+/v8-With B+ANg-M", "With BØM", TRUE
, NULL
);
227 TEST(FALSE
, "No B+ANg-M+AAA-but NULs", "No BØM\0but NULs", FALSE
, "UTF-7");
228 /* Fails to load as UTF-7 because of the NUL, and succeeds as UTF-8 but
229 * obviously doesn't match expectations */
230 /*TEST(FALSE, "+/v8-With B+ANg-M+AAA-and NULs", "With BØM\0and NULs", TRUE, NULL);*/
232 /* empty data with BOMs */
233 TEST_ENC(TRUE
, "+/v8-", "", TRUE
, NULL
, "UTF-7"); /* UTF-7 */
234 TEST_ENC(TRUE
, UTF16_BE_BOM
, "", TRUE
, NULL
, "UTF-16BE");
235 TEST_ENC(TRUE
, UTF16_LE_BOM
, "", TRUE
, NULL
, "UTF-16LE");
236 TEST_ENC(TRUE
, UTF32_BE_BOM
, "", TRUE
, NULL
, "UTF-32BE");
237 TEST_ENC(TRUE
, UTF32_LE_BOM
, "", TRUE
, NULL
, "UTF-32LE");
248 static void test_encodings_convert_iso8859_to_utf8_auto(void)
250 #define TEST(success, input, output, forced_enc) \
251 g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
252 forced_enc, output, G_N_ELEMENTS(output) - 1, forced_enc, FALSE, \
253 strlen(output) != G_N_ELEMENTS(output) - 1))
255 TEST(TRUE
, "Th\xec""s", "Thìs", NULL
);
256 TEST(TRUE
, "Th\xec""s", "Thìs", "ISO-8859-1");
257 TEST(TRUE
, "Th\xec""s", "Thìs", "ISO-8859-15");
258 TEST(TRUE
, "\xa4""uro", "¤uro", "ISO-8859-1");
259 TEST(TRUE
, "\xa4""uro", "€uro", "ISO-8859-15");
260 TEST(TRUE
, "\xd8""ed", "Řed", "ISO-8859-2");
261 /* make-believe UTF-8 BOM followed by non-UTF-8 data */
262 TEST(TRUE
, "\xef\xbb\xbf""not B\xd3M", "not BÓM", NULL
);
263 TEST(TRUE
, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL
);
265 TEST(FALSE
, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1");
266 TEST(FALSE
, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15");
267 /* This parses as UTF-16, but that's not really what we'd expect */
268 /*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/
270 /* UTF-8 BOM with non-UTF-8 data, we should fallback */
271 TEST(TRUE
, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "Wìthøut BOM", NULL
);
277 int main(int argc
, char **argv
)
279 g_test_init(&argc
, &argv
, NULL
);
280 gtk_init_check(&argc
, &argv
);
281 main_init_headless();
283 g_test_add_func("/encodings/ascii/convert_to_utf8_auto", test_encodings_convert_ascii_to_utf8_auto
);
284 g_test_add_func("/encodings/utf8/convert_to_utf8_auto", test_encodings_convert_utf8_to_utf8_auto
);
285 g_test_add_func("/encodings/utf_other/convert_to_utf_other_auto", test_encodings_convert_utf_other_to_utf8_auto
);
286 g_test_add_func("/encodings/iso8859/convert_to_utf8_auto", test_encodings_convert_iso8859_to_utf8_auto
);