tests/test_encodings.c

   1 /*
   2  *      Copyright 2023 The Geany contributors
   3  *
   4  *      This program is free software; you can redistribute it and/or modify
   5  *      it under the terms of the GNU General Public License as published by
   6  *      the Free Software Foundation; either version 2 of the License, or
   7  *      (at your option) any later version.
   8  *
   9  *      This program is distributed in the hope that it will be useful,
  10  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *      GNU General Public License for more details.
  13  *
  14  *      You should have received a copy of the GNU General Public License along
  15  *      with this program; if not, write to the Free Software Foundation, Inc.,
  16  *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  17  */
  18
  19 #include "encodingsprivate.h"
  20 #include "main.h"
  21
  22
  23 /* Asserts 2 bytes buffers are identical, trying to provide a somewhat useful
  24  * error if not. */
  25 static void assert_cmpmem_eq_impl(const char *p1, const char *p2, gsize len,
  26                 const char *domain, const char *file, int line, const char *func,
  27                 const char *expr)
  28 {
  29         gchar *msg;
  30         gsize i;
  31
  32         for (i = 0; i < len && p1[i] == p2[i]; i++)
  33                 ;
  34         if (i == len)
  35                 return;
  36
  37         msg = g_strdup_printf("assertion failed (%s): bytes %#x and %#x differ at offset %lu (at \"%s\" and \"%s\")",
  38                         expr, (guint) (guchar) p1[i], (guint) (guchar) p2[i], i, p1 + i, p2 + i);
  39         g_assertion_message(domain, file, line, func, msg);
  40         g_free(msg);
  41 }
  42
  43 #define assert_cmpmem_eq_with_caller(p1, p2, len, domain, file, line, func) \
  44         assert_cmpmem_eq_impl(p1, p2, len, domain, file, line, func, #p1 " == " #p2)
  45
  46 #define assert_cmpmem_eq(p1, p2, len) assert_cmpmem_eq_impl(p1, p2, len, \
  47                 G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, #p1 " == " #p2)
  48
  49 /*
  50  * @brief More convenient test API for encodings_convert_to_utf8_auto()
  51  * @param input Input buffer, NUL-terminated (well, at least there should be a
  52  *        trailing NUL).
  53  * @param input_size Actual size of @p input buffer, without the trailing NUL
  54  * @param disk_size Size on disk (as reported by e.g stat -- that may be 0 for
  55  *                  virtual files, otherwise should be input_size)
  56  * @param forced_enc Forced encoding, or NULL
  57  * @param expected_output Expected output data
  58  * @param expected_size Expected output size
  59  * @param expected_encoding Expected output encoding
  60  * @param expected_has_bom Whether the input contains a BOM
  61  * @param expected_partial Whether the output is expected to be truncated
  62  * @returns Whether the conversion succeeded and followed the parameters
  63  */
  64 static gboolean assert_convert_to_utf8_auto_impl(
  65                 const char *domain, const char *file, int line, const char *func,
  66                 const gchar *input, gsize input_size,
  67                 const gsize disk_size, const gchar *forced_enc,
  68                 const gchar *expected_output, gsize expected_size, const gchar *expected_encoding,
  69                 gboolean expected_has_bom, gboolean expected_partial)
  70 {
  71         gchar *buf = g_memdup(input, input_size + 1);
  72         gsize size = disk_size;
  73         gchar *used_encoding = NULL;
  74         gboolean has_bom = FALSE;
  75         gboolean partial = FALSE;
  76         gboolean ret;
  77         GError *err = NULL;
  78
  79         g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: converting %lu bytes", file, line, func, input_size);
  80         ret = encodings_convert_to_utf8_auto(&buf, &size, forced_enc, &used_encoding, &has_bom, &partial, &err);
  81         fflush(stdout);
  82         if (! ret)
  83         {
  84                 g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: conversion failed: %s", file, line, func, err->message);
  85                 g_error_free(err);
  86         }
  87         else
  88         {
  89                 assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size),
  90                                 domain, file, line, func);
  91                 g_assert_cmpuint(size, ==, expected_size);
  92                 if (expected_encoding)
  93                         g_assert_cmpstr(expected_encoding, ==, used_encoding);
  94                 g_assert_cmpint(has_bom, ==, expected_has_bom);
  95                 g_assert_cmpint(partial, ==, expected_partial);
  96
  97                 g_free(used_encoding);
  98         }
  99
 100         g_free(buf);
 101
 102         return ret;
 103 }
 104
 105
 106 #define assert_convert_to_utf8_auto(input, input_size, disk_size, forced_enc, \
 107                 expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) \
 108         assert_convert_to_utf8_auto_impl(G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, \
 109                         input, input_size, disk_size, forced_enc, \
 110                         expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial)
 111
 112
 113 static void test_encodings_convert_ascii_to_utf8_auto(void)
 114 {
 115 #define TEST_ASCII(success, str, forced_enc) \
 116                 g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \
 117                                 forced_enc, str, G_N_ELEMENTS(str) - 1, forced_enc, FALSE, \
 118                                 strlen(str) != G_N_ELEMENTS(str) - 1))
 119
 120         TEST_ASCII(TRUE, "This is a very basic ASCII test", NULL);
 121         TEST_ASCII(TRUE, "This is a very basic ASCII test", "None");
 122         TEST_ASCII(TRUE, "This is a very basic ASCII test", "ASCII");
 123         TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8");
 124         TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL);
 125         TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL);
 126         TEST_ASCII(TRUE, "With\0some\0NULs\0", "None");
 127         TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");
 128
 129 #undef TEST_ASCII
 130 }
 131
 132
 133 static void test_encodings_convert_utf8_to_utf8_auto(void)
 134 {
 135 #define UTF8_BOM "\xef\xbb\xbf"
 136 #define TEST_UTF8(success, str, forced_enc)                                                                                                                                     \
 137         G_STMT_START {                                                                                                                                                                                  \
 138                 gboolean has_bom = strncmp(str, UTF8_BOM, 3) == 0;                                                                                                      \
 139                 g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1,      \
 140                                 forced_enc, str + (has_bom ? 3 : 0), G_N_ELEMENTS(str) - 1 - (has_bom ? 3 : 0),                         \
 141                                 forced_enc, has_bom, strlen(str) != G_N_ELEMENTS(str) - 1));                                                            \
 142         } G_STMT_END
 143
 144         TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", NULL);
 145         TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None");
 146         TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8");
 147         TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL);
 148         TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */
 149         TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */
 150
 151         /* with the inline hint */
 152         TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL);
 153         TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL);
 154
 155         TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL);
 156         /* These won't pass the UTF-8 validation despite the BOM, so we fallback to
 157          * testing other options, and it will succeed with UTF-16 so there's no real
 158          * point in verifying this */
 159         /*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/
 160         /*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/
 161
 162         /* non-UTF-8 */
 163         TEST_UTF8(FALSE, "Th\xec""s", "UTF-8");
 164         TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8");
 165         TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");
 166
 167 #undef TEST_UTF8
 168 #undef UTF8_BOM
 169 }
 170
 171
 172 static void test_encodings_convert_utf_other_to_utf8_auto(void)
 173 {
 174 #define UTF16_LE_BOM "\xff\xfe"
 175 #define UTF16_BE_BOM "\xfe\xff"
 176 #define UTF32_LE_BOM "\xff\xfe\x00\x00"
 177 #define UTF32_BE_BOM "\x00\x00\xfe\xff"
 178 #define TEST_ENC(success, input, output, has_bom, forced_enc, expected_encoding) \
 179                 g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
 180                                 forced_enc, output, G_N_ELEMENTS(output) - 1, expected_encoding, has_bom, \
 181                                 strlen(output) != G_N_ELEMENTS(output) - 1))
 182 #define TEST(success, input, output, has_bom, forced_enc) \
 183                 TEST_ENC(success, input, output, has_bom, forced_enc, forced_enc)
 184
 185         TEST(TRUE, "N\000o\000 \000B\000O\000M\000", "No BOM", FALSE, NULL);
 186         TEST(TRUE, "N\000o\000 \000B\000\330\000M\000", "No BØM", FALSE, NULL);
 187         /* doesn't accept the NULs */
 188         TEST(FALSE, "N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "No BOM\0and NULs", FALSE, NULL);
 189         TEST(FALSE, "N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "No BØM\0añd NÙLs", FALSE, NULL);
 190
 191         TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000", "With BOM", TRUE, NULL);
 192         TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000\330\000M\000", "With BØM", TRUE, NULL);
 193         /* doesn't accept the NULs */
 194         TEST(FALSE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "With BOM\0and NULs", TRUE, NULL);
 195         TEST(FALSE, UTF16_LE_BOM"W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "Wíth BØM\0añd NÙLs", TRUE, NULL);
 196
 197         /* We should actually be smarter in our selection of encoding introducing
 198          * probability scores, because this loads as UTF-16LE but is "圀椀琀栀 䈀伀䴀"
 199          * which doesn't seem to be real Chinese */
 200         TEST(TRUE, "\000N\000o\000 \000B\000O\000M", "No BOM", FALSE, "UTF-16BE");
 201         TEST(TRUE, "\000N\000o\000 \000B\000\330\000M", "No BØM", FALSE, NULL);
 202         /* doesn't accept the NULs -- and see above for the encoding choice */
 203         TEST(FALSE, "\000N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "No BOM\0and NULs", FALSE, "UTF-16BE");
 204         TEST(FALSE, "\000N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "No BØM\0añd NÙLs", FALSE, NULL);
 205
 206         TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M", "With BOM", TRUE, NULL);
 207         TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000\330\000M", "With BØM", TRUE, NULL);
 208         /* doesn't accept the NULs */
 209         TEST(FALSE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "With BOM\0and NULs", TRUE, NULL);
 210         TEST(FALSE, UTF16_BE_BOM"\000W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL);
 211
 212         TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000", "With BOM", TRUE, NULL);
 213         TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000", "With BØM", TRUE, NULL);
 214         /* doesn't accept the NULs */
 215         TEST(FALSE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s\000\000\000", "With BOM\0and NULs", TRUE, NULL);
 216         TEST(FALSE, UTF32_LE_BOM"W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s\000\000\000", "Wíth BØM\0añd NÙLs", TRUE, NULL);
 217
 218         TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M", "With BOM", TRUE, NULL);
 219         TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M", "With BØM", TRUE, NULL);
 220         /* doesn't accept the NULs */
 221         TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s", "With BOM\0and NULs", TRUE, NULL);
 222         TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL);
 223
 224         /* meh, UTF-7 */
 225         TEST(TRUE, "No B+ANg-M", "No BØM", FALSE, "UTF-7");
 226         TEST(TRUE, "+/v8-With B+ANg-M", "With BØM", TRUE, NULL);
 227         TEST(FALSE, "No B+ANg-M+AAA-but NULs", "No BØM\0but NULs", FALSE, "UTF-7");
 228         /* Fails to load as UTF-7 because of the NUL, and succeeds as UTF-8 but
 229          * obviously doesn't match expectations */
 230         /*TEST(FALSE, "+/v8-With B+ANg-M+AAA-and NULs", "With BØM\0and NULs", TRUE, NULL);*/
 231
 232         /* empty data with BOMs */
 233         TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */
 234         TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");
 235         TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");
 236         TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE");
 237         TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE");
 238
 239 #undef TEST
 240 #undef TEST_ENC
 241 #undef UTF32_BE_BOM
 242 #undef UTF32_LE_BOM
 243 #undef UTF16_BE_BOM
 244 #undef UTF16_LE_BOM
 245 }
 246
 247
 248 static void test_encodings_convert_iso8859_to_utf8_auto(void)
 249 {
 250 #define TEST(success, input, output, forced_enc) \
 251                 g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \
 252                                 forced_enc, output, G_N_ELEMENTS(output) - 1, forced_enc, FALSE, \
 253                                 strlen(output) != G_N_ELEMENTS(output) - 1))
 254
 255         TEST(TRUE, "Th\xec""s", "Thìs", NULL);
 256         TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-1");
 257         TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-15");
 258         TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1");
 259         TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15");
 260         TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2");
 261         /* make-believe UTF-8 BOM followed by non-UTF-8 data */
 262         TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);
 263         TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL);
 264         /* with NULs */
 265         TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1");
 266         TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15");
 267         /* This parses as UTF-16, but that's not really what we'd expect */
 268         /*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/
 269
 270         /* UTF-8 BOM with non-UTF-8 data, we should fallback */
 271         TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "ï»¿Wìthøut BOM", NULL);
 272
 273 #undef TEST
 274 }
 275
 276
 277 int main(int argc, char **argv)
 278 {
 279         g_test_init(&argc, &argv, NULL);
 280         gtk_init_check(&argc, &argv);
 281         main_init_headless();
 282
 283         g_test_add_func("/encodings/ascii/convert_to_utf8_auto", test_encodings_convert_ascii_to_utf8_auto);
 284         g_test_add_func("/encodings/utf8/convert_to_utf8_auto", test_encodings_convert_utf8_to_utf8_auto);
 285         g_test_add_func("/encodings/utf_other/convert_to_utf_other_auto", test_encodings_convert_utf_other_to_utf8_auto);
 286         g_test_add_func("/encodings/iso8859/convert_to_utf8_auto", test_encodings_convert_iso8859_to_utf8_auto);
 287
 288         return g_test_run();
 289 }