From 3d4761788ab284687f46ae4c1285363a6b73ad22 Mon Sep 17 00:00:00 2001 From: Colomban Wendling Date: Fri, 8 Dec 2023 21:57:05 +0100 Subject: [PATCH] Fix silently truncating files with NULs This is slightly brittle, so it might require some refactoring. --- src/encodings.c | 63 +++++++++++++++++++++++++------------------------- src/encodingsprivate.h | 2 +- tests/test_encodings.c | 47 +++++++++++++++---------------------- 3 files changed, 50 insertions(+), 62 deletions(-) diff --git a/src/encodings.c b/src/encodings.c index 47359ba3a..b96ddf4fe 100644 --- a/src/encodings.c +++ b/src/encodings.c @@ -877,10 +877,9 @@ typedef struct { gchar *data; /* null-terminated data */ gsize size; /* actual data size */ - gsize len; /* string length of data */ gchar *enc; gboolean bom; - gboolean partial; + gboolean has_nuls; } BufferData; @@ -892,7 +891,7 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) if (utils_str_equal(forced_enc, "UTF-8")) { - if (! g_utf8_validate(buffer->data, buffer->len, NULL)) + if (! g_utf8_validate(buffer->data, buffer->size, NULL)) { return FALSE; } @@ -908,7 +907,8 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) else { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); + /* we can't succeed with NULs, so this is OK */ + buffer->size = strlen(converted_text); } } enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); @@ -927,8 +927,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) if (buffer->size == 0) { - /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty - * e.g. UTF32 file with a BOM(so size is 4, len is 0) */ + /* we have no data so assume UTF-8 */ buffer->enc = g_strdup("UTF-8"); } else @@ -939,14 +938,24 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) buffer->enc = g_strdup(encodings[enc_idx].charset); buffer->bom = TRUE; - if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */ + if (enc_idx == GEANY_ENCODING_UTF_8) + { + if (! g_utf8_validate(buffer->data, buffer->size, NULL)) + { + /* this is not actually valid UTF-8 */ + SETPTR(buffer->enc, NULL); + buffer->bom = FALSE; + } + } + else /* the BOM indicated something else than UTF-8 */ { gchar *converted_text = encodings_convert_to_utf8_from_charset( buffer->data, buffer->size, buffer->enc, FALSE); if (converted_text != NULL) { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); + /* we can't succeed with NULs, so this is OK */ + buffer->size = strlen(converted_text); } else { @@ -964,7 +973,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) /* try UTF-8 first */ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 && - (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL)) + g_utf8_validate(buffer->data, buffer->size, NULL)) { buffer->enc = g_strdup("UTF-8"); } @@ -980,7 +989,8 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) return FALSE; } SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); + /* we can't succeed with NULs, so this is OK */ + buffer->size = strlen(converted_text); } g_free(regex_charset); } @@ -997,11 +1007,11 @@ handle_bom(BufferData *buffer) encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len); g_return_if_fail(bom_len != 0); - /* use filedata->len here because the contents are already converted into UTF-8 */ - buffer->len -= bom_len; + /* the contents are already converted into UTF-8 here */ + buffer->size -= bom_len; /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */ - memmove(buffer->data, buffer->data + bom_len, buffer->len + 1); - buffer->data = g_realloc(buffer->data, buffer->len + 1); + memmove(buffer->data, buffer->data + bom_len, buffer->size + 1); + buffer->data = g_realloc(buffer->data, buffer->size + 1); } @@ -1014,16 +1024,6 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * if we have a BOM */ tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); - /* check whether the size of the loaded data is equal to the size of the file in the - * filesystem file size may be 0 to allow opening files in /proc/ which have typically a - * file size of 0 bytes */ - if (buffer->len != buffer->size && buffer->size != 0 && ( - tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */ - tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */ - { - buffer->partial = TRUE; - } - /* Determine character encoding and convert to UTF-8 */ if (forced_enc != NULL) { @@ -1032,6 +1032,7 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) { buffer->bom = FALSE; buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset); + buffer->has_nuls = strlen(buffer->data) != buffer->size; } else if (! handle_forced_encoding(buffer, forced_enc)) { @@ -1060,36 +1061,34 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * @param forced_enc forced encoding to use, or @c NULL * @param used_encoding return location for the actually used encoding, or @c NULL * @param has_bom return location to store whether the data had a BOM, or @c NULL - * @param partial return location to store whether the conversion may be partial, or @c NULL + * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL * * @return @C TRUE if the conversion succeeded, @c FALSE otherwise. */ GEANY_EXPORT_SYMBOL gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial) + gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls) { BufferData buffer; buffer.data = *buf; buffer.size = *size; - /* use strlen to check for null chars */ - buffer.len = strlen(buffer.data); buffer.enc = NULL; buffer.bom = FALSE; - buffer.partial = FALSE; + buffer.has_nuls = FALSE; if (! handle_buffer(&buffer, forced_enc)) return FALSE; - *size = buffer.len; + *size = buffer.size; if (used_encoding) *used_encoding = buffer.enc; else g_free(buffer.enc); if (has_bom) *has_bom = buffer.bom; - if (partial) - *partial = buffer.partial; + if (has_nuls) + *has_nuls = buffer.has_nuls; *buf = buffer.data; return TRUE; diff --git a/src/encodingsprivate.h b/src/encodingsprivate.h index 11b9f1b26..dc9abe5c0 100644 --- a/src/encodingsprivate.h +++ b/src/encodingsprivate.h @@ -73,7 +73,7 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell gboolean encodings_is_unicode_charset(const gchar *string); gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial); + gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls); GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len); diff --git a/tests/test_encodings.c b/tests/test_encodings.c index 7db6bd5d1..d3c82d564 100644 --- a/tests/test_encodings.c +++ b/tests/test_encodings.c @@ -80,13 +80,9 @@ static gboolean assert_convert_to_utf8_auto_impl( fflush(stdout); if (ret) { - /* FIXME: that's probably a bug in encodings_convert_to_utf8_auto() */ - if (size != expected_size && expected_partial) - expected_size = strlen(expected_output); - - g_assert_cmpuint(size, ==, expected_size); assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size), domain, file, line, func); + g_assert_cmpuint(size, ==, expected_size); if (expected_encoding) g_assert_cmpstr(expected_encoding, ==, used_encoding); g_assert_cmpint(has_bom, ==, expected_has_bom); @@ -121,9 +117,8 @@ static void test_encodings_convert_ascii_to_utf8_auto(void) TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8"); TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL); TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL); - /* these fails to report partial output! */ - /*TEST_ASCII(FALSE, "With\0some\0NULs\0", "None");*/ - /*TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");*/ + TEST_ASCII(TRUE, "With\0some\0NULs\0", "None"); + TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8"); #undef TEST_ASCII } @@ -144,23 +139,24 @@ static void test_encodings_convert_utf8_to_utf8_auto(void) TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None"); TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8"); TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL); - /* these fails to report partial output! */ - /*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8");*/ - /*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "None");*/ + TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */ + TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */ /* with the inline hint */ TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL); TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL); TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL); - TEST_UTF8(TRUE, UTF8_BOM"With BOM\0and NULs", NULL); - TEST_UTF8(TRUE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL); + /* These won't pass the UTF-8 validation despite the BOM, so we fallback to + * testing other options, and it will succeed with UTF-16 so there's no real + * point in verifying this */ + /*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/ + /*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/ /* non-UTF-8 */ TEST_UTF8(FALSE, "Th\xec""s", "UTF-8"); TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8"); - /* erroneously succeeds and fails to report partial */ - /*TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");*/ + TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8"); #undef TEST_UTF8 #undef UTF8_BOM @@ -229,9 +225,8 @@ static void test_encodings_convert_utf_other_to_utf8_auto(void) /* empty data with BOMs */ TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */ - /* these two actually lead to reading past the buffer's bounds */ - /*TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");*/ - /*TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");*/ + TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE"); + TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE"); TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE"); TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE"); @@ -257,17 +252,8 @@ static void test_encodings_convert_iso8859_to_utf8_auto(void) TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1"); TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15"); TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2"); - /* huh? the UTF-8 BOM takes over, although \xd3 is NOT valid UTF-8!? - * - file(1) says "iso8859 text", OK - * - kate(1) loads as ISO-8859-15 - * - vim(1) loads as "latin1" whatever that means (but looks OK) - * - chardet(1) wrongly reports "UTF-8-SIG with confidence 1.0", which is - * a tad sad for a tool which only purpose IS detecting encoding... - * - pluma(1) doesn't open it and asks for encoding input - * - gedit(1) opens as broken UTF-8, but warns about it and asks - * - gnome-text-editor(1) is just broken, opens as gedit, but says I don't - * have permission to open that file :) looks like a generic error. */ - /*TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "not BÓM", NULL);*/ + /* make-believe UTF-8 BOM followed by non-UTF-8 data */ + TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "not BÓM", NULL); TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL); /* with NULs */ TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1"); @@ -275,6 +261,9 @@ static void test_encodings_convert_iso8859_to_utf8_auto(void) /* This parses as UTF-16, but that's not really what we'd expect */ /*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/ + /* UTF-8 BOM with non-UTF-8 data, we should fallback */ + TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "Wìthøut BOM", NULL); + #undef TEST } -- 2.11.4.GIT