From 3d4761788ab284687f46ae4c1285363a6b73ad22 Mon Sep 17 00:00:00 2001
From: Colomban Wendling <ban@herbesfolles.org>
Date: Fri, 8 Dec 2023 21:57:05 +0100
Subject: [PATCH] Fix silently truncating files with NULs

This is slightly brittle, so it might require some refactoring.
---
 src/encodings.c        | 63 +++++++++++++++++++++++++-------------------------
 src/encodingsprivate.h |  2 +-
 tests/test_encodings.c | 47 +++++++++++++++----------------------
 3 files changed, 50 insertions(+), 62 deletions(-)

diff --git a/src/encodings.c b/src/encodings.c
index 47359ba3a..b96ddf4fe 100644
--- a/src/encodings.c
+++ b/src/encodings.c
@@ -877,10 +877,9 @@ typedef struct
 {
 	gchar		*data;	/* null-terminated data */
 	gsize		 size;	/* actual data size */
-	gsize		 len;	/* string length of data */
 	gchar		*enc;
 	gboolean	 bom;
-	gboolean	 partial;
+	gboolean	 has_nuls;
 } BufferData;
 
 
@@ -892,7 +891,7 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
 
 	if (utils_str_equal(forced_enc, "UTF-8"))
 	{
-		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
+		if (! g_utf8_validate(buffer->data, buffer->size, NULL))
 		{
 			return FALSE;
 		}
@@ -908,7 +907,8 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
 		else
 		{
 			SETPTR(buffer->data, converted_text);
-			buffer->len = strlen(converted_text);
+			/* we can't succeed with NULs, so this is OK */
+			buffer->size = strlen(converted_text);
 		}
 	}
 	enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
@@ -927,8 +927,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 
 	if (buffer->size == 0)
 	{
-		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
-		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
+		/* we have no data so assume UTF-8 */
 		buffer->enc = g_strdup("UTF-8");
 	}
 	else
@@ -939,14 +938,24 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 			buffer->enc = g_strdup(encodings[enc_idx].charset);
 			buffer->bom = TRUE;
 
-			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
+			if (enc_idx == GEANY_ENCODING_UTF_8)
+			{
+				if (! g_utf8_validate(buffer->data, buffer->size, NULL))
+				{
+					/* this is not actually valid UTF-8 */
+					SETPTR(buffer->enc, NULL);
+					buffer->bom = FALSE;
+				}
+			}
+			else /* the BOM indicated something else than UTF-8 */
 			{
 				gchar *converted_text = encodings_convert_to_utf8_from_charset(
 										buffer->data, buffer->size, buffer->enc, FALSE);
 				if (converted_text != NULL)
 				{
 					SETPTR(buffer->data, converted_text);
-					buffer->len = strlen(converted_text);
+					/* we can't succeed with NULs, so this is OK */
+					buffer->size = strlen(converted_text);
 				}
 				else
 				{
@@ -964,7 +973,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 
 			/* try UTF-8 first */
 			if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
-				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
+				g_utf8_validate(buffer->data, buffer->size, NULL))
 			{
 				buffer->enc = g_strdup("UTF-8");
 			}
@@ -980,7 +989,8 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
 					return FALSE;
 				}
 				SETPTR(buffer->data, converted_text);
-				buffer->len = strlen(converted_text);
+				/* we can't succeed with NULs, so this is OK */
+				buffer->size = strlen(converted_text);
 			}
 			g_free(regex_charset);
 		}
@@ -997,11 +1007,11 @@ handle_bom(BufferData *buffer)
 	encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
 	g_return_if_fail(bom_len != 0);
 
-	/* use filedata->len here because the contents are already converted into UTF-8 */
-	buffer->len -= bom_len;
+	/* the contents are already converted into UTF-8 here */
+	buffer->size -= bom_len;
 	/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
-	memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
-	buffer->data = g_realloc(buffer->data, buffer->len + 1);
+	memmove(buffer->data, buffer->data + bom_len, buffer->size + 1);
+	buffer->data = g_realloc(buffer->data, buffer->size + 1);
 }
 
 
@@ -1014,16 +1024,6 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
 	 * if we have a BOM */
 	tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
 
-	/* check whether the size of the loaded data is equal to the size of the file in the
-	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
-	 * file size of 0 bytes */
-	if (buffer->len != buffer->size && buffer->size != 0 && (
-		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
-		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
-	{
-		buffer->partial = TRUE;
-	}
-
 	/* Determine character encoding and convert to UTF-8 */
 	if (forced_enc != NULL)
 	{
@@ -1032,6 +1032,7 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
 		{
 			buffer->bom = FALSE;
 			buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
+			buffer->has_nuls = strlen(buffer->data) != buffer->size;
 		}
 		else if (! handle_forced_encoding(buffer, forced_enc))
 		{
@@ -1060,36 +1061,34 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
  * @param forced_enc forced encoding to use, or @c NULL
  * @param used_encoding return location for the actually used encoding, or @c NULL
  * @param has_bom return location to store whether the data had a BOM, or @c NULL
- * @param partial return location to store whether the conversion may be partial, or @c NULL
+ * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL
  *
  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
  */
 GEANY_EXPORT_SYMBOL
 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
-		gchar **used_encoding, gboolean *has_bom, gboolean *partial)
+		gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls)
 {
 	BufferData buffer;
 
 	buffer.data = *buf;
 	buffer.size = *size;
-	/* use strlen to check for null chars */
-	buffer.len = strlen(buffer.data);
 	buffer.enc = NULL;
 	buffer.bom = FALSE;
-	buffer.partial = FALSE;
+	buffer.has_nuls = FALSE;
 
 	if (! handle_buffer(&buffer, forced_enc))
 		return FALSE;
 
-	*size = buffer.len;
+	*size = buffer.size;
 	if (used_encoding)
 		*used_encoding = buffer.enc;
 	else
 		g_free(buffer.enc);
 	if (has_bom)
 		*has_bom = buffer.bom;
-	if (partial)
-		*partial = buffer.partial;
+	if (has_nuls)
+		*has_nuls = buffer.has_nuls;
 
 	*buf = buffer.data;
 	return TRUE;
diff --git a/src/encodingsprivate.h b/src/encodingsprivate.h
index 11b9f1b26..dc9abe5c0 100644
--- a/src/encodingsprivate.h
+++ b/src/encodingsprivate.h
@@ -73,7 +73,7 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell
 gboolean encodings_is_unicode_charset(const gchar *string);
 
 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
-                                        gchar **used_encoding, gboolean *has_bom, gboolean *partial);
+                                        gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls);
 
 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len);
 
diff --git a/tests/test_encodings.c b/tests/test_encodings.c
index 7db6bd5d1..d3c82d564 100644
--- a/tests/test_encodings.c
+++ b/tests/test_encodings.c
@@ -80,13 +80,9 @@ static gboolean assert_convert_to_utf8_auto_impl(
 	fflush(stdout);
 	if (ret)
 	{
-		/* FIXME: that's probably a bug in encodings_convert_to_utf8_auto() */
-		if (size != expected_size && expected_partial)
-			expected_size = strlen(expected_output);
-
-		g_assert_cmpuint(size, ==, expected_size);
 		assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size),
 				domain, file, line, func);
+		g_assert_cmpuint(size, ==, expected_size);
 		if (expected_encoding)
 			g_assert_cmpstr(expected_encoding, ==, used_encoding);
 		g_assert_cmpint(has_bom, ==, expected_has_bom);
@@ -121,9 +117,8 @@ static void test_encodings_convert_ascii_to_utf8_auto(void)
 	TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8");
 	TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL);
 	TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL);
-	/* these fails to report partial output! */
-	/*TEST_ASCII(FALSE, "With\0some\0NULs\0", "None");*/
-	/*TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");*/
+	TEST_ASCII(TRUE, "With\0some\0NULs\0", "None");
+	TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8");
 
 #undef TEST_ASCII
 }
@@ -144,23 +139,24 @@ static void test_encodings_convert_utf8_to_utf8_auto(void)
 	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None");
 	TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8");
 	TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL);
-	/* these fails to report partial output! */
-	/*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8");*/
-	/*TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "None");*/
+	TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */
+	TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */
 
 	/* with the inline hint */
 	TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL);
 	TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL);
 
 	TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL);
-	TEST_UTF8(TRUE, UTF8_BOM"With BOM\0and NULs", NULL);
-	TEST_UTF8(TRUE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);
+	/* These won't pass the UTF-8 validation despite the BOM, so we fallback to
+	 * testing other options, and it will succeed with UTF-16 so there's no real
+	 * point in verifying this */
+	/*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/
+	/*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/
 
 	/* non-UTF-8 */
 	TEST_UTF8(FALSE, "Th\xec""s", "UTF-8");
 	TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8");
-	/* erroneously succeeds and fails to report partial */
-	/*TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");*/
+	TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8");
 
 #undef TEST_UTF8
 #undef UTF8_BOM
@@ -229,9 +225,8 @@ static void test_encodings_convert_utf_other_to_utf8_auto(void)
 
 	/* empty data with BOMs */
 	TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */
-	/* these two actually lead to reading past the buffer's bounds */
-	/*TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");*/
-	/*TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");*/
+	TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE");
+	TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE");
 	TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE");
 	TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE");
 
@@ -257,17 +252,8 @@ static void test_encodings_convert_iso8859_to_utf8_auto(void)
 	TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1");
 	TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15");
 	TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2");
-	/* huh?  the UTF-8 BOM takes over, although \xd3 is NOT valid UTF-8!?
-	 * - file(1) says "iso8859 text", OK
-	 * - kate(1) loads as ISO-8859-15
-	 * - vim(1) loads as "latin1" whatever that means (but looks OK)
-	 * - chardet(1) wrongly reports "UTF-8-SIG with confidence 1.0", which is
-	 *   a tad sad for a tool which only purpose IS detecting encoding...
-	 * - pluma(1) doesn't open it and asks for encoding input
-	 * - gedit(1) opens as broken UTF-8, but warns about it and asks
-	 * - gnome-text-editor(1) is just broken, opens as gedit, but says I don't
-	 *   have permission to open that file :)  looks like a generic error. */
-	/*TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);*/
+	/* make-believe UTF-8 BOM followed by non-UTF-8 data */
+	TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "ï»¿not BÓM", NULL);
 	TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL);
 	/* with NULs */
 	TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1");
@@ -275,6 +261,9 @@ static void test_encodings_convert_iso8859_to_utf8_auto(void)
 	/* This parses as UTF-16, but that's not really what we'd expect */
 	/*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/
 
+	/* UTF-8 BOM with non-UTF-8 data, we should fallback */
+	TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "ï»¿Wìthøut BOM", NULL);
+
 #undef TEST
 }
 
-- 
2.11.4.GIT