From 728f8f0a00a22e544789952c76f9eac0c3f8b2c6 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Fri, 2 Sep 2011 22:23:17 -0700
Subject: [PATCH] * fileio.c: Fix bugs with large file offsets.

The previous code assumed that file offsets (off_t values) fit in
EMACS_INT variables, which is not true on typical 32-bit hosts.
The code messed up by falsely reporting buffer overflow in cases
such as (insert-file-contents "big" nil 1 2) into an empty buffer
when "big" contains more than 2**29 bytes, even though this
inserts just one byte and does not overflow the buffer.
(Finsert_file_contents): Store file offsets as off_t
values, not as EMACS_INT values.  Check for overflow when
converting between EMACS_INT and off_t.  When checking for
buffer overflow or for overlap, take the offsets into account.
Don't use EMACS_INT for small values where int suffices.
When checking for overlap, fix a typo: ZV was used where
ZV_BYTE was intended.
(Fwrite_region): Don't assume off_t fits into 'long'.
* buffer.h (struct buffer.modtime_size): Now off_t, not EMACS_INT.
---
 src/ChangeLog | 19 ++++++++++++
 src/buffer.h  |  2 +-
 src/fileio.c  | 92 ++++++++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 82 insertions(+), 31 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 52ec796d6cf..c80a12c1c00 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,22 @@
+2011-09-03  Paul Eggert  <eggert@cs.ucla.edu>
+
+	* fileio.c: Fix bugs with large file offsets.
+	The previous code assumed that file offsets (off_t values) fit in
+	EMACS_INT variables, which is not true on typical 32-bit hosts.
+	The code messed up by falsely reporting buffer overflow in cases
+	such as (insert-file-contents "big" nil 1 2) into an empty buffer
+	when "big" contains more than 2**29 bytes, even though this
+	inserts just one byte and does not overflow the buffer.
+	(Finsert_file_contents): Store file offsets as off_t
+	values, not as EMACS_INT values.  Check for overflow when
+	converting between EMACS_INT and off_t.  When checking for
+	buffer overflow or for overlap, take the offsets into account.
+	Don't use EMACS_INT for small values where int suffices.
+	When checking for overlap, fix a typo: ZV was used where
+	ZV_BYTE was intended.
+	(Fwrite_region): Don't assume off_t fits into 'long'.
+	* buffer.h (struct buffer.modtime_size): Now off_t, not EMACS_INT.
+
 2011-08-30  Chong Yidong  <cyd@stupidchicken.com>
 
 	* syntax.c (find_defun_start): Update all cache variables if
diff --git a/src/buffer.h b/src/buffer.h
index 06864dd5789..c50cfe56c77 100644
--- a/src/buffer.h
+++ b/src/buffer.h
@@ -559,7 +559,7 @@ struct buffer
      is still the same (since it's rounded up to seconds) but we're actually
      not up-to-date.  -1 means the size is unknown.  Only meaningful if
      modtime is actually set.  */
-  EMACS_INT modtime_size;
+  off_t modtime_size;
   /* The value of text->modiff at the last auto-save.  */
   int auto_save_modified;
   /* The value of text->modiff at the last display error.
diff --git a/src/fileio.c b/src/fileio.c
index 60ee35bb399..fe0fb593208 100644
--- a/src/fileio.c
+++ b/src/fileio.c
@@ -3179,6 +3179,7 @@ variable `last-coding-system-used' to the coding system actually used.  */)
   EMACS_INT inserted = 0;
   int nochange = 0;
   register EMACS_INT how_much;
+  off_t beg_offset, end_offset;
   register EMACS_INT unprocessed;
   int count = SPECPDL_INDEX ();
   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
@@ -3284,15 +3285,6 @@ variable `last-coding-system-used' to the coding system actually used.  */)
   record_unwind_protect (close_file_unwind, make_number (fd));
 
 
-  /* Check whether the size is too large or negative, which can happen on a
-     platform that allows file sizes greater than the maximum off_t value.  */
-  if (! not_regular
-      && ! (0 <= st.st_size && st.st_size <= BUF_BYTES_MAX))
-    buffer_overflow ();
-
-  /* Prevent redisplay optimizations.  */
-  current_buffer->clip_changed = 1;
-
   if (!NILP (visit))
     {
       if (!NILP (beg) || !NILP (end))
@@ -3302,26 +3294,64 @@ variable `last-coding-system-used' to the coding system actually used.  */)
     }
 
   if (!NILP (beg))
-    CHECK_NUMBER (beg);
+    {
+      if (! (RANGED_INTEGERP (0, beg, TYPE_MAXIMUM (off_t))))
+	wrong_type_argument (intern ("file-offset"), beg);
+      beg_offset = XFASTINT (beg);
+    }
   else
-    XSETFASTINT (beg, 0);
+    beg_offset = 0;
 
   if (!NILP (end))
-    CHECK_NUMBER (end);
+    {
+      if (! (RANGED_INTEGERP (0, end, TYPE_MAXIMUM (off_t))))
+	wrong_type_argument (intern ("file-offset"), end);
+      end_offset = XFASTINT (end);
+    }
   else
     {
-      if (! not_regular)
+      if (not_regular)
+	end_offset = TYPE_MAXIMUM (off_t);
+      else
 	{
-	  XSETINT (end, st.st_size);
+	  end_offset = st.st_size;
+
+	  /* A negative size can happen on a platform that allows file
+	     sizes greater than the maximum off_t value.  */
+	  if (end_offset < 0)
+	    buffer_overflow ();
 
 	  /* The file size returned from stat may be zero, but data
 	     may be readable nonetheless, for example when this is a
 	     file in the /proc filesystem.  */
-	  if (st.st_size == 0)
-	    XSETINT (end, READ_BUF_SIZE);
+	  if (end_offset == 0)
+	    end_offset = READ_BUF_SIZE;
 	}
     }
 
+  /* Check now whether the buffer will become too large,
+     in the likely case where the file's length is not changing.
+     This saves a lot of needless work before a buffer overflow.  */
+  if (! not_regular)
+    {
+      /* The likely offset where we will stop reading.  We could read
+	 more (or less), if the file grows (or shrinks) as we read it.  */
+      off_t likely_end = min (end_offset, st.st_size);
+
+      if (beg_offset < likely_end)
+	{
+	  ptrdiff_t buf_bytes =
+	    Z_BYTE - (!NILP (replace) ? ZV_BYTE - BEGV_BYTE  : 0);
+	  ptrdiff_t buf_growth_max = BUF_BYTES_MAX - buf_bytes;
+	  off_t likely_growth = likely_end - beg_offset;
+	  if (buf_growth_max < likely_growth)
+	    buffer_overflow ();
+	}
+    }
+
+  /* Prevent redisplay optimizations.  */
+  current_buffer->clip_changed = 1;
+
   if (EQ (Vcoding_system_for_read, Qauto_save_coding))
     {
       coding_system = coding_inherit_eol_type (Qutf_8_emacs, Qunix);
@@ -3465,9 +3495,9 @@ variable `last-coding-system-used' to the coding system actually used.  */)
 	 give up on handling REPLACE in the optimized way.  */
       int giveup_match_end = 0;
 
-      if (XINT (beg) != 0)
+      if (beg_offset != 0)
 	{
-	  if (emacs_lseek (fd, XINT (beg), SEEK_SET) < 0)
+	  if (lseek (fd, beg_offset, SEEK_SET) < 0)
 	    report_file_error ("Setting file position",
 			       Fcons (orig_filename, Qnil));
 	}
@@ -3515,7 +3545,7 @@ variable `last-coding-system-used' to the coding system actually used.  */)
       immediate_quit = 0;
       /* If the file matches the buffer completely,
 	 there's no need to replace anything.  */
-      if (same_at_start - BEGV_BYTE == XINT (end))
+      if (same_at_start - BEGV_BYTE == end_offset)
 	{
 	  emacs_close (fd);
 	  specpdl_ptr--;
@@ -3530,16 +3560,17 @@ variable `last-coding-system-used' to the coding system actually used.  */)
 	 already found that decoding is necessary, don't waste time.  */
       while (!giveup_match_end)
 	{
-	  EMACS_INT total_read, nread, bufpos, curpos, trial;
+	  int total_read, nread, bufpos, trial;
+	  off_t curpos;
 
 	  /* At what file position are we now scanning?  */
-	  curpos = XINT (end) - (ZV_BYTE - same_at_end);
+	  curpos = end_offset - (ZV_BYTE - same_at_end);
 	  /* If the entire file matches the buffer tail, stop the scan.  */
 	  if (curpos == 0)
 	    break;
 	  /* How much can we scan in the next step?  */
 	  trial = min (curpos, sizeof buffer);
-	  if (emacs_lseek (fd, curpos - trial, SEEK_SET) < 0)
+	  if (lseek (fd, curpos - trial, SEEK_SET) < 0)
 	    report_file_error ("Setting file position",
 			       Fcons (orig_filename, Qnil));
 
@@ -3606,13 +3637,14 @@ variable `last-coding-system-used' to the coding system actually used.  */)
 
 	  /* Don't try to reuse the same piece of text twice.  */
 	  overlap = (same_at_start - BEGV_BYTE
-		     - (same_at_end + st.st_size - ZV));
+		     - (same_at_end
+			+ (! NILP (end) ? end_offset : st.st_size) - ZV_BYTE));
 	  if (overlap > 0)
 	    same_at_end += overlap;
 
 	  /* Arrange to read only the nonmatching middle part of the file.  */
-	  XSETFASTINT (beg, XINT (beg) + (same_at_start - BEGV_BYTE));
-	  XSETFASTINT (end, XINT (end) - (ZV_BYTE - same_at_end));
+	  beg_offset += same_at_start - BEGV_BYTE;
+	  end_offset -= ZV_BYTE - same_at_end;
 
 	  del_range_byte (same_at_start, same_at_end, 0);
 	  /* Insert from the file at the proper position.  */
@@ -3657,7 +3689,7 @@ variable `last-coding-system-used' to the coding system actually used.  */)
       /* First read the whole file, performing code conversion into
 	 CONVERSION_BUFFER.  */
 
-      if (emacs_lseek (fd, XINT (beg), SEEK_SET) < 0)
+      if (lseek (fd, beg_offset, SEEK_SET) < 0)
 	report_file_error ("Setting file position",
 			   Fcons (orig_filename, Qnil));
 
@@ -3824,7 +3856,7 @@ variable `last-coding-system-used' to the coding system actually used.  */)
     }
 
   if (! not_regular)
-    total = XINT (end) - XINT (beg);
+    total = end_offset - beg_offset;
   else
     /* For a special file, all we can do is guess.  */
     total = READ_BUF_SIZE;
@@ -3845,9 +3877,9 @@ variable `last-coding-system-used' to the coding system actually used.  */)
   if (GAP_SIZE < total)
     make_gap (total - GAP_SIZE);
 
-  if (XINT (beg) != 0 || !NILP (replace))
+  if (beg_offset != 0 || !NILP (replace))
     {
-      if (emacs_lseek (fd, XINT (beg), SEEK_SET) < 0)
+      if (lseek (fd, beg_offset, SEEK_SET) < 0)
 	report_file_error ("Setting file position",
 			   Fcons (orig_filename, Qnil));
     }
@@ -4576,7 +4608,7 @@ This calls `write-region-annotate-functions' at the start, and
 
   if (!NILP (append) && !NILP (Ffile_regular_p (filename)))
     {
-      long ret;
+      off_t ret;
 
       if (NUMBERP (append))
 	ret = emacs_lseek (desc, XINT (append), SEEK_CUR);
-- 
2.11.4.GIT