fix-data-corruption-when-blocksize-lt-pagesize-for-mmaped-data

   1 vfs: fix data corruption when blocksize < pagesize for mmaped data
   2
   3 From: Jan Kara <jack@suse.cz>
   4
   5 ->page_mkwrite() is used by filesystems to allocate blocks under a page
   6 which is becoming writeably mmapped in some process' address space. This
   7 allows a filesystem to return a page fault if there is not enough space
   8 available, user exceeds quota or similar problem happens, rather than
   9 silently discarding data later when writepage is called.
  10
  11 However VFS fails to call ->page_mkwrite() in all the cases where
  12 filesystems need it when blocksize < pagesize. For example when
  13 blocksize = 1024, pagesize = 4096 the following is problematic:
  14   ftruncate(fd, 0);
  15   pwrite(fd, buf, 1024, 0);
  16   map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0);
  17   map[0] = 'a';       ----> page_mkwrite() for index 0 is called
  18   ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */
  19   mremap(map, 1024, 10000, 0);
  20   map[4095] = 'a';    ----> no page_mkwrite() called
  21
  22 At the moment ->page_mkwrite() is called, filesystem can allocate only
  23 one block for the page because i_size == 1024. Otherwise it would create
  24 blocks beyond i_size which is generally undesirable. But later at
  25 ->writepage() time, we also need to store data at offset 4095 but we
  26 don't have block allocated for it.
  27
  28 This patch introduces a helper function filesystems can use to have
  29 ->page_mkwrite() called at all the necessary moments.
  30
  31 Signed-off-by: Jan Kara <jack@suse.cz>
  32 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  33 ---
  34  fs/buffer.c        |  3 +++
  35  include/linux/mm.h |  1 +
  36  mm/truncate.c      | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  37  3 files changed, 61 insertions(+)
  38
  39 diff --git a/fs/buffer.c b/fs/buffer.c
  40 index 9a6029e..6dc1475 100644
  41 --- a/fs/buffer.c
  42 +++ b/fs/buffer.c
  43 @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
  44                         struct page *page, void *fsdata)
  45  {
  46         struct inode *inode = mapping->host;
  47 +       loff_t old_size = inode->i_size;
  48         int i_size_changed = 0;
  49
  50         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
  51 @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
  52         unlock_page(page);
  53         page_cache_release(page);
  54
  55 +       if (old_size < pos)
  56 +               pagecache_isize_extended(inode, old_size, pos);
  57         /*
  58          * Don't mark the inode dirty under page lock. First, it unnecessarily
  59          * makes the holding time of page lock longer. Second, it forces lock
  60 diff --git a/include/linux/mm.h b/include/linux/mm.h
  61 index 8981cc8..5005464 100644
  62 --- a/include/linux/mm.h
  63 +++ b/include/linux/mm.h
  64 @@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
  65
  66  extern void truncate_pagecache(struct inode *inode, loff_t new);
  67  extern void truncate_setsize(struct inode *inode, loff_t newsize);
  68 +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
  69  void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
  70  int truncate_inode_page(struct address_space *mapping, struct page *page);
  71  int generic_error_remove_page(struct address_space *mapping, struct page *page);
  72 diff --git a/mm/truncate.c b/mm/truncate.c
  73 index 96d1673..261eaf6 100644
  74 --- a/mm/truncate.c
  75 +++ b/mm/truncate.c
  76 @@ -20,6 +20,7 @@
  77  #include <linux/buffer_head.h> /* grr. try_to_release_page,
  78                                    do_invalidatepage */
  79  #include <linux/cleancache.h>
  80 +#include <linux/rmap.h>
  81  #include "internal.h"
  82
  83  static void clear_exceptional_entry(struct address_space *mapping,
  84 @@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache);
  85   */
  86  void truncate_setsize(struct inode *inode, loff_t newsize)
  87  {
  88 +       loff_t oldsize = inode->i_size;
  89 +
  90         i_size_write(inode, newsize);
  91 +       if (newsize > oldsize)
  92 +               pagecache_isize_extended(inode, oldsize, newsize);
  93         truncate_pagecache(inode, newsize);
  94  }
  95  EXPORT_SYMBOL(truncate_setsize);
  96
  97  /**
  98 + * pagecache_isize_extended - update pagecache after extension of i_size
  99 + * @inode:     inode for which i_size was extended
 100 + * @from:      original inode size
 101 + * @to:                new inode size
 102 + *
 103 + * Handle extension of inode size either caused by extending truncate or by
 104 + * write starting after current i_size. We mark the page straddling current
 105 + * i_size RO so that page_mkwrite() is called on the nearest write access to
 106 + * the page.  This way filesystem can be sure that page_mkwrite() is called on
 107 + * the page before user writes to the page via mmap after the i_size has been
 108 + * changed.
 109 + *
 110 + * The function must be called after i_size is updated so that page fault
 111 + * coming after we unlock the page will already see the new i_size.
 112 + * The function must be called while we still hold i_mutex - this not only
 113 + * makes sure i_size is stable but also that userspace cannot observe new
 114 + * i_size value before we are prepared to store mmap writes at new inode size.
 115 + */
 116 +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 117 +{
 118 +       int bsize = 1 << inode->i_blkbits;
 119 +       loff_t rounded_from;
 120 +       struct page *page;
 121 +       pgoff_t index;
 122 +
 123 +       WARN_ON(!mutex_is_locked(&inode->i_mutex));
 124 +       WARN_ON(to > inode->i_size);
 125 +
 126 +       if (from >= to || bsize == PAGE_CACHE_SIZE)
 127 +               return;
 128 +       /* Page straddling @from will not have any hole block created? */
 129 +       rounded_from = round_up(from, bsize);
 130 +       if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
 131 +               return;
 132 +
 133 +       index = from >> PAGE_CACHE_SHIFT;
 134 +       page = find_lock_page(inode->i_mapping, index);
 135 +       /* Page not cached? Nothing to do */
 136 +       if (!page)
 137 +               return;
 138 +       /*
 139 +        * See clear_page_dirty_for_io() for details why set_page_dirty()
 140 +        * is needed.
 141 +        */
 142 +       if (page_mkclean(page))
 143 +               set_page_dirty(page);
 144 +       unlock_page(page);
 145 +       page_cache_release(page);
 146 +}
 147 +EXPORT_SYMBOL(pagecache_isize_extended);
 148 +
 149 +/**
 150   * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 151   * @inode: inode
 152   * @lstart: offset of beginning of hole