1 vfs: fix data corruption when blocksize < pagesize for mmaped data
3 From: Jan Kara <jack@suse.cz>
5 ->page_mkwrite() is used by filesystems to allocate blocks under a page
6 which is becoming writeably mmapped in some process' address space. This
7 allows a filesystem to return a page fault if there is not enough space
8 available, user exceeds quota or similar problem happens, rather than
9 silently discarding data later when writepage is called.
11 However VFS fails to call ->page_mkwrite() in all the cases where
12 filesystems need it when blocksize < pagesize. For example when
13 blocksize = 1024, pagesize = 4096 the following is problematic:
15 pwrite(fd, buf, 1024, 0);
16 map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0);
17 map[0] = 'a'; ----> page_mkwrite() for index 0 is called
18 ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */
19 mremap(map, 1024, 10000, 0);
20 map[4095] = 'a'; ----> no page_mkwrite() called
22 At the moment ->page_mkwrite() is called, filesystem can allocate only
23 one block for the page because i_size == 1024. Otherwise it would create
24 blocks beyond i_size which is generally undesirable. But later at
25 ->writepage() time, we also need to store data at offset 4095 but we
26 don't have block allocated for it.
28 This patch introduces a helper function filesystems can use to have
29 ->page_mkwrite() called at all the necessary moments.
31 Signed-off-by: Jan Kara <jack@suse.cz>
32 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
35 include/linux/mm.h | 1 +
36 mm/truncate.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
37 3 files changed, 61 insertions(+)
39 diff --git a/fs/buffer.c b/fs/buffer.c
40 index 9a6029e..6dc1475 100644
43 @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
44 struct page *page, void *fsdata)
46 struct inode *inode = mapping->host;
47 + loff_t old_size = inode->i_size;
48 int i_size_changed = 0;
50 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
51 @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
53 page_cache_release(page);
56 + pagecache_isize_extended(inode, old_size, pos);
58 * Don't mark the inode dirty under page lock. First, it unnecessarily
59 * makes the holding time of page lock longer. Second, it forces lock
60 diff --git a/include/linux/mm.h b/include/linux/mm.h
61 index 8981cc8..5005464 100644
62 --- a/include/linux/mm.h
63 +++ b/include/linux/mm.h
64 @@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
66 extern void truncate_pagecache(struct inode *inode, loff_t new);
67 extern void truncate_setsize(struct inode *inode, loff_t newsize);
68 +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
69 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
70 int truncate_inode_page(struct address_space *mapping, struct page *page);
71 int generic_error_remove_page(struct address_space *mapping, struct page *page);
72 diff --git a/mm/truncate.c b/mm/truncate.c
73 index 96d1673..261eaf6 100644
77 #include <linux/buffer_head.h> /* grr. try_to_release_page,
79 #include <linux/cleancache.h>
80 +#include <linux/rmap.h>
83 static void clear_exceptional_entry(struct address_space *mapping,
84 @@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache);
86 void truncate_setsize(struct inode *inode, loff_t newsize)
88 + loff_t oldsize = inode->i_size;
90 i_size_write(inode, newsize);
91 + if (newsize > oldsize)
92 + pagecache_isize_extended(inode, oldsize, newsize);
93 truncate_pagecache(inode, newsize);
95 EXPORT_SYMBOL(truncate_setsize);
98 + * pagecache_isize_extended - update pagecache after extension of i_size
99 + * @inode: inode for which i_size was extended
100 + * @from: original inode size
101 + * @to: new inode size
103 + * Handle extension of inode size either caused by extending truncate or by
104 + * write starting after current i_size. We mark the page straddling current
105 + * i_size RO so that page_mkwrite() is called on the nearest write access to
106 + * the page. This way filesystem can be sure that page_mkwrite() is called on
107 + * the page before user writes to the page via mmap after the i_size has been
110 + * The function must be called after i_size is updated so that page fault
111 + * coming after we unlock the page will already see the new i_size.
112 + * The function must be called while we still hold i_mutex - this not only
113 + * makes sure i_size is stable but also that userspace cannot observe new
114 + * i_size value before we are prepared to store mmap writes at new inode size.
116 +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
118 + int bsize = 1 << inode->i_blkbits;
119 + loff_t rounded_from;
123 + WARN_ON(!mutex_is_locked(&inode->i_mutex));
124 + WARN_ON(to > inode->i_size);
126 + if (from >= to || bsize == PAGE_CACHE_SIZE)
128 + /* Page straddling @from will not have any hole block created? */
129 + rounded_from = round_up(from, bsize);
130 + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
133 + index = from >> PAGE_CACHE_SHIFT;
134 + page = find_lock_page(inode->i_mapping, index);
135 + /* Page not cached? Nothing to do */
139 + * See clear_page_dirty_for_io() for details why set_page_dirty()
142 + if (page_mkclean(page))
143 + set_page_dirty(page);
145 + page_cache_release(page);
147 +EXPORT_SYMBOL(pagecache_isize_extended);
150 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
152 * @lstart: offset of beginning of hole