1 ext4: fix races between page faults and hole punching
3 From: Jan Kara <jack@suse.com>
5 Currently, page faults and hole punching are completely
6 unsynchronized. This can result in page fault faulting in a page into
7 a range that we are punching after truncate_pagecache_range() has been
8 called and thus we can end up with a page mapped to disk blocks that
9 will be shortly freed. Filesystem corruption will shortly follow. Note
10 that the same race is avoided for truncate by checking page fault
11 offset against i_size but there isn't similar mechanism available for
14 Fix the problem by creating new rw semaphore i_mmap_sem in inode and
15 grab it for writing over truncate, hole punching, and other functions
16 removing blocks from extent tree and for read over page faults. We
17 cannot easily use i_data_sem for this since that ranks below transaction
18 start and we need something ranking above it so that it can be held over
19 the whole truncate / hole punching operation. Also remove various
20 workarounds we had in the code to reduce race window when page fault
21 could have created pages with stale mapping information.
23 Signed-off-by: Jan Kara <jack@suse.com>
24 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
26 fs/ext4/ext4.h | 10 +++++++++
27 fs/ext4/extents.c | 54 ++++++++++++++++++++++++--------------------
28 fs/ext4/file.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++--------
29 fs/ext4/inode.c | 36 +++++++++++++++++++++--------
31 fs/ext4/truncate.h | 2 ++
32 6 files changed, 127 insertions(+), 42 deletions(-)
34 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
35 index fd1f28be5296..c19ff61ccbdf 100644
38 @@ -869,6 +869,15 @@ struct ext4_inode_info {
39 * by other means, so we have i_data_sem.
41 struct rw_semaphore i_data_sem;
43 + * i_mmap_sem is for serializing page faults with truncate / punch hole
44 + * operations. We have to make sure that new page cannot be faulted in
45 + * a section of the inode that is being punched. We cannot easily use
46 + * i_data_sem for this since we need protection for the whole punch
47 + * operation and i_data_sem ranks below transaction start so we have
48 + * to occasionally drop it.
50 + struct rw_semaphore i_mmap_sem;
51 struct inode vfs_inode;
52 struct jbd2_inode *jinode;
54 @@ -2316,6 +2325,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
55 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
56 loff_t lstart, loff_t lend);
57 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
58 +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
59 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
60 extern void ext4_da_update_reserve_space(struct inode *inode,
61 int used, int quota_claim);
62 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
63 index 2553aa8b608d..6e0c5d37a232 100644
64 --- a/fs/ext4/extents.c
65 +++ b/fs/ext4/extents.c
66 @@ -4766,7 +4766,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
67 int partial_begin, partial_end;
70 - struct address_space *mapping = inode->i_mapping;
71 unsigned int blkbits = inode->i_blkbits;
73 trace_ext4_zero_range(inode, offset, len, mode);
74 @@ -4782,17 +4781,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
78 - * Write out all dirty pages to avoid race conditions
79 - * Then release them.
81 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
82 - ret = filemap_write_and_wait_range(mapping, offset,
89 * Round up offset. This is not fallocate, we neet to zero out
90 * blocks, so convert interior block aligned part of the range to
91 * unwritten and possibly manually zero out unaligned parts of the
92 @@ -4852,16 +4840,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
93 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
96 - /* Now release the pages and zero block aligned part of pages*/
97 - truncate_pagecache_range(inode, start, end - 1);
98 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
100 /* Wait all existing dio workers, newcomers will block on i_mutex */
101 ext4_inode_block_unlocked_dio(inode);
102 inode_dio_wait(inode);
105 + * Prevent page faults from reinstantiating pages we have
106 + * released from page cache.
108 + down_write(&EXT4_I(inode)->i_mmap_sem);
109 + /* Now release the pages and zero block aligned part of pages */
110 + truncate_pagecache_range(inode, start, end - 1);
111 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
113 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
115 + up_write(&EXT4_I(inode)->i_mmap_sem);
119 @@ -5520,17 +5514,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
123 - truncate_pagecache(inode, ioffset);
125 /* Wait for existing dio to complete */
126 ext4_inode_block_unlocked_dio(inode);
127 inode_dio_wait(inode);
130 + * Prevent page faults from reinstantiating pages we have released from
133 + down_write(&EXT4_I(inode)->i_mmap_sem);
134 + truncate_pagecache(inode, ioffset);
136 credits = ext4_writepage_trans_blocks(inode);
137 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
138 if (IS_ERR(handle)) {
139 ret = PTR_ERR(handle);
144 down_write(&EXT4_I(inode)->i_data_sem);
145 @@ -5569,7 +5568,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
148 ext4_journal_stop(handle);
151 + up_write(&EXT4_I(inode)->i_mmap_sem);
152 ext4_inode_resume_unlocked_dio(inode);
154 mutex_unlock(&inode->i_mutex);
155 @@ -5656,17 +5656,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
159 - truncate_pagecache(inode, ioffset);
161 /* Wait for existing dio to complete */
162 ext4_inode_block_unlocked_dio(inode);
163 inode_dio_wait(inode);
166 + * Prevent page faults from reinstantiating pages we have released from
169 + down_write(&EXT4_I(inode)->i_mmap_sem);
170 + truncate_pagecache(inode, ioffset);
172 credits = ext4_writepage_trans_blocks(inode);
173 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
174 if (IS_ERR(handle)) {
175 ret = PTR_ERR(handle);
180 /* Expand file to avoid data loss if there is error while shifting */
181 @@ -5737,7 +5742,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
184 ext4_journal_stop(handle);
187 + up_write(&EXT4_I(inode)->i_mmap_sem);
188 ext4_inode_resume_unlocked_dio(inode);
190 mutex_unlock(&inode->i_mutex);
191 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
192 index 113837e7ba98..0d24ebcd7c9e 100644
195 @@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
198 handle_t *handle = NULL;
199 - struct super_block *sb = file_inode(vma->vm_file)->i_sb;
200 + struct inode *inode = file_inode(vma->vm_file);
201 + struct super_block *sb = inode->i_sb;
202 bool write = vmf->flags & FAULT_FLAG_WRITE;
205 sb_start_pagefault(sb);
206 file_update_time(vma->vm_file);
207 + down_read(&EXT4_I(inode)->i_mmap_sem);
208 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
209 EXT4_DATA_TRANS_BLOCKS(sb));
212 + down_read(&EXT4_I(inode)->i_mmap_sem);
215 result = VM_FAULT_SIGBUS;
216 @@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219 ext4_journal_stop(handle);
220 + up_read(&EXT4_I(inode)->i_mmap_sem);
221 sb_end_pagefault(sb);
224 + up_read(&EXT4_I(inode)->i_mmap_sem);
228 @@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
230 sb_start_pagefault(sb);
231 file_update_time(vma->vm_file);
232 + down_read(&EXT4_I(inode)->i_mmap_sem);
233 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
234 ext4_chunk_trans_blocks(inode,
235 PMD_SIZE / PAGE_SIZE));
238 + down_read(&EXT4_I(inode)->i_mmap_sem);
241 result = VM_FAULT_SIGBUS;
242 @@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
245 ext4_journal_stop(handle);
246 + up_read(&EXT4_I(inode)->i_mmap_sem);
247 sb_end_pagefault(sb);
250 + up_read(&EXT4_I(inode)->i_mmap_sem);
255 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
257 - return dax_mkwrite(vma, vmf, ext4_get_block_dax,
258 - ext4_end_io_unwritten);
260 + struct inode *inode = file_inode(vma->vm_file);
262 + sb_start_pagefault(inode->i_sb);
263 + file_update_time(vma->vm_file);
264 + down_read(&EXT4_I(inode)->i_mmap_sem);
265 + err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
266 + ext4_end_io_unwritten);
267 + up_read(&EXT4_I(inode)->i_mmap_sem);
268 + sb_end_pagefault(inode->i_sb);
274 + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
275 + * handler we check for races agaist truncate. Note that since we cycle through
276 + * i_mmap_sem, we are sure that also any hole punching that began before we
277 + * were called is finished by now and so if it included part of the file we
278 + * are working on, our pte will get unmapped and the check for pte_same() in
279 + * wp_pfn_shared() fails. Thus fault gets retried and things work out as
282 +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
283 + struct vm_fault *vmf)
285 + struct inode *inode = file_inode(vma->vm_file);
286 + struct super_block *sb = inode->i_sb;
287 + int ret = VM_FAULT_NOPAGE;
290 + sb_start_pagefault(sb);
291 + file_update_time(vma->vm_file);
292 + down_read(&EXT4_I(inode)->i_mmap_sem);
293 + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
294 + if (vmf->pgoff >= size)
295 + ret = VM_FAULT_SIGBUS;
296 + up_read(&EXT4_I(inode)->i_mmap_sem);
297 + sb_end_pagefault(sb);
302 static const struct vm_operations_struct ext4_dax_vm_ops = {
303 .fault = ext4_dax_fault,
304 .pmd_fault = ext4_dax_pmd_fault,
305 .page_mkwrite = ext4_dax_mkwrite,
306 - .pfn_mkwrite = dax_pfn_mkwrite,
307 + .pfn_mkwrite = ext4_dax_pfn_mkwrite,
310 #define ext4_dax_vm_ops ext4_file_vm_ops
313 static const struct vm_operations_struct ext4_file_vm_ops = {
314 - .fault = filemap_fault,
315 + .fault = ext4_filemap_fault,
316 .map_pages = filemap_map_pages,
317 .page_mkwrite = ext4_page_mkwrite,
319 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
320 index 612fbcf76b5c..36ad45906d26 100644
321 --- a/fs/ext4/inode.c
322 +++ b/fs/ext4/inode.c
323 @@ -3581,6 +3581,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
327 + /* Wait all existing dio workers, newcomers will block on i_mutex */
328 + ext4_inode_block_unlocked_dio(inode);
329 + inode_dio_wait(inode);
332 + * Prevent page faults from reinstantiating pages we have released from
335 + down_write(&EXT4_I(inode)->i_mmap_sem);
336 first_block_offset = round_up(offset, sb->s_blocksize);
337 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
339 @@ -3589,10 +3598,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
340 truncate_pagecache_range(inode, first_block_offset,
343 - /* Wait all existing dio workers, newcomers will block on i_mutex */
344 - ext4_inode_block_unlocked_dio(inode);
345 - inode_dio_wait(inode);
347 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
348 credits = ext4_writepage_trans_blocks(inode);
350 @@ -3638,16 +3643,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
352 ext4_handle_sync(handle);
354 - /* Now release the pages again to reduce race window */
355 - if (last_block_offset > first_block_offset)
356 - truncate_pagecache_range(inode, first_block_offset,
357 - last_block_offset);
359 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
360 ext4_mark_inode_dirty(handle, inode);
362 ext4_journal_stop(handle);
364 + up_write(&EXT4_I(inode)->i_mmap_sem);
365 ext4_inode_resume_unlocked_dio(inode);
367 mutex_unlock(&inode->i_mutex);
368 @@ -4784,6 +4785,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
370 ext4_wait_for_tail_page_commit(inode);
372 + down_write(&EXT4_I(inode)->i_mmap_sem);
374 * Truncate pagecache after we've waited for commit
375 * in data=journal mode to make pages freeable.
376 @@ -4791,6 +4793,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
377 truncate_pagecache(inode, inode->i_size);
379 ext4_truncate(inode);
380 + up_write(&EXT4_I(inode)->i_mmap_sem);
384 @@ -5239,6 +5242,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
386 sb_start_pagefault(inode->i_sb);
387 file_update_time(vma->vm_file);
389 + down_read(&EXT4_I(inode)->i_mmap_sem);
390 /* Delalloc case is easy... */
391 if (test_opt(inode->i_sb, DELALLOC) &&
392 !ext4_should_journal_data(inode) &&
393 @@ -5308,6 +5313,19 @@ retry_alloc:
395 ret = block_page_mkwrite_return(ret);
397 + up_read(&EXT4_I(inode)->i_mmap_sem);
398 sb_end_pagefault(inode->i_sb);
402 +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
404 + struct inode *inode = file_inode(vma->vm_file);
407 + down_read(&EXT4_I(inode)->i_mmap_sem);
408 + err = filemap_fault(vma, vmf);
409 + up_read(&EXT4_I(inode)->i_mmap_sem);
413 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
414 index a63c7b0a10cf..61fad9e33bb9 100644
415 --- a/fs/ext4/super.c
416 +++ b/fs/ext4/super.c
417 @@ -955,6 +955,7 @@ static void init_once(void *foo)
418 INIT_LIST_HEAD(&ei->i_orphan);
419 init_rwsem(&ei->xattr_sem);
420 init_rwsem(&ei->i_data_sem);
421 + init_rwsem(&ei->i_mmap_sem);
422 inode_init_once(&ei->vfs_inode);
425 diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
426 index 011ba6670d99..c70d06a383e2 100644
427 --- a/fs/ext4/truncate.h
428 +++ b/fs/ext4/truncate.h
431 static inline void ext4_truncate_failed_write(struct inode *inode)
433 + down_write(&EXT4_I(inode)->i_mmap_sem);
434 truncate_inode_pages(inode->i_mapping, inode->i_size);
435 ext4_truncate(inode);
436 + up_write(&EXT4_I(inode)->i_mmap_sem);