Update make-the-bitmap-read-routines-return-real-errors-code to not
[ext4-patch-queue.git] / fix-races-between-page-faults-and-hole-punching
blobe8114ab368d6dc9344e18ebb379b08cc56d867c6
1 ext4: fix races between page faults and hole punching
3 From: Jan Kara <jack@suse.com>
5 Currently, page faults and hole punching are completely
6 unsynchronized.  This can result in page fault faulting in a page into
7 a range that we are punching after truncate_pagecache_range() has been
8 called and thus we can end up with a page mapped to disk blocks that
9 will be shortly freed. Filesystem corruption will shortly follow. Note
10 that the same race is avoided for truncate by checking page fault
11 offset against i_size but there isn't similar mechanism available for
12 punching holes.
14 Fix the problem by creating new rw semaphore i_mmap_sem in inode and
15 grab it for writing over truncate, hole punching, and other functions
16 removing blocks from extent tree and for read over page faults. We
17 cannot easily use i_data_sem for this since that ranks below transaction
18 start and we need something ranking above it so that it can be held over
19 the whole truncate / hole punching operation. Also remove various
20 workarounds we had in the code to reduce race window when page fault
21 could have created pages with stale mapping information.
23 Signed-off-by: Jan Kara <jack@suse.com>
24 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
25 ---
26  fs/ext4/ext4.h     | 10 +++++++++
27  fs/ext4/extents.c  | 54 ++++++++++++++++++++++++--------------------
28  fs/ext4/file.c     | 66 ++++++++++++++++++++++++++++++++++++++++++++++--------
29  fs/ext4/inode.c    | 36 +++++++++++++++++++++--------
30  fs/ext4/super.c    |  1 +
31  fs/ext4/truncate.h |  2 ++
32  6 files changed, 127 insertions(+), 42 deletions(-)
34 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
35 index fd1f28be5296..c19ff61ccbdf 100644
36 --- a/fs/ext4/ext4.h
37 +++ b/fs/ext4/ext4.h
38 @@ -869,6 +869,15 @@ struct ext4_inode_info {
39          * by other means, so we have i_data_sem.
40          */
41         struct rw_semaphore i_data_sem;
42 +       /*
43 +        * i_mmap_sem is for serializing page faults with truncate / punch hole
44 +        * operations. We have to make sure that new page cannot be faulted in
45 +        * a section of the inode that is being punched. We cannot easily use
46 +        * i_data_sem for this since we need protection for the whole punch
47 +        * operation and i_data_sem ranks below transaction start so we have
48 +        * to occasionally drop it.
49 +        */
50 +       struct rw_semaphore i_mmap_sem;
51         struct inode vfs_inode;
52         struct jbd2_inode *jinode;
54 @@ -2316,6 +2325,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
55  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
56                              loff_t lstart, loff_t lend);
57  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
58 +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
59  extern qsize_t *ext4_get_reserved_space(struct inode *inode);
60  extern void ext4_da_update_reserve_space(struct inode *inode,
61                                         int used, int quota_claim);
62 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
63 index 2553aa8b608d..6e0c5d37a232 100644
64 --- a/fs/ext4/extents.c
65 +++ b/fs/ext4/extents.c
66 @@ -4766,7 +4766,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
67         int partial_begin, partial_end;
68         loff_t start, end;
69         ext4_lblk_t lblk;
70 -       struct address_space *mapping = inode->i_mapping;
71         unsigned int blkbits = inode->i_blkbits;
73         trace_ext4_zero_range(inode, offset, len, mode);
74 @@ -4782,17 +4781,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
75         }
77         /*
78 -        * Write out all dirty pages to avoid race conditions
79 -        * Then release them.
80 -        */
81 -       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
82 -               ret = filemap_write_and_wait_range(mapping, offset,
83 -                                                  offset + len - 1);
84 -               if (ret)
85 -                       return ret;
86 -       }
88 -       /*
89          * Round up offset. This is not fallocate, we neet to zero out
90          * blocks, so convert interior block aligned part of the range to
91          * unwritten and possibly manually zero out unaligned parts of the
92 @@ -4852,16 +4840,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
93                 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
94                           EXT4_EX_NOCACHE);
96 -               /* Now release the pages and zero block aligned part of pages*/
97 -               truncate_pagecache_range(inode, start, end - 1);
98 -               inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
100                 /* Wait all existing dio workers, newcomers will block on i_mutex */
101                 ext4_inode_block_unlocked_dio(inode);
102                 inode_dio_wait(inode);
104 +               /*
105 +                * Prevent page faults from reinstantiating pages we have
106 +                * released from page cache.
107 +                */
108 +               down_write(&EXT4_I(inode)->i_mmap_sem);
109 +               /* Now release the pages and zero block aligned part of pages */
110 +               truncate_pagecache_range(inode, start, end - 1);
111 +               inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
113                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
114                                              flags, mode);
115 +               up_write(&EXT4_I(inode)->i_mmap_sem);
116                 if (ret)
117                         goto out_dio;
118         }
119 @@ -5520,17 +5514,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
120                 goto out_mutex;
121         }
123 -       truncate_pagecache(inode, ioffset);
125         /* Wait for existing dio to complete */
126         ext4_inode_block_unlocked_dio(inode);
127         inode_dio_wait(inode);
129 +       /*
130 +        * Prevent page faults from reinstantiating pages we have released from
131 +        * page cache.
132 +        */
133 +       down_write(&EXT4_I(inode)->i_mmap_sem);
134 +       truncate_pagecache(inode, ioffset);
136         credits = ext4_writepage_trans_blocks(inode);
137         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
138         if (IS_ERR(handle)) {
139                 ret = PTR_ERR(handle);
140 -               goto out_dio;
141 +               goto out_mmap;
142         }
144         down_write(&EXT4_I(inode)->i_data_sem);
145 @@ -5569,7 +5568,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
147  out_stop:
148         ext4_journal_stop(handle);
149 -out_dio:
150 +out_mmap:
151 +       up_write(&EXT4_I(inode)->i_mmap_sem);
152         ext4_inode_resume_unlocked_dio(inode);
153  out_mutex:
154         mutex_unlock(&inode->i_mutex);
155 @@ -5656,17 +5656,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
156                 goto out_mutex;
157         }
159 -       truncate_pagecache(inode, ioffset);
161         /* Wait for existing dio to complete */
162         ext4_inode_block_unlocked_dio(inode);
163         inode_dio_wait(inode);
165 +       /*
166 +        * Prevent page faults from reinstantiating pages we have released from
167 +        * page cache.
168 +        */
169 +       down_write(&EXT4_I(inode)->i_mmap_sem);
170 +       truncate_pagecache(inode, ioffset);
172         credits = ext4_writepage_trans_blocks(inode);
173         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
174         if (IS_ERR(handle)) {
175                 ret = PTR_ERR(handle);
176 -               goto out_dio;
177 +               goto out_mmap;
178         }
180         /* Expand file to avoid data loss if there is error while shifting */
181 @@ -5737,7 +5742,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
183  out_stop:
184         ext4_journal_stop(handle);
185 -out_dio:
186 +out_mmap:
187 +       up_write(&EXT4_I(inode)->i_mmap_sem);
188         ext4_inode_resume_unlocked_dio(inode);
189  out_mutex:
190         mutex_unlock(&inode->i_mutex);
191 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
192 index 113837e7ba98..0d24ebcd7c9e 100644
193 --- a/fs/ext4/file.c
194 +++ b/fs/ext4/file.c
195 @@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
197         int result;
198         handle_t *handle = NULL;
199 -       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
200 +       struct inode *inode = file_inode(vma->vm_file);
201 +       struct super_block *sb = inode->i_sb;
202         bool write = vmf->flags & FAULT_FLAG_WRITE;
204         if (write) {
205                 sb_start_pagefault(sb);
206                 file_update_time(vma->vm_file);
207 +               down_read(&EXT4_I(inode)->i_mmap_sem);
208                 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
209                                                 EXT4_DATA_TRANS_BLOCKS(sb));
210 -       }
211 +       } else
212 +               down_read(&EXT4_I(inode)->i_mmap_sem);
214         if (IS_ERR(handle))
215                 result = VM_FAULT_SIGBUS;
216 @@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
217         if (write) {
218                 if (!IS_ERR(handle))
219                         ext4_journal_stop(handle);
220 +               up_read(&EXT4_I(inode)->i_mmap_sem);
221                 sb_end_pagefault(sb);
222 -       }
223 +       } else
224 +               up_read(&EXT4_I(inode)->i_mmap_sem);
226         return result;
228 @@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
229         if (write) {
230                 sb_start_pagefault(sb);
231                 file_update_time(vma->vm_file);
232 +               down_read(&EXT4_I(inode)->i_mmap_sem);
233                 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
234                                 ext4_chunk_trans_blocks(inode,
235                                                         PMD_SIZE / PAGE_SIZE));
236 -       }
237 +       } else
238 +               down_read(&EXT4_I(inode)->i_mmap_sem);
240         if (IS_ERR(handle))
241                 result = VM_FAULT_SIGBUS;
242 @@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
243         if (write) {
244                 if (!IS_ERR(handle))
245                         ext4_journal_stop(handle);
246 +               up_read(&EXT4_I(inode)->i_mmap_sem);
247                 sb_end_pagefault(sb);
248 -       }
249 +       } else
250 +               up_read(&EXT4_I(inode)->i_mmap_sem);
252         return result;
255  static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
257 -       return dax_mkwrite(vma, vmf, ext4_get_block_dax,
258 -                               ext4_end_io_unwritten);
259 +       int err;
260 +       struct inode *inode = file_inode(vma->vm_file);
262 +       sb_start_pagefault(inode->i_sb);
263 +       file_update_time(vma->vm_file);
264 +       down_read(&EXT4_I(inode)->i_mmap_sem);
265 +       err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
266 +                           ext4_end_io_unwritten);
267 +       up_read(&EXT4_I(inode)->i_mmap_sem);
268 +       sb_end_pagefault(inode->i_sb);
270 +       return err;
274 + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
275 + * handler we check for races agaist truncate. Note that since we cycle through
276 + * i_mmap_sem, we are sure that also any hole punching that began before we
277 + * were called is finished by now and so if it included part of the file we
278 + * are working on, our pte will get unmapped and the check for pte_same() in
279 + * wp_pfn_shared() fails. Thus fault gets retried and things work out as
280 + * desired.
281 + */
282 +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
283 +                               struct vm_fault *vmf)
285 +       struct inode *inode = file_inode(vma->vm_file);
286 +       struct super_block *sb = inode->i_sb;
287 +       int ret = VM_FAULT_NOPAGE;
288 +       loff_t size;
290 +       sb_start_pagefault(sb);
291 +       file_update_time(vma->vm_file);
292 +       down_read(&EXT4_I(inode)->i_mmap_sem);
293 +       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
294 +       if (vmf->pgoff >= size)
295 +               ret = VM_FAULT_SIGBUS;
296 +       up_read(&EXT4_I(inode)->i_mmap_sem);
297 +       sb_end_pagefault(sb);
299 +       return ret;
302  static const struct vm_operations_struct ext4_dax_vm_ops = {
303         .fault          = ext4_dax_fault,
304         .pmd_fault      = ext4_dax_pmd_fault,
305         .page_mkwrite   = ext4_dax_mkwrite,
306 -       .pfn_mkwrite    = dax_pfn_mkwrite,
307 +       .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
308  };
309  #else
310  #define ext4_dax_vm_ops        ext4_file_vm_ops
311  #endif
313  static const struct vm_operations_struct ext4_file_vm_ops = {
314 -       .fault          = filemap_fault,
315 +       .fault          = ext4_filemap_fault,
316         .map_pages      = filemap_map_pages,
317         .page_mkwrite   = ext4_page_mkwrite,
318  };
319 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
320 index 612fbcf76b5c..36ad45906d26 100644
321 --- a/fs/ext4/inode.c
322 +++ b/fs/ext4/inode.c
323 @@ -3581,6 +3581,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
325         }
327 +       /* Wait all existing dio workers, newcomers will block on i_mutex */
328 +       ext4_inode_block_unlocked_dio(inode);
329 +       inode_dio_wait(inode);
331 +       /*
332 +        * Prevent page faults from reinstantiating pages we have released from
333 +        * page cache.
334 +        */
335 +       down_write(&EXT4_I(inode)->i_mmap_sem);
336         first_block_offset = round_up(offset, sb->s_blocksize);
337         last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
339 @@ -3589,10 +3598,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
340                 truncate_pagecache_range(inode, first_block_offset,
341                                          last_block_offset);
343 -       /* Wait all existing dio workers, newcomers will block on i_mutex */
344 -       ext4_inode_block_unlocked_dio(inode);
345 -       inode_dio_wait(inode);
347         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
348                 credits = ext4_writepage_trans_blocks(inode);
349         else
350 @@ -3638,16 +3643,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
351         if (IS_SYNC(inode))
352                 ext4_handle_sync(handle);
354 -       /* Now release the pages again to reduce race window */
355 -       if (last_block_offset > first_block_offset)
356 -               truncate_pagecache_range(inode, first_block_offset,
357 -                                        last_block_offset);
359         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
360         ext4_mark_inode_dirty(handle, inode);
361  out_stop:
362         ext4_journal_stop(handle);
363  out_dio:
364 +       up_write(&EXT4_I(inode)->i_mmap_sem);
365         ext4_inode_resume_unlocked_dio(inode);
366  out_mutex:
367         mutex_unlock(&inode->i_mutex);
368 @@ -4784,6 +4785,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
369                         } else
370                                 ext4_wait_for_tail_page_commit(inode);
371                 }
372 +               down_write(&EXT4_I(inode)->i_mmap_sem);
373                 /*
374                  * Truncate pagecache after we've waited for commit
375                  * in data=journal mode to make pages freeable.
376 @@ -4791,6 +4793,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
377                 truncate_pagecache(inode, inode->i_size);
378                 if (shrink)
379                         ext4_truncate(inode);
380 +               up_write(&EXT4_I(inode)->i_mmap_sem);
381         }
383         if (!rc) {
384 @@ -5239,6 +5242,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
386         sb_start_pagefault(inode->i_sb);
387         file_update_time(vma->vm_file);
389 +       down_read(&EXT4_I(inode)->i_mmap_sem);
390         /* Delalloc case is easy... */
391         if (test_opt(inode->i_sb, DELALLOC) &&
392             !ext4_should_journal_data(inode) &&
393 @@ -5308,6 +5313,19 @@ retry_alloc:
394  out_ret:
395         ret = block_page_mkwrite_return(ret);
396  out:
397 +       up_read(&EXT4_I(inode)->i_mmap_sem);
398         sb_end_pagefault(inode->i_sb);
399         return ret;
402 +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
404 +       struct inode *inode = file_inode(vma->vm_file);
405 +       int err;
407 +       down_read(&EXT4_I(inode)->i_mmap_sem);
408 +       err = filemap_fault(vma, vmf);
409 +       up_read(&EXT4_I(inode)->i_mmap_sem);
411 +       return err;
413 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
414 index a63c7b0a10cf..61fad9e33bb9 100644
415 --- a/fs/ext4/super.c
416 +++ b/fs/ext4/super.c
417 @@ -955,6 +955,7 @@ static void init_once(void *foo)
418         INIT_LIST_HEAD(&ei->i_orphan);
419         init_rwsem(&ei->xattr_sem);
420         init_rwsem(&ei->i_data_sem);
421 +       init_rwsem(&ei->i_mmap_sem);
422         inode_init_once(&ei->vfs_inode);
425 diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
426 index 011ba6670d99..c70d06a383e2 100644
427 --- a/fs/ext4/truncate.h
428 +++ b/fs/ext4/truncate.h
429 @@ -10,8 +10,10 @@
430   */
431  static inline void ext4_truncate_failed_write(struct inode *inode)
433 +       down_write(&EXT4_I(inode)->i_mmap_sem);
434         truncate_inode_pages(inode->i_mapping, inode->i_size);
435         ext4_truncate(inode);
436 +       up_write(&EXT4_I(inode)->i_mmap_sem);
439  /*
440 -- 
441 2.1.4