add patch add-select-for-CONFIG_FS_IOMAP
[ext4-patch-queue.git] / dax-rip-out-get_block-based-IO-support
blobf1e3c0d952c869f5bdd9755970259656c0beb896
1 dax: rip out get_block based IO support
3 From: Jan Kara <jack@suse.cz>
5 No one uses functions using the get_block callback anymore. Rip them
6 out and update documentation.
8 Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
9 Signed-off-by: Jan Kara <jack@suse.cz>
10 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
11 ---
12  Documentation/filesystems/dax.txt |  22 +--
13  fs/dax.c                          | 315 --------------------------------------
14  include/linux/dax.h               |  12 --
15  3 files changed, 11 insertions(+), 338 deletions(-)
17 diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
18 index 23d18b8a49d5..a7e6e14aeb08 100644
19 --- a/Documentation/filesystems/dax.txt
20 +++ b/Documentation/filesystems/dax.txt
21 @@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
22  Filesystem support consists of
23  - adding support to mark inodes as being DAX by setting the S_DAX flag in
24    i_flags
25 -- implementing the direct_IO address space operation, and calling
26 -  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
27 +- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
28 +  when inode has S_DAX flag set
29  - implementing an mmap file operation for DAX files which sets the
30    VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
31 -  include handlers for fault, pmd_fault and page_mkwrite (which should
32 -  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
33 -  appropriate get_block() callback)
34 -- calling dax_truncate_page() instead of block_truncate_page() for DAX files
35 -- calling dax_zero_page_range() instead of zero_user() for DAX files
36 +  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
37 +  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
38 +  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
39 +  iomap operations.
40 +- calling iomap_zero_range() passing appropriate iomap operations instead of
41 +  block_truncate_page() for DAX files
42  - ensuring that there is sufficient locking between reads, writes,
43    truncates and page faults
45 -The get_block() callback passed to the DAX functions may return
46 -uninitialised extents.  If it does, it must ensure that simultaneous
47 -calls to get_block() (for example by a page-fault racing with a read()
48 -or a write()) work correctly.
49 +The iomap handlers for allocating blocks must make sure that allocated blocks
50 +are zeroed out and converted to written extents before being returned to avoid
51 +exposure of uninitialized data through mmap.
53  These filesystems may be used for inspiration:
54  - ext2: see Documentation/filesystems/ext2.txt
55 diff --git a/fs/dax.c b/fs/dax.c
56 index 28af41b9da3a..ad131cd2605d 100644
57 --- a/fs/dax.c
58 +++ b/fs/dax.c
59 @@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
60         return page;
61  }
63 -static bool buffer_written(struct buffer_head *bh)
65 -       return buffer_mapped(bh) && !buffer_unwritten(bh);
68 -static sector_t to_sector(const struct buffer_head *bh,
69 -               const struct inode *inode)
71 -       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
73 -       return sector;
76 -static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
77 -                     loff_t start, loff_t end, get_block_t get_block,
78 -                     struct buffer_head *bh)
80 -       loff_t pos = start, max = start, bh_max = start;
81 -       bool hole = false;
82 -       struct block_device *bdev = NULL;
83 -       int rw = iov_iter_rw(iter), rc;
84 -       long map_len = 0;
85 -       struct blk_dax_ctl dax = {
86 -               .addr = ERR_PTR(-EIO),
87 -       };
88 -       unsigned blkbits = inode->i_blkbits;
89 -       sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
90 -                                                               >> blkbits;
92 -       if (rw == READ)
93 -               end = min(end, i_size_read(inode));
95 -       while (pos < end) {
96 -               size_t len;
97 -               if (pos == max) {
98 -                       long page = pos >> PAGE_SHIFT;
99 -                       sector_t block = page << (PAGE_SHIFT - blkbits);
100 -                       unsigned first = pos - (block << blkbits);
101 -                       long size;
103 -                       if (pos == bh_max) {
104 -                               bh->b_size = PAGE_ALIGN(end - pos);
105 -                               bh->b_state = 0;
106 -                               rc = get_block(inode, block, bh, rw == WRITE);
107 -                               if (rc)
108 -                                       break;
109 -                               bh_max = pos - first + bh->b_size;
110 -                               bdev = bh->b_bdev;
111 -                               /*
112 -                                * We allow uninitialized buffers for writes
113 -                                * beyond EOF as those cannot race with faults
114 -                                */
115 -                               WARN_ON_ONCE(
116 -                                       (buffer_new(bh) && block < file_blks) ||
117 -                                       (rw == WRITE && buffer_unwritten(bh)));
118 -                       } else {
119 -                               unsigned done = bh->b_size -
120 -                                               (bh_max - (pos - first));
121 -                               bh->b_blocknr += done >> blkbits;
122 -                               bh->b_size -= done;
123 -                       }
125 -                       hole = rw == READ && !buffer_written(bh);
126 -                       if (hole) {
127 -                               size = bh->b_size - first;
128 -                       } else {
129 -                               dax_unmap_atomic(bdev, &dax);
130 -                               dax.sector = to_sector(bh, inode);
131 -                               dax.size = bh->b_size;
132 -                               map_len = dax_map_atomic(bdev, &dax);
133 -                               if (map_len < 0) {
134 -                                       rc = map_len;
135 -                                       break;
136 -                               }
137 -                               dax.addr += first;
138 -                               size = map_len - first;
139 -                       }
140 -                       /*
141 -                        * pos + size is one past the last offset for IO,
142 -                        * so pos + size can overflow loff_t at extreme offsets.
143 -                        * Cast to u64 to catch this and get the true minimum.
144 -                        */
145 -                       max = min_t(u64, pos + size, end);
146 -               }
148 -               if (iov_iter_rw(iter) == WRITE) {
149 -                       len = copy_from_iter_pmem(dax.addr, max - pos, iter);
150 -               } else if (!hole)
151 -                       len = copy_to_iter((void __force *) dax.addr, max - pos,
152 -                                       iter);
153 -               else
154 -                       len = iov_iter_zero(max - pos, iter);
156 -               if (!len) {
157 -                       rc = -EFAULT;
158 -                       break;
159 -               }
161 -               pos += len;
162 -               if (!IS_ERR(dax.addr))
163 -                       dax.addr += len;
164 -       }
166 -       dax_unmap_atomic(bdev, &dax);
168 -       return (pos == start) ? rc : pos - start;
171 -/**
172 - * dax_do_io - Perform I/O to a DAX file
173 - * @iocb: The control block for this I/O
174 - * @inode: The file which the I/O is directed at
175 - * @iter: The addresses to do I/O from or to
176 - * @get_block: The filesystem method used to translate file offsets to blocks
177 - * @end_io: A filesystem callback for I/O completion
178 - * @flags: See below
179 - *
180 - * This function uses the same locking scheme as do_blockdev_direct_IO:
181 - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
182 - * caller for writes.  For reads, we take and release the i_mutex ourselves.
183 - * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
184 - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
185 - * is in progress.
186 - */
187 -ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
188 -                 struct iov_iter *iter, get_block_t get_block,
189 -                 dio_iodone_t end_io, int flags)
191 -       struct buffer_head bh;
192 -       ssize_t retval = -EINVAL;
193 -       loff_t pos = iocb->ki_pos;
194 -       loff_t end = pos + iov_iter_count(iter);
196 -       memset(&bh, 0, sizeof(bh));
197 -       bh.b_bdev = inode->i_sb->s_bdev;
199 -       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
200 -               inode_lock(inode);
202 -       /* Protects against truncate */
203 -       if (!(flags & DIO_SKIP_DIO_COUNT))
204 -               inode_dio_begin(inode);
206 -       retval = dax_io(inode, iter, pos, end, get_block, &bh);
208 -       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
209 -               inode_unlock(inode);
211 -       if (end_io) {
212 -               int err;
214 -               err = end_io(iocb, pos, retval, bh.b_private);
215 -               if (err)
216 -                       retval = err;
217 -       }
219 -       if (!(flags & DIO_SKIP_DIO_COUNT))
220 -               inode_dio_end(inode);
221 -       return retval;
223 -EXPORT_SYMBOL_GPL(dax_do_io);
225  /*
226   * DAX radix tree locking
227   */
228 @@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping,
231  /**
232 - * dax_fault - handle a page fault on a DAX file
233 - * @vma: The virtual memory area where the fault occurred
234 - * @vmf: The description of the fault
235 - * @get_block: The filesystem method used to translate file offsets to blocks
236 - *
237 - * When a page fault occurs, filesystems may call this helper in their
238 - * fault handler for DAX files. dax_fault() assumes the caller has done all
239 - * the necessary locking for the page fault to proceed successfully.
240 - */
241 -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
242 -                       get_block_t get_block)
244 -       struct file *file = vma->vm_file;
245 -       struct address_space *mapping = file->f_mapping;
246 -       struct inode *inode = mapping->host;
247 -       void *entry;
248 -       struct buffer_head bh;
249 -       unsigned long vaddr = (unsigned long)vmf->virtual_address;
250 -       unsigned blkbits = inode->i_blkbits;
251 -       sector_t block;
252 -       pgoff_t size;
253 -       int error;
254 -       int major = 0;
256 -       /*
257 -        * Check whether offset isn't beyond end of file now. Caller is supposed
258 -        * to hold locks serializing us with truncate / punch hole so this is
259 -        * a reliable test.
260 -        */
261 -       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
262 -       if (vmf->pgoff >= size)
263 -               return VM_FAULT_SIGBUS;
265 -       memset(&bh, 0, sizeof(bh));
266 -       block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
267 -       bh.b_bdev = inode->i_sb->s_bdev;
268 -       bh.b_size = PAGE_SIZE;
270 -       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
271 -       if (IS_ERR(entry)) {
272 -               error = PTR_ERR(entry);
273 -               goto out;
274 -       }
276 -       error = get_block(inode, block, &bh, 0);
277 -       if (!error && (bh.b_size < PAGE_SIZE))
278 -               error = -EIO;           /* fs corruption? */
279 -       if (error)
280 -               goto unlock_entry;
282 -       if (vmf->cow_page) {
283 -               struct page *new_page = vmf->cow_page;
284 -               if (buffer_written(&bh))
285 -                       error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
286 -                                       bh.b_size, new_page, vaddr);
287 -               else
288 -                       clear_user_highpage(new_page, vaddr);
289 -               if (error)
290 -                       goto unlock_entry;
291 -               if (!radix_tree_exceptional_entry(entry)) {
292 -                       vmf->page = entry;
293 -                       return VM_FAULT_LOCKED;
294 -               }
295 -               vmf->entry = entry;
296 -               return VM_FAULT_DAX_LOCKED;
297 -       }
299 -       if (!buffer_mapped(&bh)) {
300 -               if (vmf->flags & FAULT_FLAG_WRITE) {
301 -                       error = get_block(inode, block, &bh, 1);
302 -                       count_vm_event(PGMAJFAULT);
303 -                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
304 -                       major = VM_FAULT_MAJOR;
305 -                       if (!error && (bh.b_size < PAGE_SIZE))
306 -                               error = -EIO;
307 -                       if (error)
308 -                               goto unlock_entry;
309 -               } else {
310 -                       return dax_load_hole(mapping, entry, vmf);
311 -               }
312 -       }
314 -       /* Filesystem should not return unwritten buffers to us! */
315 -       WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
316 -       error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
317 -                       bh.b_size, &entry, vma, vmf);
318 - unlock_entry:
319 -       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
320 - out:
321 -       if (error == -ENOMEM)
322 -               return VM_FAULT_OOM | major;
323 -       /* -EBUSY is fine, somebody else faulted on the same PTE */
324 -       if ((error < 0) && (error != -EBUSY))
325 -               return VM_FAULT_SIGBUS | major;
326 -       return VM_FAULT_NOPAGE | major;
328 -EXPORT_SYMBOL_GPL(dax_fault);
330 -/**
331   * dax_pfn_mkwrite - handle first write to DAX page
332   * @vma: The virtual memory area where the fault occurred
333   * @vmf: The description of the fault
334 @@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
336  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
338 -/**
339 - * dax_zero_page_range - zero a range within a page of a DAX file
340 - * @inode: The file being truncated
341 - * @from: The file offset that is being truncated to
342 - * @length: The number of bytes to zero
343 - * @get_block: The filesystem method used to translate file offsets to blocks
344 - *
345 - * This function can be called by a filesystem when it is zeroing part of a
346 - * page in a DAX file.  This is intended for hole-punch operations.  If
347 - * you are truncating a file, the helper function dax_truncate_page() may be
348 - * more convenient.
349 - */
350 -int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
351 -                                                       get_block_t get_block)
353 -       struct buffer_head bh;
354 -       pgoff_t index = from >> PAGE_SHIFT;
355 -       unsigned offset = from & (PAGE_SIZE-1);
356 -       int err;
358 -       /* Block boundary? Nothing to do */
359 -       if (!length)
360 -               return 0;
361 -       if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
362 -               return -EINVAL;
364 -       memset(&bh, 0, sizeof(bh));
365 -       bh.b_bdev = inode->i_sb->s_bdev;
366 -       bh.b_size = PAGE_SIZE;
367 -       err = get_block(inode, index, &bh, 0);
368 -       if (err < 0 || !buffer_written(&bh))
369 -               return err;
371 -       return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
372 -                       offset, length);
374 -EXPORT_SYMBOL_GPL(dax_zero_page_range);
376 -/**
377 - * dax_truncate_page - handle a partial page being truncated in a DAX file
378 - * @inode: The file being truncated
379 - * @from: The file offset that is being truncated to
380 - * @get_block: The filesystem method used to translate file offsets to blocks
381 - *
382 - * Similar to block_truncate_page(), this function can be called by a
383 - * filesystem when it is truncating a DAX file to handle the partial page.
384 - */
385 -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
387 -       unsigned length = PAGE_ALIGN(from) - from;
388 -       return dax_zero_page_range(inode, from, length, get_block);
390 -EXPORT_SYMBOL_GPL(dax_truncate_page);
392  #ifdef CONFIG_FS_IOMAP
393  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
395 diff --git a/include/linux/dax.h b/include/linux/dax.h
396 index 8d1a5c47945f..0afade8bd3d7 100644
397 --- a/include/linux/dax.h
398 +++ b/include/linux/dax.h
399 @@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
401  ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
402                 struct iomap_ops *ops);
403 -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
404 -                 get_block_t, dio_iodone_t, int flags);
405 -int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
406 -int dax_truncate_page(struct inode *, loff_t from, get_block_t);
407  int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
408                         struct iomap_ops *ops);
409 -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
410  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
411  void dax_wake_mapping_entry_waiter(struct address_space *mapping,
412                 pgoff_t index, void *entry, bool wake_all);
413 @@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
415  #endif
417 -static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
418 -                               pmd_t *pmd, unsigned int flags, get_block_t gb)
420 -       return VM_FAULT_FALLBACK;
423  #ifdef CONFIG_FS_DAX_PMD
424  static inline unsigned int dax_radix_order(void *entry)
426 @@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
428  #endif
429  int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
430 -#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
432  static inline bool vma_is_dax(struct vm_area_struct *vma)
434 -- 
435 2.6.6