1 dax: rip out get_block based IO support
3 From: Jan Kara <jack@suse.cz>
5 No one uses functions using the get_block callback anymore. Rip them
6 out and update documentation.
8 Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
9 Signed-off-by: Jan Kara <jack@suse.cz>
10 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
12 Documentation/filesystems/dax.txt | 22 +--
13 fs/dax.c | 315 --------------------------------------
14 include/linux/dax.h | 12 --
15 3 files changed, 11 insertions(+), 338 deletions(-)
17 diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
18 index 23d18b8a49d5..a7e6e14aeb08 100644
19 --- a/Documentation/filesystems/dax.txt
20 +++ b/Documentation/filesystems/dax.txt
21 @@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
22 Filesystem support consists of
23 - adding support to mark inodes as being DAX by setting the S_DAX flag in
25 -- implementing the direct_IO address space operation, and calling
26 - dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
27 +- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
28 + when inode has S_DAX flag set
29 - implementing an mmap file operation for DAX files which sets the
30 VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
31 - include handlers for fault, pmd_fault and page_mkwrite (which should
32 - probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
33 - appropriate get_block() callback)
34 -- calling dax_truncate_page() instead of block_truncate_page() for DAX files
35 -- calling dax_zero_page_range() instead of zero_user() for DAX files
36 + include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
37 + handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
38 + handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
40 +- calling iomap_zero_range() passing appropriate iomap operations instead of
41 + block_truncate_page() for DAX files
42 - ensuring that there is sufficient locking between reads, writes,
43 truncates and page faults
45 -The get_block() callback passed to the DAX functions may return
46 -uninitialised extents. If it does, it must ensure that simultaneous
47 -calls to get_block() (for example by a page-fault racing with a read()
48 -or a write()) work correctly.
49 +The iomap handlers for allocating blocks must make sure that allocated blocks
50 +are zeroed out and converted to written extents before being returned to avoid
51 +exposure of uninitialized data through mmap.
53 These filesystems may be used for inspiration:
54 - ext2: see Documentation/filesystems/ext2.txt
55 diff --git a/fs/dax.c b/fs/dax.c
56 index 28af41b9da3a..ad131cd2605d 100644
59 @@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
63 -static bool buffer_written(struct buffer_head *bh)
65 - return buffer_mapped(bh) && !buffer_unwritten(bh);
68 -static sector_t to_sector(const struct buffer_head *bh,
69 - const struct inode *inode)
71 - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
76 -static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
77 - loff_t start, loff_t end, get_block_t get_block,
78 - struct buffer_head *bh)
80 - loff_t pos = start, max = start, bh_max = start;
82 - struct block_device *bdev = NULL;
83 - int rw = iov_iter_rw(iter), rc;
85 - struct blk_dax_ctl dax = {
86 - .addr = ERR_PTR(-EIO),
88 - unsigned blkbits = inode->i_blkbits;
89 - sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
93 - end = min(end, i_size_read(inode));
98 - long page = pos >> PAGE_SHIFT;
99 - sector_t block = page << (PAGE_SHIFT - blkbits);
100 - unsigned first = pos - (block << blkbits);
103 - if (pos == bh_max) {
104 - bh->b_size = PAGE_ALIGN(end - pos);
106 - rc = get_block(inode, block, bh, rw == WRITE);
109 - bh_max = pos - first + bh->b_size;
112 - * We allow uninitialized buffers for writes
113 - * beyond EOF as those cannot race with faults
116 - (buffer_new(bh) && block < file_blks) ||
117 - (rw == WRITE && buffer_unwritten(bh)));
119 - unsigned done = bh->b_size -
120 - (bh_max - (pos - first));
121 - bh->b_blocknr += done >> blkbits;
122 - bh->b_size -= done;
125 - hole = rw == READ && !buffer_written(bh);
127 - size = bh->b_size - first;
129 - dax_unmap_atomic(bdev, &dax);
130 - dax.sector = to_sector(bh, inode);
131 - dax.size = bh->b_size;
132 - map_len = dax_map_atomic(bdev, &dax);
138 - size = map_len - first;
141 - * pos + size is one past the last offset for IO,
142 - * so pos + size can overflow loff_t at extreme offsets.
143 - * Cast to u64 to catch this and get the true minimum.
145 - max = min_t(u64, pos + size, end);
148 - if (iov_iter_rw(iter) == WRITE) {
149 - len = copy_from_iter_pmem(dax.addr, max - pos, iter);
151 - len = copy_to_iter((void __force *) dax.addr, max - pos,
154 - len = iov_iter_zero(max - pos, iter);
162 - if (!IS_ERR(dax.addr))
166 - dax_unmap_atomic(bdev, &dax);
168 - return (pos == start) ? rc : pos - start;
172 - * dax_do_io - Perform I/O to a DAX file
173 - * @iocb: The control block for this I/O
174 - * @inode: The file which the I/O is directed at
175 - * @iter: The addresses to do I/O from or to
176 - * @get_block: The filesystem method used to translate file offsets to blocks
177 - * @end_io: A filesystem callback for I/O completion
178 - * @flags: See below
180 - * This function uses the same locking scheme as do_blockdev_direct_IO:
181 - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
182 - * caller for writes. For reads, we take and release the i_mutex ourselves.
183 - * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
184 - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
187 -ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
188 - struct iov_iter *iter, get_block_t get_block,
189 - dio_iodone_t end_io, int flags)
191 - struct buffer_head bh;
192 - ssize_t retval = -EINVAL;
193 - loff_t pos = iocb->ki_pos;
194 - loff_t end = pos + iov_iter_count(iter);
196 - memset(&bh, 0, sizeof(bh));
197 - bh.b_bdev = inode->i_sb->s_bdev;
199 - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
202 - /* Protects against truncate */
203 - if (!(flags & DIO_SKIP_DIO_COUNT))
204 - inode_dio_begin(inode);
206 - retval = dax_io(inode, iter, pos, end, get_block, &bh);
208 - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
209 - inode_unlock(inode);
214 - err = end_io(iocb, pos, retval, bh.b_private);
219 - if (!(flags & DIO_SKIP_DIO_COUNT))
220 - inode_dio_end(inode);
223 -EXPORT_SYMBOL_GPL(dax_do_io);
226 * DAX radix tree locking
228 @@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping,
232 - * dax_fault - handle a page fault on a DAX file
233 - * @vma: The virtual memory area where the fault occurred
234 - * @vmf: The description of the fault
235 - * @get_block: The filesystem method used to translate file offsets to blocks
237 - * When a page fault occurs, filesystems may call this helper in their
238 - * fault handler for DAX files. dax_fault() assumes the caller has done all
239 - * the necessary locking for the page fault to proceed successfully.
241 -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
242 - get_block_t get_block)
244 - struct file *file = vma->vm_file;
245 - struct address_space *mapping = file->f_mapping;
246 - struct inode *inode = mapping->host;
248 - struct buffer_head bh;
249 - unsigned long vaddr = (unsigned long)vmf->virtual_address;
250 - unsigned blkbits = inode->i_blkbits;
257 - * Check whether offset isn't beyond end of file now. Caller is supposed
258 - * to hold locks serializing us with truncate / punch hole so this is
261 - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
262 - if (vmf->pgoff >= size)
263 - return VM_FAULT_SIGBUS;
265 - memset(&bh, 0, sizeof(bh));
266 - block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
267 - bh.b_bdev = inode->i_sb->s_bdev;
268 - bh.b_size = PAGE_SIZE;
270 - entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
271 - if (IS_ERR(entry)) {
272 - error = PTR_ERR(entry);
276 - error = get_block(inode, block, &bh, 0);
277 - if (!error && (bh.b_size < PAGE_SIZE))
278 - error = -EIO; /* fs corruption? */
282 - if (vmf->cow_page) {
283 - struct page *new_page = vmf->cow_page;
284 - if (buffer_written(&bh))
285 - error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
286 - bh.b_size, new_page, vaddr);
288 - clear_user_highpage(new_page, vaddr);
291 - if (!radix_tree_exceptional_entry(entry)) {
293 - return VM_FAULT_LOCKED;
295 - vmf->entry = entry;
296 - return VM_FAULT_DAX_LOCKED;
299 - if (!buffer_mapped(&bh)) {
300 - if (vmf->flags & FAULT_FLAG_WRITE) {
301 - error = get_block(inode, block, &bh, 1);
302 - count_vm_event(PGMAJFAULT);
303 - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
304 - major = VM_FAULT_MAJOR;
305 - if (!error && (bh.b_size < PAGE_SIZE))
310 - return dax_load_hole(mapping, entry, vmf);
314 - /* Filesystem should not return unwritten buffers to us! */
315 - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
316 - error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
317 - bh.b_size, &entry, vma, vmf);
319 - put_locked_mapping_entry(mapping, vmf->pgoff, entry);
321 - if (error == -ENOMEM)
322 - return VM_FAULT_OOM | major;
323 - /* -EBUSY is fine, somebody else faulted on the same PTE */
324 - if ((error < 0) && (error != -EBUSY))
325 - return VM_FAULT_SIGBUS | major;
326 - return VM_FAULT_NOPAGE | major;
328 -EXPORT_SYMBOL_GPL(dax_fault);
331 * dax_pfn_mkwrite - handle first write to DAX page
332 * @vma: The virtual memory area where the fault occurred
333 * @vmf: The description of the fault
334 @@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
336 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
339 - * dax_zero_page_range - zero a range within a page of a DAX file
340 - * @inode: The file being truncated
341 - * @from: The file offset that is being truncated to
342 - * @length: The number of bytes to zero
343 - * @get_block: The filesystem method used to translate file offsets to blocks
345 - * This function can be called by a filesystem when it is zeroing part of a
346 - * page in a DAX file. This is intended for hole-punch operations. If
347 - * you are truncating a file, the helper function dax_truncate_page() may be
350 -int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
351 - get_block_t get_block)
353 - struct buffer_head bh;
354 - pgoff_t index = from >> PAGE_SHIFT;
355 - unsigned offset = from & (PAGE_SIZE-1);
358 - /* Block boundary? Nothing to do */
361 - if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
364 - memset(&bh, 0, sizeof(bh));
365 - bh.b_bdev = inode->i_sb->s_bdev;
366 - bh.b_size = PAGE_SIZE;
367 - err = get_block(inode, index, &bh, 0);
368 - if (err < 0 || !buffer_written(&bh))
371 - return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
374 -EXPORT_SYMBOL_GPL(dax_zero_page_range);
377 - * dax_truncate_page - handle a partial page being truncated in a DAX file
378 - * @inode: The file being truncated
379 - * @from: The file offset that is being truncated to
380 - * @get_block: The filesystem method used to translate file offsets to blocks
382 - * Similar to block_truncate_page(), this function can be called by a
383 - * filesystem when it is truncating a DAX file to handle the partial page.
385 -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
387 - unsigned length = PAGE_ALIGN(from) - from;
388 - return dax_zero_page_range(inode, from, length, get_block);
390 -EXPORT_SYMBOL_GPL(dax_truncate_page);
392 #ifdef CONFIG_FS_IOMAP
393 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
395 diff --git a/include/linux/dax.h b/include/linux/dax.h
396 index 8d1a5c47945f..0afade8bd3d7 100644
397 --- a/include/linux/dax.h
398 +++ b/include/linux/dax.h
399 @@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
401 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
402 struct iomap_ops *ops);
403 -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
404 - get_block_t, dio_iodone_t, int flags);
405 -int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
406 -int dax_truncate_page(struct inode *, loff_t from, get_block_t);
407 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
408 struct iomap_ops *ops);
409 -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
410 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
411 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
412 pgoff_t index, void *entry, bool wake_all);
413 @@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
417 -static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
418 - pmd_t *pmd, unsigned int flags, get_block_t gb)
420 - return VM_FAULT_FALLBACK;
423 #ifdef CONFIG_FS_DAX_PMD
424 static inline unsigned int dax_radix_order(void *entry)
426 @@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
429 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
430 -#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
432 static inline bool vma_is_dax(struct vm_area_struct *vma)