From aa550e790b3c492bfc3ef5e4784e19b5a2e3fa90 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 21 Nov 2016 11:51:03 -0500 Subject: [PATCH] Check in Jan Kara's v3 DAX iomap patches --- DAX-iomap-write-support | 228 +++++++++++ avoid-split-extents-for-DAX-writes | 60 +++ convert-DAX-faults-to-iomap-infrastructure | 80 ++++ convert-dax-reads-to-iomap-infrastructure | 161 ++++++++ dax-rip-out-get_block-based-IO-support | 437 +++++++++++++++++++++ ...ro_range-for-zeroing-truncated-page-in-DAX-path | 48 +++ factor-out-checks-from-ext4_file_write_iter | 141 +++++++ let-S_DAX-set-only-if-DAX-is-really-supported | 102 +++++ rip-out-DAX-handling-from-direct-IO-path | 182 +++++++++ series | 11 + timestamps | 16 +- use-iomap-for-zeroing-blocks-in-DAX-mode | 36 ++ 12 files changed, 1499 insertions(+), 3 deletions(-) create mode 100644 DAX-iomap-write-support create mode 100644 avoid-split-extents-for-DAX-writes create mode 100644 convert-DAX-faults-to-iomap-infrastructure create mode 100644 convert-dax-reads-to-iomap-infrastructure create mode 100644 dax-rip-out-get_block-based-IO-support create mode 100644 ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path create mode 100644 factor-out-checks-from-ext4_file_write_iter create mode 100644 let-S_DAX-set-only-if-DAX-is-really-supported create mode 100644 rip-out-DAX-handling-from-direct-IO-path create mode 100644 use-iomap-for-zeroing-blocks-in-DAX-mode diff --git a/DAX-iomap-write-support b/DAX-iomap-write-support new file mode 100644 index 00000000..cdb2404b --- /dev/null +++ b/DAX-iomap-write-support @@ -0,0 +1,228 @@ +ext4: DAX iomap write support + +From: Jan Kara + +Implement DAX writes using the new iomap infrastructure instead of +overloading the direct IO path. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/file.c | 40 ++++++++++++++++++ + fs/ext4/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- + 2 files changed, 160 insertions(+), 6 deletions(-) + +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 1f25c644cb12..1953fe34f9fe 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) + return iov_iter_count(from); + } + ++#ifdef CONFIG_FS_DAX ++static ssize_t ++ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ssize_t ret; ++ bool overwrite = false; ++ ++ inode_lock(inode); ++ ret = ext4_write_checks(iocb, from); ++ if (ret <= 0) ++ goto out; ++ ret = file_remove_privs(iocb->ki_filp); ++ if (ret) ++ goto out; ++ ret = file_update_time(iocb->ki_filp); ++ if (ret) ++ goto out; ++ ++ if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { ++ overwrite = true; ++ downgrade_write(&inode->i_rwsem); ++ } ++ ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); ++out: ++ if (!overwrite) ++ inode_unlock(inode); ++ else ++ inode_unlock_shared(inode); ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ return ret; ++} ++#endif ++ + static ssize_t + ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + { +@@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + int overwrite = 0; + ssize_t ret; + ++#ifdef CONFIG_FS_DAX ++ if (IS_DAX(inode)) ++ return ext4_dax_write_iter(iocb, from); ++#endif ++ + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index df017ce3e52d..a7079cab645a 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3321,18 +3321,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + struct ext4_map_blocks map; + int ret; + +- if (flags & IOMAP_WRITE) +- return -EIO; +- + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + + map.m_lblk = first_block; + map.m_len = last_block - first_block + 1; + +- ret = ext4_map_blocks(NULL, inode, &map, 0); +- if (ret < 0) +- return ret; ++ if (!(flags & IOMAP_WRITE)) { ++ ret = ext4_map_blocks(NULL, inode, &map, 0); ++ } else { ++ int dio_credits; ++ handle_t *handle; ++ int retries = 0; ++ ++ /* Trim mapping request to maximum we can map at once for DIO */ ++ if (map.m_len > DIO_MAX_BLOCKS) ++ map.m_len = DIO_MAX_BLOCKS; ++ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); ++retry: ++ /* ++ * Either we allocate blocks and then we don't get unwritten ++ * extent so we have reserved enough credits, or the blocks ++ * are already allocated and unwritten and in that case ++ * extent conversion fits in the credits as well. ++ */ ++ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, ++ dio_credits); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ret = ext4_map_blocks(handle, inode, &map, ++ EXT4_GET_BLOCKS_PRE_IO | ++ EXT4_GET_BLOCKS_CREATE_ZERO); ++ if (ret < 0) { ++ ext4_journal_stop(handle); ++ if (ret == -ENOSPC && ++ ext4_should_retry_alloc(inode->i_sb, &retries)) ++ goto retry; ++ return ret; ++ } ++ /* For DAX writes we need to zero out unwritten extents */ ++ if (map.m_flags & EXT4_MAP_UNWRITTEN) { ++ /* ++ * We are protected by i_mmap_sem or i_rwsem so we know ++ * block cannot go away from under us even though we ++ * dropped i_data_sem. Convert extent to written and ++ * write zeros there. ++ */ ++ ret = ext4_map_blocks(handle, inode, &map, ++ EXT4_GET_BLOCKS_CONVERT | ++ EXT4_GET_BLOCKS_CREATE_ZERO); ++ if (ret < 0) { ++ ext4_journal_stop(handle); ++ return ret; ++ } ++ } ++ ++ /* ++ * If we added blocks beyond i_size we need to make sure they ++ * will get truncated if we crash before updating i_size in ++ * ext4_iomap_end(). ++ */ ++ if (first_block + map.m_len > ++ (inode->i_size + (1 << blkbits) - 1) >> blkbits) { ++ int err; ++ ++ err = ext4_orphan_add(handle, inode); ++ if (err < 0) { ++ ext4_journal_stop(handle); ++ return err; ++ } ++ } ++ ext4_journal_stop(handle); ++ } + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; +@@ -3360,8 +3421,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + return 0; + } + ++static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ++ ssize_t written, unsigned flags, struct iomap *iomap) ++{ ++ int ret = 0; ++ handle_t *handle; ++ int blkbits = inode->i_blkbits; ++ bool truncate = false; ++ ++ if (!(flags & IOMAP_WRITE)) ++ return 0; ++ ++ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto orphan_del; ++ } ++ if (ext4_update_inode_size(inode, offset + written)) ++ ext4_mark_inode_dirty(handle, inode); ++ /* ++ * We may need to truncate allocated but not written blocks beyond EOF. ++ */ ++ if (iomap->offset + iomap->length > ++ ALIGN(inode->i_size, 1 << blkbits)) { ++ ext4_lblk_t written_blk, end_blk; ++ ++ written_blk = (offset + written) >> blkbits; ++ end_blk = (offset + length) >> blkbits; ++ if (written_blk < end_blk && ext4_can_truncate(inode)) ++ truncate = true; ++ } ++ /* ++ * Remove inode from orphan list if we were extending a inode and ++ * everything went fine. ++ */ ++ if (!truncate && inode->i_nlink && ++ !list_empty(&EXT4_I(inode)->i_orphan)) ++ ext4_orphan_del(handle, inode); ++ ext4_journal_stop(handle); ++ if (truncate) { ++ ext4_truncate_failed_write(inode); ++orphan_del: ++ /* ++ * If truncate failed early the inode might still be on the ++ * orphan list; we need to make sure the inode is removed from ++ * the orphan list in that case. ++ */ ++ if (inode->i_nlink) ++ ext4_orphan_del(NULL, inode); ++ } ++ return ret; ++} ++ + struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, ++ .iomap_end = ext4_iomap_end, + }; + + #else +-- +2.6.6 + + diff --git a/avoid-split-extents-for-DAX-writes b/avoid-split-extents-for-DAX-writes new file mode 100644 index 00000000..e6c30529 --- /dev/null +++ b/avoid-split-extents-for-DAX-writes @@ -0,0 +1,60 @@ +ext4: avoid split extents for DAX writes + +From: Jan Kara + +Currently mapping of blocks for DAX writes happen with +EXT4_GET_BLOCKS_PRE_IO flag set. That has a result that each +ext4_map_blocks() call creates a separate written extent, although it +could be merged to the neighboring extents in the extent tree. The +reason for using this flag is that in case the extent is unwritten, we +need to convert it to written one and zero it out. However this "convert +mapped range to written" operation is already implemented by +ext4_map_blocks() for the case of data writes into unwritten extent. So +just use flags for that mode of operation, simplify the code, and avoid +unnecessary split extents. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/inode.c | 17 ----------------- + 1 file changed, 17 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index a7079cab645a..3192ec0768d4 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3351,7 +3351,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, &map, +- EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) { + ext4_journal_stop(handle); +@@ -3360,22 +3359,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + goto retry; + return ret; + } +- /* For DAX writes we need to zero out unwritten extents */ +- if (map.m_flags & EXT4_MAP_UNWRITTEN) { +- /* +- * We are protected by i_mmap_sem or i_rwsem so we know +- * block cannot go away from under us even though we +- * dropped i_data_sem. Convert extent to written and +- * write zeros there. +- */ +- ret = ext4_map_blocks(handle, inode, &map, +- EXT4_GET_BLOCKS_CONVERT | +- EXT4_GET_BLOCKS_CREATE_ZERO); +- if (ret < 0) { +- ext4_journal_stop(handle); +- return ret; +- } +- } + + /* + * If we added blocks beyond i_size we need to make sure they +-- +2.6.6 + + diff --git a/convert-DAX-faults-to-iomap-infrastructure b/convert-DAX-faults-to-iomap-infrastructure new file mode 100644 index 00000000..211a7501 --- /dev/null +++ b/convert-DAX-faults-to-iomap-infrastructure @@ -0,0 +1,80 @@ +ext4: convert DAX faults to iomap infrastructure + +From: Jan Kara + +Convert DAX faults to use iomap infrastructure. We would not have to start +transaction in ext4_dax_fault() anymore since ext4_iomap_begin takes +care of that but so far we do that to avoid lock inversion of +transaction start with DAX entry lock which gets acquired in +dax_iomap_fault() before calling ->iomap_begin handler. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/file.c | 9 +++++---- + fs/ext4/inode.c | 14 +++++++++----- + 2 files changed, 14 insertions(+), 9 deletions(-) + +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 1953fe34f9fe..b5f184493c57 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -275,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else +- result = dax_fault(vma, vmf, ext4_dax_get_block); ++ result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); + + if (write) { + if (!IS_ERR(handle)) +@@ -309,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; +- else +- result = dax_pmd_fault(vma, addr, pmd, flags, +- ext4_dax_get_block); ++ else { ++ result = dax_iomap_pmd_fault(vma, addr, pmd, flags, ++ &ext4_iomap_ops); ++ } + + if (write) { + if (!IS_ERR(handle)) +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 3192ec0768d4..4d71c7bc3524 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3361,12 +3361,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + } + + /* +- * If we added blocks beyond i_size we need to make sure they ++ * If we added blocks beyond i_size, we need to make sure they + * will get truncated if we crash before updating i_size in +- * ext4_iomap_end(). ++ * ext4_iomap_end(). For faults we don't need to do that (and ++ * even cannot because for orphan list operations inode_lock is ++ * required) - if we happen to instantiate block beyond i_size, ++ * it is because we race with truncate which has already added ++ * the inode to the orphan list. + */ +- if (first_block + map.m_len > +- (inode->i_size + (1 << blkbits) - 1) >> blkbits) { ++ if (!(flags & IOMAP_FAULT) && first_block + map.m_len > ++ (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { + int err; + + err = ext4_orphan_add(handle, inode); +@@ -3412,7 +3416,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + int blkbits = inode->i_blkbits; + bool truncate = false; + +- if (!(flags & IOMAP_WRITE)) ++ if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); +-- +2.6.6 diff --git a/convert-dax-reads-to-iomap-infrastructure b/convert-dax-reads-to-iomap-infrastructure new file mode 100644 index 00000000..2ae825a6 --- /dev/null +++ b/convert-dax-reads-to-iomap-infrastructure @@ -0,0 +1,161 @@ +ext4: convert DAX reads to iomap infrastructure + +From: Jan Kara + +Implement basic iomap_begin function that handles reading and use it for +DAX reads. + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 2 ++ + fs/ext4/file.c | 38 +++++++++++++++++++++++++++++++++++++- + fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 93 insertions(+), 1 deletion(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 282a51b07c57..098b39910001 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3271,6 +3271,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) + return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); + } + ++extern struct iomap_ops ext4_iomap_ops; ++ + #endif /* __KERNEL__ */ + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 9facb4dc5c70..1f25c644cb12 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -31,6 +31,42 @@ + #include "xattr.h" + #include "acl.h" + ++#ifdef CONFIG_FS_DAX ++static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ssize_t ret; ++ ++ inode_lock_shared(inode); ++ /* ++ * Recheck under inode lock - at this point we are sure it cannot ++ * change anymore ++ */ ++ if (!IS_DAX(inode)) { ++ inode_unlock_shared(inode); ++ /* Fallback to buffered IO in case we cannot support DAX */ ++ return generic_file_read_iter(iocb, to); ++ } ++ ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); ++ inode_unlock_shared(inode); ++ ++ file_accessed(iocb->ki_filp); ++ return ret; ++} ++#endif ++ ++static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ++{ ++ if (!iov_iter_count(to)) ++ return 0; /* skip atime */ ++ ++#ifdef CONFIG_FS_DAX ++ if (IS_DAX(file_inode(iocb->ki_filp))) ++ return ext4_dax_read_iter(iocb, to); ++#endif ++ return generic_file_read_iter(iocb, to); ++} ++ + /* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release +@@ -690,7 +726,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) + + const struct file_operations ext4_file_operations = { + .llseek = ext4_llseek, +- .read_iter = generic_file_read_iter, ++ .read_iter = ext4_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, + #ifdef CONFIG_COMPAT +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 5337828c68a7..83e8411370d3 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include "ext4_jbd2.h" + #include "xattr.h" +@@ -3310,6 +3311,59 @@ int ext4_dax_get_block(struct inode *inode, sector_t iblock, + clear_buffer_new(bh_result); + return 0; + } ++ ++static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ++ unsigned flags, struct iomap *iomap) ++{ ++ unsigned int blkbits = inode->i_blkbits; ++ unsigned long first_block = offset >> blkbits; ++ unsigned long last_block = (offset + length - 1) >> blkbits; ++ struct ext4_map_blocks map; ++ int ret; ++ ++ if (flags & IOMAP_WRITE) ++ return -EIO; ++ ++ if (WARN_ON_ONCE(ext4_has_inline_data(inode))) ++ return -ERANGE; ++ ++ map.m_lblk = first_block; ++ map.m_len = last_block - first_block + 1; ++ ++ ret = ext4_map_blocks(NULL, inode, &map, 0); ++ if (ret < 0) ++ return ret; ++ ++ iomap->flags = 0; ++ iomap->bdev = inode->i_sb->s_bdev; ++ iomap->offset = first_block << blkbits; ++ ++ if (ret == 0) { ++ iomap->type = IOMAP_HOLE; ++ iomap->blkno = IOMAP_NULL_BLOCK; ++ iomap->length = (u64)map.m_len << blkbits; ++ } else { ++ if (map.m_flags & EXT4_MAP_MAPPED) { ++ iomap->type = IOMAP_MAPPED; ++ } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { ++ iomap->type = IOMAP_UNWRITTEN; ++ } else { ++ WARN_ON_ONCE(1); ++ return -EIO; ++ } ++ iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); ++ iomap->length = (u64)map.m_len << blkbits; ++ } ++ ++ if (map.m_flags & EXT4_MAP_NEW) ++ iomap->flags |= IOMAP_F_NEW; ++ return 0; ++} ++ ++struct iomap_ops ext4_iomap_ops = { ++ .iomap_begin = ext4_iomap_begin, ++}; ++ + #else + /* Just define empty function, it will never get called. */ + int ext4_dax_get_block(struct inode *inode, sector_t iblock, +-- +2.6.6 + + diff --git a/dax-rip-out-get_block-based-IO-support b/dax-rip-out-get_block-based-IO-support new file mode 100644 index 00000000..f1e3c0d9 --- /dev/null +++ b/dax-rip-out-get_block-based-IO-support @@ -0,0 +1,437 @@ +dax: rip out get_block based IO support + +From: Jan Kara + +No one uses functions using the get_block callback anymore. Rip them +out and update documentation. + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + Documentation/filesystems/dax.txt | 22 +-- + fs/dax.c | 315 -------------------------------------- + include/linux/dax.h | 12 -- + 3 files changed, 11 insertions(+), 338 deletions(-) + +diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt +index 23d18b8a49d5..a7e6e14aeb08 100644 +--- a/Documentation/filesystems/dax.txt ++++ b/Documentation/filesystems/dax.txt +@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers + Filesystem support consists of + - adding support to mark inodes as being DAX by setting the S_DAX flag in + i_flags +-- implementing the direct_IO address space operation, and calling +- dax_do_io() instead of blockdev_direct_IO() if S_DAX is set ++- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw() ++ when inode has S_DAX flag set + - implementing an mmap file operation for DAX files which sets the + VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to +- include handlers for fault, pmd_fault and page_mkwrite (which should +- probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the +- appropriate get_block() callback) +-- calling dax_truncate_page() instead of block_truncate_page() for DAX files +-- calling dax_zero_page_range() instead of zero_user() for DAX files ++ include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These ++ handlers should probably call dax_iomap_fault() (for fault and page_mkwrite ++ handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate ++ iomap operations. ++- calling iomap_zero_range() passing appropriate iomap operations instead of ++ block_truncate_page() for DAX files + - ensuring that there is sufficient locking between reads, writes, + truncates and page faults + +-The get_block() callback passed to the DAX functions may return +-uninitialised extents. If it does, it must ensure that simultaneous +-calls to get_block() (for example by a page-fault racing with a read() +-or a write()) work correctly. ++The iomap handlers for allocating blocks must make sure that allocated blocks ++are zeroed out and converted to written extents before being returned to avoid ++exposure of uninitialized data through mmap. + + These filesystems may be used for inspiration: + - ext2: see Documentation/filesystems/ext2.txt +diff --git a/fs/dax.c b/fs/dax.c +index 28af41b9da3a..ad131cd2605d 100644 +--- a/fs/dax.c ++++ b/fs/dax.c +@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) + return page; + } + +-static bool buffer_written(struct buffer_head *bh) +-{ +- return buffer_mapped(bh) && !buffer_unwritten(bh); +-} +- +-static sector_t to_sector(const struct buffer_head *bh, +- const struct inode *inode) +-{ +- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); +- +- return sector; +-} +- +-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, +- loff_t start, loff_t end, get_block_t get_block, +- struct buffer_head *bh) +-{ +- loff_t pos = start, max = start, bh_max = start; +- bool hole = false; +- struct block_device *bdev = NULL; +- int rw = iov_iter_rw(iter), rc; +- long map_len = 0; +- struct blk_dax_ctl dax = { +- .addr = ERR_PTR(-EIO), +- }; +- unsigned blkbits = inode->i_blkbits; +- sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) +- >> blkbits; +- +- if (rw == READ) +- end = min(end, i_size_read(inode)); +- +- while (pos < end) { +- size_t len; +- if (pos == max) { +- long page = pos >> PAGE_SHIFT; +- sector_t block = page << (PAGE_SHIFT - blkbits); +- unsigned first = pos - (block << blkbits); +- long size; +- +- if (pos == bh_max) { +- bh->b_size = PAGE_ALIGN(end - pos); +- bh->b_state = 0; +- rc = get_block(inode, block, bh, rw == WRITE); +- if (rc) +- break; +- bh_max = pos - first + bh->b_size; +- bdev = bh->b_bdev; +- /* +- * We allow uninitialized buffers for writes +- * beyond EOF as those cannot race with faults +- */ +- WARN_ON_ONCE( +- (buffer_new(bh) && block < file_blks) || +- (rw == WRITE && buffer_unwritten(bh))); +- } else { +- unsigned done = bh->b_size - +- (bh_max - (pos - first)); +- bh->b_blocknr += done >> blkbits; +- bh->b_size -= done; +- } +- +- hole = rw == READ && !buffer_written(bh); +- if (hole) { +- size = bh->b_size - first; +- } else { +- dax_unmap_atomic(bdev, &dax); +- dax.sector = to_sector(bh, inode); +- dax.size = bh->b_size; +- map_len = dax_map_atomic(bdev, &dax); +- if (map_len < 0) { +- rc = map_len; +- break; +- } +- dax.addr += first; +- size = map_len - first; +- } +- /* +- * pos + size is one past the last offset for IO, +- * so pos + size can overflow loff_t at extreme offsets. +- * Cast to u64 to catch this and get the true minimum. +- */ +- max = min_t(u64, pos + size, end); +- } +- +- if (iov_iter_rw(iter) == WRITE) { +- len = copy_from_iter_pmem(dax.addr, max - pos, iter); +- } else if (!hole) +- len = copy_to_iter((void __force *) dax.addr, max - pos, +- iter); +- else +- len = iov_iter_zero(max - pos, iter); +- +- if (!len) { +- rc = -EFAULT; +- break; +- } +- +- pos += len; +- if (!IS_ERR(dax.addr)) +- dax.addr += len; +- } +- +- dax_unmap_atomic(bdev, &dax); +- +- return (pos == start) ? rc : pos - start; +-} +- +-/** +- * dax_do_io - Perform I/O to a DAX file +- * @iocb: The control block for this I/O +- * @inode: The file which the I/O is directed at +- * @iter: The addresses to do I/O from or to +- * @get_block: The filesystem method used to translate file offsets to blocks +- * @end_io: A filesystem callback for I/O completion +- * @flags: See below +- * +- * This function uses the same locking scheme as do_blockdev_direct_IO: +- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the +- * caller for writes. For reads, we take and release the i_mutex ourselves. +- * If DIO_LOCKING is not set, the filesystem takes care of its own locking. +- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O +- * is in progress. +- */ +-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, +- struct iov_iter *iter, get_block_t get_block, +- dio_iodone_t end_io, int flags) +-{ +- struct buffer_head bh; +- ssize_t retval = -EINVAL; +- loff_t pos = iocb->ki_pos; +- loff_t end = pos + iov_iter_count(iter); +- +- memset(&bh, 0, sizeof(bh)); +- bh.b_bdev = inode->i_sb->s_bdev; +- +- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) +- inode_lock(inode); +- +- /* Protects against truncate */ +- if (!(flags & DIO_SKIP_DIO_COUNT)) +- inode_dio_begin(inode); +- +- retval = dax_io(inode, iter, pos, end, get_block, &bh); +- +- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) +- inode_unlock(inode); +- +- if (end_io) { +- int err; +- +- err = end_io(iocb, pos, retval, bh.b_private); +- if (err) +- retval = err; +- } +- +- if (!(flags & DIO_SKIP_DIO_COUNT)) +- inode_dio_end(inode); +- return retval; +-} +-EXPORT_SYMBOL_GPL(dax_do_io); +- + /* + * DAX radix tree locking + */ +@@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping, + } + + /** +- * dax_fault - handle a page fault on a DAX file +- * @vma: The virtual memory area where the fault occurred +- * @vmf: The description of the fault +- * @get_block: The filesystem method used to translate file offsets to blocks +- * +- * When a page fault occurs, filesystems may call this helper in their +- * fault handler for DAX files. dax_fault() assumes the caller has done all +- * the necessary locking for the page fault to proceed successfully. +- */ +-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +- get_block_t get_block) +-{ +- struct file *file = vma->vm_file; +- struct address_space *mapping = file->f_mapping; +- struct inode *inode = mapping->host; +- void *entry; +- struct buffer_head bh; +- unsigned long vaddr = (unsigned long)vmf->virtual_address; +- unsigned blkbits = inode->i_blkbits; +- sector_t block; +- pgoff_t size; +- int error; +- int major = 0; +- +- /* +- * Check whether offset isn't beyond end of file now. Caller is supposed +- * to hold locks serializing us with truncate / punch hole so this is +- * a reliable test. +- */ +- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; +- if (vmf->pgoff >= size) +- return VM_FAULT_SIGBUS; +- +- memset(&bh, 0, sizeof(bh)); +- block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); +- bh.b_bdev = inode->i_sb->s_bdev; +- bh.b_size = PAGE_SIZE; +- +- entry = grab_mapping_entry(mapping, vmf->pgoff, 0); +- if (IS_ERR(entry)) { +- error = PTR_ERR(entry); +- goto out; +- } +- +- error = get_block(inode, block, &bh, 0); +- if (!error && (bh.b_size < PAGE_SIZE)) +- error = -EIO; /* fs corruption? */ +- if (error) +- goto unlock_entry; +- +- if (vmf->cow_page) { +- struct page *new_page = vmf->cow_page; +- if (buffer_written(&bh)) +- error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), +- bh.b_size, new_page, vaddr); +- else +- clear_user_highpage(new_page, vaddr); +- if (error) +- goto unlock_entry; +- if (!radix_tree_exceptional_entry(entry)) { +- vmf->page = entry; +- return VM_FAULT_LOCKED; +- } +- vmf->entry = entry; +- return VM_FAULT_DAX_LOCKED; +- } +- +- if (!buffer_mapped(&bh)) { +- if (vmf->flags & FAULT_FLAG_WRITE) { +- error = get_block(inode, block, &bh, 1); +- count_vm_event(PGMAJFAULT); +- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); +- major = VM_FAULT_MAJOR; +- if (!error && (bh.b_size < PAGE_SIZE)) +- error = -EIO; +- if (error) +- goto unlock_entry; +- } else { +- return dax_load_hole(mapping, entry, vmf); +- } +- } +- +- /* Filesystem should not return unwritten buffers to us! */ +- WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); +- error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), +- bh.b_size, &entry, vma, vmf); +- unlock_entry: +- put_locked_mapping_entry(mapping, vmf->pgoff, entry); +- out: +- if (error == -ENOMEM) +- return VM_FAULT_OOM | major; +- /* -EBUSY is fine, somebody else faulted on the same PTE */ +- if ((error < 0) && (error != -EBUSY)) +- return VM_FAULT_SIGBUS | major; +- return VM_FAULT_NOPAGE | major; +-} +-EXPORT_SYMBOL_GPL(dax_fault); +- +-/** + * dax_pfn_mkwrite - handle first write to DAX page + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault +@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + } + EXPORT_SYMBOL_GPL(__dax_zero_page_range); + +-/** +- * dax_zero_page_range - zero a range within a page of a DAX file +- * @inode: The file being truncated +- * @from: The file offset that is being truncated to +- * @length: The number of bytes to zero +- * @get_block: The filesystem method used to translate file offsets to blocks +- * +- * This function can be called by a filesystem when it is zeroing part of a +- * page in a DAX file. This is intended for hole-punch operations. If +- * you are truncating a file, the helper function dax_truncate_page() may be +- * more convenient. +- */ +-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, +- get_block_t get_block) +-{ +- struct buffer_head bh; +- pgoff_t index = from >> PAGE_SHIFT; +- unsigned offset = from & (PAGE_SIZE-1); +- int err; +- +- /* Block boundary? Nothing to do */ +- if (!length) +- return 0; +- if (WARN_ON_ONCE((offset + length) > PAGE_SIZE)) +- return -EINVAL; +- +- memset(&bh, 0, sizeof(bh)); +- bh.b_bdev = inode->i_sb->s_bdev; +- bh.b_size = PAGE_SIZE; +- err = get_block(inode, index, &bh, 0); +- if (err < 0 || !buffer_written(&bh)) +- return err; +- +- return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), +- offset, length); +-} +-EXPORT_SYMBOL_GPL(dax_zero_page_range); +- +-/** +- * dax_truncate_page - handle a partial page being truncated in a DAX file +- * @inode: The file being truncated +- * @from: The file offset that is being truncated to +- * @get_block: The filesystem method used to translate file offsets to blocks +- * +- * Similar to block_truncate_page(), this function can be called by a +- * filesystem when it is truncating a DAX file to handle the partial page. +- */ +-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +-{ +- unsigned length = PAGE_ALIGN(from) - from; +- return dax_zero_page_range(inode, from, length, get_block); +-} +-EXPORT_SYMBOL_GPL(dax_truncate_page); +- + #ifdef CONFIG_FS_IOMAP + static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) + { +diff --git a/include/linux/dax.h b/include/linux/dax.h +index 8d1a5c47945f..0afade8bd3d7 100644 +--- a/include/linux/dax.h ++++ b/include/linux/dax.h +@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) + + ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops); +-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, +- get_block_t, dio_iodone_t, int flags); +-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); +-int dax_truncate_page(struct inode *, loff_t from, get_block_t); + int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops); +-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); + int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); + void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, void *entry, bool wake_all); +@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev, + } + #endif + +-static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, +- pmd_t *pmd, unsigned int flags, get_block_t gb) +-{ +- return VM_FAULT_FALLBACK; +-} +- + #ifdef CONFIG_FS_DAX_PMD + static inline unsigned int dax_radix_order(void *entry) + { +@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, + } + #endif + int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); +-#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) + + static inline bool vma_is_dax(struct vm_area_struct *vma) + { +-- +2.6.6 + + diff --git a/ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path b/ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path new file mode 100644 index 00000000..efbc8aec --- /dev/null +++ b/ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path @@ -0,0 +1,48 @@ +ext2: use iomap_zero_range() for zeroing truncated page in DAX path + +From: Jan Kara + +Currently the last user of ext2_get_blocks() for DAX inodes was +dax_truncate_page(). Convert that to iomap_zero_range() so that all DAX +IO uses the iomap path. + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext2/inode.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c +index 41b8b44a391c..046b642f3585 100644 +--- a/fs/ext2/inode.c ++++ b/fs/ext2/inode.c +@@ -850,6 +850,9 @@ struct iomap_ops ext2_iomap_ops = { + .iomap_begin = ext2_iomap_begin, + .iomap_end = ext2_iomap_end, + }; ++#else ++/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ ++struct iomap_ops ext2_iomap_ops; + #endif /* CONFIG_FS_DAX */ + + int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +@@ -1293,9 +1296,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) + + inode_dio_wait(inode); + +- if (IS_DAX(inode)) +- error = dax_truncate_page(inode, newsize, ext2_get_block); +- else if (test_opt(inode->i_sb, NOBH)) ++ if (IS_DAX(inode)) { ++ error = iomap_zero_range(inode, newsize, ++ PAGE_ALIGN(newsize) - newsize, NULL, ++ &ext2_iomap_ops); ++ } else if (test_opt(inode->i_sb, NOBH)) + error = nobh_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + else +-- +2.6.6 + + diff --git a/factor-out-checks-from-ext4_file_write_iter b/factor-out-checks-from-ext4_file_write_iter new file mode 100644 index 00000000..f17aaa10 --- /dev/null +++ b/factor-out-checks-from-ext4_file_write_iter @@ -0,0 +1,141 @@ +ext4: factor out checks from ext4_file_write_iter() + +From: Jan Kara + +Factor out checks of 'from' and whether we are overwriting out of +ext4_file_write_iter() so that the function is easier to follow. + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/file.c | 97 ++++++++++++++++++++++++++++++---------------------------- + 1 file changed, 50 insertions(+), 47 deletions(-) + +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 2a822d30e73f..9facb4dc5c70 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -88,6 +88,51 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) + return 0; + } + ++/* Is IO overwriting allocated and initialized blocks? */ ++static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) ++{ ++ struct ext4_map_blocks map; ++ unsigned int blkbits = inode->i_blkbits; ++ int err, blklen; ++ ++ if (pos + len > i_size_read(inode)) ++ return false; ++ ++ map.m_lblk = pos >> blkbits; ++ map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); ++ blklen = map.m_len; ++ ++ err = ext4_map_blocks(NULL, inode, &map, 0); ++ /* ++ * 'err==len' means that all of the blocks have been preallocated, ++ * regardless of whether they have been initialized or not. To exclude ++ * unwritten extents, we need to check m_flags. ++ */ ++ return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); ++} ++ ++static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ssize_t ret; ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ return ret; ++ /* ++ * If we have encountered a bitmap-format file, the size limit ++ * is smaller than s_maxbytes, which is for extent-mapped files. ++ */ ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ ++ if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) ++ return -EFBIG; ++ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); ++ } ++ return iov_iter_count(from); ++} ++ + static ssize_t + ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + { +@@ -98,7 +143,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + ssize_t ret; + + inode_lock(inode); +- ret = generic_write_checks(iocb, from); ++ ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + +@@ -114,53 +159,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + ext4_unwritten_wait(inode); + } + +- /* +- * If we have encountered a bitmap-format file, the size limit +- * is smaller than s_maxbytes, which is for extent-mapped files. +- */ +- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { +- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- +- if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { +- ret = -EFBIG; +- goto out; +- } +- iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); +- } +- + iocb->private = &overwrite; +- if (o_direct) { +- size_t length = iov_iter_count(from); +- loff_t pos = iocb->ki_pos; +- +- /* check whether we do a DIO overwrite or not */ +- if (ext4_should_dioread_nolock(inode) && !unaligned_aio && +- pos + length <= i_size_read(inode)) { +- struct ext4_map_blocks map; +- unsigned int blkbits = inode->i_blkbits; +- int err, len; +- +- map.m_lblk = pos >> blkbits; +- map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); +- len = map.m_len; +- +- err = ext4_map_blocks(NULL, inode, &map, 0); +- /* +- * 'err==len' means that all of blocks has +- * been preallocated no matter they are +- * initialized or not. For excluding +- * unwritten extents, we need to check +- * m_flags. There are two conditions that +- * indicate for initialized extents. 1) If we +- * hit extent cache, EXT4_MAP_MAPPED flag is +- * returned; 2) If we do a real lookup, +- * non-flags are returned. So we should check +- * these two conditions. +- */ +- if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) +- overwrite = 1; +- } +- } ++ /* Check whether we do a DIO overwrite or not */ ++ if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && ++ ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) ++ overwrite = 1; + + ret = __generic_file_write_iter(iocb, from); + inode_unlock(inode); +-- +2.6.6 + + diff --git a/let-S_DAX-set-only-if-DAX-is-really-supported b/let-S_DAX-set-only-if-DAX-is-really-supported new file mode 100644 index 00000000..9fd221d6 --- /dev/null +++ b/let-S_DAX-set-only-if-DAX-is-really-supported @@ -0,0 +1,102 @@ +ext4: only set S_DAX if DAX is really supported + +From: Jan Kara + +Currently we have S_DAX set inode->i_flags for a regular file whenever +ext4 is mounted with dax mount option. However in some cases we cannot +really do DAX - e.g. when inode is marked to use data journalling, when +inode data is being encrypted, or when inode is stored inline. Make sure +S_DAX flag is appropriately set/cleared in these cases. + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/inline.c | 10 ++++++++++ + fs/ext4/inode.c | 9 ++++++++- + fs/ext4/super.c | 6 ++++++ + 3 files changed, 24 insertions(+), 1 deletion(-) + +diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +index f74d5ee2cdec..c29678965c3c 100644 +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle, + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); ++ /* ++ * Propagate changes to inode->i_flags as well - e.g. S_DAX may ++ * get cleared ++ */ ++ ext4_set_inode_flags(inode); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +@@ -442,6 +447,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); ++ /* ++ * Propagate changes to inode->i_flags as well - e.g. S_DAX may ++ * get set. ++ */ ++ ext4_set_inode_flags(inode); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 3d58b2b477e8..5337828c68a7 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4355,7 +4355,9 @@ void ext4_set_inode_flags(struct inode *inode) + new_fl |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + new_fl |= S_DIRSYNC; +- if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) ++ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && ++ !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && ++ !ext4_encrypted_inode(inode)) + new_fl |= S_DAX; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); +@@ -5623,6 +5625,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } + ext4_set_aops(inode); ++ /* ++ * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. ++ * E.g. S_DAX may get cleared / set. ++ */ ++ ext4_set_inode_flags(inode); + + jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 20da99da0a34..d5b94cc6a74e 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1126,6 +1126,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); ++ /* ++ * Update inode->i_flags - e.g. S_DAX may get disabled ++ */ ++ ext4_set_inode_flags(inode); + } + return res; + } +@@ -1140,6 +1144,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ++ /* Update inode->i_flags - e.g. S_DAX may get disabled */ ++ ext4_set_inode_flags(inode); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); +-- +2.6.6 + + diff --git a/rip-out-DAX-handling-from-direct-IO-path b/rip-out-DAX-handling-from-direct-IO-path new file mode 100644 index 00000000..1f2dda7f --- /dev/null +++ b/rip-out-DAX-handling-from-direct-IO-path @@ -0,0 +1,182 @@ +ext4: rip out DAX handling from direct IO path + +From: Jan Kara + +Reads and writes for DAX inodes should no longer end up in direct IO +code. Rip out the support and add a warning. + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 2 -- + fs/ext4/inode.c | 97 +++++++++------------------------------------------------ + 2 files changed, 15 insertions(+), 84 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 098b39910001..8b763113a1b8 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2457,8 +2457,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); + struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); + int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +-int ext4_dax_get_block(struct inode *inode, sector_t iblock, +- struct buffer_head *bh_result, int create); + int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); + int ext4_dio_get_block(struct inode *inode, sector_t iblock, +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 4d71c7bc3524..d13f7cb6b1d5 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3272,46 +3272,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) + } + + #ifdef CONFIG_FS_DAX +-/* +- * Get block function for DAX IO and mmap faults. It takes care of converting +- * unwritten extents to written ones and initializes new / converted blocks +- * to zeros. +- */ +-int ext4_dax_get_block(struct inode *inode, sector_t iblock, +- struct buffer_head *bh_result, int create) +-{ +- int ret; +- +- ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); +- if (!create) +- return _ext4_get_block(inode, iblock, bh_result, 0); +- +- ret = ext4_get_block_trans(inode, iblock, bh_result, +- EXT4_GET_BLOCKS_PRE_IO | +- EXT4_GET_BLOCKS_CREATE_ZERO); +- if (ret < 0) +- return ret; +- +- if (buffer_unwritten(bh_result)) { +- /* +- * We are protected by i_mmap_sem or i_mutex so we know block +- * cannot go away from under us even though we dropped +- * i_data_sem. Convert extent to written and write zeros there. +- */ +- ret = ext4_get_block_trans(inode, iblock, bh_result, +- EXT4_GET_BLOCKS_CONVERT | +- EXT4_GET_BLOCKS_CREATE_ZERO); +- if (ret < 0) +- return ret; +- } +- /* +- * At least for now we have to clear BH_New so that DAX code +- * doesn't attempt to zero blocks again in a racy way. +- */ +- clear_buffer_new(bh_result); +- return 0; +-} +- + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) + { +@@ -3465,14 +3425,6 @@ struct iomap_ops ext4_iomap_ops = { + .iomap_end = ext4_iomap_end, + }; + +-#else +-/* Just define empty function, it will never get called. */ +-int ext4_dax_get_block(struct inode *inode, sector_t iblock, +- struct buffer_head *bh_result, int create) +-{ +- BUG(); +- return 0; +-} + #endif + + static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, +@@ -3594,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) + iocb->private = NULL; + if (overwrite) + get_block_func = ext4_dio_get_block_overwrite; +- else if (IS_DAX(inode)) { +- /* +- * We can avoid zeroing for aligned DAX writes beyond EOF. Other +- * writes need zeroing either because they can race with page +- * faults or because they use partial blocks. +- */ +- if (round_down(offset, 1<i_blkbits) >= inode->i_size && +- ext4_aligned_io(inode, offset, count)) +- get_block_func = ext4_dio_get_block; +- else +- get_block_func = ext4_dax_get_block; +- dio_flags = DIO_LOCKING; +- } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || ++ else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + get_block_func = ext4_dio_get_block; + dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; +@@ -3620,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) + #ifdef CONFIG_EXT4_FS_ENCRYPTION + BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); + #endif +- if (IS_DAX(inode)) { +- ret = dax_do_io(iocb, inode, iter, get_block_func, +- ext4_end_io_dio, dio_flags); +- } else +- ret = __blockdev_direct_IO(iocb, inode, +- inode->i_sb->s_bdev, iter, +- get_block_func, +- ext4_end_io_dio, NULL, dio_flags); ++ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, ++ get_block_func, ext4_end_io_dio, NULL, ++ dio_flags); + + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { +@@ -3696,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) + { + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; ++ size_t count = iov_iter_count(iter); + ssize_t ret; + + /* +@@ -3704,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) + * we are protected against page writeback as well. + */ + inode_lock_shared(inode); +- if (IS_DAX(inode)) { +- ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); +- } else { +- size_t count = iov_iter_count(iter); +- +- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, +- iocb->ki_pos + count); +- if (ret) +- goto out_unlock; +- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, +- iter, ext4_dio_get_block, +- NULL, NULL, 0); +- } ++ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, ++ iocb->ki_pos + count); ++ if (ret) ++ goto out_unlock; ++ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, ++ iter, ext4_dio_get_block, NULL, NULL, 0); + out_unlock: + inode_unlock_shared(inode); + return ret; +@@ -3745,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) + if (ext4_has_inline_data(inode)) + return 0; + ++ /* DAX uses iomap path now */ ++ if (WARN_ON_ONCE(IS_DAX(inode))) ++ return 0; ++ + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (iov_iter_rw(iter) == READ) + ret = ext4_direct_IO_read(iocb, iter); +-- +2.6.6 + + diff --git a/series b/series index 706fd4ed..3456fdf9 100644 --- a/series +++ b/series @@ -14,6 +14,17 @@ fix-sb-mount-options-processing verify-inodes_per_group-during-mount add-sanity-checking-in-count_overhead +factor-out-checks-from-ext4_file_write_iter +let-S_DAX-set-only-if-DAX-is-really-supported +convert-dax-reads-to-iomap-infrastructure +use-iomap-for-zeroing-blocks-in-DAX-mode +DAX-iomap-write-support +avoid-split-extents-for-DAX-writes +convert-DAX-faults-to-iomap-infrastructure +rip-out-DAX-handling-from-direct-IO-path +ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path +dax-rip-out-get_block-based-IO-support + #################################################### # unstable patches #################################################### diff --git a/timestamps b/timestamps index 0bdfbc45..c13a46e4 100755 --- a/timestamps +++ b/timestamps @@ -55,8 +55,18 @@ touch -d @1479310377 avoid-lockdep-warning-when-inheriting-encryption-context touch -d @1479492024 sanity-check-block-and-cluster-size touch -d @1479493466 fix-sb-mount-options-processing touch -d @1479493710 verify-inodes_per_group-during-mount -touch -d @1479494158 series touch -d @1479494267 add-sanity-checking-in-count_overhead touch -d @1479494327 stable-boundary -touch -d @1479674600 status -touch -d @1479676755 timestamps +touch -d @1479680991 factor-out-checks-from-ext4_file_write_iter +touch -d @1479681179 let-S_DAX-set-only-if-DAX-is-really-supported +touch -d @1479681366 convert-dax-reads-to-iomap-infrastructure +touch -d @1479683285 use-iomap-for-zeroing-blocks-in-DAX-mode +touch -d @1479683351 DAX-iomap-write-support +touch -d @1479683409 avoid-split-extents-for-DAX-writes +touch -d @1479685884 convert-DAX-faults-to-iomap-infrastructure +touch -d @1479686010 rip-out-DAX-handling-from-direct-IO-path +touch -d @1479692827 ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path +touch -d @1479692916 dax-rip-out-get_block-based-IO-support +touch -d @1479695661 series +touch -d @1479695664 status +touch -d @1479697791 timestamps diff --git a/use-iomap-for-zeroing-blocks-in-DAX-mode b/use-iomap-for-zeroing-blocks-in-DAX-mode new file mode 100644 index 00000000..1b23eaf8 --- /dev/null +++ b/use-iomap-for-zeroing-blocks-in-DAX-mode @@ -0,0 +1,36 @@ +ext4: use iomap for zeroing blocks in DAX mode + +From: Jan Kara + +Use iomap infrastructure for zeroing blocks when in DAX mode. +ext4_iomap_begin() handles read requests just fine and that's all that +is needed for iomap_zero_range(). + +Reviewed-by: Ross Zwisler +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/inode.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 83e8411370d3..df017ce3e52d 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3849,8 +3849,10 @@ static int ext4_block_zero_page_range(handle_t *handle, + if (length > max || length < 0) + length = max; + +- if (IS_DAX(inode)) +- return dax_zero_page_range(inode, from, length, ext4_get_block); ++ if (IS_DAX(inode)) { ++ return iomap_zero_range(inode, from, length, NULL, ++ &ext4_iomap_ops); ++ } + return __ext4_block_zero_page_range(handle, mapping, from, length); + } + +-- +2.6.6 + + -- 2.11.4.GIT