From aa550e790b3c492bfc3ef5e4784e19b5a2e3fa90 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 21 Nov 2016 11:51:03 -0500
Subject: [PATCH] Check in Jan Kara's v3 DAX iomap patches

---
 DAX-iomap-write-support                            | 228 +++++++++++
 avoid-split-extents-for-DAX-writes                 |  60 +++
 convert-DAX-faults-to-iomap-infrastructure         |  80 ++++
 convert-dax-reads-to-iomap-infrastructure          | 161 ++++++++
 dax-rip-out-get_block-based-IO-support             | 437 +++++++++++++++++++++
 ...ro_range-for-zeroing-truncated-page-in-DAX-path |  48 +++
 factor-out-checks-from-ext4_file_write_iter        | 141 +++++++
 let-S_DAX-set-only-if-DAX-is-really-supported      | 102 +++++
 rip-out-DAX-handling-from-direct-IO-path           | 182 +++++++++
 series                                             |  11 +
 timestamps                                         |  16 +-
 use-iomap-for-zeroing-blocks-in-DAX-mode           |  36 ++
 12 files changed, 1499 insertions(+), 3 deletions(-)
 create mode 100644 DAX-iomap-write-support
 create mode 100644 avoid-split-extents-for-DAX-writes
 create mode 100644 convert-DAX-faults-to-iomap-infrastructure
 create mode 100644 convert-dax-reads-to-iomap-infrastructure
 create mode 100644 dax-rip-out-get_block-based-IO-support
 create mode 100644 ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path
 create mode 100644 factor-out-checks-from-ext4_file_write_iter
 create mode 100644 let-S_DAX-set-only-if-DAX-is-really-supported
 create mode 100644 rip-out-DAX-handling-from-direct-IO-path
 create mode 100644 use-iomap-for-zeroing-blocks-in-DAX-mode

diff --git a/DAX-iomap-write-support b/DAX-iomap-write-support
new file mode 100644
index 00000000..cdb2404b
--- /dev/null
+++ b/DAX-iomap-write-support
@@ -0,0 +1,228 @@
+ext4: DAX iomap write support
+
+From: Jan Kara <jack@suse.cz>
+
+Implement DAX writes using the new iomap infrastructure instead of
+overloading the direct IO path.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/file.c  |  40 ++++++++++++++++++
+ fs/ext4/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 160 insertions(+), 6 deletions(-)
+
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 1f25c644cb12..1953fe34f9fe 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+ 	return iov_iter_count(from);
+ }
+ 
++#ifdef CONFIG_FS_DAX
++static ssize_t
++ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
++{
++	struct inode *inode = file_inode(iocb->ki_filp);
++	ssize_t ret;
++	bool overwrite = false;
++
++	inode_lock(inode);
++	ret = ext4_write_checks(iocb, from);
++	if (ret <= 0)
++		goto out;
++	ret = file_remove_privs(iocb->ki_filp);
++	if (ret)
++		goto out;
++	ret = file_update_time(iocb->ki_filp);
++	if (ret)
++		goto out;
++
++	if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
++		overwrite = true;
++		downgrade_write(&inode->i_rwsem);
++	}
++	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
++out:
++	if (!overwrite)
++		inode_unlock(inode);
++	else
++		inode_unlock_shared(inode);
++	if (ret > 0)
++		ret = generic_write_sync(iocb, ret);
++	return ret;
++}
++#endif
++
+ static ssize_t
+ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+@@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 	int overwrite = 0;
+ 	ssize_t ret;
+ 
++#ifdef CONFIG_FS_DAX
++	if (IS_DAX(inode))
++		return ext4_dax_write_iter(iocb, from);
++#endif
++
+ 	inode_lock(inode);
+ 	ret = ext4_write_checks(iocb, from);
+ 	if (ret <= 0)
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index df017ce3e52d..a7079cab645a 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3321,18 +3321,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ 	struct ext4_map_blocks map;
+ 	int ret;
+ 
+-	if (flags & IOMAP_WRITE)
+-		return -EIO;
+-
+ 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ 		return -ERANGE;
+ 
+ 	map.m_lblk = first_block;
+ 	map.m_len = last_block - first_block + 1;
+ 
+-	ret = ext4_map_blocks(NULL, inode, &map, 0);
+-	if (ret < 0)
+-		return ret;
++	if (!(flags & IOMAP_WRITE)) {
++		ret = ext4_map_blocks(NULL, inode, &map, 0);
++	} else {
++		int dio_credits;
++		handle_t *handle;
++		int retries = 0;
++
++		/* Trim mapping request to maximum we can map at once for DIO */
++		if (map.m_len > DIO_MAX_BLOCKS)
++			map.m_len = DIO_MAX_BLOCKS;
++		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
++retry:
++		/*
++		 * Either we allocate blocks and then we don't get unwritten
++		 * extent so we have reserved enough credits, or the blocks
++		 * are already allocated and unwritten and in that case
++		 * extent conversion fits in the credits as well.
++		 */
++		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
++					    dio_credits);
++		if (IS_ERR(handle))
++			return PTR_ERR(handle);
++
++		ret = ext4_map_blocks(handle, inode, &map,
++				      EXT4_GET_BLOCKS_PRE_IO |
++				      EXT4_GET_BLOCKS_CREATE_ZERO);
++		if (ret < 0) {
++			ext4_journal_stop(handle);
++			if (ret == -ENOSPC &&
++			    ext4_should_retry_alloc(inode->i_sb, &retries))
++				goto retry;
++			return ret;
++		}
++		/* For DAX writes we need to zero out unwritten extents */
++		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
++			/*
++			 * We are protected by i_mmap_sem or i_rwsem so we know
++			 * block cannot go away from under us even though we
++			 * dropped i_data_sem. Convert extent to written and
++			 * write zeros there.
++			 */
++			ret = ext4_map_blocks(handle, inode, &map,
++					      EXT4_GET_BLOCKS_CONVERT |
++					      EXT4_GET_BLOCKS_CREATE_ZERO);
++			if (ret < 0) {
++				ext4_journal_stop(handle);
++				return ret;
++			}
++		}
++
++		/*
++		 * If we added blocks beyond i_size we need to make sure they
++		 * will get truncated if we crash before updating i_size in
++		 * ext4_iomap_end().
++		 */
++		if (first_block + map.m_len >
++		    (inode->i_size + (1 << blkbits) - 1) >> blkbits) {
++			int err;
++
++			err = ext4_orphan_add(handle, inode);
++			if (err < 0) {
++				ext4_journal_stop(handle);
++				return err;
++			}
++		}
++		ext4_journal_stop(handle);
++	}
+ 
+ 	iomap->flags = 0;
+ 	iomap->bdev = inode->i_sb->s_bdev;
+@@ -3360,8 +3421,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ 	return 0;
+ }
+ 
++static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
++			  ssize_t written, unsigned flags, struct iomap *iomap)
++{
++	int ret = 0;
++	handle_t *handle;
++	int blkbits = inode->i_blkbits;
++	bool truncate = false;
++
++	if (!(flags & IOMAP_WRITE))
++		return 0;
++
++	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
++	if (IS_ERR(handle)) {
++		ret = PTR_ERR(handle);
++		goto orphan_del;
++	}
++	if (ext4_update_inode_size(inode, offset + written))
++		ext4_mark_inode_dirty(handle, inode);
++	/*
++	 * We may need to truncate allocated but not written blocks beyond EOF.
++	 */
++	if (iomap->offset + iomap->length > 
++	    ALIGN(inode->i_size, 1 << blkbits)) {
++		ext4_lblk_t written_blk, end_blk;
++
++		written_blk = (offset + written) >> blkbits;
++		end_blk = (offset + length) >> blkbits;
++		if (written_blk < end_blk && ext4_can_truncate(inode))
++			truncate = true;
++	}
++	/*
++	 * Remove inode from orphan list if we were extending a inode and
++	 * everything went fine.
++	 */
++	if (!truncate && inode->i_nlink &&
++	    !list_empty(&EXT4_I(inode)->i_orphan))
++		ext4_orphan_del(handle, inode);
++	ext4_journal_stop(handle);
++	if (truncate) {
++		ext4_truncate_failed_write(inode);
++orphan_del:
++		/*
++		 * If truncate failed early the inode might still be on the
++		 * orphan list; we need to make sure the inode is removed from
++		 * the orphan list in that case.
++		 */
++		if (inode->i_nlink)
++			ext4_orphan_del(NULL, inode);
++	}
++	return ret;
++}
++
+ struct iomap_ops ext4_iomap_ops = {
+ 	.iomap_begin		= ext4_iomap_begin,
++	.iomap_end		= ext4_iomap_end,
+ };
+ 
+ #else
+-- 
+2.6.6
+
+
diff --git a/avoid-split-extents-for-DAX-writes b/avoid-split-extents-for-DAX-writes
new file mode 100644
index 00000000..e6c30529
--- /dev/null
+++ b/avoid-split-extents-for-DAX-writes
@@ -0,0 +1,60 @@
+ext4: avoid split extents for DAX writes
+
+From: Jan Kara <jack@suse.cz>
+
+Currently mapping of blocks for DAX writes happen with
+EXT4_GET_BLOCKS_PRE_IO flag set. That has a result that each
+ext4_map_blocks() call creates a separate written extent, although it
+could be merged to the neighboring extents in the extent tree.  The
+reason for using this flag is that in case the extent is unwritten, we
+need to convert it to written one and zero it out. However this "convert
+mapped range to written" operation is already implemented by
+ext4_map_blocks() for the case of data writes into unwritten extent. So
+just use flags for that mode of operation, simplify the code, and avoid
+unnecessary split extents.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/inode.c | 17 -----------------
+ 1 file changed, 17 deletions(-)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index a7079cab645a..3192ec0768d4 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3351,7 +3351,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ 			return PTR_ERR(handle);
+ 
+ 		ret = ext4_map_blocks(handle, inode, &map,
+-				      EXT4_GET_BLOCKS_PRE_IO |
+ 				      EXT4_GET_BLOCKS_CREATE_ZERO);
+ 		if (ret < 0) {
+ 			ext4_journal_stop(handle);
+@@ -3360,22 +3359,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ 				goto retry;
+ 			return ret;
+ 		}
+-		/* For DAX writes we need to zero out unwritten extents */
+-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+-			/*
+-			 * We are protected by i_mmap_sem or i_rwsem so we know
+-			 * block cannot go away from under us even though we
+-			 * dropped i_data_sem. Convert extent to written and
+-			 * write zeros there.
+-			 */
+-			ret = ext4_map_blocks(handle, inode, &map,
+-					      EXT4_GET_BLOCKS_CONVERT |
+-					      EXT4_GET_BLOCKS_CREATE_ZERO);
+-			if (ret < 0) {
+-				ext4_journal_stop(handle);
+-				return ret;
+-			}
+-		}
+ 
+ 		/*
+ 		 * If we added blocks beyond i_size we need to make sure they
+-- 
+2.6.6
+
+
diff --git a/convert-DAX-faults-to-iomap-infrastructure b/convert-DAX-faults-to-iomap-infrastructure
new file mode 100644
index 00000000..211a7501
--- /dev/null
+++ b/convert-DAX-faults-to-iomap-infrastructure
@@ -0,0 +1,80 @@
+ext4: convert DAX faults to iomap infrastructure
+
+From: Jan Kara <jack@suse.cz>
+
+Convert DAX faults to use iomap infrastructure. We would not have to start
+transaction in ext4_dax_fault() anymore since ext4_iomap_begin takes
+care of that but so far we do that to avoid lock inversion of
+transaction start with DAX entry lock which gets acquired in
+dax_iomap_fault() before calling ->iomap_begin handler.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/file.c  |  9 +++++----
+ fs/ext4/inode.c | 14 +++++++++-----
+ 2 files changed, 14 insertions(+), 9 deletions(-)
+
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 1953fe34f9fe..b5f184493c57 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -275,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ 	if (IS_ERR(handle))
+ 		result = VM_FAULT_SIGBUS;
+ 	else
+-		result = dax_fault(vma, vmf, ext4_dax_get_block);
++		result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+ 
+ 	if (write) {
+ 		if (!IS_ERR(handle))
+@@ -309,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ 
+ 	if (IS_ERR(handle))
+ 		result = VM_FAULT_SIGBUS;
+-	else
+-		result = dax_pmd_fault(vma, addr, pmd, flags,
+-					 ext4_dax_get_block);
++	else {
++		result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
++					     &ext4_iomap_ops);
++	}
+ 
+ 	if (write) {
+ 		if (!IS_ERR(handle))
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 3192ec0768d4..4d71c7bc3524 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3361,12 +3361,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ 		}
+ 
+ 		/*
+-		 * If we added blocks beyond i_size we need to make sure they
++		 * If we added blocks beyond i_size, we need to make sure they
+ 		 * will get truncated if we crash before updating i_size in
+-		 * ext4_iomap_end().
++		 * ext4_iomap_end(). For faults we don't need to do that (and
++		 * even cannot because for orphan list operations inode_lock is
++		 * required) - if we happen to instantiate block beyond i_size,
++		 * it is because we race with truncate which has already added
++		 * the inode to the orphan list.
+ 		 */
+-		if (first_block + map.m_len >
+-		    (inode->i_size + (1 << blkbits) - 1) >> blkbits) {
++		if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
++		    (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
+ 			int err;
+ 
+ 			err = ext4_orphan_add(handle, inode);
+@@ -3412,7 +3416,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ 	int blkbits = inode->i_blkbits;
+ 	bool truncate = false;
+ 
+-	if (!(flags & IOMAP_WRITE))
++	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+ 		return 0;
+ 
+ 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+-- 
+2.6.6
diff --git a/convert-dax-reads-to-iomap-infrastructure b/convert-dax-reads-to-iomap-infrastructure
new file mode 100644
index 00000000..2ae825a6
--- /dev/null
+++ b/convert-dax-reads-to-iomap-infrastructure
@@ -0,0 +1,161 @@
+ext4: convert DAX reads to iomap infrastructure
+
+From: Jan Kara <jack@suse.cz>
+
+Implement basic iomap_begin function that handles reading and use it for
+DAX reads.
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h  |  2 ++
+ fs/ext4/file.c  | 38 +++++++++++++++++++++++++++++++++++++-
+ fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 93 insertions(+), 1 deletion(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 282a51b07c57..098b39910001 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -3271,6 +3271,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
+ 	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
+ }
+ 
++extern struct iomap_ops ext4_iomap_ops;
++
+ #endif	/* __KERNEL__ */
+ 
+ #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 9facb4dc5c70..1f25c644cb12 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -31,6 +31,42 @@
+ #include "xattr.h"
+ #include "acl.h"
+ 
++#ifdef CONFIG_FS_DAX
++static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
++{
++	struct inode *inode = file_inode(iocb->ki_filp);
++	ssize_t ret;
++
++	inode_lock_shared(inode);
++	/*
++	 * Recheck under inode lock - at this point we are sure it cannot
++	 * change anymore
++	 */
++	if (!IS_DAX(inode)) {
++		inode_unlock_shared(inode);
++		/* Fallback to buffered IO in case we cannot support DAX */
++		return generic_file_read_iter(iocb, to);
++	}
++	ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
++	inode_unlock_shared(inode);
++
++	file_accessed(iocb->ki_filp);
++	return ret;
++}
++#endif
++
++static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
++{
++	if (!iov_iter_count(to))
++		return 0; /* skip atime */
++
++#ifdef CONFIG_FS_DAX
++	if (IS_DAX(file_inode(iocb->ki_filp)))
++		return ext4_dax_read_iter(iocb, to);
++#endif
++	return generic_file_read_iter(iocb, to);
++}
++
+ /*
+  * Called when an inode is released. Note that this is different
+  * from ext4_file_open: open gets called at every open, but release
+@@ -690,7 +726,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
+ 
+ const struct file_operations ext4_file_operations = {
+ 	.llseek		= ext4_llseek,
+-	.read_iter	= generic_file_read_iter,
++	.read_iter	= ext4_file_read_iter,
+ 	.write_iter	= ext4_file_write_iter,
+ 	.unlocked_ioctl = ext4_ioctl,
+ #ifdef CONFIG_COMPAT
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 5337828c68a7..83e8411370d3 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -37,6 +37,7 @@
+ #include <linux/printk.h>
+ #include <linux/slab.h>
+ #include <linux/bitops.h>
++#include <linux/iomap.h>
+ 
+ #include "ext4_jbd2.h"
+ #include "xattr.h"
+@@ -3310,6 +3311,59 @@ int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+ 	clear_buffer_new(bh_result);
+ 	return 0;
+ }
++
++static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
++			    unsigned flags, struct iomap *iomap)
++{
++	unsigned int blkbits = inode->i_blkbits;
++	unsigned long first_block = offset >> blkbits;
++	unsigned long last_block = (offset + length - 1) >> blkbits;
++	struct ext4_map_blocks map;
++	int ret;
++
++	if (flags & IOMAP_WRITE)
++		return -EIO;
++
++	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
++		return -ERANGE;
++
++	map.m_lblk = first_block;
++	map.m_len = last_block - first_block + 1;
++
++	ret = ext4_map_blocks(NULL, inode, &map, 0);
++	if (ret < 0)
++		return ret;
++
++	iomap->flags = 0;
++	iomap->bdev = inode->i_sb->s_bdev;
++	iomap->offset = first_block << blkbits;
++
++	if (ret == 0) {
++		iomap->type = IOMAP_HOLE;
++		iomap->blkno = IOMAP_NULL_BLOCK;
++		iomap->length = (u64)map.m_len << blkbits;
++	} else {
++		if (map.m_flags & EXT4_MAP_MAPPED) {
++			iomap->type = IOMAP_MAPPED;
++		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
++			iomap->type = IOMAP_UNWRITTEN;
++		} else {
++			WARN_ON_ONCE(1);
++			return -EIO;
++		}
++		iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
++		iomap->length = (u64)map.m_len << blkbits;
++	}
++
++	if (map.m_flags & EXT4_MAP_NEW)
++		iomap->flags |= IOMAP_F_NEW;
++	return 0;
++}
++
++struct iomap_ops ext4_iomap_ops = {
++	.iomap_begin		= ext4_iomap_begin,
++};
++
+ #else
+ /* Just define empty function, it will never get called. */
+ int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+-- 
+2.6.6
+
+
diff --git a/dax-rip-out-get_block-based-IO-support b/dax-rip-out-get_block-based-IO-support
new file mode 100644
index 00000000..f1e3c0d9
--- /dev/null
+++ b/dax-rip-out-get_block-based-IO-support
@@ -0,0 +1,437 @@
+dax: rip out get_block based IO support
+
+From: Jan Kara <jack@suse.cz>
+
+No one uses functions using the get_block callback anymore. Rip them
+out and update documentation.
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ Documentation/filesystems/dax.txt |  22 +--
+ fs/dax.c                          | 315 --------------------------------------
+ include/linux/dax.h               |  12 --
+ 3 files changed, 11 insertions(+), 338 deletions(-)
+
+diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
+index 23d18b8a49d5..a7e6e14aeb08 100644
+--- a/Documentation/filesystems/dax.txt
++++ b/Documentation/filesystems/dax.txt
+@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
+ Filesystem support consists of
+ - adding support to mark inodes as being DAX by setting the S_DAX flag in
+   i_flags
+-- implementing the direct_IO address space operation, and calling
+-  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
++- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
++  when inode has S_DAX flag set
+ - implementing an mmap file operation for DAX files which sets the
+   VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+-  include handlers for fault, pmd_fault and page_mkwrite (which should
+-  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+-  appropriate get_block() callback)
+-- calling dax_truncate_page() instead of block_truncate_page() for DAX files
+-- calling dax_zero_page_range() instead of zero_user() for DAX files
++  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
++  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
++  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
++  iomap operations.
++- calling iomap_zero_range() passing appropriate iomap operations instead of
++  block_truncate_page() for DAX files
+ - ensuring that there is sufficient locking between reads, writes,
+   truncates and page faults
+ 
+-The get_block() callback passed to the DAX functions may return
+-uninitialised extents.  If it does, it must ensure that simultaneous
+-calls to get_block() (for example by a page-fault racing with a read()
+-or a write()) work correctly.
++The iomap handlers for allocating blocks must make sure that allocated blocks
++are zeroed out and converted to written extents before being returned to avoid
++exposure of uninitialized data through mmap.
+ 
+ These filesystems may be used for inspiration:
+ - ext2: see Documentation/filesystems/ext2.txt
+diff --git a/fs/dax.c b/fs/dax.c
+index 28af41b9da3a..ad131cd2605d 100644
+--- a/fs/dax.c
++++ b/fs/dax.c
+@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+ 	return page;
+ }
+ 
+-static bool buffer_written(struct buffer_head *bh)
+-{
+-	return buffer_mapped(bh) && !buffer_unwritten(bh);
+-}
+-
+-static sector_t to_sector(const struct buffer_head *bh,
+-		const struct inode *inode)
+-{
+-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+-
+-	return sector;
+-}
+-
+-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
+-		      loff_t start, loff_t end, get_block_t get_block,
+-		      struct buffer_head *bh)
+-{
+-	loff_t pos = start, max = start, bh_max = start;
+-	bool hole = false;
+-	struct block_device *bdev = NULL;
+-	int rw = iov_iter_rw(iter), rc;
+-	long map_len = 0;
+-	struct blk_dax_ctl dax = {
+-		.addr = ERR_PTR(-EIO),
+-	};
+-	unsigned blkbits = inode->i_blkbits;
+-	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+-								>> blkbits;
+-
+-	if (rw == READ)
+-		end = min(end, i_size_read(inode));
+-
+-	while (pos < end) {
+-		size_t len;
+-		if (pos == max) {
+-			long page = pos >> PAGE_SHIFT;
+-			sector_t block = page << (PAGE_SHIFT - blkbits);
+-			unsigned first = pos - (block << blkbits);
+-			long size;
+-
+-			if (pos == bh_max) {
+-				bh->b_size = PAGE_ALIGN(end - pos);
+-				bh->b_state = 0;
+-				rc = get_block(inode, block, bh, rw == WRITE);
+-				if (rc)
+-					break;
+-				bh_max = pos - first + bh->b_size;
+-				bdev = bh->b_bdev;
+-				/*
+-				 * We allow uninitialized buffers for writes
+-				 * beyond EOF as those cannot race with faults
+-				 */
+-				WARN_ON_ONCE(
+-					(buffer_new(bh) && block < file_blks) ||
+-					(rw == WRITE && buffer_unwritten(bh)));
+-			} else {
+-				unsigned done = bh->b_size -
+-						(bh_max - (pos - first));
+-				bh->b_blocknr += done >> blkbits;
+-				bh->b_size -= done;
+-			}
+-
+-			hole = rw == READ && !buffer_written(bh);
+-			if (hole) {
+-				size = bh->b_size - first;
+-			} else {
+-				dax_unmap_atomic(bdev, &dax);
+-				dax.sector = to_sector(bh, inode);
+-				dax.size = bh->b_size;
+-				map_len = dax_map_atomic(bdev, &dax);
+-				if (map_len < 0) {
+-					rc = map_len;
+-					break;
+-				}
+-				dax.addr += first;
+-				size = map_len - first;
+-			}
+-			/*
+-			 * pos + size is one past the last offset for IO,
+-			 * so pos + size can overflow loff_t at extreme offsets.
+-			 * Cast to u64 to catch this and get the true minimum.
+-			 */
+-			max = min_t(u64, pos + size, end);
+-		}
+-
+-		if (iov_iter_rw(iter) == WRITE) {
+-			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
+-		} else if (!hole)
+-			len = copy_to_iter((void __force *) dax.addr, max - pos,
+-					iter);
+-		else
+-			len = iov_iter_zero(max - pos, iter);
+-
+-		if (!len) {
+-			rc = -EFAULT;
+-			break;
+-		}
+-
+-		pos += len;
+-		if (!IS_ERR(dax.addr))
+-			dax.addr += len;
+-	}
+-
+-	dax_unmap_atomic(bdev, &dax);
+-
+-	return (pos == start) ? rc : pos - start;
+-}
+-
+-/**
+- * dax_do_io - Perform I/O to a DAX file
+- * @iocb: The control block for this I/O
+- * @inode: The file which the I/O is directed at
+- * @iter: The addresses to do I/O from or to
+- * @get_block: The filesystem method used to translate file offsets to blocks
+- * @end_io: A filesystem callback for I/O completion
+- * @flags: See below
+- *
+- * This function uses the same locking scheme as do_blockdev_direct_IO:
+- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+- * caller for writes.  For reads, we take and release the i_mutex ourselves.
+- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+- * is in progress.
+- */
+-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
+-		  struct iov_iter *iter, get_block_t get_block,
+-		  dio_iodone_t end_io, int flags)
+-{
+-	struct buffer_head bh;
+-	ssize_t retval = -EINVAL;
+-	loff_t pos = iocb->ki_pos;
+-	loff_t end = pos + iov_iter_count(iter);
+-
+-	memset(&bh, 0, sizeof(bh));
+-	bh.b_bdev = inode->i_sb->s_bdev;
+-
+-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
+-		inode_lock(inode);
+-
+-	/* Protects against truncate */
+-	if (!(flags & DIO_SKIP_DIO_COUNT))
+-		inode_dio_begin(inode);
+-
+-	retval = dax_io(inode, iter, pos, end, get_block, &bh);
+-
+-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
+-		inode_unlock(inode);
+-
+-	if (end_io) {
+-		int err;
+-
+-		err = end_io(iocb, pos, retval, bh.b_private);
+-		if (err)
+-			retval = err;
+-	}
+-
+-	if (!(flags & DIO_SKIP_DIO_COUNT))
+-		inode_dio_end(inode);
+-	return retval;
+-}
+-EXPORT_SYMBOL_GPL(dax_do_io);
+-
+ /*
+  * DAX radix tree locking
+  */
+@@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping,
+ }
+ 
+ /**
+- * dax_fault - handle a page fault on a DAX file
+- * @vma: The virtual memory area where the fault occurred
+- * @vmf: The description of the fault
+- * @get_block: The filesystem method used to translate file offsets to blocks
+- *
+- * When a page fault occurs, filesystems may call this helper in their
+- * fault handler for DAX files. dax_fault() assumes the caller has done all
+- * the necessary locking for the page fault to proceed successfully.
+- */
+-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+-			get_block_t get_block)
+-{
+-	struct file *file = vma->vm_file;
+-	struct address_space *mapping = file->f_mapping;
+-	struct inode *inode = mapping->host;
+-	void *entry;
+-	struct buffer_head bh;
+-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+-	unsigned blkbits = inode->i_blkbits;
+-	sector_t block;
+-	pgoff_t size;
+-	int error;
+-	int major = 0;
+-
+-	/*
+-	 * Check whether offset isn't beyond end of file now. Caller is supposed
+-	 * to hold locks serializing us with truncate / punch hole so this is
+-	 * a reliable test.
+-	 */
+-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+-	if (vmf->pgoff >= size)
+-		return VM_FAULT_SIGBUS;
+-
+-	memset(&bh, 0, sizeof(bh));
+-	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+-	bh.b_bdev = inode->i_sb->s_bdev;
+-	bh.b_size = PAGE_SIZE;
+-
+-	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+-	if (IS_ERR(entry)) {
+-		error = PTR_ERR(entry);
+-		goto out;
+-	}
+-
+-	error = get_block(inode, block, &bh, 0);
+-	if (!error && (bh.b_size < PAGE_SIZE))
+-		error = -EIO;		/* fs corruption? */
+-	if (error)
+-		goto unlock_entry;
+-
+-	if (vmf->cow_page) {
+-		struct page *new_page = vmf->cow_page;
+-		if (buffer_written(&bh))
+-			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
+-					bh.b_size, new_page, vaddr);
+-		else
+-			clear_user_highpage(new_page, vaddr);
+-		if (error)
+-			goto unlock_entry;
+-		if (!radix_tree_exceptional_entry(entry)) {
+-			vmf->page = entry;
+-			return VM_FAULT_LOCKED;
+-		}
+-		vmf->entry = entry;
+-		return VM_FAULT_DAX_LOCKED;
+-	}
+-
+-	if (!buffer_mapped(&bh)) {
+-		if (vmf->flags & FAULT_FLAG_WRITE) {
+-			error = get_block(inode, block, &bh, 1);
+-			count_vm_event(PGMAJFAULT);
+-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+-			major = VM_FAULT_MAJOR;
+-			if (!error && (bh.b_size < PAGE_SIZE))
+-				error = -EIO;
+-			if (error)
+-				goto unlock_entry;
+-		} else {
+-			return dax_load_hole(mapping, entry, vmf);
+-		}
+-	}
+-
+-	/* Filesystem should not return unwritten buffers to us! */
+-	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
+-	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
+-			bh.b_size, &entry, vma, vmf);
+- unlock_entry:
+-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+- out:
+-	if (error == -ENOMEM)
+-		return VM_FAULT_OOM | major;
+-	/* -EBUSY is fine, somebody else faulted on the same PTE */
+-	if ((error < 0) && (error != -EBUSY))
+-		return VM_FAULT_SIGBUS | major;
+-	return VM_FAULT_NOPAGE | major;
+-}
+-EXPORT_SYMBOL_GPL(dax_fault);
+-
+-/**
+  * dax_pfn_mkwrite - handle first write to DAX page
+  * @vma: The virtual memory area where the fault occurred
+  * @vmf: The description of the fault
+@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ }
+ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+ 
+-/**
+- * dax_zero_page_range - zero a range within a page of a DAX file
+- * @inode: The file being truncated
+- * @from: The file offset that is being truncated to
+- * @length: The number of bytes to zero
+- * @get_block: The filesystem method used to translate file offsets to blocks
+- *
+- * This function can be called by a filesystem when it is zeroing part of a
+- * page in a DAX file.  This is intended for hole-punch operations.  If
+- * you are truncating a file, the helper function dax_truncate_page() may be
+- * more convenient.
+- */
+-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+-							get_block_t get_block)
+-{
+-	struct buffer_head bh;
+-	pgoff_t index = from >> PAGE_SHIFT;
+-	unsigned offset = from & (PAGE_SIZE-1);
+-	int err;
+-
+-	/* Block boundary? Nothing to do */
+-	if (!length)
+-		return 0;
+-	if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
+-		return -EINVAL;
+-
+-	memset(&bh, 0, sizeof(bh));
+-	bh.b_bdev = inode->i_sb->s_bdev;
+-	bh.b_size = PAGE_SIZE;
+-	err = get_block(inode, index, &bh, 0);
+-	if (err < 0 || !buffer_written(&bh))
+-		return err;
+-
+-	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+-			offset, length);
+-}
+-EXPORT_SYMBOL_GPL(dax_zero_page_range);
+-
+-/**
+- * dax_truncate_page - handle a partial page being truncated in a DAX file
+- * @inode: The file being truncated
+- * @from: The file offset that is being truncated to
+- * @get_block: The filesystem method used to translate file offsets to blocks
+- *
+- * Similar to block_truncate_page(), this function can be called by a
+- * filesystem when it is truncating a DAX file to handle the partial page.
+- */
+-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+-{
+-	unsigned length = PAGE_ALIGN(from) - from;
+-	return dax_zero_page_range(inode, from, length, get_block);
+-}
+-EXPORT_SYMBOL_GPL(dax_truncate_page);
+-
+ #ifdef CONFIG_FS_IOMAP
+ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+ {
+diff --git a/include/linux/dax.h b/include/linux/dax.h
+index 8d1a5c47945f..0afade8bd3d7 100644
+--- a/include/linux/dax.h
++++ b/include/linux/dax.h
+@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+ 
+ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
+ 		struct iomap_ops *ops);
+-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
+-		  get_block_t, dio_iodone_t, int flags);
+-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ 			struct iomap_ops *ops);
+-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ 		pgoff_t index, void *entry, bool wake_all);
+@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
+ }
+ #endif
+ 
+-static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+-				pmd_t *pmd, unsigned int flags, get_block_t gb)
+-{
+-	return VM_FAULT_FALLBACK;
+-}
+-
+ #ifdef CONFIG_FS_DAX_PMD
+ static inline unsigned int dax_radix_order(void *entry)
+ {
+@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
+ }
+ #endif
+ int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+-#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
+ 
+ static inline bool vma_is_dax(struct vm_area_struct *vma)
+ {
+-- 
+2.6.6
+
+
diff --git a/ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path b/ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path
new file mode 100644
index 00000000..efbc8aec
--- /dev/null
+++ b/ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path
@@ -0,0 +1,48 @@
+ext2: use iomap_zero_range() for zeroing truncated page in DAX path
+
+From: Jan Kara <jack@suse.cz>
+
+Currently the last user of ext2_get_blocks() for DAX inodes was
+dax_truncate_page(). Convert that to iomap_zero_range() so that all DAX
+IO uses the iomap path.
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext2/inode.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
+index 41b8b44a391c..046b642f3585 100644
+--- a/fs/ext2/inode.c
++++ b/fs/ext2/inode.c
+@@ -850,6 +850,9 @@ struct iomap_ops ext2_iomap_ops = {
+ 	.iomap_begin		= ext2_iomap_begin,
+ 	.iomap_end		= ext2_iomap_end,
+ };
++#else
++/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
++struct iomap_ops ext2_iomap_ops;
+ #endif /* CONFIG_FS_DAX */
+ 
+ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+@@ -1293,9 +1296,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
+ 
+ 	inode_dio_wait(inode);
+ 
+-	if (IS_DAX(inode))
+-		error = dax_truncate_page(inode, newsize, ext2_get_block);
+-	else if (test_opt(inode->i_sb, NOBH))
++	if (IS_DAX(inode)) {
++		error = iomap_zero_range(inode, newsize,
++					 PAGE_ALIGN(newsize) - newsize, NULL,
++					 &ext2_iomap_ops);
++	} else if (test_opt(inode->i_sb, NOBH))
+ 		error = nobh_truncate_page(inode->i_mapping,
+ 				newsize, ext2_get_block);
+ 	else
+-- 
+2.6.6
+
+
diff --git a/factor-out-checks-from-ext4_file_write_iter b/factor-out-checks-from-ext4_file_write_iter
new file mode 100644
index 00000000..f17aaa10
--- /dev/null
+++ b/factor-out-checks-from-ext4_file_write_iter
@@ -0,0 +1,141 @@
+ext4: factor out checks from ext4_file_write_iter()
+
+From: Jan Kara <jack@suse.cz>
+
+Factor out checks of 'from' and whether we are overwriting out of
+ext4_file_write_iter() so that the function is easier to follow.
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/file.c | 97 ++++++++++++++++++++++++++++++----------------------------
+ 1 file changed, 50 insertions(+), 47 deletions(-)
+
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 2a822d30e73f..9facb4dc5c70 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -88,6 +88,51 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+ 	return 0;
+ }
+ 
++/* Is IO overwriting allocated and initialized blocks? */
++static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
++{
++	struct ext4_map_blocks map;
++	unsigned int blkbits = inode->i_blkbits;
++	int err, blklen;
++
++	if (pos + len > i_size_read(inode))
++		return false;
++
++	map.m_lblk = pos >> blkbits;
++	map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
++	blklen = map.m_len;
++
++	err = ext4_map_blocks(NULL, inode, &map, 0);
++	/*
++	 * 'err==len' means that all of the blocks have been preallocated,
++	 * regardless of whether they have been initialized or not. To exclude
++	 * unwritten extents, we need to check m_flags.
++	 */
++	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
++}
++
++static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
++{
++	struct inode *inode = file_inode(iocb->ki_filp);
++	ssize_t ret;
++
++	ret = generic_write_checks(iocb, from);
++	if (ret <= 0)
++		return ret;
++	/*
++	 * If we have encountered a bitmap-format file, the size limit
++	 * is smaller than s_maxbytes, which is for extent-mapped files.
++	 */
++	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
++		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++
++		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
++			return -EFBIG;
++		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
++	}
++	return iov_iter_count(from);
++}
++
+ static ssize_t
+ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+@@ -98,7 +143,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 	ssize_t ret;
+ 
+ 	inode_lock(inode);
+-	ret = generic_write_checks(iocb, from);
++	ret = ext4_write_checks(iocb, from);
+ 	if (ret <= 0)
+ 		goto out;
+ 
+@@ -114,53 +159,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 		ext4_unwritten_wait(inode);
+ 	}
+ 
+-	/*
+-	 * If we have encountered a bitmap-format file, the size limit
+-	 * is smaller than s_maxbytes, which is for extent-mapped files.
+-	 */
+-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-
+-		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
+-			ret = -EFBIG;
+-			goto out;
+-		}
+-		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+-	}
+-
+ 	iocb->private = &overwrite;
+-	if (o_direct) {
+-		size_t length = iov_iter_count(from);
+-		loff_t pos = iocb->ki_pos;
+-
+-		/* check whether we do a DIO overwrite or not */
+-		if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
+-		    pos + length <= i_size_read(inode)) {
+-			struct ext4_map_blocks map;
+-			unsigned int blkbits = inode->i_blkbits;
+-			int err, len;
+-
+-			map.m_lblk = pos >> blkbits;
+-			map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits);
+-			len = map.m_len;
+-
+-			err = ext4_map_blocks(NULL, inode, &map, 0);
+-			/*
+-			 * 'err==len' means that all of blocks has
+-			 * been preallocated no matter they are
+-			 * initialized or not.  For excluding
+-			 * unwritten extents, we need to check
+-			 * m_flags.  There are two conditions that
+-			 * indicate for initialized extents.  1) If we
+-			 * hit extent cache, EXT4_MAP_MAPPED flag is
+-			 * returned; 2) If we do a real lookup,
+-			 * non-flags are returned.  So we should check
+-			 * these two conditions.
+-			 */
+-			if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+-				overwrite = 1;
+-		}
+-	}
++	/* Check whether we do a DIO overwrite or not */
++	if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
++	    ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
++		overwrite = 1;
+ 
+ 	ret = __generic_file_write_iter(iocb, from);
+ 	inode_unlock(inode);
+-- 
+2.6.6
+
+
diff --git a/let-S_DAX-set-only-if-DAX-is-really-supported b/let-S_DAX-set-only-if-DAX-is-really-supported
new file mode 100644
index 00000000..9fd221d6
--- /dev/null
+++ b/let-S_DAX-set-only-if-DAX-is-really-supported
@@ -0,0 +1,102 @@
+ext4: only set S_DAX if DAX is really supported
+
+From: Jan Kara <jack@suse.cz>
+
+Currently we have S_DAX set inode->i_flags for a regular file whenever
+ext4 is mounted with dax mount option. However in some cases we cannot
+really do DAX - e.g. when inode is marked to use data journalling, when
+inode data is being encrypted, or when inode is stored inline. Make sure
+S_DAX flag is appropriately set/cleared in these cases.
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/inline.c | 10 ++++++++++
+ fs/ext4/inode.c  |  9 ++++++++-
+ fs/ext4/super.c  |  6 ++++++
+ 3 files changed, 24 insertions(+), 1 deletion(-)
+
+diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
+index f74d5ee2cdec..c29678965c3c 100644
+--- a/fs/ext4/inline.c
++++ b/fs/ext4/inline.c
+@@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle,
+ 	EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+ 	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ 	ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
++	/*
++	 * Propagate changes to inode->i_flags as well - e.g. S_DAX may
++	 * get cleared
++	 */
++	ext4_set_inode_flags(inode);
+ 	get_bh(is.iloc.bh);
+ 	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+ 
+@@ -442,6 +447,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
+ 		}
+ 	}
+ 	ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
++	/*
++	 * Propagate changes to inode->i_flags as well - e.g. S_DAX may
++	 * get set.
++	 */
++	ext4_set_inode_flags(inode);
+ 
+ 	get_bh(is.iloc.bh);
+ 	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 3d58b2b477e8..5337828c68a7 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4355,7 +4355,9 @@ void ext4_set_inode_flags(struct inode *inode)
+ 		new_fl |= S_NOATIME;
+ 	if (flags & EXT4_DIRSYNC_FL)
+ 		new_fl |= S_DIRSYNC;
+-	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
++	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
++	    !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
++	    !ext4_encrypted_inode(inode))
+ 		new_fl |= S_DAX;
+ 	inode_set_flags(inode, new_fl,
+ 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+@@ -5623,6 +5625,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
+ 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ 	}
+ 	ext4_set_aops(inode);
++	/*
++	 * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated.
++	 * E.g. S_DAX may get cleared / set.
++	 */
++	ext4_set_inode_flags(inode);
+ 
+ 	jbd2_journal_unlock_updates(journal);
+ 	percpu_up_write(&sbi->s_journal_flag_rwsem);
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 20da99da0a34..d5b94cc6a74e 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1126,6 +1126,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ 			ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ 			ext4_clear_inode_state(inode,
+ 					EXT4_STATE_MAY_INLINE_DATA);
++			/*
++			 * Update inode->i_flags - e.g. S_DAX may get disabled
++			 */
++			ext4_set_inode_flags(inode);
+ 		}
+ 		return res;
+ 	}
+@@ -1140,6 +1144,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ 			len, 0);
+ 	if (!res) {
+ 		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
++		/* Update inode->i_flags - e.g. S_DAX may get disabled */
++		ext4_set_inode_flags(inode);
+ 		res = ext4_mark_inode_dirty(handle, inode);
+ 		if (res)
+ 			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+-- 
+2.6.6
+
+
diff --git a/rip-out-DAX-handling-from-direct-IO-path b/rip-out-DAX-handling-from-direct-IO-path
new file mode 100644
index 00000000..1f2dda7f
--- /dev/null
+++ b/rip-out-DAX-handling-from-direct-IO-path
@@ -0,0 +1,182 @@
+ext4: rip out DAX handling from direct IO path
+
+From: Jan Kara <jack@suse.cz>
+
+Reads and writes for DAX inodes should no longer end up in direct IO
+code. Rip out the support and add a warning.
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h  |  2 --
+ fs/ext4/inode.c | 97 +++++++++------------------------------------------------
+ 2 files changed, 15 insertions(+), 84 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 098b39910001..8b763113a1b8 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -2457,8 +2457,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
+ struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
+ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ 			     struct buffer_head *bh_result, int create);
+-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+-		       struct buffer_head *bh_result, int create);
+ int ext4_get_block(struct inode *inode, sector_t iblock,
+ 		   struct buffer_head *bh_result, int create);
+ int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 4d71c7bc3524..d13f7cb6b1d5 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3272,46 +3272,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
+ }
+ 
+ #ifdef CONFIG_FS_DAX
+-/*
+- * Get block function for DAX IO and mmap faults. It takes care of converting
+- * unwritten extents to written ones and initializes new / converted blocks
+- * to zeros.
+- */
+-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+-		       struct buffer_head *bh_result, int create)
+-{
+-	int ret;
+-
+-	ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
+-	if (!create)
+-		return _ext4_get_block(inode, iblock, bh_result, 0);
+-
+-	ret = ext4_get_block_trans(inode, iblock, bh_result,
+-				   EXT4_GET_BLOCKS_PRE_IO |
+-				   EXT4_GET_BLOCKS_CREATE_ZERO);
+-	if (ret < 0)
+-		return ret;
+-
+-	if (buffer_unwritten(bh_result)) {
+-		/*
+-		 * We are protected by i_mmap_sem or i_mutex so we know block
+-		 * cannot go away from under us even though we dropped
+-		 * i_data_sem. Convert extent to written and write zeros there.
+-		 */
+-		ret = ext4_get_block_trans(inode, iblock, bh_result,
+-					   EXT4_GET_BLOCKS_CONVERT |
+-					   EXT4_GET_BLOCKS_CREATE_ZERO);
+-		if (ret < 0)
+-			return ret;
+-	}
+-	/*
+-	 * At least for now we have to clear BH_New so that DAX code
+-	 * doesn't attempt to zero blocks again in a racy way.
+-	 */
+-	clear_buffer_new(bh_result);
+-	return 0;
+-}
+-
+ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ 			    unsigned flags, struct iomap *iomap)
+ {
+@@ -3465,14 +3425,6 @@ struct iomap_ops ext4_iomap_ops = {
+ 	.iomap_end		= ext4_iomap_end,
+ };
+ 
+-#else
+-/* Just define empty function, it will never get called. */
+-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+-		       struct buffer_head *bh_result, int create)
+-{
+-	BUG();
+-	return 0;
+-}
+ #endif
+ 
+ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+@@ -3594,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+ 	iocb->private = NULL;
+ 	if (overwrite)
+ 		get_block_func = ext4_dio_get_block_overwrite;
+-	else if (IS_DAX(inode)) {
+-		/*
+-		 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+-		 * writes need zeroing either because they can race with page
+-		 * faults or because they use partial blocks.
+-		 */
+-		if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+-		    ext4_aligned_io(inode, offset, count))
+-			get_block_func = ext4_dio_get_block;
+-		else
+-			get_block_func = ext4_dax_get_block;
+-		dio_flags = DIO_LOCKING;
+-	} else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
++	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ 		   round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+ 		get_block_func = ext4_dio_get_block;
+ 		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+@@ -3620,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+ #ifdef CONFIG_EXT4_FS_ENCRYPTION
+ 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
+ #endif
+-	if (IS_DAX(inode)) {
+-		ret = dax_do_io(iocb, inode, iter, get_block_func,
+-				ext4_end_io_dio, dio_flags);
+-	} else
+-		ret = __blockdev_direct_IO(iocb, inode,
+-					   inode->i_sb->s_bdev, iter,
+-					   get_block_func,
+-					   ext4_end_io_dio, NULL, dio_flags);
++	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
++				   get_block_func, ext4_end_io_dio, NULL,
++				   dio_flags);
+ 
+ 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ 						EXT4_STATE_DIO_UNWRITTEN)) {
+@@ -3696,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+ {
+ 	struct address_space *mapping = iocb->ki_filp->f_mapping;
+ 	struct inode *inode = mapping->host;
++	size_t count = iov_iter_count(iter);
+ 	ssize_t ret;
+ 
+ 	/*
+@@ -3704,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+ 	 * we are protected against page writeback as well.
+ 	 */
+ 	inode_lock_shared(inode);
+-	if (IS_DAX(inode)) {
+-		ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
+-	} else {
+-		size_t count = iov_iter_count(iter);
+-
+-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+-						   iocb->ki_pos + count);
+-		if (ret)
+-			goto out_unlock;
+-		ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+-					   iter, ext4_dio_get_block,
+-					   NULL, NULL, 0);
+-	}
++	ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
++					   iocb->ki_pos + count);
++	if (ret)
++		goto out_unlock;
++	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
++				   iter, ext4_dio_get_block, NULL, NULL, 0);
+ out_unlock:
+ 	inode_unlock_shared(inode);
+ 	return ret;
+@@ -3745,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+ 	if (ext4_has_inline_data(inode))
+ 		return 0;
+ 
++	/* DAX uses iomap path now */
++	if (WARN_ON_ONCE(IS_DAX(inode)))
++		return 0;
++
+ 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+ 	if (iov_iter_rw(iter) == READ)
+ 		ret = ext4_direct_IO_read(iocb, iter);
+-- 
+2.6.6
+
+
diff --git a/series b/series
index 706fd4ed..3456fdf9 100644
--- a/series
+++ b/series
@@ -14,6 +14,17 @@ fix-sb-mount-options-processing
 verify-inodes_per_group-during-mount
 add-sanity-checking-in-count_overhead
 
+factor-out-checks-from-ext4_file_write_iter
+let-S_DAX-set-only-if-DAX-is-really-supported
+convert-dax-reads-to-iomap-infrastructure
+use-iomap-for-zeroing-blocks-in-DAX-mode
+DAX-iomap-write-support
+avoid-split-extents-for-DAX-writes
+convert-DAX-faults-to-iomap-infrastructure
+rip-out-DAX-handling-from-direct-IO-path
+ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path
+dax-rip-out-get_block-based-IO-support
+
 ####################################################
 # unstable patches
 ####################################################
diff --git a/timestamps b/timestamps
index 0bdfbc45..c13a46e4 100755
--- a/timestamps
+++ b/timestamps
@@ -55,8 +55,18 @@ touch -d @1479310377 avoid-lockdep-warning-when-inheriting-encryption-context
 touch -d @1479492024 sanity-check-block-and-cluster-size
 touch -d @1479493466 fix-sb-mount-options-processing
 touch -d @1479493710 verify-inodes_per_group-during-mount
-touch -d @1479494158 series
 touch -d @1479494267 add-sanity-checking-in-count_overhead
 touch -d @1479494327 stable-boundary
-touch -d @1479674600 status
-touch -d @1479676755 timestamps
+touch -d @1479680991 factor-out-checks-from-ext4_file_write_iter
+touch -d @1479681179 let-S_DAX-set-only-if-DAX-is-really-supported
+touch -d @1479681366 convert-dax-reads-to-iomap-infrastructure
+touch -d @1479683285 use-iomap-for-zeroing-blocks-in-DAX-mode
+touch -d @1479683351 DAX-iomap-write-support
+touch -d @1479683409 avoid-split-extents-for-DAX-writes
+touch -d @1479685884 convert-DAX-faults-to-iomap-infrastructure
+touch -d @1479686010 rip-out-DAX-handling-from-direct-IO-path
+touch -d @1479692827 ext2-use-iomap_zero_range-for-zeroing-truncated-page-in-DAX-path
+touch -d @1479692916 dax-rip-out-get_block-based-IO-support
+touch -d @1479695661 series
+touch -d @1479695664 status
+touch -d @1479697791 timestamps
diff --git a/use-iomap-for-zeroing-blocks-in-DAX-mode b/use-iomap-for-zeroing-blocks-in-DAX-mode
new file mode 100644
index 00000000..1b23eaf8
--- /dev/null
+++ b/use-iomap-for-zeroing-blocks-in-DAX-mode
@@ -0,0 +1,36 @@
+ext4: use iomap for zeroing blocks in DAX mode
+
+From: Jan Kara <jack@suse.cz>
+
+Use iomap infrastructure for zeroing blocks when in DAX mode.
+ext4_iomap_begin() handles read requests just fine and that's all that
+is needed for iomap_zero_range().
+
+Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/inode.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 83e8411370d3..df017ce3e52d 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3849,8 +3849,10 @@ static int ext4_block_zero_page_range(handle_t *handle,
+ 	if (length > max || length < 0)
+ 		length = max;
+ 
+-	if (IS_DAX(inode))
+-		return dax_zero_page_range(inode, from, length, ext4_get_block);
++	if (IS_DAX(inode)) {
++		return iomap_zero_range(inode, from, length, NULL,
++					&ext4_iomap_ops);
++	}
+ 	return __ext4_block_zero_page_range(handle, mapping, from, length);
+ }
+ 
+-- 
+2.6.6
+
+
-- 
2.11.4.GIT