1 ext4: use pre-zeroed blocks for DAX page faults
3 From: Jan Kara <jack@suse.com>
5 Make DAX fault path use pre-zeroed blocks to avoid races with extent
6 conversion and zeroing when two page faults to the same block happen.
8 Signed-off-by: Jan Kara <jack@suse.com>
9 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
11 fs/ext4/ext4.h | 4 +--
12 fs/ext4/file.c | 20 +++-----------
13 fs/ext4/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++++------------
14 3 files changed, 69 insertions(+), 36 deletions(-)
16 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
17 index 0a11a34d54c9..9b6e5813968b 100644
20 @@ -2284,8 +2284,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
21 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
22 int ext4_get_block_write(struct inode *inode, sector_t iblock,
23 struct buffer_head *bh_result, int create);
24 -int ext4_get_block_dax(struct inode *inode, sector_t iblock,
25 - struct buffer_head *bh_result, int create);
26 +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
27 + struct buffer_head *bh_result, int create);
28 int ext4_get_block(struct inode *inode, sector_t iblock,
29 struct buffer_head *bh_result, int create);
30 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
31 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
32 index 0d24ebcd7c9e..749b222e6498 100644
35 @@ -193,18 +193,6 @@ out:
39 -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
41 - struct inode *inode = bh->b_assoc_map->host;
42 - /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
43 - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
47 - WARN_ON(!buffer_unwritten(bh));
48 - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
51 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
54 @@ -225,8 +213,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
56 result = VM_FAULT_SIGBUS;
58 - result = __dax_fault(vma, vmf, ext4_get_block_dax,
59 - ext4_end_io_unwritten);
60 + result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
64 @@ -262,7 +249,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
65 result = VM_FAULT_SIGBUS;
67 result = __dax_pmd_fault(vma, addr, pmd, flags,
68 - ext4_get_block_dax, ext4_end_io_unwritten);
69 + ext4_dax_mmap_get_block, NULL);
73 @@ -283,8 +270,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
74 sb_start_pagefault(inode->i_sb);
75 file_update_time(vma->vm_file);
76 down_read(&EXT4_I(inode)->i_mmap_sem);
77 - err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
78 - ext4_end_io_unwritten);
79 + err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
80 up_read(&EXT4_I(inode)->i_mmap_sem);
81 sb_end_pagefault(inode->i_sb);
83 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
84 index 73c6214084d4..c2a476a35464 100644
87 @@ -718,16 +718,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
89 map_bh(bh, inode->i_sb, map.m_pblk);
90 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
91 - if (IS_DAX(inode) && buffer_unwritten(bh)) {
93 - * dgc: I suspect unwritten conversion on ext4+DAX is
94 - * fundamentally broken here when there are concurrent
95 - * read/write in progress on this inode.
97 - WARN_ON_ONCE(io_end);
98 - bh->b_assoc_map = inode->i_mapping;
99 - bh->b_private = (void *)(unsigned long)iblock;
101 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
102 set_buffer_defer_completion(bh);
103 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
104 @@ -3050,17 +3040,74 @@ static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
108 -int ext4_get_block_dax(struct inode *inode, sector_t iblock,
109 - struct buffer_head *bh_result, int create)
110 +#ifdef CONFIG_FS_DAX
111 +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
112 + struct buffer_head *bh_result, int create)
114 - int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
117 + struct ext4_map_blocks map;
118 + handle_t *handle = NULL;
122 - flags |= EXT4_GET_BLOCKS_CREATE;
123 - ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
124 + ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
125 inode->i_ino, create);
126 - return _ext4_get_block(inode, iblock, bh_result, flags);
127 + map.m_lblk = iblock;
128 + map.m_len = bh_result->b_size >> inode->i_blkbits;
129 + credits = ext4_chunk_trans_blocks(inode, map.m_len);
131 + flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
132 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
133 + if (IS_ERR(handle)) {
134 + ret = PTR_ERR(handle);
139 + ret = ext4_map_blocks(handle, inode, &map, flags);
141 + err = ext4_journal_stop(handle);
142 + if (ret >= 0 && err < 0)
147 + if (map.m_flags & EXT4_MAP_UNWRITTEN) {
151 + * We are protected by i_mmap_sem so we know block cannot go
152 + * away from under us even though we dropped i_data_sem.
153 + * Convert extent to written and write zeros there.
155 + * Note: We may get here even when create == 0.
157 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
158 + if (IS_ERR(handle)) {
159 + ret = PTR_ERR(handle);
163 + err = ext4_map_blocks(handle, inode, &map,
164 + EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
167 + err2 = ext4_journal_stop(handle);
168 + if (err2 < 0 && ret > 0)
172 + WARN_ON_ONCE(ret == 0 && create);
174 + map_bh(bh_result, inode->i_sb, map.m_pblk);
175 + bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
177 + bh_result->b_size = map.m_len << inode->i_blkbits;
184 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
185 ssize_t size, void *private)