Update make-the-bitmap-read-routines-return-real-errors-code to not
[ext4-patch-queue.git] / use-pre-zeroed-blocks-for-DAX-page-faults
blobf27b8dd943b18ccb8c7a270e8ffd589bde90c87b
1 ext4: use pre-zeroed blocks for DAX page faults
3 From: Jan Kara <jack@suse.com>
5 Make DAX fault path use pre-zeroed blocks to avoid races with extent
6 conversion and zeroing when two page faults to the same block happen.
8 Signed-off-by: Jan Kara <jack@suse.com>
9 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
10 ---
11  fs/ext4/ext4.h  |  4 +--
12  fs/ext4/file.c  | 20 +++-----------
13  fs/ext4/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++++------------
14  3 files changed, 69 insertions(+), 36 deletions(-)
16 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
17 index 0a11a34d54c9..9b6e5813968b 100644
18 --- a/fs/ext4/ext4.h
19 +++ b/fs/ext4/ext4.h
20 @@ -2284,8 +2284,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
21  struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
22  int ext4_get_block_write(struct inode *inode, sector_t iblock,
23                          struct buffer_head *bh_result, int create);
24 -int ext4_get_block_dax(struct inode *inode, sector_t iblock,
25 -                        struct buffer_head *bh_result, int create);
26 +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
27 +                           struct buffer_head *bh_result, int create);
28  int ext4_get_block(struct inode *inode, sector_t iblock,
29                                 struct buffer_head *bh_result, int create);
30  int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
31 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
32 index 0d24ebcd7c9e..749b222e6498 100644
33 --- a/fs/ext4/file.c
34 +++ b/fs/ext4/file.c
35 @@ -193,18 +193,6 @@ out:
36  }
38  #ifdef CONFIG_FS_DAX
39 -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
41 -       struct inode *inode = bh->b_assoc_map->host;
42 -       /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
43 -       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
44 -       int err;
45 -       if (!uptodate)
46 -               return;
47 -       WARN_ON(!buffer_unwritten(bh));
48 -       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
51  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
52  {
53         int result;
54 @@ -225,8 +213,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
55         if (IS_ERR(handle))
56                 result = VM_FAULT_SIGBUS;
57         else
58 -               result = __dax_fault(vma, vmf, ext4_get_block_dax,
59 -                                               ext4_end_io_unwritten);
60 +               result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
62         if (write) {
63                 if (!IS_ERR(handle))
64 @@ -262,7 +249,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
65                 result = VM_FAULT_SIGBUS;
66         else
67                 result = __dax_pmd_fault(vma, addr, pmd, flags,
68 -                               ext4_get_block_dax, ext4_end_io_unwritten);
69 +                               ext4_dax_mmap_get_block, NULL);
71         if (write) {
72                 if (!IS_ERR(handle))
73 @@ -283,8 +270,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
74         sb_start_pagefault(inode->i_sb);
75         file_update_time(vma->vm_file);
76         down_read(&EXT4_I(inode)->i_mmap_sem);
77 -       err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
78 -                           ext4_end_io_unwritten);
79 +       err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
80         up_read(&EXT4_I(inode)->i_mmap_sem);
81         sb_end_pagefault(inode->i_sb);
83 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
84 index 73c6214084d4..c2a476a35464 100644
85 --- a/fs/ext4/inode.c
86 +++ b/fs/ext4/inode.c
87 @@ -718,16 +718,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
89                 map_bh(bh, inode->i_sb, map.m_pblk);
90                 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
91 -               if (IS_DAX(inode) && buffer_unwritten(bh)) {
92 -                       /*
93 -                        * dgc: I suspect unwritten conversion on ext4+DAX is
94 -                        * fundamentally broken here when there are concurrent
95 -                        * read/write in progress on this inode.
96 -                        */
97 -                       WARN_ON_ONCE(io_end);
98 -                       bh->b_assoc_map = inode->i_mapping;
99 -                       bh->b_private = (void *)(unsigned long)iblock;
100 -               }
101                 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
102                         set_buffer_defer_completion(bh);
103                 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
104 @@ -3050,17 +3040,74 @@ static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
105         return ret;
108 -int ext4_get_block_dax(struct inode *inode, sector_t iblock,
109 -                  struct buffer_head *bh_result, int create)
110 +#ifdef CONFIG_FS_DAX
111 +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
112 +                           struct buffer_head *bh_result, int create)
114 -       int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
115 +       int ret, err;
116 +       int credits;
117 +       struct ext4_map_blocks map;
118 +       handle_t *handle = NULL;
119 +       int flags = 0;
121 -       if (create)
122 -               flags |= EXT4_GET_BLOCKS_CREATE;
123 -       ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
124 +       ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
125                    inode->i_ino, create);
126 -       return _ext4_get_block(inode, iblock, bh_result, flags);
127 +       map.m_lblk = iblock;
128 +       map.m_len = bh_result->b_size >> inode->i_blkbits;
129 +       credits = ext4_chunk_trans_blocks(inode, map.m_len);
130 +       if (create) {
131 +               flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
132 +               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
133 +               if (IS_ERR(handle)) {
134 +                       ret = PTR_ERR(handle);
135 +                       return ret;
136 +               }
137 +       }
139 +       ret = ext4_map_blocks(handle, inode, &map, flags);
140 +       if (create) {
141 +               err = ext4_journal_stop(handle);
142 +               if (ret >= 0 && err < 0)
143 +                       ret = err;
144 +       }
145 +       if (ret <= 0)
146 +               goto out;
147 +       if (map.m_flags & EXT4_MAP_UNWRITTEN) {
148 +               int err2;
150 +               /*
151 +                * We are protected by i_mmap_sem so we know block cannot go
152 +                * away from under us even though we dropped i_data_sem.
153 +                * Convert extent to written and write zeros there.
154 +                *
155 +                * Note: We may get here even when create == 0.
156 +                */
157 +               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
158 +               if (IS_ERR(handle)) {
159 +                       ret = PTR_ERR(handle);
160 +                       goto out;
161 +               }
163 +               err = ext4_map_blocks(handle, inode, &map,
164 +                     EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
165 +               if (err < 0)
166 +                       ret = err;
167 +               err2 = ext4_journal_stop(handle);
168 +               if (err2 < 0 && ret > 0)
169 +                       ret = err2;
170 +       }
171 +out:
172 +       WARN_ON_ONCE(ret == 0 && create);
173 +       if (ret > 0) {
174 +               map_bh(bh_result, inode->i_sb, map.m_pblk);
175 +               bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
176 +                                       map.m_flags;
177 +               bh_result->b_size = map.m_len << inode->i_blkbits;
178 +               ret = 0;
179 +       }
180 +       return ret;
182 +#endif
184  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
185                             ssize_t size, void *private)
186 -- 
187 2.1.4