Fix up tracepoints and add signed off by for btrfs patches
[ext4-patch-queue.git] / fix-suboptimal-seek_datahole_extents_traversal
blob51aa96179920f353cb281c941b6b140a9169a4f5
1 ext4: fix suboptimal seek_{data,hole} extents traversial
3 From: Dmitry Monakhov <dmonakhov@openvz.org>
5 It is ridiculus practice to scan inode block by block, this technique
6 applicable only for old indirect files.  This takes signifficant amount
7 of time for really large files. Let's reuse ext4_fiemap which already
8 traverse inode-tree in most optimal meaner.
10 TESTCASE:
11 ftruncate64(fd, 0);
12 ftruncate64(fd, 1ULL << 40);
13 /* lseek will spin very long time */
14 lseek64(fd, 0, SEEK_DATA);
15 lseek64(fd, 0, SEEK_HOLE);
17 Original report: https://lkml.org/lkml/2014/10/16/620
19 Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
20 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
21 ---
22  fs/ext4/extents.c |    4 +-
23  fs/ext4/file.c    |  220 +++++++++++++++++++++++++---------------------------
24  2 files changed, 108 insertions(+), 116 deletions(-)
26 ### Do we need i_mutex here?
28 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
29 index 37043d0..11cee53 100644
30 --- a/fs/ext4/extents.c
31 +++ b/fs/ext4/extents.c
32 @@ -5155,8 +5155,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
34         /* fallback to generic here if not in extents fmt */
35         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
36 -               return generic_block_fiemap(inode, fieinfo, start, len,
37 -                       ext4_get_block);
38 +               return __generic_block_fiemap(inode, fieinfo, start, len,
39 +                                             ext4_get_block);
41         if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
42                 return -EBADR;
43 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
44 index aca7b24..12cbffd 100644
45 --- a/fs/ext4/file.c
46 +++ b/fs/ext4/file.c
47 @@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
48   * we determine this extent as a data or a hole according to whether the
49   * page cache has data or not.
50   */
51 -static int ext4_find_unwritten_pgoff(struct inode *inode,
52 -                                    int whence,
53 -                                    struct ext4_map_blocks *map,
54 -                                    loff_t *offset)
55 +static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
56 +                                    loff_t endoff, loff_t *offset)
57  {
58         struct pagevec pvec;
59 -       unsigned int blkbits;
60         pgoff_t index;
61         pgoff_t end;
62 -       loff_t endoff;
63         loff_t startoff;
64         loff_t lastoff;
65         int found = 0;
67 -       blkbits = inode->i_sb->s_blocksize_bits;
68         startoff = *offset;
69         lastoff = startoff;
70 -       endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
73         index = startoff >> PAGE_CACHE_SHIFT;
74         end = endoff >> PAGE_CACHE_SHIFT;
75 @@ -408,147 +403,144 @@ out:
76  static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
77  {
78         struct inode *inode = file->f_mapping->host;
79 -       struct ext4_map_blocks map;
80 -       struct extent_status es;
81 -       ext4_lblk_t start, last, end;
82 -       loff_t dataoff, isize;
83 -       int blkbits;
84 -       int ret = 0;
85 +       struct fiemap_extent_info fie;
86 +       struct fiemap_extent ext[2];
87 +       loff_t next;
88 +       int i, ret = 0;
90         mutex_lock(&inode->i_mutex);
92 -       isize = i_size_read(inode);
93 -       if (offset >= isize) {
94 +       if (offset >= inode->i_size) {
95                 mutex_unlock(&inode->i_mutex);
96                 return -ENXIO;
97         }
99 -       blkbits = inode->i_sb->s_blocksize_bits;
100 -       start = offset >> blkbits;
101 -       last = start;
102 -       end = isize >> blkbits;
103 -       dataoff = offset;
105 -       do {
106 -               map.m_lblk = last;
107 -               map.m_len = end - last + 1;
108 -               ret = ext4_map_blocks(NULL, inode, &map, 0);
109 -               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
110 -                       if (last != start)
111 -                               dataoff = (loff_t)last << blkbits;
112 +       fie.fi_flags = 0;
113 +       fie.fi_extents_max = 2;
114 +       fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
115 +       while (1) {
116 +               mm_segment_t old_fs = get_fs();
118 +               fie.fi_extents_mapped = 0;
119 +               memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
121 +               set_fs(get_ds());
122 +               ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
123 +               set_fs(old_fs);
124 +               if (ret)
125                         break;
126 -               }
128 -               /*
129 -                * If there is a delay extent at this offset,
130 -                * it will be as a data.
131 -                */
132 -               ext4_es_find_delayed_extent_range(inode, last, last, &es);
133 -               if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
134 -                       if (last != start)
135 -                               dataoff = (loff_t)last << blkbits;
136 +               /* No extents found, EOF */
137 +               if (!fie.fi_extents_mapped) {
138 +                       ret = -ENXIO;
139                         break;
140                 }
141 +               for (i = 0; i < fie.fi_extents_mapped; i++) {
142 +                       next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
144 -               /*
145 -                * If there is a unwritten extent at this offset,
146 -                * it will be as a data or a hole according to page
147 -                * cache that has data or not.
148 -                */
149 -               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
150 -                       int unwritten;
151 -                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
152 -                                                             &map, &dataoff);
153 -                       if (unwritten)
154 -                               break;
155 -               }
156 +                       if (offset < (loff_t)ext[i].fe_logical)
157 +                               offset = (loff_t)ext[i].fe_logical;
158 +                       /*
159 +                        * If extent is not unwritten, then it contains valid
160 +                        * data, mapped or delayed.
161 +                        */
162 +                       if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
163 +                               goto out;
165 -               last++;
166 -               dataoff = (loff_t)last << blkbits;
167 -       } while (last <= end);
168 +                       /*
169 +                        * If there is a unwritten extent at this offset,
170 +                        * it will be as a data or a hole according to page
171 +                        * cache that has data or not.
172 +                        */
173 +                       if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
174 +                                                     next, &offset))
175 +                               goto out;
177 +                       if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
178 +                               ret = -ENXIO;
179 +                               goto out;
180 +                       }
181 +                       offset = next;
182 +               }
183 +       }
184 +       if (offset > inode->i_size)
185 +               offset = inode->i_size;
186 +out:
187         mutex_unlock(&inode->i_mutex);
188 +       if (ret)
189 +               return ret;
191 -       if (dataoff > isize)
192 -               return -ENXIO;
194 -       return vfs_setpos(file, dataoff, maxsize);
195 +       return vfs_setpos(file, offset, maxsize);
198  /*
199 - * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
200 + * ext4_seek_hole() retrieves the offset for SEEK_HOLE
201   */
202  static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
204         struct inode *inode = file->f_mapping->host;
205 -       struct ext4_map_blocks map;
206 -       struct extent_status es;
207 -       ext4_lblk_t start, last, end;
208 -       loff_t holeoff, isize;
209 -       int blkbits;
210 -       int ret = 0;
211 +       struct fiemap_extent_info fie;
212 +       struct fiemap_extent ext[2];
213 +       loff_t next;
214 +       int i, ret = 0;
216         mutex_lock(&inode->i_mutex);
218 -       isize = i_size_read(inode);
219 -       if (offset >= isize) {
220 +       if (offset >= inode->i_size) {
221                 mutex_unlock(&inode->i_mutex);
222                 return -ENXIO;
223         }
225 -       blkbits = inode->i_sb->s_blocksize_bits;
226 -       start = offset >> blkbits;
227 -       last = start;
228 -       end = isize >> blkbits;
229 -       holeoff = offset;
230 +       fie.fi_flags = 0;
231 +       fie.fi_extents_max = 2;
232 +       fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
233 +       while (1) {
234 +               mm_segment_t old_fs = get_fs();
236 -       do {
237 -               map.m_lblk = last;
238 -               map.m_len = end - last + 1;
239 -               ret = ext4_map_blocks(NULL, inode, &map, 0);
240 -               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
241 -                       last += ret;
242 -                       holeoff = (loff_t)last << blkbits;
243 -                       continue;
244 -               }
245 +               fie.fi_extents_mapped = 0;
246 +               memset(ext, 0, sizeof(*ext));
248 -               /*
249 -                * If there is a delay extent at this offset,
250 -                * we will skip this extent.
251 -                */
252 -               ext4_es_find_delayed_extent_range(inode, last, last, &es);
253 -               if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
254 -                       last = es.es_lblk + es.es_len;
255 -                       holeoff = (loff_t)last << blkbits;
256 -                       continue;
257 -               }
258 +               set_fs(get_ds());
259 +               ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
260 +               set_fs(old_fs);
261 +               if (ret)
262 +                       break;
264 -               /*
265 -                * If there is a unwritten extent at this offset,
266 -                * it will be as a data or a hole according to page
267 -                * cache that has data or not.
268 -                */
269 -               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
270 -                       int unwritten;
271 -                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
272 -                                                             &map, &holeoff);
273 -                       if (!unwritten) {
274 -                               last += ret;
275 -                               holeoff = (loff_t)last << blkbits;
276 +               /* No extents found */
277 +               if (!fie.fi_extents_mapped)
278 +                       break;
280 +               for (i = 0; i < fie.fi_extents_mapped; i++) {
281 +                       next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
282 +                       /*
283 +                        * If extent is not unwritten, then it contains valid
284 +                        * data, mapped or delayed.
285 +                        */
286 +                       if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
287 +                               if (offset < (loff_t)ext[i].fe_logical)
288 +                                       goto out;
289 +                               offset = next;
290                                 continue;
291                         }
292 -               }
294 -               /* find a hole */
295 -               break;
296 -       } while (last <= end);
297 +                       /*
298 +                        * If there is a unwritten extent at this offset,
299 +                        * it will be as a data or a hole according to page
300 +                        * cache that has data or not.
301 +                        */
302 +                       if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
303 +                                                     next, &offset))
304 +                               goto out;
306 +                       offset = next;
307 +                       if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
308 +                               goto out;
309 +               }
310 +       }
311 +       if (offset > inode->i_size)
312 +               offset = inode->i_size;
313 +out:
314         mutex_unlock(&inode->i_mutex);
315 +       if (ret)
316 +               return ret;
318 -       if (holeoff > isize)
319 -               holeoff = isize;
321 -       return vfs_setpos(file, holeoff, maxsize);
322 +       return vfs_setpos(file, offset, maxsize);
325  /*
326 -- 
327 1.7.1
330 To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
331 the body of a message to majordomo@vger.kernel.org
332 More majordomo info at  http://vger.kernel.org/majordomo-info.html
333 Please read the FAQ at  http://www.tux.org/lkml/