1 ext4: fix suboptimal seek_{data,hole} extents traversial
3 From: Dmitry Monakhov <dmonakhov@openvz.org>
5 It is ridiculous practice to scan inode block by block, this technique
6 applicable only for old indirect files. This takes significant amount
7 of time for really large files. Let's reuse ext4_fiemap which already
8 traverse inode-tree in most optimal meaner.
12 ftruncate64(fd, 1ULL << 40);
13 /* lseek will spin very long time */
14 lseek64(fd, 0, SEEK_DATA);
15 lseek64(fd, 0, SEEK_HOLE);
17 Original report: https://lkml.org/lkml/2014/10/16/620
19 Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
20 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
22 fs/ext4/extents.c | 4 +-
23 fs/ext4/file.c | 220 +++++++++++++++++++++++++---------------------------
24 2 files changed, 108 insertions(+), 116 deletions(-)
26 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
27 index bed4308..e5d3ead 100644
28 --- a/fs/ext4/extents.c
29 +++ b/fs/ext4/extents.c
30 @@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
32 /* fallback to generic here if not in extents fmt */
33 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
34 - return generic_block_fiemap(inode, fieinfo, start, len,
36 + return __generic_block_fiemap(inode, fieinfo, start, len,
39 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
41 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
42 index 8131be8..513c12c 100644
45 @@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
46 * we determine this extent as a data or a hole according to whether the
47 * page cache has data or not.
49 -static int ext4_find_unwritten_pgoff(struct inode *inode,
51 - struct ext4_map_blocks *map,
53 +static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
54 + loff_t endoff, loff_t *offset)
57 - unsigned int blkbits;
65 - blkbits = inode->i_sb->s_blocksize_bits;
68 - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
71 index = startoff >> PAGE_CACHE_SHIFT;
72 end = endoff >> PAGE_CACHE_SHIFT;
73 @@ -408,147 +403,144 @@ out:
74 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
76 struct inode *inode = file->f_mapping->host;
77 - struct ext4_map_blocks map;
78 - struct extent_status es;
79 - ext4_lblk_t start, last, end;
80 - loff_t dataoff, isize;
83 + struct fiemap_extent_info fie;
84 + struct fiemap_extent ext[2];
88 mutex_lock(&inode->i_mutex);
90 - isize = i_size_read(inode);
91 - if (offset >= isize) {
92 + if (offset >= inode->i_size) {
93 mutex_unlock(&inode->i_mutex);
97 - blkbits = inode->i_sb->s_blocksize_bits;
98 - start = offset >> blkbits;
100 - end = isize >> blkbits;
105 - map.m_len = end - last + 1;
106 - ret = ext4_map_blocks(NULL, inode, &map, 0);
107 - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
109 - dataoff = (loff_t)last << blkbits;
111 + fie.fi_extents_max = 2;
112 + fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
114 + mm_segment_t old_fs = get_fs();
116 + fie.fi_extents_mapped = 0;
117 + memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
120 + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
127 - * If there is a delay extent at this offset,
128 - * it will be as a data.
130 - ext4_es_find_delayed_extent_range(inode, last, last, &es);
131 - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
133 - dataoff = (loff_t)last << blkbits;
134 + /* No extents found, EOF */
135 + if (!fie.fi_extents_mapped) {
139 + for (i = 0; i < fie.fi_extents_mapped; i++) {
140 + next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
143 - * If there is a unwritten extent at this offset,
144 - * it will be as a data or a hole according to page
145 - * cache that has data or not.
147 - if (map.m_flags & EXT4_MAP_UNWRITTEN) {
149 - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
154 + if (offset < (loff_t)ext[i].fe_logical)
155 + offset = (loff_t)ext[i].fe_logical;
157 + * If extent is not unwritten, then it contains valid
158 + * data, mapped or delayed.
160 + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
164 - dataoff = (loff_t)last << blkbits;
165 - } while (last <= end);
167 + * If there is a unwritten extent at this offset,
168 + * it will be as a data or a hole according to page
169 + * cache that has data or not.
171 + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
175 + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
182 + if (offset > inode->i_size)
183 + offset = inode->i_size;
185 mutex_unlock(&inode->i_mutex);
189 - if (dataoff > isize)
192 - return vfs_setpos(file, dataoff, maxsize);
193 + return vfs_setpos(file, offset, maxsize);
197 - * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
198 + * ext4_seek_hole() retrieves the offset for SEEK_HOLE
200 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
202 struct inode *inode = file->f_mapping->host;
203 - struct ext4_map_blocks map;
204 - struct extent_status es;
205 - ext4_lblk_t start, last, end;
206 - loff_t holeoff, isize;
209 + struct fiemap_extent_info fie;
210 + struct fiemap_extent ext[2];
214 mutex_lock(&inode->i_mutex);
216 - isize = i_size_read(inode);
217 - if (offset >= isize) {
218 + if (offset >= inode->i_size) {
219 mutex_unlock(&inode->i_mutex);
223 - blkbits = inode->i_sb->s_blocksize_bits;
224 - start = offset >> blkbits;
226 - end = isize >> blkbits;
229 + fie.fi_extents_max = 2;
230 + fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
232 + mm_segment_t old_fs = get_fs();
236 - map.m_len = end - last + 1;
237 - ret = ext4_map_blocks(NULL, inode, &map, 0);
238 - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
240 - holeoff = (loff_t)last << blkbits;
243 + fie.fi_extents_mapped = 0;
244 + memset(ext, 0, sizeof(*ext));
247 - * If there is a delay extent at this offset,
248 - * we will skip this extent.
250 - ext4_es_find_delayed_extent_range(inode, last, last, &es);
251 - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
252 - last = es.es_lblk + es.es_len;
253 - holeoff = (loff_t)last << blkbits;
257 + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
263 - * If there is a unwritten extent at this offset,
264 - * it will be as a data or a hole according to page
265 - * cache that has data or not.
267 - if (map.m_flags & EXT4_MAP_UNWRITTEN) {
269 - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
273 - holeoff = (loff_t)last << blkbits;
274 + /* No extents found */
275 + if (!fie.fi_extents_mapped)
278 + for (i = 0; i < fie.fi_extents_mapped; i++) {
279 + next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
281 + * If extent is not unwritten, then it contains valid
282 + * data, mapped or delayed.
284 + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
285 + if (offset < (loff_t)ext[i].fe_logical)
294 - } while (last <= end);
296 + * If there is a unwritten extent at this offset,
297 + * it will be as a data or a hole according to page
298 + * cache that has data or not.
300 + if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
305 + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
309 + if (offset > inode->i_size)
310 + offset = inode->i_size;
312 mutex_unlock(&inode->i_mutex);
316 - if (holeoff > isize)
319 - return vfs_setpos(file, holeoff, maxsize);
320 + return vfs_setpos(file, offset, maxsize);