1 ext4: fix suboptimal seek_{data,hole} extents traversial
3 From: Dmitry Monakhov <dmonakhov@openvz.org>
5 It is ridiculus practice to scan inode block by block, this technique
6 applicable only for old indirect files. This takes signifficant amount
7 of time for really large files. Let's reuse ext4_fiemap which already
8 traverse inode-tree in most optimal meaner.
12 ftruncate64(fd, 1ULL << 40);
13 /* lseek will spin very long time */
14 lseek64(fd, 0, SEEK_DATA);
15 lseek64(fd, 0, SEEK_HOLE);
17 Original report: https://lkml.org/lkml/2014/10/16/620
19 Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
20 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
22 fs/ext4/extents.c | 4 +-
23 fs/ext4/file.c | 220 +++++++++++++++++++++++++---------------------------
24 2 files changed, 108 insertions(+), 116 deletions(-)
26 ### Do we need i_mutex here?
28 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
29 index 37043d0..11cee53 100644
30 --- a/fs/ext4/extents.c
31 +++ b/fs/ext4/extents.c
32 @@ -5155,8 +5155,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
34 /* fallback to generic here if not in extents fmt */
35 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
36 - return generic_block_fiemap(inode, fieinfo, start, len,
38 + return __generic_block_fiemap(inode, fieinfo, start, len,
41 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
43 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
44 index aca7b24..12cbffd 100644
47 @@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
48 * we determine this extent as a data or a hole according to whether the
49 * page cache has data or not.
51 -static int ext4_find_unwritten_pgoff(struct inode *inode,
53 - struct ext4_map_blocks *map,
55 +static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
56 + loff_t endoff, loff_t *offset)
59 - unsigned int blkbits;
67 - blkbits = inode->i_sb->s_blocksize_bits;
70 - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
73 index = startoff >> PAGE_CACHE_SHIFT;
74 end = endoff >> PAGE_CACHE_SHIFT;
75 @@ -408,147 +403,144 @@ out:
76 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
78 struct inode *inode = file->f_mapping->host;
79 - struct ext4_map_blocks map;
80 - struct extent_status es;
81 - ext4_lblk_t start, last, end;
82 - loff_t dataoff, isize;
85 + struct fiemap_extent_info fie;
86 + struct fiemap_extent ext[2];
90 mutex_lock(&inode->i_mutex);
92 - isize = i_size_read(inode);
93 - if (offset >= isize) {
94 + if (offset >= inode->i_size) {
95 mutex_unlock(&inode->i_mutex);
99 - blkbits = inode->i_sb->s_blocksize_bits;
100 - start = offset >> blkbits;
102 - end = isize >> blkbits;
107 - map.m_len = end - last + 1;
108 - ret = ext4_map_blocks(NULL, inode, &map, 0);
109 - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
111 - dataoff = (loff_t)last << blkbits;
113 + fie.fi_extents_max = 2;
114 + fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
116 + mm_segment_t old_fs = get_fs();
118 + fie.fi_extents_mapped = 0;
119 + memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
122 + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
129 - * If there is a delay extent at this offset,
130 - * it will be as a data.
132 - ext4_es_find_delayed_extent_range(inode, last, last, &es);
133 - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
135 - dataoff = (loff_t)last << blkbits;
136 + /* No extents found, EOF */
137 + if (!fie.fi_extents_mapped) {
141 + for (i = 0; i < fie.fi_extents_mapped; i++) {
142 + next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
145 - * If there is a unwritten extent at this offset,
146 - * it will be as a data or a hole according to page
147 - * cache that has data or not.
149 - if (map.m_flags & EXT4_MAP_UNWRITTEN) {
151 - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
156 + if (offset < (loff_t)ext[i].fe_logical)
157 + offset = (loff_t)ext[i].fe_logical;
159 + * If extent is not unwritten, then it contains valid
160 + * data, mapped or delayed.
162 + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
166 - dataoff = (loff_t)last << blkbits;
167 - } while (last <= end);
169 + * If there is a unwritten extent at this offset,
170 + * it will be as a data or a hole according to page
171 + * cache that has data or not.
173 + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
177 + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
184 + if (offset > inode->i_size)
185 + offset = inode->i_size;
187 mutex_unlock(&inode->i_mutex);
191 - if (dataoff > isize)
194 - return vfs_setpos(file, dataoff, maxsize);
195 + return vfs_setpos(file, offset, maxsize);
199 - * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
200 + * ext4_seek_hole() retrieves the offset for SEEK_HOLE
202 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
204 struct inode *inode = file->f_mapping->host;
205 - struct ext4_map_blocks map;
206 - struct extent_status es;
207 - ext4_lblk_t start, last, end;
208 - loff_t holeoff, isize;
211 + struct fiemap_extent_info fie;
212 + struct fiemap_extent ext[2];
216 mutex_lock(&inode->i_mutex);
218 - isize = i_size_read(inode);
219 - if (offset >= isize) {
220 + if (offset >= inode->i_size) {
221 mutex_unlock(&inode->i_mutex);
225 - blkbits = inode->i_sb->s_blocksize_bits;
226 - start = offset >> blkbits;
228 - end = isize >> blkbits;
231 + fie.fi_extents_max = 2;
232 + fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
234 + mm_segment_t old_fs = get_fs();
238 - map.m_len = end - last + 1;
239 - ret = ext4_map_blocks(NULL, inode, &map, 0);
240 - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
242 - holeoff = (loff_t)last << blkbits;
245 + fie.fi_extents_mapped = 0;
246 + memset(ext, 0, sizeof(*ext));
249 - * If there is a delay extent at this offset,
250 - * we will skip this extent.
252 - ext4_es_find_delayed_extent_range(inode, last, last, &es);
253 - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
254 - last = es.es_lblk + es.es_len;
255 - holeoff = (loff_t)last << blkbits;
259 + ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
265 - * If there is a unwritten extent at this offset,
266 - * it will be as a data or a hole according to page
267 - * cache that has data or not.
269 - if (map.m_flags & EXT4_MAP_UNWRITTEN) {
271 - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
275 - holeoff = (loff_t)last << blkbits;
276 + /* No extents found */
277 + if (!fie.fi_extents_mapped)
280 + for (i = 0; i < fie.fi_extents_mapped; i++) {
281 + next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
283 + * If extent is not unwritten, then it contains valid
284 + * data, mapped or delayed.
286 + if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
287 + if (offset < (loff_t)ext[i].fe_logical)
296 - } while (last <= end);
298 + * If there is a unwritten extent at this offset,
299 + * it will be as a data or a hole according to page
300 + * cache that has data or not.
302 + if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
307 + if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
311 + if (offset > inode->i_size)
312 + offset = inode->i_size;
314 mutex_unlock(&inode->i_mutex);
318 - if (holeoff > isize)
321 - return vfs_setpos(file, holeoff, maxsize);
322 + return vfs_setpos(file, offset, maxsize);
330 To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
331 the body of a message to majordomo@vger.kernel.org
332 More majordomo info at http://vger.kernel.org/majordomo-info.html
333 Please read the FAQ at http://www.tux.org/lkml/