From 37204bbebf578bf7b5cb23792c7a73a25cbc7912 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 25 Nov 2014 16:16:46 -0500 Subject: [PATCH] Add commented out patch fix-suboptimal-seek_datahole_extents_traversal --- fix-suboptimal-seek_datahole_extents_traversal | 334 +++++++++++++++++++++++++ series | 3 + 2 files changed, 337 insertions(+) create mode 100644 fix-suboptimal-seek_datahole_extents_traversal diff --git a/fix-suboptimal-seek_datahole_extents_traversal b/fix-suboptimal-seek_datahole_extents_traversal new file mode 100644 index 00000000..51aa9617 --- /dev/null +++ b/fix-suboptimal-seek_datahole_extents_traversal @@ -0,0 +1,334 @@ +ext4: fix suboptimal seek_{data,hole} extents traversial + +From: Dmitry Monakhov + +It is ridiculus practice to scan inode block by block, this technique +applicable only for old indirect files. This takes signifficant amount +of time for really large files. Let's reuse ext4_fiemap which already +traverse inode-tree in most optimal meaner. + +TESTCASE: +ftruncate64(fd, 0); +ftruncate64(fd, 1ULL << 40); +/* lseek will spin very long time */ +lseek64(fd, 0, SEEK_DATA); +lseek64(fd, 0, SEEK_HOLE); + +Original report: https://lkml.org/lkml/2014/10/16/620 + +Signed-off-by: Dmitry Monakhov +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents.c | 4 +- + fs/ext4/file.c | 220 +++++++++++++++++++++++++--------------------------- + 2 files changed, 108 insertions(+), 116 deletions(-) + +### Do we need i_mutex here? + +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 37043d0..11cee53 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -5155,8 +5155,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + + /* fallback to generic here if not in extents fmt */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) +- return generic_block_fiemap(inode, fieinfo, start, len, +- ext4_get_block); ++ return __generic_block_fiemap(inode, fieinfo, start, len, ++ ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index aca7b24..12cbffd 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp) + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +-static int ext4_find_unwritten_pgoff(struct inode *inode, +- int whence, +- struct ext4_map_blocks *map, +- loff_t *offset) ++static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, ++ loff_t endoff, loff_t *offset) + { + struct pagevec pvec; +- unsigned int blkbits; + pgoff_t index; + pgoff_t end; +- loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + +- blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; +- endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; ++ + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; +@@ -408,147 +403,144 @@ out: + static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) + { + struct inode *inode = file->f_mapping->host; +- struct ext4_map_blocks map; +- struct extent_status es; +- ext4_lblk_t start, last, end; +- loff_t dataoff, isize; +- int blkbits; +- int ret = 0; ++ struct fiemap_extent_info fie; ++ struct fiemap_extent ext[2]; ++ loff_t next; ++ int i, ret = 0; + + mutex_lock(&inode->i_mutex); +- +- isize = i_size_read(inode); +- if (offset >= isize) { ++ if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } +- +- blkbits = inode->i_sb->s_blocksize_bits; +- start = offset >> blkbits; +- last = start; +- end = isize >> blkbits; +- dataoff = offset; +- +- do { +- map.m_lblk = last; +- map.m_len = end - last + 1; +- ret = ext4_map_blocks(NULL, inode, &map, 0); +- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { +- if (last != start) +- dataoff = (loff_t)last << blkbits; ++ fie.fi_flags = 0; ++ fie.fi_extents_max = 2; ++ fie.fi_extents_start = (struct fiemap_extent __user *) &ext; ++ while (1) { ++ mm_segment_t old_fs = get_fs(); ++ ++ fie.fi_extents_mapped = 0; ++ memset(ext, 0, sizeof(*ext) * fie.fi_extents_max); ++ ++ set_fs(get_ds()); ++ ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); ++ set_fs(old_fs); ++ if (ret) + break; +- } + +- /* +- * If there is a delay extent at this offset, +- * it will be as a data. +- */ +- ext4_es_find_delayed_extent_range(inode, last, last, &es); +- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { +- if (last != start) +- dataoff = (loff_t)last << blkbits; ++ /* No extents found, EOF */ ++ if (!fie.fi_extents_mapped) { ++ ret = -ENXIO; + break; + } ++ for (i = 0; i < fie.fi_extents_mapped; i++) { ++ next = (loff_t)(ext[i].fe_length + ext[i].fe_logical); + +- /* +- * If there is a unwritten extent at this offset, +- * it will be as a data or a hole according to page +- * cache that has data or not. +- */ +- if (map.m_flags & EXT4_MAP_UNWRITTEN) { +- int unwritten; +- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, +- &map, &dataoff); +- if (unwritten) +- break; +- } ++ if (offset < (loff_t)ext[i].fe_logical) ++ offset = (loff_t)ext[i].fe_logical; ++ /* ++ * If extent is not unwritten, then it contains valid ++ * data, mapped or delayed. ++ */ ++ if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) ++ goto out; + +- last++; +- dataoff = (loff_t)last << blkbits; +- } while (last <= end); ++ /* ++ * If there is a unwritten extent at this offset, ++ * it will be as a data or a hole according to page ++ * cache that has data or not. ++ */ ++ if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, ++ next, &offset)) ++ goto out; + ++ if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) { ++ ret = -ENXIO; ++ goto out; ++ } ++ offset = next; ++ } ++ } ++ if (offset > inode->i_size) ++ offset = inode->i_size; ++out: + mutex_unlock(&inode->i_mutex); ++ if (ret) ++ return ret; + +- if (dataoff > isize) +- return -ENXIO; +- +- return vfs_setpos(file, dataoff, maxsize); ++ return vfs_setpos(file, offset, maxsize); + } + + /* +- * ext4_seek_hole() retrieves the offset for SEEK_HOLE. ++ * ext4_seek_hole() retrieves the offset for SEEK_HOLE + */ + static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) + { + struct inode *inode = file->f_mapping->host; +- struct ext4_map_blocks map; +- struct extent_status es; +- ext4_lblk_t start, last, end; +- loff_t holeoff, isize; +- int blkbits; +- int ret = 0; ++ struct fiemap_extent_info fie; ++ struct fiemap_extent ext[2]; ++ loff_t next; ++ int i, ret = 0; + + mutex_lock(&inode->i_mutex); +- +- isize = i_size_read(inode); +- if (offset >= isize) { ++ if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + +- blkbits = inode->i_sb->s_blocksize_bits; +- start = offset >> blkbits; +- last = start; +- end = isize >> blkbits; +- holeoff = offset; ++ fie.fi_flags = 0; ++ fie.fi_extents_max = 2; ++ fie.fi_extents_start = (struct fiemap_extent __user *)&ext; ++ while (1) { ++ mm_segment_t old_fs = get_fs(); + +- do { +- map.m_lblk = last; +- map.m_len = end - last + 1; +- ret = ext4_map_blocks(NULL, inode, &map, 0); +- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { +- last += ret; +- holeoff = (loff_t)last << blkbits; +- continue; +- } ++ fie.fi_extents_mapped = 0; ++ memset(ext, 0, sizeof(*ext)); + +- /* +- * If there is a delay extent at this offset, +- * we will skip this extent. +- */ +- ext4_es_find_delayed_extent_range(inode, last, last, &es); +- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { +- last = es.es_lblk + es.es_len; +- holeoff = (loff_t)last << blkbits; +- continue; +- } ++ set_fs(get_ds()); ++ ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); ++ set_fs(old_fs); ++ if (ret) ++ break; + +- /* +- * If there is a unwritten extent at this offset, +- * it will be as a data or a hole according to page +- * cache that has data or not. +- */ +- if (map.m_flags & EXT4_MAP_UNWRITTEN) { +- int unwritten; +- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, +- &map, &holeoff); +- if (!unwritten) { +- last += ret; +- holeoff = (loff_t)last << blkbits; ++ /* No extents found */ ++ if (!fie.fi_extents_mapped) ++ break; ++ ++ for (i = 0; i < fie.fi_extents_mapped; i++) { ++ next = (loff_t)(ext[i].fe_logical + ext[i].fe_length); ++ /* ++ * If extent is not unwritten, then it contains valid ++ * data, mapped or delayed. ++ */ ++ if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) { ++ if (offset < (loff_t)ext[i].fe_logical) ++ goto out; ++ offset = next; + continue; + } +- } +- +- /* find a hole */ +- break; +- } while (last <= end); ++ /* ++ * If there is a unwritten extent at this offset, ++ * it will be as a data or a hole according to page ++ * cache that has data or not. ++ */ ++ if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE, ++ next, &offset)) ++ goto out; + ++ offset = next; ++ if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) ++ goto out; ++ } ++ } ++ if (offset > inode->i_size) ++ offset = inode->i_size; ++out: + mutex_unlock(&inode->i_mutex); ++ if (ret) ++ return ret; + +- if (holeoff > isize) +- holeoff = isize; +- +- return vfs_setpos(file, holeoff, maxsize); ++ return vfs_setpos(file, offset, maxsize); + } + + /* +-- +1.7.1 + +-- +To unsubscribe from this list: send the line "unsubscribe linux-kernel" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html +Please read the FAQ at http://www.tux.org/lkml/ + diff --git a/series b/series index c2392838..2bc6c8fe 100644 --- a/series +++ b/series @@ -26,6 +26,9 @@ introduce-aging-to-extent-status-tree cleanup-gfp-flags-inside-resize-path fix-potential-use-after-free-during-resize +# note: causes generic/285 to loop forever in inline data mode +# fix-suboptimal-seek_datahole_extents_traversal + ########################################## # unstable patches #################################################### -- 2.11.4.GIT