From f1c7b1a908a0a4097a731a566cc221d379569ef4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Sep 2014 21:33:18 -0400 Subject: [PATCH] save the extent_status patches in the unstable portion of the tree --- ...le-in-extent-status-tree-for-ext4_da_map_blocks | 142 +++++ change-lru-to-round-rubin-in-extent-status-tree | 623 +++++++++++++++++++++ prepare-to-drop-state_dellaloc_reserved-flag | 5 +- series | 5 + timestamps | 23 +- ...a-garbage-collection-algorithm-to-manage-object | 201 +++++++ ...-all-reclaimable-objects-for-extent-status-tree | 214 +++++++ 7 files changed, 1202 insertions(+), 11 deletions(-) create mode 100644 cache-extent-hole-in-extent-status-tree-for-ext4_da_map_blocks create mode 100644 change-lru-to-round-rubin-in-extent-status-tree create mode 100644 use-a-garbage-collection-algorithm-to-manage-object create mode 100644 use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree diff --git a/cache-extent-hole-in-extent-status-tree-for-ext4_da_map_blocks b/cache-extent-hole-in-extent-status-tree-for-ext4_da_map_blocks new file mode 100644 index 00000000..7f14cc94 --- /dev/null +++ b/cache-extent-hole-in-extent-status-tree-for-ext4_da_map_blocks @@ -0,0 +1,142 @@ +ext4: cache extent hole in extent status tree for ext4_da_map_blocks() + +From: Zheng Liu + +Currently extent status tree doesn't cache extent hole when a write +looks up in extent tree to make sure whether a block has been allocated +or not. In this case, we don't put extent hole in extent cache because +later this extent might be removed and a new delayed extent might be +added back. But it will cause a defect when we do a lot of writes. +If we don't put extent hole in extent cache, the following writes also +need to access extent tree to look at whether or not a block has been +allocated. It brings a cache miss. This commit fixes this defect. +Meanwhile, if an inode has no any extent, this extent hole also will +be cached. + +Cc: Andreas Dilger +Cc: Jan Kara +Signed-off-by: Zheng Liu +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 4 +--- + fs/ext4/extents.c | 23 +++++++++-------------- + fs/ext4/inode.c | 6 ++---- + include/trace/events/ext4.h | 3 +-- + 4 files changed, 13 insertions(+), 23 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 6f294d3..7df9220 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -565,10 +565,8 @@ enum { + #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Do not take i_data_sem locking in ext4_map_blocks */ + #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 +- /* Do not put hole in extent cache */ +-#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 + /* Convert written extents to unwritten */ +-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 ++#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 + + /* + * The bit position of these flags must not overlap with any of the +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 76c2df3..6463d34 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2284,16 +2284,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t block) + { + int depth = ext_depth(inode); +- unsigned long len = 0; +- ext4_lblk_t lblock = 0; ++ unsigned long len; ++ ext4_lblk_t lblock; + struct ext4_extent *ex; + + ex = path[depth].p_ext; + if (ex == NULL) { +- /* +- * there is no extent yet, so gap is [0;-] and we +- * don't cache it +- */ ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCKS; + ext_debug("cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; +@@ -2302,9 +2301,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + block, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); +- if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) +- ext4_es_insert_extent(inode, lblock, len, ~0, +- EXTENT_STATUS_HOLE); + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + ext4_lblk_t next; +@@ -2318,14 +2314,14 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + block); + BUG_ON(next == lblock); + len = next - lblock; +- if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) +- ext4_es_insert_extent(inode, lblock, len, ~0, +- EXTENT_STATUS_HOLE); + } else { + BUG(); + } + + ext_debug(" -> %u:%lu\n", lblock, len); ++ if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) ++ ext4_es_insert_extent(inode, lblock, len, ~0, ++ EXTENT_STATUS_HOLE); + } + + /* +@@ -4362,8 +4358,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + * put just found gap into cache to speed up + * subsequent requests + */ +- if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) +- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); ++ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + goto out2; + } + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 367a60c..d1ad9f9 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1485,11 +1485,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + retval = 0; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) +- retval = ext4_ext_map_blocks(NULL, inode, map, +- EXT4_GET_BLOCKS_NO_PUT_HOLE); ++ retval = ext4_ext_map_blocks(NULL, inode, map, 0); + else +- retval = ext4_ind_map_blocks(NULL, inode, map, +- EXT4_GET_BLOCKS_NO_PUT_HOLE); ++ retval = ext4_ind_map_blocks(NULL, inode, map, 0); + + add_delayed: + if (retval == 0) { +diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h +index ff4bd1b..9337d36 100644 +--- a/include/trace/events/ext4.h ++++ b/include/trace/events/ext4.h +@@ -43,8 +43,7 @@ struct extent_status; + { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ + { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ + { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ +- { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }, \ +- { EXT4_GET_BLOCKS_NO_PUT_HOLE, "NO_PUT_HOLE" }) ++ { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }) + + #define show_mflags(flags) __print_flags(flags, "", \ + { EXT4_MAP_NEW, "N" }, \ +-- +1.7.9.7 + + diff --git a/change-lru-to-round-rubin-in-extent-status-tree b/change-lru-to-round-rubin-in-extent-status-tree new file mode 100644 index 00000000..274d1761 --- /dev/null +++ b/change-lru-to-round-rubin-in-extent-status-tree @@ -0,0 +1,623 @@ +ext4: change lru to round-robin in extent status tree shrinker + +From: Zheng Liu + +In this commit we discard the lru algorithm because it should take a +long time to keep a lru list in extent status tree shrinker and the +shrinker should take a long time to scan this lru list in order to +reclaim some objects. + +For reducing the latency, this commit does two works. The first one +is to replace lru with round-robin. After that we never need to keep +a lru list. That means that the list shouldn't be sorted if the +shrinker can not reclaim any objects in the first round. The second +one is to shrink the length of the list. After using round-robin +algorithm, the shrinker takes the first inode in the list and handle +it. If this inode is skipped, it will be moved into the tail of the +list. Otherwise it will be added back when it is touched again. + +[ Changed the locking in __es_shrink to avoid the inode potentially + disappearing out from under us; this was suggested by Jan -- TYT ] + +Cc: Andreas Dilger +Cc: Jan Kara +Signed-off-by: Zheng Liu +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 10 +-- + fs/ext4/extents.c | 4 +- + fs/ext4/extents_status.c | 217 ++++++++++++++++++++-------------------------- + fs/ext4/extents_status.h | 7 +- + fs/ext4/inode.c | 4 +- + fs/ext4/ioctl.c | 4 +- + fs/ext4/super.c | 7 +- + include/trace/events/ext4.h | 11 +-- + 8 files changed, 113 insertions(+), 151 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index f70c3fc..cff567d 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -889,10 +889,9 @@ struct ext4_inode_info { + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; +- struct list_head i_es_lru; ++ struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ +- unsigned int i_es_lru_nr; /* protected by i_es_lock */ +- unsigned long i_touch_when; /* jiffies of last accessing */ ++ unsigned int i_es_shk_nr; /* protected by i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; +@@ -1330,10 +1329,11 @@ struct ext4_sb_info { + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; +- struct list_head s_es_lru; ++ struct list_head s_es_list; ++ long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_mb_cache; +- spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; ++ spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 8170b32..02d871a 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4615,7 +4615,7 @@ out2: + + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + return err ? err : allocated; + } + +@@ -5174,7 +5174,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); + } +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + return error; + } + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 09fd576..4768f7f 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -149,8 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan); +-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, +- struct ext4_inode_info *locked_ei); ++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ++ struct ext4_inode_info *locked_ei); + + int __init ext4_init_es(void) + { +@@ -314,9 +314,9 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { +- EXT4_I(inode)->i_es_lru_nr++; ++ EXT4_I(inode)->i_es_shk_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> +- s_es_stats.es_stats_lru_cnt); ++ s_es_stats.es_stats_shk_cnt); + } + + EXT4_I(inode)->i_es_all_nr++; +@@ -330,12 +330,12 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + +- /* Decrease the lru counter when this es is not delayed */ ++ /* Decrease the shrink counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { +- BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); +- EXT4_I(inode)->i_es_lru_nr--; ++ BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); ++ EXT4_I(inode)->i_es_shk_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> +- s_es_stats.es_stats_lru_cnt); ++ s_es_stats.es_stats_shk_cnt); + } + + kmem_cache_free(ext4_es_cachep, es); +@@ -683,8 +683,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + goto error; + retry: + err = __es_insert_extent(inode, &newes); +- if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, +- EXT4_I(inode))) ++ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), ++ 1, EXT4_I(inode))) + goto retry; + if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + err = 0; +@@ -841,8 +841,8 @@ retry: + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; + if ((err == -ENOMEM) && +- __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, +- EXT4_I(inode))) ++ __es_shrink(EXT4_SB(inode->i_sb), ++ 1, EXT4_I(inode))) + goto retry; + goto out; + } +@@ -921,114 +921,112 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + return err; + } + +-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, +- struct list_head *b) ++static inline void __ext4_es_list_add(struct ext4_sb_info *sbi, ++ struct ext4_inode_info *ei) + { +- struct ext4_inode_info *eia, *eib; +- eia = list_entry(a, struct ext4_inode_info, i_es_lru); +- eib = list_entry(b, struct ext4_inode_info, i_es_lru); ++ if (list_empty(&ei->i_es_list)) { ++ list_add_tail(&ei->i_es_list, &sbi->s_es_list); ++ sbi->s_es_nr_inode++; ++ } ++} + +- if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && +- !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) +- return 1; +- if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && +- ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) +- return -1; +- if (eia->i_touch_when == eib->i_touch_when) +- return 0; +- if (time_after(eia->i_touch_when, eib->i_touch_when)) +- return 1; +- else +- return -1; ++void ext4_es_list_add(struct inode *inode) ++{ ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ ++ if (!list_empty(&ei->i_es_list)) ++ return; ++ ++ spin_lock(&sbi->s_es_lock); ++ __ext4_es_list_add(sbi, ei); ++ spin_unlock(&sbi->s_es_lock); + } + +-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, +- struct ext4_inode_info *locked_ei) ++void ext4_es_list_del(struct inode *inode) ++{ ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ ++ spin_lock(&sbi->s_es_lock); ++ if (!list_empty(&ei->i_es_list)) { ++ list_del_init(&ei->i_es_list); ++ WARN_ON_ONCE(sbi->s_es_nr_inode-- < 0); ++ } ++ spin_unlock(&sbi->s_es_lock); ++} ++ ++static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ++ struct ext4_inode_info *locked_ei) + { + struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; +- struct list_head *cur, *tmp; +- LIST_HEAD(skipped); + ktime_t start_time; + u64 scan_time; ++ int nr_to_walk; + int nr_shrunk = 0; +- int retried = 0, skip_precached = 1, nr_skipped = 0; ++ int retried = 0, nr_skipped = 0; + + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); +- spin_lock(&sbi->s_es_lru_lock); + + retry: +- list_for_each_safe(cur, tmp, &sbi->s_es_lru) { ++ spin_lock(&sbi->s_es_lock); ++ nr_to_walk = sbi->s_es_nr_inode; ++ while (!list_empty(&sbi->s_es_list) && nr_to_walk-- > 0) { + int shrunk; + +- /* +- * If we have already reclaimed all extents from extent +- * status tree, just stop the loop immediately. +- */ +- if (percpu_counter_read_positive( +- &es_stats->es_stats_lru_cnt) == 0) +- break; ++ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, ++ i_es_list); + +- ei = list_entry(cur, struct ext4_inode_info, i_es_lru); ++ list_del_init(&ei->i_es_list); ++ sbi->s_es_nr_inode--; ++ if (ei->i_es_shk_nr == 0) ++ continue; + + /* +- * Skip the inode that is newer than the last_sorted +- * time. Normally we try hard to avoid shrinking +- * precached inodes, but we will as a last resort. ++ * Normally we try hard to avoid shrinking precached inodes, ++ * but we will as a last resort. + */ +- if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || +- (skip_precached && ext4_test_inode_state(&ei->vfs_inode, +- EXT4_STATE_EXT_PRECACHED))) { ++ if ((!retried && ext4_test_inode_state(&ei->vfs_inode, ++ EXT4_STATE_EXT_PRECACHED)) || ++ ei == locked_ei || ++ !write_trylock(&ei->i_es_lock)) { + nr_skipped++; +- list_move_tail(cur, &skipped); ++ __ext4_es_list_add(sbi, ei); ++ if (spin_is_contended(&sbi->s_es_lock)) { ++ spin_unlock(&sbi->s_es_lock); ++ spin_lock(&sbi->s_es_lock); ++ } + continue; + } +- +- if (ei->i_es_lru_nr == 0 || ei == locked_ei || +- !write_trylock(&ei->i_es_lock)) +- continue; +- ++ /* we only release s_es_lock once we have i_es_lock */ ++ spin_unlock(&sbi->s_es_lock); + shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); +- if (ei->i_es_lru_nr == 0) +- list_del_init(&ei->i_es_lru); + write_unlock(&ei->i_es_lock); + + nr_shrunk += shrunk; + nr_to_scan -= shrunk; ++ + if (nr_to_scan == 0) +- break; ++ goto out; ++ spin_lock(&sbi->s_es_lock); + } +- +- /* Move the newer inodes into the tail of the LRU list. */ +- list_splice_tail(&skipped, &sbi->s_es_lru); +- INIT_LIST_HEAD(&skipped); ++ spin_unlock(&sbi->s_es_lock); + + /* + * If we skipped any inodes, and we weren't able to make any +- * forward progress, sort the list and try again. ++ * forward progress, try again to scan precached inodes. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; +- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); +- es_stats->es_stats_last_sorted = jiffies; +- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, +- i_es_lru); +- /* +- * If there are no non-precached inodes left on the +- * list, start releasing precached extents. +- */ +- if (ext4_test_inode_state(&ei->vfs_inode, +- EXT4_STATE_EXT_PRECACHED)) +- skip_precached = 0; + goto retry; + } + +- spin_unlock(&sbi->s_es_lru_lock); +- + if (locked_ei && nr_shrunk == 0) + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + ++out: + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + +@@ -1043,7 +1041,7 @@ retry: + else + es_stats->es_stats_shrunk = nr_shrunk; + +- trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, ++ trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, + nr_skipped, retried); + return nr_shrunk; + } +@@ -1055,7 +1053,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, + struct ext4_sb_info *sbi; + + sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); +- nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); ++ nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); + return nr; + } +@@ -1068,13 +1066,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk; + +- ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); ++ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); + + if (!nr_to_scan) + return ret; + +- nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); ++ nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); + + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); + return nr_shrunk; +@@ -1102,28 +1100,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) + return 0; + + /* here we just find an inode that has the max nr. of objects */ +- spin_lock(&sbi->s_es_lru_lock); +- list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { ++ spin_lock(&sbi->s_es_lock); ++ list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } +- spin_unlock(&sbi->s_es_lru_lock); ++ spin_unlock(&sbi->s_es_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), +- percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); ++ percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); +- if (es_stats->es_stats_last_sorted != 0) +- seq_printf(seq, " %u ms last sorted interval\n", +- jiffies_to_msecs(jiffies - +- es_stats->es_stats_last_sorted)); + if (inode_cnt) +- seq_printf(seq, " %d inodes on lru list\n", inode_cnt); ++ seq_printf(seq, " %d inodes on list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); +@@ -1132,7 +1126,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", +- max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, ++ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +@@ -1181,9 +1175,9 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) + { + int err; + +- INIT_LIST_HEAD(&sbi->s_es_lru); +- spin_lock_init(&sbi->s_es_lru_lock); +- sbi->s_es_stats.es_stats_last_sorted = 0; ++ INIT_LIST_HEAD(&sbi->s_es_list); ++ sbi->s_es_nr_inode = 0; ++ spin_lock_init(&sbi->s_es_lock); + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; +@@ -1192,7 +1186,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0); + if (err) + return err; +- err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0); ++ err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0); + if (err) + goto err1; + +@@ -1210,7 +1204,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) + return 0; + + err2: +- percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); + err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + return err; +@@ -1221,37 +1215,10 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); +- percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); ++ percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); + unregister_shrinker(&sbi->s_es_shrinker); + } + +-void ext4_es_lru_add(struct inode *inode) +-{ +- struct ext4_inode_info *ei = EXT4_I(inode); +- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- +- ei->i_touch_when = jiffies; +- +- if (!list_empty(&ei->i_es_lru)) +- return; +- +- spin_lock(&sbi->s_es_lru_lock); +- if (list_empty(&ei->i_es_lru)) +- list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); +- spin_unlock(&sbi->s_es_lru_lock); +-} +- +-void ext4_es_lru_del(struct inode *inode) +-{ +- struct ext4_inode_info *ei = EXT4_I(inode); +- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- +- spin_lock(&sbi->s_es_lru_lock); +- if (!list_empty(&ei->i_es_lru)) +- list_del_init(&ei->i_es_lru); +- spin_unlock(&sbi->s_es_lru_lock); +-} +- + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan) + { +@@ -1263,7 +1230,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +- if (ei->i_es_lru_nr == 0) ++ if (ei->i_es_shk_nr == 0) + return 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index efd5f97..0e6a33e 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -65,14 +65,13 @@ struct ext4_es_tree { + }; + + struct ext4_es_stats { +- unsigned long es_stats_last_sorted; + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; +- struct percpu_counter es_stats_lru_cnt; ++ struct percpu_counter es_stats_shk_cnt; + }; + + extern int __init ext4_init_es(void); +@@ -151,7 +150,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, + + extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); + extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +-extern void ext4_es_lru_add(struct inode *inode); +-extern void ext4_es_lru_del(struct inode *inode); ++extern void ext4_es_list_add(struct inode *inode); ++extern void ext4_es_list_del(struct inode *inode); + + #endif /* _EXT4_EXTENTS_STATUS_H */ +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index d5dd7d4..d2e12b9 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -494,7 +494,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; +@@ -1396,7 +1396,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { +- ext4_es_lru_add(inode); ++ ext4_es_list_add(inode); + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read(&EXT4_I(inode)->i_data_sem); +diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c +index 0f2252e..25c9ef0 100644 +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -78,8 +78,8 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); +- ext4_es_lru_del(inode1); +- ext4_es_lru_del(inode2); ++ ext4_es_list_del(inode1); ++ ext4_es_list_del(inode2); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 487c65b..3fa98a9 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -883,10 +883,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); +- INIT_LIST_HEAD(&ei->i_es_lru); ++ INIT_LIST_HEAD(&ei->i_es_list); + ei->i_es_all_nr = 0; +- ei->i_es_lru_nr = 0; +- ei->i_touch_when = 0; ++ ei->i_es_shk_nr = 0; + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; +@@ -975,7 +974,7 @@ void ext4_clear_inode(struct inode *inode) + dquot_drop(inode); + ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); +- ext4_es_lru_del(inode); ++ ext4_es_list_del(inode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); +diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h +index ff4bd1b..12312ea 100644 +--- a/include/trace/events/ext4.h ++++ b/include/trace/events/ext4.h +@@ -2452,15 +2452,14 @@ TRACE_EVENT(ext4_collapse_range, + + TRACE_EVENT(ext4_es_shrink, + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, +- int skip_precached, int nr_skipped, int retried), ++ int nr_skipped, int retried), + +- TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), ++ TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nr_shrunk ) + __field( unsigned long long, scan_time ) +- __field( int, skip_precached ) + __field( int, nr_skipped ) + __field( int, retried ) + ), +@@ -2469,16 +2468,14 @@ TRACE_EVENT(ext4_es_shrink, + __entry->dev = sb->s_dev; + __entry->nr_shrunk = nr_shrunk; + __entry->scan_time = div_u64(scan_time, 1000); +- __entry->skip_precached = skip_precached; + __entry->nr_skipped = nr_skipped; + __entry->retried = retried; + ), + +- TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " ++ TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu " + "nr_skipped %d retried %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, +- __entry->scan_time, __entry->skip_precached, +- __entry->nr_skipped, __entry->retried) ++ __entry->scan_time, __entry->nr_skipped, __entry->retried) + ); + + #endif /* _TRACE_EXT4_H */ diff --git a/prepare-to-drop-state_dellaloc_reserved-flag b/prepare-to-drop-state_dellaloc_reserved-flag index 217e73f4..d098d724 100644 --- a/prepare-to-drop-state_dellaloc_reserved-flag +++ b/prepare-to-drop-state_dellaloc_reserved-flag @@ -1,8 +1,8 @@ ext4: prepare to drop EXT4_STATE_DELALLOC_RESERVED The EXT4_STATE_DELALLOC_RESERVED flag was originally implemented -because it was too hard to make sure mballoc and get_block flags could -be reliably passed down through all of the codepaths that end up +because it was too hard to make sure the mballoc and get_block flags +could be reliably passed down through all of the codepaths that end up calling ext4_mb_new_blocks(). Since then, we have mb_flags passed down through most of the code @@ -15,6 +15,7 @@ a full regression test run, we can then drop EXT4_STATE_DELALLOC_RESERVED. Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara --- fs/ext4/balloc.c | 3 +-- fs/ext4/extents.c | 6 +++++- diff --git a/series b/series index d5a11a30..5b846e45 100644 --- a/series +++ b/series @@ -45,6 +45,11 @@ renumber-ext4_ex-flags stable-boundary stable-boundary-undo.patch +cache-extent-hole-in-extent-status-tree-for-ext4_da_map_blocks +change-lru-to-round-rubin-in-extent-status-tree +use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree +use-a-garbage-collection-algorithm-to-manage-object + add-fallocate-mode-blocking-for-debugging use-discard-if-possible-in-blkdev_issue_zeroout diff --git a/timestamps b/timestamps index 7de59b38..e461b94c 100755 --- a/timestamps +++ b/timestamps @@ -44,12 +44,17 @@ touch -d @1409621649 enable-block-validity-by-default touch -d @1409624130 fix-comments-about-get_blocks touch -d @1409624533 improve-extents-status-tree-trace-point touch -d @1409624809 track-extent-status-tree-shrinker-delay-stats -touch -d @1409683916 pass-allocation_request-struct-to-ext4_alloc_branch -touch -d @1409692836 prepare-to-drop-state_dellaloc_reserved-flag -touch -d @1409692896 drop-ext4_state-delalloc-reserved -touch -d @1409696362 series -touch -d @1409696615 dont-elevate-b_count-before-calling-__jbd2_journal_remove_checkpoint -touch -d @1409696624 optimize-jbd2_log_do_checkpoint-a-bit -touch -d @1409696625 stable-boundary -touch -d @1409698984 status -touch -d @1409699006 timestamps +touch -d @1409707592 cache-extent-hole-in-extent-status-tree-for-ext4_da_map_blocks +touch -d @1409868385 pass-allocation_request-struct-to-ext4_alloc_branch +touch -d @1409868445 prepare-to-drop-state_dellaloc_reserved-flag +touch -d @1409868502 drop-ext4_state-delalloc-reserved +touch -d @1409868562 dont-elevate-b_count-before-calling-__jbd2_journal_remove_checkpoint +touch -d @1409868569 change-lru-to-round-rubin-in-extent-status-tree +touch -d @1409868569 optimize-jbd2_log_do_checkpoint-a-bit +touch -d @1409868569 renumber-ext4_ex-flags +touch -d @1409868569 use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree +touch -d @1409868570 stable-boundary +touch -d @1409868570 use-a-garbage-collection-algorithm-to-manage-object +touch -d @1409868584 status +touch -d @1409880735 series +touch -d @1409880760 timestamps diff --git a/use-a-garbage-collection-algorithm-to-manage-object b/use-a-garbage-collection-algorithm-to-manage-object new file mode 100644 index 00000000..f05155db --- /dev/null +++ b/use-a-garbage-collection-algorithm-to-manage-object @@ -0,0 +1,201 @@ +ext4: use a garbage collection algorithm to manage object + +From: Zheng Liu + +For keeping useful extent cache in the tree, this commit uses a gc-like +algorithm to manage object. A new flag called '_ACCESSED' is defined to +track whether an extent cache is touched or not. When the shrinker tries +to reclaim some ojbects, an extent cache will be moved to the tail of +active list from inactive list if this flag is marked. The object in +active list will be reclaimed when we are under a high memory pressure. +After that, the aged extent cache should be kept as many as possible. + +Cc: Andreas Dilger +Cc: Jan Kara +Signed-off-by: Zheng Liu +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents_status.c | 42 +++++++++++++++++++++++++++++++++--------- + fs/ext4/extents_status.h | 31 ++++++++++++++++++++++++------- + 2 files changed, 57 insertions(+), 16 deletions(-) + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 2f81d1e..2f6bb538 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -149,7 +149,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + struct list_head *freeable, +- int *nr_to_scan); ++ int *nr_to_scan, int force); + static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); + +@@ -172,7 +172,8 @@ void ext4_exit_es(void) + void ext4_es_init_tree(struct ext4_es_tree *tree) + { + tree->root = RB_ROOT; +- INIT_LIST_HEAD(&tree->list); ++ INIT_LIST_HEAD(&tree->active_list); ++ INIT_LIST_HEAD(&tree->inactive_list); + tree->cache_es = NULL; + } + +@@ -317,7 +318,7 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { +- list_add_tail(&es->list, &ei->i_es_tree.list); ++ list_add_tail(&es->list, &ei->i_es_tree.inactive_list); + ei->i_es_shk_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); +@@ -787,6 +788,7 @@ out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; + if (found) { + BUG_ON(!es1); ++ ext4_es_mark_accessed(es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; +@@ -1027,7 +1029,7 @@ retry: + } + + nr_shrunk += __es_try_to_reclaim_extents(ei, &freeable, +- &nr_to_scan); ++ &nr_to_scan, retried); + write_unlock(&ei->i_es_lock); + dispose_list(&ei->vfs_inode, &freeable); + +@@ -1048,7 +1050,7 @@ retry: + + if (locked_ei && nr_shrunk == 0) { + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, &freeable, +- &nr_to_scan); ++ &nr_to_scan, 1); + dispose_list(&locked_ei->vfs_inode, &freeable); + } + +@@ -1247,7 +1249,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + struct list_head *freeable, +- int *nr_to_scan) ++ int *nr_to_scan, int force) + { + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; +@@ -1263,13 +1265,35 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + +- list_for_each_entry_safe(es, tmp, &tree->list, list) { ++ list_for_each_entry_safe(es, tmp, &tree->inactive_list, list) { ++ if (!*nr_to_scan) ++ goto done; ++ --*nr_to_scan; ++ ++ if (ext4_es_is_accessed(es)) { ++ list_move_tail(&es->list, &tree->active_list); ++ continue; ++ } else { ++ rb_erase(&es->rb_node, &tree->root); ++ list_move_tail(&es->list, freeable); ++ nr_shrunk++; ++ } ++ } ++ ++ if (!force) ++ goto done; ++ ++ list_for_each_entry_safe(es, tmp, &tree->active_list, list) { ++ if (!*nr_to_scan) ++ goto done; ++ --*nr_to_scan; ++ + rb_erase(&es->rb_node, &tree->root); + list_move_tail(&es->list, freeable); + nr_shrunk++; +- if (--*nr_to_scan == 0) +- break; + } ++ ++done: + tree->cache_es = NULL; + return nr_shrunk; + } +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index a45c7fe..213e056 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -29,12 +29,12 @@ + /* + * These flags live in the high bits of extent_status.es_pblk + */ +-#define ES_SHIFT 60 ++#define ES_SHIFT 59 + +-#define EXTENT_STATUS_WRITTEN (1 << 3) +-#define EXTENT_STATUS_UNWRITTEN (1 << 2) +-#define EXTENT_STATUS_DELAYED (1 << 1) +-#define EXTENT_STATUS_HOLE (1 << 0) ++#define EXTENT_STATUS_WRITTEN (1 << 4) ++#define EXTENT_STATUS_UNWRITTEN (1 << 3) ++#define EXTENT_STATUS_DELAYED (1 << 2) ++#define EXTENT_STATUS_HOLE (1 << 1) + + #define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ +@@ -45,9 +45,10 @@ + #define ES_UNWRITTEN (1ULL << 62) + #define ES_DELAYED (1ULL << 61) + #define ES_HOLE (1ULL << 60) ++#define ES_ACCESSED (1ULL << 59) + + #define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ +- ES_DELAYED | ES_HOLE) ++ ES_DELAYED | ES_HOLE | ES_ACCESSED) + + struct ext4_sb_info; + struct ext4_extent; +@@ -62,7 +63,8 @@ struct extent_status { + + struct ext4_es_tree { + struct rb_root root; +- struct list_head list; ++ struct list_head active_list; ++ struct list_head inactive_list; + struct extent_status *cache_es; /* recently accessed extent */ + }; + +@@ -114,6 +116,21 @@ static inline int ext4_es_is_hole(struct extent_status *es) + return (es->es_pblk & ES_HOLE) != 0; + } + ++static inline int ext4_es_is_accessed(struct extent_status *es) ++{ ++ return (es->es_pblk & ES_ACCESSED) != 0; ++} ++ ++static inline void ext4_es_mark_accessed(struct extent_status *es) ++{ ++ es->es_pblk |= ES_ACCESSED; ++} ++ ++static inline void ext4_es_clear_accessed(struct extent_status *es) ++{ ++ es->es_pblk &= ~ES_ACCESSED; ++} ++ + static inline unsigned int ext4_es_status(struct extent_status *es) + { + return es->es_pblk >> ES_SHIFT; +-- +1.7.9.7 + +-- +To unsubscribe from this list: send the line "unsubscribe linux-ext4" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html + diff --git a/use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree b/use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree new file mode 100644 index 00000000..62530ea9 --- /dev/null +++ b/use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree @@ -0,0 +1,214 @@ +ext4: use a list to track all reclaimable objects for extent status tree + +From: Zheng Liu + +Currently the shrinker needs to take a long time to reclaim some objects +from a extent status tree because there are a lot of objects that are +delayed. These objects could not be reclaimed because ext4 uses them to +finish seeking data/hole, finding delayed range and other works. If a +rb-tree has a large number of delayed objects, shrinker should scan more +objects and the latency will be high. This commit uses a list to track +all reclaimble objects in order to reduce the latency when the shrinker +tries to reclaim some objects from a extent status tree. + +Cc: Andreas Dilger +Cc: Jan Kara +Signed-off-by: Zheng Liu +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents_status.c | 74 ++++++++++++++++++++++++++++---------------------- + fs/ext4/extents_status.h | 2 ++ + 2 files changed, 44 insertions(+), 32 deletions(-) + +diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c +index 4768f7f..e845c4f 100644 +--- a/fs/ext4/extents_status.c ++++ b/fs/ext4/extents_status.c +@@ -148,7 +148,8 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes); + static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, +- int nr_to_scan); ++ struct list_head *freeable, ++ int *nr_to_scan); + static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); + +@@ -171,6 +172,7 @@ void ext4_exit_es(void) + void ext4_es_init_tree(struct ext4_es_tree *tree) + { + tree->root = RB_ROOT; ++ INIT_LIST_HEAD(&tree->list); + tree->cache_es = NULL; + } + +@@ -302,6 +304,7 @@ static struct extent_status * + ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + ext4_fsblk_t pblk) + { ++ struct ext4_inode_info *ei = EXT4_I(inode); + struct extent_status *es; + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + if (es == NULL) +@@ -314,12 +317,13 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) { +- EXT4_I(inode)->i_es_shk_nr++; ++ list_add_tail(&es->list, &ei->i_es_tree.list); ++ ei->i_es_shk_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } + +- EXT4_I(inode)->i_es_all_nr++; ++ ei->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + + return es; +@@ -327,13 +331,15 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + + static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) + { +- EXT4_I(inode)->i_es_all_nr--; ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ ++ ei->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + + /* Decrease the shrink counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { +- BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); +- EXT4_I(inode)->i_es_shk_nr--; ++ list_del(&es->list); ++ BUG_ON(ei->i_es_shk_nr-- == 0); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_shk_cnt); + } +@@ -956,11 +962,24 @@ void ext4_es_list_del(struct inode *inode) + spin_unlock(&sbi->s_es_lock); + } + ++static void dispose_list(struct inode *inode, struct list_head *head) ++{ ++ while (!list_empty(head)) { ++ struct extent_status *es; ++ ++ es = list_first_entry(head, struct extent_status, list); ++ list_del_init(&es->list); ++ ++ ext4_es_free_extent(inode, es); ++ } ++} ++ + static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) + { + struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; ++ LIST_HEAD(freeable); + ktime_t start_time; + u64 scan_time; + int nr_to_walk; +@@ -974,8 +993,6 @@ retry: + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (!list_empty(&sbi->s_es_list) && nr_to_walk-- > 0) { +- int shrunk; +- + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + +@@ -1002,11 +1019,10 @@ retry: + } + /* we only release s_es_lock once we have i_es_lock */ + spin_unlock(&sbi->s_es_lock); +- shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); ++ nr_shrunk += __es_try_to_reclaim_extents(ei, &freeable, ++ &nr_to_scan); + write_unlock(&ei->i_es_lock); +- +- nr_shrunk += shrunk; +- nr_to_scan -= shrunk; ++ dispose_list(&ei->vfs_inode, &freeable); + + if (nr_to_scan == 0) + goto out; +@@ -1023,8 +1039,11 @@ retry: + goto retry; + } + +- if (locked_ei && nr_shrunk == 0) +- nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); ++ if (locked_ei && nr_shrunk == 0) { ++ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, &freeable, ++ &nr_to_scan); ++ dispose_list(&locked_ei->vfs_inode, &freeable); ++ } + + out: + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); +@@ -1220,12 +1239,12 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) + } + + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, +- int nr_to_scan) ++ struct list_head *freeable, ++ int *nr_to_scan) + { + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; +- struct rb_node *node; +- struct extent_status *es; ++ struct extent_status *es, *tmp; + unsigned long nr_shrunk = 0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); +@@ -1237,21 +1256,12 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + +- node = rb_first(&tree->root); +- while (node != NULL) { +- es = rb_entry(node, struct extent_status, rb_node); +- node = rb_next(&es->rb_node); +- /* +- * We can't reclaim delayed extent from status tree because +- * fiemap, bigallic, and seek_data/hole need to use it. +- */ +- if (!ext4_es_is_delayed(es)) { +- rb_erase(&es->rb_node, &tree->root); +- ext4_es_free_extent(inode, es); +- nr_shrunk++; +- if (--nr_to_scan == 0) +- break; +- } ++ list_for_each_entry_safe(es, tmp, &tree->list, list) { ++ rb_erase(&es->rb_node, &tree->root); ++ list_move_tail(&es->list, freeable); ++ nr_shrunk++; ++ if (--*nr_to_scan == 0) ++ break; + } + tree->cache_es = NULL; + return nr_shrunk; +diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h +index 0e6a33e..a45c7fe 100644 +--- a/fs/ext4/extents_status.h ++++ b/fs/ext4/extents_status.h +@@ -54,6 +54,7 @@ struct ext4_extent; + + struct extent_status { + struct rb_node rb_node; ++ struct list_head list; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +@@ -61,6 +62,7 @@ struct extent_status { + + struct ext4_es_tree { + struct rb_root root; ++ struct list_head list; + struct extent_status *cache_es; /* recently accessed extent */ + }; + -- 2.11.4.GIT