1 ext4: limit number of scanned extents in status tree shrinker
3 From: Jan Kara <jack@suse.cz>
5 Currently we scan extent status trees of inodes until we reclaim nr_to_scan
6 extents. This can however require a lot of scanning when there are lots
7 of delayed extents (as those cannot be reclaimed).
9 Change shrinker to work as shrinkers are supposed to and *scan* only
10 nr_to_scan extents regardless of how many extents did we actually
11 reclaim. We however need to be careful and avoid scanning each status
12 tree from the beginning - that could lead to a situation where we would
13 not be able to reclaim anything at all when first nr_to_scan extents in
14 the tree are always unreclaimable. We remember with each inode offset
15 where we stopped scanning and continue from there when we next come
18 Note that we also need to update places calling __es_shrink() manually
19 to pass reasonable nr_to_scan to have a chance of reclaiming anything and
22 Signed-off-by: Jan Kara <jack@suse.cz>
23 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
25 fs/ext4/ext4.h | 5 ++-
26 fs/ext4/extents_status.c | 91 +++++++++++++++++++++++++++++++-----------------
28 3 files changed, 64 insertions(+), 33 deletions(-)
30 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
31 index ede6dd43fe13..f75e04a5d613 100644
34 @@ -881,6 +881,9 @@ struct ext4_inode_info {
35 struct list_head i_es_list;
36 unsigned int i_es_all_nr; /* protected by i_es_lock */
37 unsigned int i_es_shk_nr; /* protected by i_es_lock */
38 + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
39 + extents to shrink. Protected by
43 ext4_group_t i_last_alloc_group;
44 @@ -1321,7 +1324,7 @@ struct ext4_sb_info {
46 /* Reclaim extents from extent status tree */
47 struct shrinker s_es_shrinker;
48 - struct list_head s_es_list;
49 + struct list_head s_es_list; /* List of inodes with reclaimable extents */
51 struct ext4_es_stats s_es_stats;
52 struct mb_cache *s_mb_cache;
53 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
54 index de2d9d8bf22f..8f2aac4006d2 100644
55 --- a/fs/ext4/extents_status.c
56 +++ b/fs/ext4/extents_status.c
57 @@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep;
58 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
59 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
61 -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
63 +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
64 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
65 struct ext4_inode_info *locked_ei);
67 @@ -716,7 +715,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
69 err = __es_insert_extent(inode, &newes);
70 if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
72 + 128, EXT4_I(inode)))
74 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
76 @@ -874,7 +873,7 @@ retry:
77 es->es_len = orig_es.es_len;
78 if ((err == -ENOMEM) &&
79 __es_shrink(EXT4_SB(inode->i_sb),
81 + 128, EXT4_I(inode)))
85 @@ -976,8 +975,6 @@ retry:
86 spin_lock(&sbi->s_es_lock);
87 nr_to_walk = sbi->s_es_nr_inode;
88 while (nr_to_walk-- > 0) {
91 if (list_empty(&sbi->s_es_list)) {
92 spin_unlock(&sbi->s_es_lock);
94 @@ -985,7 +982,7 @@ retry:
95 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
97 /* Move the inode to the tail */
98 - list_move(&ei->i_es_list, sbi->s_es_list.prev);
99 + list_move_tail(&ei->i_es_list, &sbi->s_es_list);
102 * Normally we try hard to avoid shrinking precached inodes,
103 @@ -1007,13 +1004,10 @@ retry:
105 spin_unlock(&sbi->s_es_lock);
107 - shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
108 + nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
109 write_unlock(&ei->i_es_lock);
111 - nr_shrunk += shrunk;
112 - nr_to_scan -= shrunk;
114 - if (nr_to_scan == 0)
115 + if (nr_to_scan <= 0)
117 spin_lock(&sbi->s_es_lock);
119 @@ -1029,7 +1023,7 @@ retry:
122 if (locked_ei && nr_shrunk == 0)
123 - nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
124 + nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
127 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
128 @@ -1224,27 +1218,33 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
129 unregister_shrinker(&sbi->s_es_shrinker);
132 -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
135 + * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
136 + * most *nr_to_scan extents, update *nr_to_scan accordingly.
138 + * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
139 + * Increment *nr_shrunk by the number of reclaimed extents. Also update
140 + * ei->i_es_shrink_lblk to where we should continue scanning.
142 +static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
143 + int *nr_to_scan, int *nr_shrunk)
145 struct inode *inode = &ei->vfs_inode;
146 struct ext4_es_tree *tree = &ei->i_es_tree;
147 - struct rb_node *node;
148 struct extent_status *es;
149 - unsigned long nr_shrunk = 0;
150 - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
151 - DEFAULT_RATELIMIT_BURST);
153 - if (ei->i_es_shk_nr == 0)
155 + struct rb_node *node;
157 - if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
159 - ext4_warning(inode->i_sb, "forced shrink of precached extents");
160 + es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
163 + node = &es->rb_node;
164 + while (*nr_to_scan > 0) {
165 + if (es->es_lblk > end) {
166 + ei->i_es_shrink_lblk = end + 1;
170 - node = rb_first(&tree->root);
171 - while (node != NULL) {
172 - es = rb_entry(node, struct extent_status, rb_node);
174 node = rb_next(&es->rb_node);
176 * We can't reclaim delayed extent from status tree because
177 @@ -1253,11 +1253,38 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
178 if (!ext4_es_is_delayed(es)) {
179 rb_erase(&es->rb_node, &tree->root);
180 ext4_es_free_extent(inode, es);
182 - if (--nr_to_scan == 0)
188 + es = rb_entry(node, struct extent_status, rb_node);
190 - tree->cache_es = NULL;
191 + ei->i_es_shrink_lblk = es->es_lblk;
194 + ei->i_es_shrink_lblk = 0;
198 +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
200 + struct inode *inode = &ei->vfs_inode;
202 + ext4_lblk_t start = ei->i_es_shrink_lblk;
203 + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
204 + DEFAULT_RATELIMIT_BURST);
206 + if (ei->i_es_shk_nr == 0)
209 + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
211 + ext4_warning(inode->i_sb, "forced shrink of precached extents");
213 + if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
215 + es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
217 + ei->i_es_tree.cache_es = NULL;
220 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
221 index f108d84e7da2..b53c243a142b 100644
222 --- a/fs/ext4/super.c
223 +++ b/fs/ext4/super.c
224 @@ -883,6 +883,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
225 INIT_LIST_HEAD(&ei->i_es_list);
228 + ei->i_es_shrink_lblk = 0;
229 ei->i_reserved_data_blocks = 0;
230 ei->i_reserved_meta_blocks = 0;
231 ei->i_allocated_meta_blocks = 0;
236 To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
237 the body of a message to majordomo@vger.kernel.org
238 More majordomo info at http://vger.kernel.org/majordomo-info.html