1 ext4: track extent status tree shrinker delay statictics
3 From: Zheng Liu <wenqing.lz@taobao.com>
5 This commit adds some statictics in extent status tree shrinker. The
6 purpose to add these is that we want to collect more details when we
7 encounter a stall caused by extent status tree shrinker. Here we count
8 the following statictics:
10 the number of all objects on all extent status trees
11 the number of reclaimable objects on lru list
13 the last sorted interval
14 the number of inodes on lru list
16 scan time for shrinking some objects
17 the number of shrunk objects
19 the inode that has max nr. of objects on lru list
20 the maximum scan time for shrinking some objects
22 The output looks like below:
23 $ cat /proc/fs/ext4/sda1/es_shrinker_info
26 6341 reclaimable objects
27 5281/631 cache hits/misses
28 586 ms last sorted interval
29 250 inodes on lru list
34 255 inode (255 objects, 198 reclaimable)
35 125723 us max scan time
37 If the lru list has never been sorted, the following line will not be
39 586ms last sorted interval
40 If there is an empty lru list, the following lines also will not be
42 250 inodes on lru list
45 255 inode (255 objects, 198 reclaimable)
48 Meanwhile in this commit a new trace point is defined to print some
49 details in __ext4_es_shrink().
51 Cc: Andreas Dilger <adilger.kernel@dilger.ca>
52 Cc: Jan Kara <jack@suse.cz>
53 Reviewed-by: Jan Kara <jack@suse.cz>
54 Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
55 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
58 fs/ext4/extents_status.c | 186 ++++++++++++++++++++++++++++++++++++++++---
59 fs/ext4/extents_status.h | 13 ++-
60 fs/ext4/super.c | 11 +--
61 include/trace/events/ext4.h | 31 ++++++++
62 5 files changed, 224 insertions(+), 21 deletions(-)
64 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
65 index 5b19760..6f294d3 100644
68 @@ -890,6 +890,7 @@ struct ext4_inode_info {
69 struct ext4_es_tree i_es_tree;
71 struct list_head i_es_lru;
72 + unsigned int i_es_all_nr; /* protected by i_es_lock */
73 unsigned int i_es_lru_nr; /* protected by i_es_lock */
74 unsigned long i_touch_when; /* jiffies of last accessing */
76 @@ -1330,8 +1331,7 @@ struct ext4_sb_info {
77 /* Reclaim extents from extent status tree */
78 struct shrinker s_es_shrinker;
79 struct list_head s_es_lru;
80 - unsigned long s_es_last_sorted;
81 - struct percpu_counter s_extent_cache_cnt;
82 + struct ext4_es_stats s_es_stats;
83 struct mb_cache *s_mb_cache;
84 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
86 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
87 index 47d60dc..b6c3366 100644
88 --- a/fs/ext4/extents_status.c
89 +++ b/fs/ext4/extents_status.c
92 #include <linux/rbtree.h>
93 #include <linux/list_sort.h>
94 +#include <linux/proc_fs.h>
95 +#include <linux/seq_file.h>
97 #include "extents_status.h"
99 @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
101 if (!ext4_es_is_delayed(es)) {
102 EXT4_I(inode)->i_es_lru_nr++;
103 - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
104 + percpu_counter_inc(&EXT4_SB(inode->i_sb)->
105 + s_es_stats.es_stats_lru_cnt);
108 + EXT4_I(inode)->i_es_all_nr++;
109 + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
114 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
116 + EXT4_I(inode)->i_es_all_nr--;
117 + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
119 /* Decrease the lru counter when this es is not delayed */
120 if (!ext4_es_is_delayed(es)) {
121 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
122 EXT4_I(inode)->i_es_lru_nr--;
123 - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
124 + percpu_counter_dec(&EXT4_SB(inode->i_sb)->
125 + s_es_stats.es_stats_lru_cnt);
128 kmem_cache_free(ext4_es_cachep, es);
129 @@ -731,6 +741,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
130 struct extent_status *es)
132 struct ext4_es_tree *tree;
133 + struct ext4_es_stats *stats;
134 struct extent_status *es1 = NULL;
135 struct rb_node *node;
137 @@ -767,11 +778,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
141 + stats = &EXT4_SB(inode->i_sb)->s_es_stats;
144 es->es_lblk = es1->es_lblk;
145 es->es_len = es1->es_len;
146 es->es_pblk = es1->es_pblk;
147 + stats->es_stats_cache_hits++;
149 + stats->es_stats_cache_misses++;
152 read_unlock(&EXT4_I(inode)->i_es_lock);
153 @@ -933,11 +948,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
154 struct ext4_inode_info *locked_ei)
156 struct ext4_inode_info *ei;
157 + struct ext4_es_stats *es_stats;
158 struct list_head *cur, *tmp;
160 + ktime_t start_time;
163 int retried = 0, skip_precached = 1, nr_skipped = 0;
165 + es_stats = &sbi->s_es_stats;
166 + start_time = ktime_get();
167 spin_lock(&sbi->s_es_lru_lock);
170 @@ -948,7 +968,8 @@ retry:
171 * If we have already reclaimed all extents from extent
172 * status tree, just stop the loop immediately.
174 - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
175 + if (percpu_counter_read_positive(
176 + &es_stats->es_stats_lru_cnt) == 0)
179 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
180 @@ -958,7 +979,7 @@ retry:
181 * time. Normally we try hard to avoid shrinking
182 * precached inodes, but we will as a last resort.
184 - if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
185 + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
186 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
187 EXT4_STATE_EXT_PRECACHED))) {
189 @@ -992,7 +1013,7 @@ retry:
190 if ((nr_shrunk == 0) && nr_skipped && !retried) {
192 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
193 - sbi->s_es_last_sorted = jiffies;
194 + es_stats->es_stats_last_sorted = jiffies;
195 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
198 @@ -1010,6 +1031,22 @@ retry:
199 if (locked_ei && nr_shrunk == 0)
200 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
202 + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
203 + if (likely(es_stats->es_stats_scan_time))
204 + es_stats->es_stats_scan_time = (scan_time +
205 + es_stats->es_stats_scan_time*3) / 4;
207 + es_stats->es_stats_scan_time = scan_time;
208 + if (scan_time > es_stats->es_stats_max_scan_time)
209 + es_stats->es_stats_max_scan_time = scan_time;
210 + if (likely(es_stats->es_stats_shrunk))
211 + es_stats->es_stats_shrunk = (nr_shrunk +
212 + es_stats->es_stats_shrunk*3) / 4;
214 + es_stats->es_stats_shrunk = nr_shrunk;
216 + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
217 + nr_skipped, retried);
221 @@ -1020,7 +1057,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
222 struct ext4_sb_info *sbi;
224 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
225 - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
226 + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
227 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
230 @@ -1033,7 +1070,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
231 int nr_to_scan = sc->nr_to_scan;
234 - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
235 + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
236 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
239 @@ -1045,19 +1082,148 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
243 -void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
244 +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
246 + return *pos ? NULL : SEQ_START_TOKEN;
250 +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
255 +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
257 + struct ext4_sb_info *sbi = seq->private;
258 + struct ext4_es_stats *es_stats = &sbi->s_es_stats;
259 + struct ext4_inode_info *ei, *max = NULL;
260 + unsigned int inode_cnt = 0;
262 + if (v != SEQ_START_TOKEN)
265 + /* here we just find an inode that has the max nr. of objects */
266 + spin_lock(&sbi->s_es_lru_lock);
267 + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
269 + if (max && max->i_es_all_nr < ei->i_es_all_nr)
274 + spin_unlock(&sbi->s_es_lru_lock);
276 + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
277 + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
278 + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
279 + seq_printf(seq, " %lu/%lu cache hits/misses\n",
280 + es_stats->es_stats_cache_hits,
281 + es_stats->es_stats_cache_misses);
282 + if (es_stats->es_stats_last_sorted != 0)
283 + seq_printf(seq, " %u ms last sorted interval\n",
284 + jiffies_to_msecs(jiffies -
285 + es_stats->es_stats_last_sorted));
287 + seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
289 + seq_printf(seq, "average:\n %llu us scan time\n",
290 + div_u64(es_stats->es_stats_scan_time, 1000));
291 + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
294 + "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
295 + " %llu us max scan time\n",
296 + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
297 + div_u64(es_stats->es_stats_max_scan_time, 1000));
302 +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
306 +static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
307 + .start = ext4_es_seq_shrinker_info_start,
308 + .next = ext4_es_seq_shrinker_info_next,
309 + .stop = ext4_es_seq_shrinker_info_stop,
310 + .show = ext4_es_seq_shrinker_info_show,
314 +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
318 + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
320 + struct seq_file *m = file->private_data;
321 + m->private = PDE_DATA(inode);
328 +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
330 + return seq_release(inode, file);
333 +static const struct file_operations ext4_es_seq_shrinker_info_fops = {
334 + .owner = THIS_MODULE,
335 + .open = ext4_es_seq_shrinker_info_open,
337 + .llseek = seq_lseek,
338 + .release = ext4_es_seq_shrinker_info_release,
341 +int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
345 INIT_LIST_HEAD(&sbi->s_es_lru);
346 spin_lock_init(&sbi->s_es_lru_lock);
347 - sbi->s_es_last_sorted = 0;
348 + sbi->s_es_stats.es_stats_last_sorted = 0;
349 + sbi->s_es_stats.es_stats_shrunk = 0;
350 + sbi->s_es_stats.es_stats_cache_hits = 0;
351 + sbi->s_es_stats.es_stats_cache_misses = 0;
352 + sbi->s_es_stats.es_stats_scan_time = 0;
353 + sbi->s_es_stats.es_stats_max_scan_time = 0;
354 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0);
357 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0);
361 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
362 sbi->s_es_shrinker.count_objects = ext4_es_count;
363 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
364 - register_shrinker(&sbi->s_es_shrinker);
365 + err = register_shrinker(&sbi->s_es_shrinker);
370 + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
371 + &ext4_es_seq_shrinker_info_fops, sbi);
376 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
378 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
382 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
385 + remove_proc_entry("es_shrinker_info", sbi->s_proc);
386 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
387 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
388 unregister_shrinker(&sbi->s_es_shrinker);
391 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
392 index f1b62a4..efd5f97 100644
393 --- a/fs/ext4/extents_status.h
394 +++ b/fs/ext4/extents_status.h
395 @@ -64,6 +64,17 @@ struct ext4_es_tree {
396 struct extent_status *cache_es; /* recently accessed extent */
399 +struct ext4_es_stats {
400 + unsigned long es_stats_last_sorted;
401 + unsigned long es_stats_shrunk;
402 + unsigned long es_stats_cache_hits;
403 + unsigned long es_stats_cache_misses;
404 + u64 es_stats_scan_time;
405 + u64 es_stats_max_scan_time;
406 + struct percpu_counter es_stats_all_cnt;
407 + struct percpu_counter es_stats_lru_cnt;
410 extern int __init ext4_init_es(void);
411 extern void ext4_exit_es(void);
412 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
413 @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
417 -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
418 +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
419 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
420 extern void ext4_es_lru_add(struct inode *inode);
421 extern void ext4_es_lru_del(struct inode *inode);
422 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
423 index 32b43ad..7a06eb4 100644
424 --- a/fs/ext4/super.c
425 +++ b/fs/ext4/super.c
426 @@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb)
427 percpu_counter_destroy(&sbi->s_freeinodes_counter);
428 percpu_counter_destroy(&sbi->s_dirs_counter);
429 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
430 - percpu_counter_destroy(&sbi->s_extent_cache_cnt);
433 for (i = 0; i < MAXQUOTAS; i++)
434 @@ -885,6 +884,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
435 ext4_es_init_tree(&ei->i_es_tree);
436 rwlock_init(&ei->i_es_lock);
437 INIT_LIST_HEAD(&ei->i_es_lru);
438 + ei->i_es_all_nr = 0;
440 ei->i_touch_when = 0;
441 ei->i_reserved_data_blocks = 0;
442 @@ -3889,12 +3889,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
443 sbi->s_err_report.data = (unsigned long) sb;
445 /* Register extent status tree shrinker */
446 - ext4_es_register_shrinker(sbi);
448 - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
449 - ext4_msg(sb, KERN_ERR, "insufficient memory");
450 + if (ext4_es_register_shrinker(sbi))
454 sbi->s_stripe = ext4_get_stripe_size(sbi);
455 sbi->s_extent_max_zeroout_kb = 32;
456 @@ -4224,10 +4220,9 @@ failed_mount_wq:
457 jbd2_journal_destroy(sbi->s_journal);
458 sbi->s_journal = NULL;
461 ext4_es_unregister_shrinker(sbi);
463 del_timer_sync(&sbi->s_err_report);
464 - percpu_counter_destroy(&sbi->s_extent_cache_cnt);
466 kthread_stop(sbi->s_mmp_tsk);
468 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
469 index 849aaba..ff4bd1b 100644
470 --- a/include/trace/events/ext4.h
471 +++ b/include/trace/events/ext4.h
472 @@ -2450,6 +2450,37 @@ TRACE_EVENT(ext4_collapse_range,
473 __entry->offset, __entry->len)
476 +TRACE_EVENT(ext4_es_shrink,
477 + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
478 + int skip_precached, int nr_skipped, int retried),
480 + TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried),
483 + __field( dev_t, dev )
484 + __field( int, nr_shrunk )
485 + __field( unsigned long long, scan_time )
486 + __field( int, skip_precached )
487 + __field( int, nr_skipped )
488 + __field( int, retried )
492 + __entry->dev = sb->s_dev;
493 + __entry->nr_shrunk = nr_shrunk;
494 + __entry->scan_time = div_u64(scan_time, 1000);
495 + __entry->skip_precached = skip_precached;
496 + __entry->nr_skipped = nr_skipped;
497 + __entry->retried = retried;
500 + TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d "
501 + "nr_skipped %d retried %d",
502 + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
503 + __entry->scan_time, __entry->skip_precached,
504 + __entry->nr_skipped, __entry->retried)
507 #endif /* _TRACE_EXT4_H */
509 /* This part must be outside protection */