Move get_bh outside of the spinlock critical region
[ext4-patch-queue.git] / track-extent-status-tree-shrinker-delay-stats
blob42c47c43ebfa525d9309b84c6c9b46d7251b140d
1 ext4: track extent status tree shrinker delay statictics
3 From: Zheng Liu <wenqing.lz@taobao.com>
5 This commit adds some statictics in extent status tree shrinker.  The
6 purpose to add these is that we want to collect more details when we
7 encounter a stall caused by extent status tree shrinker.  Here we count
8 the following statictics:
9   stats:
10     the number of all objects on all extent status trees
11     the number of reclaimable objects on lru list
12     cache hits/misses
13     the last sorted interval
14     the number of inodes on lru list
15   average:
16     scan time for shrinking some objects
17     the number of shrunk objects
18   maximum:
19     the inode that has max nr. of objects on lru list
20     the maximum scan time for shrinking some objects
22 The output looks like below:
23   $ cat /proc/fs/ext4/sda1/es_shrinker_info
24   stats:
25     28228 objects
26     6341 reclaimable objects
27     5281/631 cache hits/misses
28     586 ms last sorted interval
29     250 inodes on lru list
30   average:
31     153 us scan time
32     128 shrunk objects
33   maximum:
34     255 inode (255 objects, 198 reclaimable)
35     125723 us max scan time
37 If the lru list has never been sorted, the following line will not be
38 printed:
39     586ms last sorted interval
40 If there is an empty lru list, the following lines also will not be
41 printed:
42     250 inodes on lru list
43   ...
44   maximum:
45     255 inode (255 objects, 198 reclaimable)
46     0 us max scan time
48 Meanwhile in this commit a new trace point is defined to print some
49 details in __ext4_es_shrink().
51 Cc: Andreas Dilger <adilger.kernel@dilger.ca>
52 Cc: Jan Kara <jack@suse.cz>
53 Reviewed-by: Jan Kara <jack@suse.cz>
54 Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
55 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
56 ---
57  fs/ext4/ext4.h              |    4 +-
58  fs/ext4/extents_status.c    |  186 ++++++++++++++++++++++++++++++++++++++++---
59  fs/ext4/extents_status.h    |   13 ++-
60  fs/ext4/super.c             |   11 +--
61  include/trace/events/ext4.h |   31 ++++++++
62  5 files changed, 224 insertions(+), 21 deletions(-)
64 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
65 index 5b19760..6f294d3 100644
66 --- a/fs/ext4/ext4.h
67 +++ b/fs/ext4/ext4.h
68 @@ -890,6 +890,7 @@ struct ext4_inode_info {
69         struct ext4_es_tree i_es_tree;
70         rwlock_t i_es_lock;
71         struct list_head i_es_lru;
72 +       unsigned int i_es_all_nr;       /* protected by i_es_lock */
73         unsigned int i_es_lru_nr;       /* protected by i_es_lock */
74         unsigned long i_touch_when;     /* jiffies of last accessing */
76 @@ -1330,8 +1331,7 @@ struct ext4_sb_info {
77         /* Reclaim extents from extent status tree */
78         struct shrinker s_es_shrinker;
79         struct list_head s_es_lru;
80 -       unsigned long s_es_last_sorted;
81 -       struct percpu_counter s_extent_cache_cnt;
82 +       struct ext4_es_stats s_es_stats;
83         struct mb_cache *s_mb_cache;
84         spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
86 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
87 index 47d60dc..b6c3366 100644
88 --- a/fs/ext4/extents_status.c
89 +++ b/fs/ext4/extents_status.c
90 @@ -11,6 +11,8 @@
91   */
92  #include <linux/rbtree.h>
93  #include <linux/list_sort.h>
94 +#include <linux/proc_fs.h>
95 +#include <linux/seq_file.h>
96  #include "ext4.h"
97  #include "extents_status.h"
99 @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
100          */
101         if (!ext4_es_is_delayed(es)) {
102                 EXT4_I(inode)->i_es_lru_nr++;
103 -               percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
104 +               percpu_counter_inc(&EXT4_SB(inode->i_sb)->
105 +                                       s_es_stats.es_stats_lru_cnt);
106         }
108 +       EXT4_I(inode)->i_es_all_nr++;
109 +       percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
111         return es;
114  static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
116 +       EXT4_I(inode)->i_es_all_nr--;
117 +       percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
119         /* Decrease the lru counter when this es is not delayed */
120         if (!ext4_es_is_delayed(es)) {
121                 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
122                 EXT4_I(inode)->i_es_lru_nr--;
123 -               percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
124 +               percpu_counter_dec(&EXT4_SB(inode->i_sb)->
125 +                                       s_es_stats.es_stats_lru_cnt);
126         }
128         kmem_cache_free(ext4_es_cachep, es);
129 @@ -731,6 +741,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
130                           struct extent_status *es)
132         struct ext4_es_tree *tree;
133 +       struct ext4_es_stats *stats;
134         struct extent_status *es1 = NULL;
135         struct rb_node *node;
136         int found = 0;
137 @@ -767,11 +778,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
138         }
140  out:
141 +       stats = &EXT4_SB(inode->i_sb)->s_es_stats;
142         if (found) {
143                 BUG_ON(!es1);
144                 es->es_lblk = es1->es_lblk;
145                 es->es_len = es1->es_len;
146                 es->es_pblk = es1->es_pblk;
147 +               stats->es_stats_cache_hits++;
148 +       } else {
149 +               stats->es_stats_cache_misses++;
150         }
152         read_unlock(&EXT4_I(inode)->i_es_lock);
153 @@ -933,11 +948,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
154                             struct ext4_inode_info *locked_ei)
156         struct ext4_inode_info *ei;
157 +       struct ext4_es_stats *es_stats;
158         struct list_head *cur, *tmp;
159         LIST_HEAD(skipped);
160 +       ktime_t start_time;
161 +       u64 scan_time;
162         int nr_shrunk = 0;
163         int retried = 0, skip_precached = 1, nr_skipped = 0;
165 +       es_stats = &sbi->s_es_stats;
166 +       start_time = ktime_get();
167         spin_lock(&sbi->s_es_lru_lock);
169  retry:
170 @@ -948,7 +968,8 @@ retry:
171                  * If we have already reclaimed all extents from extent
172                  * status tree, just stop the loop immediately.
173                  */
174 -               if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
175 +               if (percpu_counter_read_positive(
176 +                               &es_stats->es_stats_lru_cnt) == 0)
177                         break;
179                 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
180 @@ -958,7 +979,7 @@ retry:
181                  * time.  Normally we try hard to avoid shrinking
182                  * precached inodes, but we will as a last resort.
183                  */
184 -               if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
185 +               if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
186                     (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
187                                                 EXT4_STATE_EXT_PRECACHED))) {
188                         nr_skipped++;
189 @@ -992,7 +1013,7 @@ retry:
190         if ((nr_shrunk == 0) && nr_skipped && !retried) {
191                 retried++;
192                 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
193 -               sbi->s_es_last_sorted = jiffies;
194 +               es_stats->es_stats_last_sorted = jiffies;
195                 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
196                                       i_es_lru);
197                 /*
198 @@ -1010,6 +1031,22 @@ retry:
199         if (locked_ei && nr_shrunk == 0)
200                 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
202 +       scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
203 +       if (likely(es_stats->es_stats_scan_time))
204 +               es_stats->es_stats_scan_time = (scan_time +
205 +                               es_stats->es_stats_scan_time*3) / 4;
206 +       else
207 +               es_stats->es_stats_scan_time = scan_time;
208 +       if (scan_time > es_stats->es_stats_max_scan_time)
209 +               es_stats->es_stats_max_scan_time = scan_time;
210 +       if (likely(es_stats->es_stats_shrunk))
211 +               es_stats->es_stats_shrunk = (nr_shrunk +
212 +                               es_stats->es_stats_shrunk*3) / 4;
213 +       else
214 +               es_stats->es_stats_shrunk = nr_shrunk;
216 +       trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
217 +                            nr_skipped, retried);
218         return nr_shrunk;
221 @@ -1020,7 +1057,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
222         struct ext4_sb_info *sbi;
224         sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
225 -       nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
226 +       nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
227         trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
228         return nr;
230 @@ -1033,7 +1070,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
231         int nr_to_scan = sc->nr_to_scan;
232         int ret, nr_shrunk;
234 -       ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
235 +       ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
236         trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
238         if (!nr_to_scan)
239 @@ -1045,19 +1082,148 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
240         return nr_shrunk;
243 -void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
244 +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
246 +       return *pos ? NULL : SEQ_START_TOKEN;
249 +static void *
250 +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
252 +       return NULL;
255 +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
257 +       struct ext4_sb_info *sbi = seq->private;
258 +       struct ext4_es_stats *es_stats = &sbi->s_es_stats;
259 +       struct ext4_inode_info *ei, *max = NULL;
260 +       unsigned int inode_cnt = 0;
262 +       if (v != SEQ_START_TOKEN)
263 +               return 0;
265 +       /* here we just find an inode that has the max nr. of objects */
266 +       spin_lock(&sbi->s_es_lru_lock);
267 +       list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
268 +               inode_cnt++;
269 +               if (max && max->i_es_all_nr < ei->i_es_all_nr)
270 +                       max = ei;
271 +               else if (!max)
272 +                       max = ei;
273 +       }
274 +       spin_unlock(&sbi->s_es_lru_lock);
276 +       seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
277 +                  percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
278 +                  percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
279 +       seq_printf(seq, "  %lu/%lu cache hits/misses\n",
280 +                  es_stats->es_stats_cache_hits,
281 +                  es_stats->es_stats_cache_misses);
282 +       if (es_stats->es_stats_last_sorted != 0)
283 +               seq_printf(seq, "  %u ms last sorted interval\n",
284 +                          jiffies_to_msecs(jiffies -
285 +                                           es_stats->es_stats_last_sorted));
286 +       if (inode_cnt)
287 +               seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
289 +       seq_printf(seq, "average:\n  %llu us scan time\n",
290 +           div_u64(es_stats->es_stats_scan_time, 1000));
291 +       seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
292 +       if (inode_cnt)
293 +               seq_printf(seq,
294 +                   "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
295 +                   "  %llu us max scan time\n",
296 +                   max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
297 +                   div_u64(es_stats->es_stats_max_scan_time, 1000));
299 +       return 0;
302 +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
306 +static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
307 +       .start = ext4_es_seq_shrinker_info_start,
308 +       .next  = ext4_es_seq_shrinker_info_next,
309 +       .stop  = ext4_es_seq_shrinker_info_stop,
310 +       .show  = ext4_es_seq_shrinker_info_show,
313 +static int
314 +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
316 +       int ret;
318 +       ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
319 +       if (!ret) {
320 +               struct seq_file *m = file->private_data;
321 +               m->private = PDE_DATA(inode);
322 +       }
324 +       return ret;
327 +static int
328 +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
330 +       return seq_release(inode, file);
333 +static const struct file_operations ext4_es_seq_shrinker_info_fops = {
334 +       .owner          = THIS_MODULE,
335 +       .open           = ext4_es_seq_shrinker_info_open,
336 +       .read           = seq_read,
337 +       .llseek         = seq_lseek,
338 +       .release        = ext4_es_seq_shrinker_info_release,
341 +int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
343 +       int err;
345         INIT_LIST_HEAD(&sbi->s_es_lru);
346         spin_lock_init(&sbi->s_es_lru_lock);
347 -       sbi->s_es_last_sorted = 0;
348 +       sbi->s_es_stats.es_stats_last_sorted = 0;
349 +       sbi->s_es_stats.es_stats_shrunk = 0;
350 +       sbi->s_es_stats.es_stats_cache_hits = 0;
351 +       sbi->s_es_stats.es_stats_cache_misses = 0;
352 +       sbi->s_es_stats.es_stats_scan_time = 0;
353 +       sbi->s_es_stats.es_stats_max_scan_time = 0;
354 +       err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0);
355 +       if (err)
356 +               return err;
357 +       err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0);
358 +       if (err)
359 +               goto err1;
361         sbi->s_es_shrinker.scan_objects = ext4_es_scan;
362         sbi->s_es_shrinker.count_objects = ext4_es_count;
363         sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
364 -       register_shrinker(&sbi->s_es_shrinker);
365 +       err = register_shrinker(&sbi->s_es_shrinker);
366 +       if (err)
367 +               goto err2;
369 +       if (sbi->s_proc)
370 +               proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
371 +                                &ext4_es_seq_shrinker_info_fops, sbi);
373 +       return 0;
375 +err2:
376 +       percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
377 +err1:
378 +       percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
379 +       return err;
382  void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
384 +       if (sbi->s_proc)
385 +               remove_proc_entry("es_shrinker_info", sbi->s_proc);
386 +       percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
387 +       percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
388         unregister_shrinker(&sbi->s_es_shrinker);
391 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
392 index f1b62a4..efd5f97 100644
393 --- a/fs/ext4/extents_status.h
394 +++ b/fs/ext4/extents_status.h
395 @@ -64,6 +64,17 @@ struct ext4_es_tree {
396         struct extent_status *cache_es; /* recently accessed extent */
397  };
399 +struct ext4_es_stats {
400 +       unsigned long es_stats_last_sorted;
401 +       unsigned long es_stats_shrunk;
402 +       unsigned long es_stats_cache_hits;
403 +       unsigned long es_stats_cache_misses;
404 +       u64 es_stats_scan_time;
405 +       u64 es_stats_max_scan_time;
406 +       struct percpu_counter es_stats_all_cnt;
407 +       struct percpu_counter es_stats_lru_cnt;
410  extern int __init ext4_init_es(void);
411  extern void ext4_exit_es(void);
412  extern void ext4_es_init_tree(struct ext4_es_tree *tree);
413 @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
414                        (pb & ~ES_MASK));
417 -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
418 +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
419  extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
420  extern void ext4_es_lru_add(struct inode *inode);
421  extern void ext4_es_lru_del(struct inode *inode);
422 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
423 index 32b43ad..7a06eb4 100644
424 --- a/fs/ext4/super.c
425 +++ b/fs/ext4/super.c
426 @@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb)
427         percpu_counter_destroy(&sbi->s_freeinodes_counter);
428         percpu_counter_destroy(&sbi->s_dirs_counter);
429         percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
430 -       percpu_counter_destroy(&sbi->s_extent_cache_cnt);
431         brelse(sbi->s_sbh);
432  #ifdef CONFIG_QUOTA
433         for (i = 0; i < MAXQUOTAS; i++)
434 @@ -885,6 +884,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
435         ext4_es_init_tree(&ei->i_es_tree);
436         rwlock_init(&ei->i_es_lock);
437         INIT_LIST_HEAD(&ei->i_es_lru);
438 +       ei->i_es_all_nr = 0;
439         ei->i_es_lru_nr = 0;
440         ei->i_touch_when = 0;
441         ei->i_reserved_data_blocks = 0;
442 @@ -3889,12 +3889,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
443         sbi->s_err_report.data = (unsigned long) sb;
445         /* Register extent status tree shrinker */
446 -       ext4_es_register_shrinker(sbi);
448 -       if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
449 -               ext4_msg(sb, KERN_ERR, "insufficient memory");
450 +       if (ext4_es_register_shrinker(sbi))
451                 goto failed_mount3;
452 -       }
454         sbi->s_stripe = ext4_get_stripe_size(sbi);
455         sbi->s_extent_max_zeroout_kb = 32;
456 @@ -4224,10 +4220,9 @@ failed_mount_wq:
457                 jbd2_journal_destroy(sbi->s_journal);
458                 sbi->s_journal = NULL;
459         }
460 -failed_mount3:
461         ext4_es_unregister_shrinker(sbi);
462 +failed_mount3:
463         del_timer_sync(&sbi->s_err_report);
464 -       percpu_counter_destroy(&sbi->s_extent_cache_cnt);
465         if (sbi->s_mmp_tsk)
466                 kthread_stop(sbi->s_mmp_tsk);
467  failed_mount2:
468 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
469 index 849aaba..ff4bd1b 100644
470 --- a/include/trace/events/ext4.h
471 +++ b/include/trace/events/ext4.h
472 @@ -2450,6 +2450,37 @@ TRACE_EVENT(ext4_collapse_range,
473                   __entry->offset, __entry->len)
474  );
476 +TRACE_EVENT(ext4_es_shrink,
477 +       TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
478 +                int skip_precached, int nr_skipped, int retried),
480 +       TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried),
482 +       TP_STRUCT__entry(
483 +               __field(        dev_t,          dev             )
484 +               __field(        int,            nr_shrunk       )
485 +               __field(        unsigned long long, scan_time   )
486 +               __field(        int,            skip_precached  )
487 +               __field(        int,            nr_skipped      )
488 +               __field(        int,            retried         )
489 +       ),
491 +       TP_fast_assign(
492 +               __entry->dev            = sb->s_dev;
493 +               __entry->nr_shrunk      = nr_shrunk;
494 +               __entry->scan_time      = div_u64(scan_time, 1000);
495 +               __entry->skip_precached = skip_precached;
496 +               __entry->nr_skipped     = nr_skipped;
497 +               __entry->retried        = retried;
498 +       ),
500 +       TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d "
501 +                 "nr_skipped %d retried %d",
502 +                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
503 +                 __entry->scan_time, __entry->skip_precached,
504 +                 __entry->nr_skipped, __entry->retried)
507  #endif /* _TRACE_EXT4_H */
509  /* This part must be outside protection */
510 -- 
511 1.7.9.7