add patch create-function-to-read-journal-inode
[ext4-patch-queue.git] / improve-ext4lazyinit-scalability
blob290255462cb36ca5adf8e15643dbbd248e06fbdb
1 ext4: improve ext4lazyinit scalability
3 From: Dmitry Monakhov <dmonakhov@openvz.org>
5 ext4lazyinit is a global thread. This thread performs itable
6 initalization under li_list_mtx mutex.
8 It basically does the following:
9 ext4_lazyinit_thread
10   ->mutex_lock(&eli->li_list_mtx);
11   ->ext4_run_li_request(elr)
12     ->ext4_init_inode_table-> Do a lot of IO if the list is large
14 And when new mount/umount arrive they have to block on ->li_list_mtx
15 because  lazy_thread holds it during full walk procedure.
16 ext4_fill_super
17  ->ext4_register_li_request
18    ->mutex_lock(&ext4_li_info->li_list_mtx);
19    ->list_add(&elr->lr_request, &ext4_li_info >li_request_list);
20 In my case mount takes 40minutes on server with 36 * 4Tb HDD.
21 Common user may face this in case of very slow dev ( /dev/mmcblkXXX)
22 Even more. If one of filesystems was frozen lazyinit_thread will simply
23 block on sb_start_write() so other mount/umount will be stuck forever.
25 This patch changes logic like follows:
26 - grab ->s_umount read sem before processing new li_request.
27   After that it is safe to drop li_list_mtx because all callers of
28   li_remove_request are holding ->s_umount for write.
29 - li_thread skips frozen SB's
31 Locking order:
32 Mh KOrder is asserted by umount path like follows: s_umount ->li_list_mtx so
33 the only way to to grab ->s_mount inside li_thread is via down_read_trylock
35 xfstests:ext4/023
36 #PSBM-49658
38 Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
39 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
40 ---
41  fs/ext4/super.c | 41 +++++++++++++++++++++++++++++++----------
42  1 file changed, 31 insertions(+), 10 deletions(-)
44 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
45 index 5819b0e..50912cc 100644
46 --- a/fs/ext4/super.c
47 +++ b/fs/ext4/super.c
48 @@ -2749,7 +2749,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
49         sb = elr->lr_super;
50         ngroups = EXT4_SB(sb)->s_groups_count;
52 -       sb_start_write(sb);
53         for (group = elr->lr_next_group; group < ngroups; group++) {
54                 gdp = ext4_get_group_desc(sb, group, NULL);
55                 if (!gdp) {
56 @@ -2776,8 +2775,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
57                 elr->lr_next_sched = jiffies + elr->lr_timeout;
58                 elr->lr_next_group = group + 1;
59         }
60 -       sb_end_write(sb);
62         return ret;
63  }
65 @@ -2842,19 +2839,43 @@ cont_thread:
66                         mutex_unlock(&eli->li_list_mtx);
67                         goto exit_thread;
68                 }
70                 list_for_each_safe(pos, n, &eli->li_request_list) {
71 +                       int err = 0;
72 +                       int progress = 0;
73                         elr = list_entry(pos, struct ext4_li_request,
74                                          lr_request);
76 -                       if (time_after_eq(jiffies, elr->lr_next_sched)) {
77 -                               if (ext4_run_li_request(elr) != 0) {
78 -                                       /* error, remove the lazy_init job */
79 -                                       ext4_remove_li_request(elr);
80 -                                       continue;
81 +                       if (time_before(jiffies, elr->lr_next_sched)) {
82 +                               if (time_before(elr->lr_next_sched, next_wakeup))
83 +                                       next_wakeup = elr->lr_next_sched;
84 +                               continue;
85 +                       }
86 +                       if (down_read_trylock(&elr->lr_super->s_umount)) {
87 +                               if (sb_start_write_trylock(elr->lr_super)) {
88 +                                       progress = 1;
89 +                                       /*
90 +                                        * We hold sb->s_umount, sb can not
91 +                                        * be removed from the list, it is
92 +                                        * now safe to drop li_list_mtx
93 +                                        */
94 +                                       mutex_unlock(&eli->li_list_mtx);
95 +                                       err = ext4_run_li_request(elr);
96 +                                       sb_end_write(elr->lr_super);
97 +                                       mutex_lock(&eli->li_list_mtx);
98 +                                       n = pos->next;
99                                 }
100 +                               up_read((&elr->lr_super->s_umount));
101 +                       }
102 +                       /* error, remove the lazy_init job */
103 +                       if (err) {
104 +                               ext4_remove_li_request(elr);
105 +                               continue;
106 +                       }
107 +                       if (!progress) {
108 +                               elr->lr_next_sched = jiffies +
109 +                                       (prandom_u32()
110 +                                        % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
111                         }
113                         if (time_before(elr->lr_next_sched, next_wakeup))
114                                 next_wakeup = elr->lr_next_sched;
115                 }