add patch move-error-report-out-of-atomic-context
[ext4-patch-queue.git] / use-a-list-to-track-all-reclaimable-objects-for-extent-status-tree
blob62530ea9d79bca4ca4ea5d0d8e9e2f2d83ef3ff7
1 ext4: use a list to track all reclaimable objects for extent status tree
3 From: Zheng Liu <wenqing.lz@taobao.com>
5 Currently the shrinker needs to take a long time to reclaim some objects
6 from a extent status tree because there are a lot of objects that are
7 delayed.  These objects could not be reclaimed because ext4 uses them to
8 finish seeking data/hole, finding delayed range and other works.  If a
9 rb-tree has a large number of delayed objects, shrinker should scan more
10 objects and the latency will be high.  This commit uses a list to track
11 all reclaimble objects in order to reduce the latency when the shrinker
12 tries to reclaim some objects from a extent status tree.
14 Cc: Andreas Dilger <adilger.kernel@dilger.ca>
15 Cc: Jan Kara <jack@suse.cz>
16 Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
17 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
18 ---
19  fs/ext4/extents_status.c | 74 ++++++++++++++++++++++++++++----------------------
20  fs/ext4/extents_status.h |  2 ++
21  2 files changed, 44 insertions(+), 32 deletions(-)
23 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
24 index 4768f7f..e845c4f 100644
25 --- a/fs/ext4/extents_status.c
26 +++ b/fs/ext4/extents_status.c
27 @@ -148,7 +148,8 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
28  static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
29                               ext4_lblk_t end);
30  static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
31 -                                      int nr_to_scan);
32 +                                      struct list_head *freeable,
33 +                                      int *nr_to_scan);
34  static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
35                        struct ext4_inode_info *locked_ei);
37 @@ -171,6 +172,7 @@ void ext4_exit_es(void)
38  void ext4_es_init_tree(struct ext4_es_tree *tree)
39  {
40         tree->root = RB_ROOT;
41 +       INIT_LIST_HEAD(&tree->list);
42         tree->cache_es = NULL;
43  }
45 @@ -302,6 +304,7 @@ static struct extent_status *
46  ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
47                      ext4_fsblk_t pblk)
48  {
49 +       struct ext4_inode_info *ei = EXT4_I(inode);
50         struct extent_status *es;
51         es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
52         if (es == NULL)
53 @@ -314,12 +317,13 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
54          * We don't count delayed extent because we never try to reclaim them
55          */
56         if (!ext4_es_is_delayed(es)) {
57 -               EXT4_I(inode)->i_es_shk_nr++;
58 +               list_add_tail(&es->list, &ei->i_es_tree.list);
59 +               ei->i_es_shk_nr++;
60                 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
61                                         s_es_stats.es_stats_shk_cnt);
62         }
64 -       EXT4_I(inode)->i_es_all_nr++;
65 +       ei->i_es_all_nr++;
66         percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
68         return es;
69 @@ -327,13 +331,15 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
71  static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
72  {
73 -       EXT4_I(inode)->i_es_all_nr--;
74 +       struct ext4_inode_info *ei = EXT4_I(inode);
76 +       ei->i_es_all_nr--;
77         percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
79         /* Decrease the shrink counter when this es is not delayed */
80         if (!ext4_es_is_delayed(es)) {
81 -               BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
82 -               EXT4_I(inode)->i_es_shk_nr--;
83 +               list_del(&es->list);
84 +               BUG_ON(ei->i_es_shk_nr-- == 0);
85                 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
86                                         s_es_stats.es_stats_shk_cnt);
87         }
88 @@ -956,11 +962,24 @@ void ext4_es_list_del(struct inode *inode)
89         spin_unlock(&sbi->s_es_lock);
90  }
92 +static void dispose_list(struct inode *inode, struct list_head *head)
94 +       while (!list_empty(head)) {
95 +               struct extent_status *es;
97 +               es = list_first_entry(head, struct extent_status, list);
98 +               list_del_init(&es->list);
100 +               ext4_es_free_extent(inode, es);
101 +       }
104  static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
105                        struct ext4_inode_info *locked_ei)
107         struct ext4_inode_info *ei;
108         struct ext4_es_stats *es_stats;
109 +       LIST_HEAD(freeable);
110         ktime_t start_time;
111         u64 scan_time;
112         int nr_to_walk;
113 @@ -974,8 +993,6 @@ retry:
114         spin_lock(&sbi->s_es_lock);
115         nr_to_walk = sbi->s_es_nr_inode;
116         while (!list_empty(&sbi->s_es_list) && nr_to_walk-- > 0) {
117 -               int shrunk;
119                 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
120                                       i_es_list);
122 @@ -1002,11 +1019,10 @@ retry:
123                 }
124                 /* we only release s_es_lock once we have i_es_lock */
125                 spin_unlock(&sbi->s_es_lock);
126 -               shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
127 +               nr_shrunk += __es_try_to_reclaim_extents(ei, &freeable,
128 +                                                        &nr_to_scan);
129                 write_unlock(&ei->i_es_lock);
131 -               nr_shrunk += shrunk;
132 -               nr_to_scan -= shrunk;
133 +               dispose_list(&ei->vfs_inode, &freeable);
135                 if (nr_to_scan == 0)
136                         goto out;
137 @@ -1023,8 +1039,11 @@ retry:
138                 goto retry;
139         }
141 -       if (locked_ei && nr_shrunk == 0)
142 -               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
143 +       if (locked_ei && nr_shrunk == 0) {
144 +               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, &freeable,
145 +                                                       &nr_to_scan);
146 +               dispose_list(&locked_ei->vfs_inode, &freeable);
147 +       }
149  out:
150         scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
151 @@ -1220,12 +1239,12 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
154  static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
155 -                                      int nr_to_scan)
156 +                                      struct list_head *freeable,
157 +                                      int *nr_to_scan)
159         struct inode *inode = &ei->vfs_inode;
160         struct ext4_es_tree *tree = &ei->i_es_tree;
161 -       struct rb_node *node;
162 -       struct extent_status *es;
163 +       struct extent_status *es, *tmp;
164         unsigned long nr_shrunk = 0;
165         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
166                                       DEFAULT_RATELIMIT_BURST);
167 @@ -1237,21 +1256,12 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
168             __ratelimit(&_rs))
169                 ext4_warning(inode->i_sb, "forced shrink of precached extents");
171 -       node = rb_first(&tree->root);
172 -       while (node != NULL) {
173 -               es = rb_entry(node, struct extent_status, rb_node);
174 -               node = rb_next(&es->rb_node);
175 -               /*
176 -                * We can't reclaim delayed extent from status tree because
177 -                * fiemap, bigallic, and seek_data/hole need to use it.
178 -                */
179 -               if (!ext4_es_is_delayed(es)) {
180 -                       rb_erase(&es->rb_node, &tree->root);
181 -                       ext4_es_free_extent(inode, es);
182 -                       nr_shrunk++;
183 -                       if (--nr_to_scan == 0)
184 -                               break;
185 -               }
186 +       list_for_each_entry_safe(es, tmp, &tree->list, list) {
187 +               rb_erase(&es->rb_node, &tree->root);
188 +               list_move_tail(&es->list, freeable);
189 +               nr_shrunk++;
190 +               if (--*nr_to_scan == 0)
191 +                       break;
192         }
193         tree->cache_es = NULL;
194         return nr_shrunk;
195 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
196 index 0e6a33e..a45c7fe 100644
197 --- a/fs/ext4/extents_status.h
198 +++ b/fs/ext4/extents_status.h
199 @@ -54,6 +54,7 @@ struct ext4_extent;
201  struct extent_status {
202         struct rb_node rb_node;
203 +       struct list_head list;
204         ext4_lblk_t es_lblk;    /* first logical block extent covers */
205         ext4_lblk_t es_len;     /* length of extent in block */
206         ext4_fsblk_t es_pblk;   /* first physical block */
207 @@ -61,6 +62,7 @@ struct extent_status {
209  struct ext4_es_tree {
210         struct rb_root root;
211 +       struct list_head list;
212         struct extent_status *cache_es; /* recently accessed extent */
213  };