1 ext4: add new pending reservation mechanism
3 From: Eric Whitney <enwlinux@gmail.com>
5 Add new pending reservation mechanism to help manage reserved cluster
6 accounting. Its primary function is to avoid the need to read extents
7 from the disk when invalidating pages as a result of a truncate, punch
8 hole, or collapse range operation.
10 Signed-off-by: Eric Whitney <enwlinux@gmail.com>
11 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
14 fs/ext4/extents_status.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++
15 fs/ext4/extents_status.h | 51 +++++++++++++
16 fs/ext4/super.c | 8 ++
17 4 files changed, 249 insertions(+)
19 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
20 index ad2c215720be..fc0f41dbf90b 100644
23 @@ -1030,6 +1030,9 @@ struct ext4_inode_info {
24 ext4_lblk_t i_da_metadata_calc_last_lblock;
25 int i_da_metadata_calc_len;
27 + /* pending cluster reservations for bigalloc file systems */
28 + struct ext4_pending_tree i_pending_tree;
30 /* on-disk additional length */
33 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
34 index 8530fbd3012d..194785ce890a 100644
35 --- a/fs/ext4/extents_status.c
36 +++ b/fs/ext4/extents_status.c
40 static struct kmem_cache *ext4_es_cachep;
41 +static struct kmem_cache *ext4_pending_cachep;
43 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
44 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
45 @@ -1365,3 +1366,189 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
46 ei->i_es_tree.cache_es = NULL;
51 +static void ext4_print_pending_tree(struct inode *inode)
53 + struct ext4_pending_tree *tree;
54 + struct rb_node *node;
55 + struct pending_reservation *pr;
57 + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
58 + tree = &EXT4_I(inode)->i_pending_tree;
59 + node = rb_first(&tree->root);
61 + pr = rb_entry(node, struct pending_reservation, rb_node);
62 + printk(KERN_DEBUG " %u", pr->lclu);
63 + node = rb_next(node);
65 + printk(KERN_DEBUG "\n");
68 +#define ext4_print_pending_tree(inode)
71 +int __init ext4_init_pending(void)
73 + ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
74 + sizeof(struct pending_reservation),
75 + 0, (SLAB_RECLAIM_ACCOUNT), NULL);
76 + if (ext4_pending_cachep == NULL)
81 +void ext4_exit_pending(void)
83 + kmem_cache_destroy(ext4_pending_cachep);
86 +void ext4_init_pending_tree(struct ext4_pending_tree *tree)
88 + tree->root = RB_ROOT;
92 + * __get_pending - retrieve a pointer to a pending reservation
94 + * @inode - file containing the pending cluster reservation
95 + * @lclu - logical cluster of interest
97 + * Returns a pointer to a pending reservation if it's a member of
98 + * the set, and NULL if not. Must be called holding i_es_lock.
100 +static struct pending_reservation *__get_pending(struct inode *inode,
103 + struct ext4_pending_tree *tree;
104 + struct rb_node *node;
105 + struct pending_reservation *pr = NULL;
107 + tree = &EXT4_I(inode)->i_pending_tree;
108 + node = (&tree->root)->rb_node;
111 + pr = rb_entry(node, struct pending_reservation, rb_node);
112 + if (lclu < pr->lclu)
113 + node = node->rb_left;
114 + else if (lclu > pr->lclu)
115 + node = node->rb_right;
116 + else if (lclu == pr->lclu)
123 + * __insert_pending - adds a pending cluster reservation to the set of
124 + * pending reservations
126 + * @inode - file containing the cluster
127 + * @lblk - logical block in the cluster to be added
129 + * Returns 0 on successful insertion and -ENOMEM on failure. If the
130 + * pending reservation is already in the set, returns successfully.
132 +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
134 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
135 + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
136 + struct rb_node **p = &tree->root.rb_node;
137 + struct rb_node *parent = NULL;
138 + struct pending_reservation *pr;
142 + lclu = EXT4_B2C(sbi, lblk);
143 + /* search to find parent for insertion */
146 + pr = rb_entry(parent, struct pending_reservation, rb_node);
148 + if (lclu < pr->lclu) {
149 + p = &(*p)->rb_left;
150 + } else if (lclu > pr->lclu) {
151 + p = &(*p)->rb_right;
153 + /* pending reservation already inserted */
158 + pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
165 + rb_link_node(&pr->rb_node, parent, p);
166 + rb_insert_color(&pr->rb_node, &tree->root);
173 + * __remove_pending - removes a pending cluster reservation from the set
174 + * of pending reservations
176 + * @inode - file containing the cluster
177 + * @lblk - logical block in the pending cluster reservation to be removed
179 + * Returns successfully if pending reservation is not a member of the set.
181 +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
183 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
184 + struct pending_reservation *pr;
185 + struct ext4_pending_tree *tree;
187 + pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
189 + tree = &EXT4_I(inode)->i_pending_tree;
190 + rb_erase(&pr->rb_node, &tree->root);
191 + kmem_cache_free(ext4_pending_cachep, pr);
196 + * ext4_remove_pending - removes a pending cluster reservation from the set
197 + * of pending reservations
199 + * @inode - file containing the cluster
200 + * @lblk - logical block in the pending cluster reservation to be removed
202 + * Locking for external use of __remove_pending.
204 +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
206 + struct ext4_inode_info *ei = EXT4_I(inode);
208 + write_lock(&ei->i_es_lock);
209 + __remove_pending(inode, lblk);
210 + write_unlock(&ei->i_es_lock);
214 + * ext4_is_pending - determine whether a cluster has a pending reservation
217 + * @inode - file containing the cluster
218 + * @lblk - logical block in the cluster
220 + * Returns true if there's a pending reservation for the cluster in the
221 + * set of pending reservations, and false if not.
223 +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
225 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
226 + struct ext4_inode_info *ei = EXT4_I(inode);
229 + read_lock(&ei->i_es_lock);
230 + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
231 + read_unlock(&ei->i_es_lock);
235 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
236 index df9628c3ec3b..379b7171c67c 100644
237 --- a/fs/ext4/extents_status.h
238 +++ b/fs/ext4/extents_status.h
239 @@ -78,6 +78,51 @@ struct ext4_es_stats {
240 struct percpu_counter es_stats_shk_cnt;
244 + * Pending cluster reservations for bigalloc file systems
246 + * A cluster with a pending reservation is a logical cluster shared by at
247 + * least one extent in the extents status tree with delayed and unwritten
248 + * status and at least one other written or unwritten extent. The
249 + * reservation is said to be pending because a cluster reservation would
250 + * have to be taken in the event all blocks in the cluster shared with
251 + * written or unwritten extents were deleted while the delayed and
252 + * unwritten blocks remained.
254 + * The set of pending cluster reservations is an auxiliary data structure
255 + * used with the extents status tree to implement reserved cluster/block
256 + * accounting for bigalloc file systems. The set is kept in memory and
257 + * records all pending cluster reservations.
259 + * Its primary function is to avoid the need to read extents from the
260 + * disk when invalidating pages as a result of a truncate, punch hole, or
261 + * collapse range operation. Page invalidation requires a decrease in the
262 + * reserved cluster count if it results in the removal of all delayed
263 + * and unwritten extents (blocks) from a cluster that is not shared with a
264 + * written or unwritten extent, and no decrease otherwise. Determining
265 + * whether the cluster is shared can be done by searching for a pending
266 + * reservation on it.
268 + * Secondarily, it provides a potentially faster method for determining
269 + * whether the reserved cluster count should be increased when a physical
270 + * cluster is deallocated as a result of a truncate, punch hole, or
271 + * collapse range operation. The necessary information is also present
272 + * in the extents status tree, but might be more rapidly accessed in
273 + * the pending reservation set in many cases due to smaller size.
275 + * The pending cluster reservation set is implemented as a red-black tree
276 + * with the goal of minimizing per page search time overhead.
279 +struct pending_reservation {
280 + struct rb_node rb_node;
284 +struct ext4_pending_tree {
285 + struct rb_root root;
288 extern int __init ext4_init_es(void);
289 extern void ext4_exit_es(void);
290 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
291 @@ -182,4 +227,10 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
293 extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
295 +extern int __init ext4_init_pending(void);
296 +extern void ext4_exit_pending(void);
297 +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
298 +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
299 +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
301 #endif /* _EXT4_EXTENTS_STATUS_H */
302 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
303 index 1145109968ef..faf293ed8060 100644
304 --- a/fs/ext4/super.c
305 +++ b/fs/ext4/super.c
306 @@ -1040,6 +1040,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
307 ei->i_da_metadata_calc_len = 0;
308 ei->i_da_metadata_calc_last_lblock = 0;
309 spin_lock_init(&(ei->i_block_reservation_lock));
310 + ext4_init_pending_tree(&ei->i_pending_tree);
312 ei->i_reserved_quota = 0;
313 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
314 @@ -5954,6 +5955,10 @@ static int __init ext4_init_fs(void)
318 + err = ext4_init_pending();
322 err = ext4_init_pageio();
325 @@ -5992,6 +5997,8 @@ static int __init ext4_init_fs(void)
329 + ext4_exit_pending();
334 @@ -6009,6 +6016,7 @@ static void __exit ext4_exit_fs(void)
335 ext4_exit_system_zone();
338 + ext4_exit_pending();
341 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");