1 ext4: reduce reserved cluster count by number of allocated clusters
3 From: Eric Whitney <enwlinux@gmail.com>
5 Ext4 does not always reduce the reserved cluster count by the number
6 of clusters allocated when mapping a delayed extent. It sometimes
7 adds back one or more clusters after allocation if delalloc blocks
8 adjacent to the range allocated by ext4_ext_map_blocks() share the
9 clusters newly allocated for that range. However, this overcounts
10 the number of clusters needed to satisfy future mapping requests
11 (holding one or more reservations for clusters that have already been
12 allocated) and premature ENOSPC and quota failures, etc., result.
14 Ext4 also does not reduce the reserved cluster count when allocating
15 clusters for non-delayed allocated writes that have previously been
16 reserved for delayed writes. This also results in overcounts.
18 To make it possible to handle reserved cluster accounting for
19 fallocated regions in the same manner as used for other non-delayed
20 writes, do the reserved cluster accounting for them at the time of
21 allocation. In the current code, this is only done later when a
22 delayed extent sharing the fallocated region is finally mapped.
24 Address comment correcting handling of unsigned long long constant
25 from Jan Kara's review of RFC version of this patch.
27 Signed-off-by: Eric Whitney <enwlinux@gmail.com>
28 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
30 fs/ext4/extents.c | 188 +++++++----------------------------------------
31 fs/ext4/extents_status.c | 175 +++++++++++++++++++++++++++++++++++++++++++
32 fs/ext4/extents_status.h | 4 +
33 3 files changed, 207 insertions(+), 160 deletions(-)
35 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
36 index 26481e543312..b52ac813ca20 100644
37 --- a/fs/ext4/extents.c
38 +++ b/fs/ext4/extents.c
39 @@ -3819,83 +3819,6 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
40 return ext4_mark_inode_dirty(handle, inode);
44 - * Determines how many complete clusters (out of those specified by the 'map')
45 - * are under delalloc and were reserved quota for.
46 - * This function is called when we are writing out the blocks that were
47 - * originally written with their allocation delayed, but then the space was
48 - * allocated using fallocate() before the delayed allocation could be resolved.
49 - * The cases to look for are:
50 - * ('=' indicated delayed allocated blocks
51 - * '-' indicates non-delayed allocated blocks)
52 - * (a) partial clusters towards beginning and/or end outside of allocated range
53 - * are not delalloc'ed.
55 - * |----c---=|====c====|====c====|===-c----|
56 - * |++++++ allocated ++++++|
57 - * ==> 4 complete clusters in above example
59 - * (b) partial cluster (outside of allocated range) towards either end is
60 - * marked for delayed allocation. In this case, we will exclude that
63 - * |----====c========|========c========|
64 - * |++++++ allocated ++++++|
65 - * ==> 1 complete clusters in above example
68 - * |================c================|
69 - * |++++++ allocated ++++++|
70 - * ==> 0 complete clusters in above example
72 - * The ext4_da_update_reserve_space will be called only if we
73 - * determine here that there were some "entire" clusters that span
74 - * this 'allocated' range.
75 - * In the non-bigalloc case, this function will just end up returning num_blks
76 - * without ever calling ext4_find_delalloc_range.
79 -get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
80 - unsigned int num_blks)
82 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
83 - ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
84 - ext4_lblk_t lblk_from, lblk_to, c_offset;
85 - unsigned int allocated_clusters = 0;
87 - alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
88 - alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
90 - /* max possible clusters for this allocation */
91 - allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
93 - trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
95 - /* Check towards left side */
96 - c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
98 - lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
99 - lblk_to = lblk_from + c_offset - 1;
101 - if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from,
103 - allocated_clusters--;
106 - /* Now check towards right. */
107 - c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
108 - if (allocated_clusters && c_offset) {
109 - lblk_from = lblk_start + num_blks;
110 - lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
112 - if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from,
114 - allocated_clusters--;
117 - return allocated_clusters;
121 convert_initialized_extent(handle_t *handle, struct inode *inode,
122 struct ext4_map_blocks *map,
123 @@ -4077,23 +4000,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
125 map->m_len = allocated;
128 - * If we have done fallocate with the offset that is already
129 - * delayed allocated, we would have block reservation
130 - * and quota reservation done in the delayed write path.
131 - * But fallocate would have already updated quota and block
132 - * count for this offset. So cancel these reservation
134 - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
135 - unsigned int reserved_clusters;
136 - reserved_clusters = get_reserved_cluster_alloc(inode,
137 - map->m_lblk, map->m_len);
138 - if (reserved_clusters)
139 - ext4_da_update_reserve_space(inode,
145 map->m_flags |= EXT4_MAP_MAPPED;
146 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
147 @@ -4482,77 +4388,39 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
148 map->m_flags |= EXT4_MAP_NEW;
151 - * Update reserved blocks/metadata blocks after successful
152 - * block allocation which had been deferred till now.
153 + * Reduce the reserved cluster count to reflect successful deferred
154 + * allocation of delayed allocated clusters or direct allocation of
155 + * clusters discovered to be delayed allocated. Once allocated, a
156 + * cluster is not included in the reserved count.
158 - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
159 - unsigned int reserved_clusters;
161 - * Check how many clusters we had reserved this allocated range
163 - reserved_clusters = get_reserved_cluster_alloc(inode,
164 - map->m_lblk, allocated);
165 - if (!map_from_cluster) {
166 - BUG_ON(allocated_clusters < reserved_clusters);
167 - if (reserved_clusters < allocated_clusters) {
168 - struct ext4_inode_info *ei = EXT4_I(inode);
169 - int reservation = allocated_clusters -
172 - * It seems we claimed few clusters outside of
173 - * the range of this allocation. We should give
174 - * it back to the reservation pool. This can
175 - * happen in the following case:
177 - * * Suppose s_cluster_ratio is 4 (i.e., each
178 - * cluster has 4 blocks. Thus, the clusters
179 - * are [0-3],[4-7],[8-11]...
180 - * * First comes delayed allocation write for
181 - * logical blocks 10 & 11. Since there were no
182 - * previous delayed allocated blocks in the
183 - * range [8-11], we would reserve 1 cluster
185 - * * Next comes write for logical blocks 3 to 8.
186 - * In this case, we will reserve 2 clusters
187 - * (for [0-3] and [4-7]; and not for [8-11] as
188 - * that range has a delayed allocated blocks.
189 - * Thus total reserved clusters now becomes 3.
190 - * * Now, during the delayed allocation writeout
191 - * time, we will first write blocks [3-8] and
192 - * allocate 3 clusters for writing these
193 - * blocks. Also, we would claim all these
194 - * three clusters above.
195 - * * Now when we come here to writeout the
196 - * blocks [10-11], we would expect to claim
197 - * the reservation of 1 cluster we had made
198 - * (and we would claim it since there are no
199 - * more delayed allocated blocks in the range
200 - * [8-11]. But our reserved cluster count had
201 - * already gone to 0.
203 - * Thus, at the step 4 above when we determine
204 - * that there are still some unwritten delayed
205 - * allocated blocks outside of our current
206 - * block range, we should increment the
207 - * reserved clusters count so that when the
208 - * remaining blocks finally gets written, we
209 - * could claim them.
211 - dquot_reserve_block(inode,
212 - EXT4_C2B(sbi, reservation));
213 - spin_lock(&ei->i_block_reservation_lock);
214 - ei->i_reserved_data_blocks += reservation;
215 - spin_unlock(&ei->i_block_reservation_lock);
217 + if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
218 + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
220 - * We will claim quota for all newly allocated blocks.
221 - * We're updating the reserved space *after* the
222 - * correction above so we do not accidentally free
223 - * all the metadata reservation because we might
224 - * actually need it later on.
225 + * When allocating delayed allocated clusters, simply
226 + * reduce the reserved cluster count and claim quota
228 ext4_da_update_reserve_space(inode, allocated_clusters,
231 + ext4_lblk_t lblk, len;
235 + * When allocating non-delayed allocated clusters
236 + * (from fallocate, filemap, DIO, or clusters
237 + * allocated when delalloc has been disabled by
238 + * ext4_nonda_switch), reduce the reserved cluster
239 + * count by the number of allocated clusters that
240 + * have previously been delayed allocated. Quota
241 + * has been claimed by ext4_mb_new_blocks() above,
242 + * so release the quota reservations made for any
243 + * previously delayed allocated clusters.
245 + lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
246 + len = allocated_clusters << sbi->s_cluster_bits;
247 + n = ext4_es_delayed_clu(inode, lblk, len);
249 + ext4_da_update_reserve_space(inode, (int) n, 0);
253 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
254 index c5d456e12062..c92fbf444d08 100644
255 --- a/fs/ext4/extents_status.c
256 +++ b/fs/ext4/extents_status.c
257 @@ -150,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
258 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
259 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
260 struct ext4_inode_info *locked_ei);
261 +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
264 int __init ext4_init_es(void)
266 @@ -808,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
267 struct extent_status newes;
268 ext4_lblk_t end = lblk + len - 1;
270 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
272 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
273 lblk, len, pblk, status, inode->i_ino);
274 @@ -844,6 +847,11 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
275 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
278 + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
279 + (status & EXTENT_STATUS_WRITTEN ||
280 + status & EXTENT_STATUS_UNWRITTEN))
281 + __revise_pending(inode, lblk, len);
284 write_unlock(&EXT4_I(inode)->i_es_lock);
286 @@ -1605,3 +1613,170 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
292 + * __es_delayed_clu - count number of clusters containing blocks that
295 + * @inode - file containing block range
296 + * @start - logical block defining start of range
297 + * @end - logical block defining end of range
299 + * Returns the number of clusters containing only delayed (not delayed
300 + * and unwritten) blocks in the range specified by @start and @end. Any
301 + * cluster or part of a cluster within the range and containing a delayed
302 + * and not unwritten block within the range is counted as a whole cluster.
304 +static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
307 + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
308 + struct extent_status *es;
309 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
310 + struct rb_node *node;
311 + ext4_lblk_t first_lclu, last_lclu;
312 + unsigned long long last_counted_lclu;
313 + unsigned int n = 0;
315 + /* guaranteed to be unequal to any ext4_lblk_t value */
316 + last_counted_lclu = ~0ULL;
318 + es = __es_tree_search(&tree->root, start);
320 + while (es && (es->es_lblk <= end)) {
321 + if (ext4_es_is_delonly(es)) {
322 + if (es->es_lblk <= start)
323 + first_lclu = EXT4_B2C(sbi, start);
325 + first_lclu = EXT4_B2C(sbi, es->es_lblk);
327 + if (ext4_es_end(es) >= end)
328 + last_lclu = EXT4_B2C(sbi, end);
330 + last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
332 + if (first_lclu == last_counted_lclu)
333 + n += last_lclu - first_lclu;
335 + n += last_lclu - first_lclu + 1;
336 + last_counted_lclu = last_lclu;
338 + node = rb_next(&es->rb_node);
341 + es = rb_entry(node, struct extent_status, rb_node);
348 + * ext4_es_delayed_clu - count number of clusters containing blocks that
349 + * are both delayed and unwritten
351 + * @inode - file containing block range
352 + * @lblk - logical block defining start of range
353 + * @len - number of blocks in range
355 + * Locking for external use of __es_delayed_clu().
357 +unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
360 + struct ext4_inode_info *ei = EXT4_I(inode);
367 + end = lblk + len - 1;
368 + WARN_ON(end < lblk);
370 + read_lock(&ei->i_es_lock);
372 + n = __es_delayed_clu(inode, lblk, end);
374 + read_unlock(&ei->i_es_lock);
380 + * __revise_pending - makes, cancels, or leaves unchanged pending cluster
381 + * reservations for a specified block range depending
382 + * upon the presence or absence of delayed blocks
383 + * outside the range within clusters at the ends of the
386 + * @inode - file containing the range
387 + * @lblk - logical block defining the start of range
388 + * @len - length of range in blocks
390 + * Used after a newly allocated extent is added to the extents status tree.
391 + * Requires that the extents in the range have either written or unwritten
392 + * status. Must be called while holding i_es_lock.
394 +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
397 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
398 + ext4_lblk_t end = lblk + len - 1;
399 + ext4_lblk_t first, last;
400 + bool f_del = false, l_del = false;
406 + * Two cases - block range within single cluster and block range
407 + * spanning two or more clusters. Note that a cluster belonging
408 + * to a range starting and/or ending on a cluster boundary is treated
409 + * as if it does not contain a delayed extent. The new range may
410 + * have allocated space for previously delayed blocks out to the
411 + * cluster boundary, requiring that any pre-existing pending
412 + * reservation be canceled. Because this code only looks at blocks
413 + * outside the range, it should revise pending reservations
414 + * correctly even if the extent represented by the range can't be
415 + * inserted in the extents status tree due to ENOSPC.
418 + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
419 + first = EXT4_LBLK_CMASK(sbi, lblk);
421 + f_del = __es_scan_range(inode, &ext4_es_is_delonly,
424 + __insert_pending(inode, first);
426 + last = EXT4_LBLK_CMASK(sbi, end) +
427 + sbi->s_cluster_ratio - 1;
429 + l_del = __es_scan_range(inode,
430 + &ext4_es_is_delonly,
433 + __insert_pending(inode, last);
435 + __remove_pending(inode, last);
438 + first = EXT4_LBLK_CMASK(sbi, lblk);
440 + f_del = __es_scan_range(inode, &ext4_es_is_delonly,
443 + __insert_pending(inode, first);
445 + __remove_pending(inode, first);
447 + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
449 + l_del = __es_scan_range(inode, &ext4_es_is_delonly,
452 + __insert_pending(inode, last);
454 + __remove_pending(inode, last);
457 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
458 index 9d3c676ec623..131a8b7df265 100644
459 --- a/fs/ext4/extents_status.h
460 +++ b/fs/ext4/extents_status.h
461 @@ -244,5 +244,9 @@ extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
462 extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
463 extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
465 +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
467 +extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
470 #endif /* _EXT4_EXTENTS_STATUS_H */