1 ext4: fix reserved cluster accounting at delayed write time
3 From: Eric Whitney <enwlinux@gmail.com>
5 The code in ext4_da_map_blocks sometimes reserves space for more
6 delayed allocated clusters than it should, resulting in premature
7 ENOSPC, exceeded quota, and inaccurate free space reporting.
9 Fix this by checking for written and unwritten blocks shared in the
10 same cluster with the newly delayed allocated block. A cluster
11 reservation should not be made for a cluster for which physical space
12 has already been allocated.
14 Signed-off-by: Eric Whitney <enwlinux@gmail.com>
15 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
18 fs/ext4/extents.c | 79 +++++++++++++++++++++++++++++++++++++++++++++
19 fs/ext4/extents_status.c | 53 ++++++++++++++++++++++++++++++
20 fs/ext4/extents_status.h | 12 +++++++
21 fs/ext4/inode.c | 79 ++++++++++++++++++++++++++++++++++-----------
22 include/trace/events/ext4.h | 35 ++++++++++++++++++++
23 6 files changed, 241 insertions(+), 18 deletions(-)
25 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
26 index fc0f41dbf90b..d85fd5c8a2c4 100644
29 @@ -3155,6 +3155,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
30 struct inode *inode2, ext4_lblk_t lblk1,
31 ext4_lblk_t lblk2, ext4_lblk_t count,
32 int mark_unwritten,int *err);
33 +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
36 extern void ext4_double_down_write_data_sem(struct inode *first,
37 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
38 index 95796f00e4e6..26481e543312 100644
39 --- a/fs/ext4/extents.c
40 +++ b/fs/ext4/extents.c
41 @@ -5930,3 +5930,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
43 return replaced_count;
47 + * ext4_clu_mapped - determine whether any block in a logical cluster has
48 + * been mapped to a physical cluster
50 + * @inode - file containing the logical cluster
51 + * @lclu - logical cluster of interest
53 + * Returns 1 if any block in the logical cluster is mapped, signifying
54 + * that a physical cluster has been allocated for it. Otherwise,
55 + * returns 0. Can also return negative error codes. Derived from
56 + * ext4_ext_map_blocks().
58 +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
60 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
61 + struct ext4_ext_path *path;
62 + int depth, mapped = 0, err = 0;
63 + struct ext4_extent *extent;
64 + ext4_lblk_t first_lblk, first_lclu, last_lclu;
66 + /* search for the extent closest to the first block in the cluster */
67 + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
69 + err = PTR_ERR(path);
74 + depth = ext_depth(inode);
77 + * A consistent leaf must not be empty. This situation is possible,
78 + * though, _during_ tree modification, and it's why an assert can't
79 + * be put in ext4_find_extent().
81 + if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
82 + EXT4_ERROR_INODE(inode,
83 + "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
84 + (unsigned long) EXT4_C2B(sbi, lclu),
85 + depth, path[depth].p_block);
86 + err = -EFSCORRUPTED;
90 + extent = path[depth].p_ext;
92 + /* can't be mapped if the extent tree is empty */
96 + first_lblk = le32_to_cpu(extent->ee_block);
97 + first_lclu = EXT4_B2C(sbi, first_lblk);
100 + * Three possible outcomes at this point - found extent spanning
101 + * the target cluster, to the left of the target cluster, or to the
102 + * right of the target cluster. The first two cases are handled here.
103 + * The last case indicates the target cluster is not mapped.
105 + if (lclu >= first_lclu) {
106 + last_lclu = EXT4_B2C(sbi, first_lblk +
107 + ext4_ext_get_actual_len(extent) - 1);
108 + if (lclu <= last_lclu) {
111 + first_lblk = ext4_ext_next_allocated_block(path);
112 + first_lclu = EXT4_B2C(sbi, first_lblk);
113 + if (lclu == first_lclu)
119 + ext4_ext_drop_refs(path);
122 + return err ? err : mapped;
124 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
125 index 194785ce890a..c5d456e12062 100644
126 --- a/fs/ext4/extents_status.c
127 +++ b/fs/ext4/extents_status.c
128 @@ -1552,3 +1552,56 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
134 + * ext4_es_insert_delayed_block - adds a delayed block to the extents status
135 + * tree, adding a pending reservation where
138 + * @inode - file containing the newly added block
139 + * @lblk - logical block to be added
140 + * @allocated - indicates whether a physical cluster has been allocated for
141 + * the logical cluster that contains the block
143 + * Returns 0 on success, negative error code on failure.
145 +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
148 + struct extent_status newes;
151 + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
152 + lblk, inode->i_ino);
154 + newes.es_lblk = lblk;
156 + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
157 + trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
159 + ext4_es_insert_extent_check(inode, &newes);
161 + write_lock(&EXT4_I(inode)->i_es_lock);
163 + err = __es_remove_extent(inode, lblk, lblk);
167 + err = __es_insert_extent(inode, &newes);
168 + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
169 + 128, EXT4_I(inode)))
175 + __insert_pending(inode, lblk);
178 + write_unlock(&EXT4_I(inode)->i_es_lock);
180 + ext4_es_print_tree(inode);
181 + ext4_print_pending_tree(inode);
185 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
186 index 379b7171c67c..9d3c676ec623 100644
187 --- a/fs/ext4/extents_status.h
188 +++ b/fs/ext4/extents_status.h
189 @@ -178,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es)
190 return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
193 +static inline int ext4_es_is_mapped(struct extent_status *es)
195 + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
198 +static inline int ext4_es_is_delonly(struct extent_status *es)
200 + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
203 static inline void ext4_es_set_referenced(struct extent_status *es)
205 es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
206 @@ -232,5 +242,7 @@ extern void ext4_exit_pending(void);
207 extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
208 extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
209 extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
210 +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
213 #endif /* _EXT4_EXTENTS_STATUS_H */
214 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
215 index b83bf3308b5e..57c6dd38f071 100644
216 --- a/fs/ext4/inode.c
217 +++ b/fs/ext4/inode.c
218 @@ -1781,6 +1781,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
222 + * ext4_insert_delayed_block - adds a delayed block to the extents status
223 + * tree, incrementing the reserved cluster/block
224 + * count or making a pending reservation
227 + * @inode - file containing the newly added block
228 + * @lblk - logical block to be added
230 + * Returns 0 on success, negative error code on failure.
232 +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
234 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
236 + bool allocated = false;
239 + * If the cluster containing lblk is shared with a delayed,
240 + * written, or unwritten extent in a bigalloc file system, it's
241 + * already been accounted for and does not need to be reserved.
242 + * A pending reservation must be made for the cluster if it's
243 + * shared with a written or unwritten extent and doesn't already
244 + * have one. Written and unwritten extents can be purged from the
245 + * extents status tree if the system is under memory pressure, so
246 + * it's necessary to examine the extent tree if a search of the
247 + * extents status tree doesn't get a match.
249 + if (sbi->s_cluster_ratio == 1) {
250 + ret = ext4_da_reserve_space(inode);
251 + if (ret != 0) /* ENOSPC */
253 + } else { /* bigalloc */
254 + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
255 + if (!ext4_es_scan_clu(inode,
256 + &ext4_es_is_mapped, lblk)) {
257 + ret = ext4_clu_mapped(inode,
258 + EXT4_B2C(sbi, lblk));
262 + ret = ext4_da_reserve_space(inode);
263 + if (ret != 0) /* ENOSPC */
274 + ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
281 * This function is grabs code from the very beginning of
282 * ext4_map_blocks, but assumes that the caller is from delayed write
283 * time. This function looks up the requested blocks and sets the
284 @@ -1864,25 +1923,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
285 * XXX: __block_prepare_write() unmaps passed block,
289 - * If the block was allocated from previously allocated cluster,
290 - * then we don't need to reserve it again. However we still need
291 - * to reserve metadata for every block we're going to write.
293 - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
294 - !ext4_es_scan_clu(inode,
295 - &ext4_es_is_delayed, map->m_lblk)) {
296 - ret = ext4_da_reserve_space(inode);
298 - /* not enough space to reserve */
304 - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
305 - ~0, EXTENT_STATUS_DELAYED);
307 + ret = ext4_insert_delayed_block(inode, map->m_lblk);
312 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
313 index 7849b7f8fd9d..6d7a943f849c 100644
314 --- a/include/trace/events/ext4.h
315 +++ b/include/trace/events/ext4.h
316 @@ -2512,6 +2512,41 @@ TRACE_EVENT(ext4_es_shrink,
317 __entry->scan_time, __entry->nr_skipped, __entry->retried)
320 +TRACE_EVENT(ext4_es_insert_delayed_block,
321 + TP_PROTO(struct inode *inode, struct extent_status *es,
324 + TP_ARGS(inode, es, allocated),
327 + __field( dev_t, dev )
328 + __field( ino_t, ino )
329 + __field( ext4_lblk_t, lblk )
330 + __field( ext4_lblk_t, len )
331 + __field( ext4_fsblk_t, pblk )
332 + __field( char, status )
333 + __field( bool, allocated )
337 + __entry->dev = inode->i_sb->s_dev;
338 + __entry->ino = inode->i_ino;
339 + __entry->lblk = es->es_lblk;
340 + __entry->len = es->es_len;
341 + __entry->pblk = ext4_es_pblock(es);
342 + __entry->status = ext4_es_status(es);
343 + __entry->allocated = allocated;
346 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
348 + MAJOR(__entry->dev), MINOR(__entry->dev),
349 + (unsigned long) __entry->ino,
350 + __entry->lblk, __entry->len,
351 + __entry->pblk, show_extent_status(__entry->status),
352 + __entry->allocated)
356 DECLARE_EVENT_CLASS(ext4_fsmap_class,
357 TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,