Check in Darrick's documentation fixes
[ext4-patch-queue.git] / fix-reserved-cluster-accounting-at-delayed-write-time
blobf5c88db7bba7ba59d4d86d68d088ca6855510bdb
1 ext4: fix reserved cluster accounting at delayed write time
3 From: Eric Whitney <enwlinux@gmail.com>
5 The code in ext4_da_map_blocks sometimes reserves space for more
6 delayed allocated clusters than it should, resulting in premature
7 ENOSPC, exceeded quota, and inaccurate free space reporting.
9 Fix this by checking for written and unwritten blocks shared in the
10 same cluster with the newly delayed allocated block.  A cluster
11 reservation should not be made for a cluster for which physical space
12 has already been allocated.
14 Signed-off-by: Eric Whitney <enwlinux@gmail.com>
15 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
16 ---
17  fs/ext4/ext4.h              |  1 +
18  fs/ext4/extents.c           | 79 +++++++++++++++++++++++++++++++++++++++++++++
19  fs/ext4/extents_status.c    | 53 ++++++++++++++++++++++++++++++
20  fs/ext4/extents_status.h    | 12 +++++++
21  fs/ext4/inode.c             | 79 ++++++++++++++++++++++++++++++++++-----------
22  include/trace/events/ext4.h | 35 ++++++++++++++++++++
23  6 files changed, 241 insertions(+), 18 deletions(-)
25 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
26 index fc0f41dbf90b..d85fd5c8a2c4 100644
27 --- a/fs/ext4/ext4.h
28 +++ b/fs/ext4/ext4.h
29 @@ -3155,6 +3155,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
30                                 struct inode *inode2, ext4_lblk_t lblk1,
31                              ext4_lblk_t lblk2,  ext4_lblk_t count,
32                              int mark_unwritten,int *err);
33 +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
35  /* move_extent.c */
36  extern void ext4_double_down_write_data_sem(struct inode *first,
37 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
38 index 95796f00e4e6..26481e543312 100644
39 --- a/fs/ext4/extents.c
40 +++ b/fs/ext4/extents.c
41 @@ -5930,3 +5930,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
42         }
43         return replaced_count;
44  }
46 +/*
47 + * ext4_clu_mapped - determine whether any block in a logical cluster has
48 + *                   been mapped to a physical cluster
49 + *
50 + * @inode - file containing the logical cluster
51 + * @lclu - logical cluster of interest
52 + *
53 + * Returns 1 if any block in the logical cluster is mapped, signifying
54 + * that a physical cluster has been allocated for it.  Otherwise,
55 + * returns 0.  Can also return negative error codes.  Derived from
56 + * ext4_ext_map_blocks().
57 + */
58 +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
60 +       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
61 +       struct ext4_ext_path *path;
62 +       int depth, mapped = 0, err = 0;
63 +       struct ext4_extent *extent;
64 +       ext4_lblk_t first_lblk, first_lclu, last_lclu;
66 +       /* search for the extent closest to the first block in the cluster */
67 +       path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
68 +       if (IS_ERR(path)) {
69 +               err = PTR_ERR(path);
70 +               path = NULL;
71 +               goto out;
72 +       }
74 +       depth = ext_depth(inode);
76 +       /*
77 +        * A consistent leaf must not be empty.  This situation is possible,
78 +        * though, _during_ tree modification, and it's why an assert can't
79 +        * be put in ext4_find_extent().
80 +        */
81 +       if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
82 +               EXT4_ERROR_INODE(inode,
83 +                   "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
84 +                                (unsigned long) EXT4_C2B(sbi, lclu),
85 +                                depth, path[depth].p_block);
86 +               err = -EFSCORRUPTED;
87 +               goto out;
88 +       }
90 +       extent = path[depth].p_ext;
92 +       /* can't be mapped if the extent tree is empty */
93 +       if (extent == NULL)
94 +               goto out;
96 +       first_lblk = le32_to_cpu(extent->ee_block);
97 +       first_lclu = EXT4_B2C(sbi, first_lblk);
99 +       /*
100 +        * Three possible outcomes at this point - found extent spanning
101 +        * the target cluster, to the left of the target cluster, or to the
102 +        * right of the target cluster.  The first two cases are handled here.
103 +        * The last case indicates the target cluster is not mapped.
104 +        */
105 +       if (lclu >= first_lclu) {
106 +               last_lclu = EXT4_B2C(sbi, first_lblk +
107 +                                    ext4_ext_get_actual_len(extent) - 1);
108 +               if (lclu <= last_lclu) {
109 +                       mapped = 1;
110 +               } else {
111 +                       first_lblk = ext4_ext_next_allocated_block(path);
112 +                       first_lclu = EXT4_B2C(sbi, first_lblk);
113 +                       if (lclu == first_lclu)
114 +                               mapped = 1;
115 +               }
116 +       }
118 +out:
119 +       ext4_ext_drop_refs(path);
120 +       kfree(path);
122 +       return err ? err : mapped;
124 diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
125 index 194785ce890a..c5d456e12062 100644
126 --- a/fs/ext4/extents_status.c
127 +++ b/fs/ext4/extents_status.c
128 @@ -1552,3 +1552,56 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
130         return ret;
134 + * ext4_es_insert_delayed_block - adds a delayed block to the extents status
135 + *                                tree, adding a pending reservation where
136 + *                                needed
137 + *
138 + * @inode - file containing the newly added block
139 + * @lblk - logical block to be added
140 + * @allocated - indicates whether a physical cluster has been allocated for
141 + *              the logical cluster that contains the block
142 + *
143 + * Returns 0 on success, negative error code on failure.
144 + */
145 +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
146 +                                bool allocated)
148 +       struct extent_status newes;
149 +       int err = 0;
151 +       es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
152 +                lblk, inode->i_ino);
154 +       newes.es_lblk = lblk;
155 +       newes.es_len = 1;
156 +       ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
157 +       trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
159 +       ext4_es_insert_extent_check(inode, &newes);
161 +       write_lock(&EXT4_I(inode)->i_es_lock);
163 +       err = __es_remove_extent(inode, lblk, lblk);
164 +       if (err != 0)
165 +               goto error;
166 +retry:
167 +       err = __es_insert_extent(inode, &newes);
168 +       if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
169 +                                         128, EXT4_I(inode)))
170 +               goto retry;
171 +       if (err != 0)
172 +               goto error;
174 +       if (allocated)
175 +               __insert_pending(inode, lblk);
177 +error:
178 +       write_unlock(&EXT4_I(inode)->i_es_lock);
180 +       ext4_es_print_tree(inode);
181 +       ext4_print_pending_tree(inode);
183 +       return err;
185 diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
186 index 379b7171c67c..9d3c676ec623 100644
187 --- a/fs/ext4/extents_status.h
188 +++ b/fs/ext4/extents_status.h
189 @@ -178,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es)
190         return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
193 +static inline int ext4_es_is_mapped(struct extent_status *es)
195 +       return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
198 +static inline int ext4_es_is_delonly(struct extent_status *es)
200 +       return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
203  static inline void ext4_es_set_referenced(struct extent_status *es)
205         es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
206 @@ -232,5 +242,7 @@ extern void ext4_exit_pending(void);
207  extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
208  extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
209  extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
210 +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
211 +                                       bool allocated);
213  #endif /* _EXT4_EXTENTS_STATUS_H */
214 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
215 index b83bf3308b5e..57c6dd38f071 100644
216 --- a/fs/ext4/inode.c
217 +++ b/fs/ext4/inode.c
218 @@ -1781,6 +1781,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
221  /*
222 + * ext4_insert_delayed_block - adds a delayed block to the extents status
223 + *                             tree, incrementing the reserved cluster/block
224 + *                             count or making a pending reservation
225 + *                             where needed
226 + *
227 + * @inode - file containing the newly added block
228 + * @lblk - logical block to be added
229 + *
230 + * Returns 0 on success, negative error code on failure.
231 + */
232 +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
234 +       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
235 +       int ret;
236 +       bool allocated = false;
238 +       /*
239 +        * If the cluster containing lblk is shared with a delayed,
240 +        * written, or unwritten extent in a bigalloc file system, it's
241 +        * already been accounted for and does not need to be reserved.
242 +        * A pending reservation must be made for the cluster if it's
243 +        * shared with a written or unwritten extent and doesn't already
244 +        * have one.  Written and unwritten extents can be purged from the
245 +        * extents status tree if the system is under memory pressure, so
246 +        * it's necessary to examine the extent tree if a search of the
247 +        * extents status tree doesn't get a match.
248 +        */
249 +       if (sbi->s_cluster_ratio == 1) {
250 +               ret = ext4_da_reserve_space(inode);
251 +               if (ret != 0)   /* ENOSPC */
252 +                       goto errout;
253 +       } else {   /* bigalloc */
254 +               if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
255 +                       if (!ext4_es_scan_clu(inode,
256 +                                             &ext4_es_is_mapped, lblk)) {
257 +                               ret = ext4_clu_mapped(inode,
258 +                                                     EXT4_B2C(sbi, lblk));
259 +                               if (ret < 0)
260 +                                       goto errout;
261 +                               if (ret == 0) {
262 +                                       ret = ext4_da_reserve_space(inode);
263 +                                       if (ret != 0)   /* ENOSPC */
264 +                                               goto errout;
265 +                               } else {
266 +                                       allocated = true;
267 +                               }
268 +                       } else {
269 +                               allocated = true;
270 +                       }
271 +               }
272 +       }
274 +       ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
276 +errout:
277 +       return ret;
281   * This function is grabs code from the very beginning of
282   * ext4_map_blocks, but assumes that the caller is from delayed write
283   * time. This function looks up the requested blocks and sets the
284 @@ -1864,25 +1923,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
285                  * XXX: __block_prepare_write() unmaps passed block,
286                  * is it OK?
287                  */
288 -               /*
289 -                * If the block was allocated from previously allocated cluster,
290 -                * then we don't need to reserve it again. However we still need
291 -                * to reserve metadata for every block we're going to write.
292 -                */
293 -               if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
294 -                   !ext4_es_scan_clu(inode,
295 -                                     &ext4_es_is_delayed, map->m_lblk)) {
296 -                       ret = ext4_da_reserve_space(inode);
297 -                       if (ret) {
298 -                               /* not enough space to reserve */
299 -                               retval = ret;
300 -                               goto out_unlock;
301 -                       }
302 -               }
304 -               ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
305 -                                           ~0, EXTENT_STATUS_DELAYED);
306 -               if (ret) {
307 +               ret = ext4_insert_delayed_block(inode, map->m_lblk);
308 +               if (ret != 0) {
309                         retval = ret;
310                         goto out_unlock;
311                 }
312 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
313 index 7849b7f8fd9d..6d7a943f849c 100644
314 --- a/include/trace/events/ext4.h
315 +++ b/include/trace/events/ext4.h
316 @@ -2512,6 +2512,41 @@ TRACE_EVENT(ext4_es_shrink,
317                   __entry->scan_time, __entry->nr_skipped, __entry->retried)
318  );
320 +TRACE_EVENT(ext4_es_insert_delayed_block,
321 +       TP_PROTO(struct inode *inode, struct extent_status *es,
322 +                bool allocated),
324 +       TP_ARGS(inode, es, allocated),
326 +       TP_STRUCT__entry(
327 +               __field(        dev_t,          dev             )
328 +               __field(        ino_t,          ino             )
329 +               __field(        ext4_lblk_t,    lblk            )
330 +               __field(        ext4_lblk_t,    len             )
331 +               __field(        ext4_fsblk_t,   pblk            )
332 +               __field(        char,           status          )
333 +               __field(        bool,           allocated       )
334 +       ),
336 +       TP_fast_assign(
337 +               __entry->dev            = inode->i_sb->s_dev;
338 +               __entry->ino            = inode->i_ino;
339 +               __entry->lblk           = es->es_lblk;
340 +               __entry->len            = es->es_len;
341 +               __entry->pblk           = ext4_es_pblock(es);
342 +               __entry->status         = ext4_es_status(es);
343 +               __entry->allocated      = allocated;
344 +       ),
346 +       TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
347 +                 "allocated %d",
348 +                 MAJOR(__entry->dev), MINOR(__entry->dev),
349 +                 (unsigned long) __entry->ino,
350 +                 __entry->lblk, __entry->len,
351 +                 __entry->pblk, show_extent_status(__entry->status),
352 +                 __entry->allocated)
355  /* fsmap traces */
356  DECLARE_EVENT_CLASS(ext4_fsmap_class,
357         TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
358 -- 
359 2.11.0