update to rc6, and verified pass fsx/dbench
[ext4-patch-queue.git] / ext4-fallocate-6-uninit_write_support
blob01a04c3d606560f91e8e4b171e142d671931e626
1 write support for preallocated blocks
3 This patch adds write support to the uninitialized extents that get
4 created when a preallocation is done using fallocate(). It takes care of
5 splitting the extents into multiple (upto three) extents and merging the
6 new split extents with neighbouring ones, if possible.
8 Signed-off-by: Amit Arora <aarora@in.ibm.com>
10 Index: linux-2.6.22-rc4/fs/ext4/extents.c
11 ===================================================================
12 --- linux-2.6.22-rc4.orig/fs/ext4/extents.c
13 +++ linux-2.6.22-rc4/fs/ext4/extents.c
14 @@ -1167,6 +1167,53 @@ ext4_can_extents_be_merged(struct inode 
15  }
17  /*
18 + * This function tries to merge the "ex" extent to the next extent in the tree.
19 + * It always tries to merge towards right. If you want to merge towards
20 + * left, pass "ex - 1" as argument instead of "ex".
21 + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
22 + * 1 if they got merged.
23 + */
24 +int ext4_ext_try_to_merge(struct inode *inode,
25 +                         struct ext4_ext_path *path,
26 +                         struct ext4_extent *ex)
28 +       struct ext4_extent_header *eh;
29 +       unsigned int depth, len;
30 +       int merge_done = 0;
31 +       int uninitialized = 0;
33 +       depth = ext_depth(inode);
34 +       BUG_ON(path[depth].p_hdr == NULL);
35 +       eh = path[depth].p_hdr;
37 +       while (ex < EXT_LAST_EXTENT(eh)) {
38 +               if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
39 +                       break;
40 +               /* merge with next extent! */
41 +               if (ext4_ext_is_uninitialized(ex))
42 +                       uninitialized = 1;
43 +               ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
44 +                               + ext4_ext_get_actual_len(ex + 1));
45 +               if (uninitialized)
46 +                       ext4_ext_mark_uninitialized(ex);
48 +               if (ex + 1 < EXT_LAST_EXTENT(eh)) {
49 +                       len = (EXT_LAST_EXTENT(eh) - ex - 1)
50 +                               * sizeof(struct ext4_extent);
51 +                       memmove(ex + 1, ex + 2, len);
52 +               }
53 +               eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
54 +               merge_done = 1;
55 +               WARN_ON(eh->eh_entries == 0);
56 +               if (!eh->eh_entries)
57 +                       ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
58 +                          "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
59 +       }
61 +       return merge_done;
64 +/*
65   * check if a portion of the "newext" extent overlaps with an
66   * existing extent.
67   *
68 @@ -1354,25 +1401,7 @@ has_space:
70  merge:
71         /* try to merge extents to the right */
72 -       while (nearex < EXT_LAST_EXTENT(eh)) {
73 -               if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
74 -                       break;
75 -               /* merge with next extent! */
76 -               if (ext4_ext_is_uninitialized(nearex))
77 -                       uninitialized = 1;
78 -               nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
79 -                                       + ext4_ext_get_actual_len(nearex + 1));
80 -               if (uninitialized)
81 -                       ext4_ext_mark_uninitialized(nearex);
83 -               if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
84 -                       len = (EXT_LAST_EXTENT(eh) - nearex - 1)
85 -                                       * sizeof(struct ext4_extent);
86 -                       memmove(nearex + 1, nearex + 2, len);
87 -               }
88 -               eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
89 -               BUG_ON(eh->eh_entries == 0);
90 -       }
91 +       ext4_ext_try_to_merge(inode, path, nearex);
93         /* try to merge extents to the left */
95 @@ -2035,15 +2064,158 @@ void ext4_ext_release(struct super_block
96  #endif
97  }
99 +/*
100 + * This function is called by ext4_ext_get_blocks() if someone tries to write
101 + * to an uninitialized extent. It may result in splitting the uninitialized
102 + * extent into multiple extents (upto three - one initialized and two
103 + * uninitialized).
104 + * There are three possibilities:
105 + *   a> There is no split required: Entire extent should be initialized
106 + *   b> Splits in two extents: Write is happening at either end of the extent
107 + *   c> Splits in three extents: Somone is writing in middle of the extent
108 + */
109 +int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
110 +                                       struct ext4_ext_path *path,
111 +                                       ext4_fsblk_t iblock,
112 +                                       unsigned long max_blocks)
114 +       struct ext4_extent *ex, newex;
115 +       struct ext4_extent *ex1 = NULL;
116 +       struct ext4_extent *ex2 = NULL;
117 +       struct ext4_extent *ex3 = NULL;
118 +       struct ext4_extent_header *eh;
119 +       unsigned int allocated, ee_block, ee_len, depth;
120 +       ext4_fsblk_t newblock;
121 +       int err = 0;
122 +       int ret = 0;
124 +       depth = ext_depth(inode);
125 +       eh = path[depth].p_hdr;
126 +       ex = path[depth].p_ext;
127 +       ee_block = le32_to_cpu(ex->ee_block);
128 +       ee_len = ext4_ext_get_actual_len(ex);
129 +       allocated = ee_len - (iblock - ee_block);
130 +       newblock = iblock - ee_block + ext_pblock(ex);
131 +       ex2 = ex;
133 +       /* ex1: ee_block to iblock - 1 : uninitialized */
134 +       if (iblock > ee_block) {
135 +               ex1 = ex;
136 +               ex1->ee_len = cpu_to_le16(iblock - ee_block);
137 +               ext4_ext_mark_uninitialized(ex1);
138 +               ex2 = &newex;
139 +       }
140 +       /*
141 +        * for sanity, update the length of the ex2 extent before
142 +        * we insert ex3, if ex1 is NULL. This is to avoid temporary
143 +        * overlap of blocks.
144 +        */
145 +       if (!ex1 && allocated > max_blocks)
146 +               ex2->ee_len = cpu_to_le16(max_blocks);
147 +       /* ex3: to ee_block + ee_len : uninitialised */
148 +       if (allocated > max_blocks) {
149 +               unsigned int newdepth;
150 +               ex3 = &newex;
151 +               ex3->ee_block = cpu_to_le32(iblock + max_blocks);
152 +               ext4_ext_store_pblock(ex3, newblock + max_blocks);
153 +               ex3->ee_len = cpu_to_le16(allocated - max_blocks);
154 +               ext4_ext_mark_uninitialized(ex3);
155 +               err = ext4_ext_insert_extent(handle, inode, path, ex3);
156 +               if (err)
157 +                       goto out;
158 +               /*
159 +                * The depth, and hence eh & ex might change
160 +                * as part of the insert above.
161 +                */
162 +               newdepth = ext_depth(inode);
163 +               if (newdepth != depth) {
164 +                       depth = newdepth;
165 +                       path = ext4_ext_find_extent(inode, iblock, NULL);
166 +                       if (IS_ERR(path)) {
167 +                               err = PTR_ERR(path);
168 +                               path = NULL;
169 +                               goto out;
170 +                       }
171 +                       eh = path[depth].p_hdr;
172 +                       ex = path[depth].p_ext;
173 +                       if (ex2 != &newex)
174 +                               ex2 = ex;
175 +               }
176 +               allocated = max_blocks;
177 +       }
178 +       /*
179 +        * If there was a change of depth as part of the
180 +        * insertion of ex3 above, we need to update the length
181 +        * of the ex1 extent again here
182 +        */
183 +       if (ex1 && ex1 != ex) {
184 +               ex1 = ex;
185 +               ex1->ee_len = cpu_to_le16(iblock - ee_block);
186 +               ext4_ext_mark_uninitialized(ex1);
187 +               ex2 = &newex;
188 +       }
189 +       /* ex2: iblock to iblock + maxblocks-1 : initialised */
190 +       ex2->ee_block = cpu_to_le32(iblock);
191 +       ex2->ee_start = cpu_to_le32(newblock);
192 +       ext4_ext_store_pblock(ex2, newblock);
193 +       ex2->ee_len = cpu_to_le16(allocated);
194 +       if (ex2 != ex)
195 +               goto insert;
196 +       err = ext4_ext_get_access(handle, inode, path + depth);
197 +       if (err)
198 +               goto out;
199 +       /*
200 +        * New (initialized) extent starts from the first block
201 +        * in the current extent. i.e., ex2 == ex
202 +        * We have to see if it can be merged with the extent
203 +        * on the left.
204 +        */
205 +       if (ex2 > EXT_FIRST_EXTENT(eh)) {
206 +               /*
207 +                * To merge left, pass "ex2 - 1" to try_to_merge(),
208 +                * since it merges towards right _only_.
209 +                */
210 +               ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
211 +               if (ret) {
212 +                       err = ext4_ext_correct_indexes(handle, inode, path);
213 +                       if (err)
214 +                               goto out;
215 +                       depth = ext_depth(inode);
216 +                       ex2--;
217 +               }
218 +       }
219 +       /*
220 +        * Try to Merge towards right. This might be required
221 +        * only when the whole extent is being written to.
222 +        * i.e. ex2 == ex and ex3 == NULL.
223 +        */
224 +       if (!ex3) {
225 +               ret = ext4_ext_try_to_merge(inode, path, ex2);
226 +               if (ret) {
227 +                       err = ext4_ext_correct_indexes(handle, inode, path);
228 +                       if (err)
229 +                               goto out;
230 +               }
231 +       }
232 +       /* Mark modified extent as dirty */
233 +       err = ext4_ext_dirty(handle, inode, path + depth);
234 +       goto out;
235 +insert:
236 +       err = ext4_ext_insert_extent(handle, inode, path, &newex);
237 +out:
238 +       return err ? err : allocated;
241  int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
242                         ext4_fsblk_t iblock,
243                         unsigned long max_blocks, struct buffer_head *bh_result,
244                         int create, int extend_disksize)
246         struct ext4_ext_path *path = NULL;
247 +       struct ext4_extent_header *eh;
248         struct ext4_extent newex, *ex;
249         ext4_fsblk_t goal, newblock;
250 -       int err = 0, depth;
251 +       int err = 0, depth, ret;
252         unsigned long allocated = 0;
254         __clear_bit(BH_New, &bh_result->b_state);
255 @@ -2056,8 +2228,10 @@ int ext4_ext_get_blocks(handle_t *handle
256         if (goal) {
257                 if (goal == EXT4_EXT_CACHE_GAP) {
258                         if (!create) {
259 -                               /* block isn't allocated yet and
260 -                                * user doesn't want to allocate it */
261 +                               /*
262 +                                * block isn't allocated yet and
263 +                                * user doesn't want to allocate it
264 +                                */
265                                 goto out2;
266                         }
267                         /* we should allocate requested block */
268 @@ -2091,6 +2265,7 @@ int ext4_ext_get_blocks(handle_t *handle
269          * this is why assert can't be put in ext4_ext_find_extent()
270          */
271         BUG_ON(path[depth].p_ext == NULL && depth != 0);
272 +       eh = path[depth].p_hdr;
274         ex = path[depth].p_ext;
275         if (ex) {
276 @@ -2099,13 +2274,9 @@ int ext4_ext_get_blocks(handle_t *handle
277                 unsigned short ee_len;
279                 /*
280 -                * Allow future support for preallocated extents to be added
281 -                * as an RO_COMPAT feature:
282                  * Uninitialized extents are treated as holes, except that
283 -                * we avoid (fail) allocating new blocks during a write.
284 +                * we split out initialized portions during a write.
285                  */
286 -               if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN)
287 -                       goto out2;
288                 ee_len = ext4_ext_get_actual_len(ex);
289                 /* if found extent covers block, simply return it */
290                 if (iblock >= ee_block && iblock < ee_block + ee_len) {
291 @@ -2114,12 +2285,27 @@ int ext4_ext_get_blocks(handle_t *handle
292                         allocated = ee_len - (iblock - ee_block);
293                         ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
294                                         ee_block, ee_len, newblock);
296                         /* Do not put uninitialized extent in the cache */
297 -                       if (!ext4_ext_is_uninitialized(ex))
298 +                       if (!ext4_ext_is_uninitialized(ex)) {
299                                 ext4_ext_put_in_cache(inode, ee_block,
300                                                         ee_len, ee_start,
301                                                         EXT4_EXT_CACHE_EXTENT);
302 -                       goto out;
303 +                               goto out;
304 +                       }
305 +                       if (create == EXT4_CREATE_UNINITIALIZED_EXT)
306 +                               goto out;
307 +                       if (!create)
308 +                               goto out2;
310 +                       ret = ext4_ext_convert_to_initialized(handle, inode,
311 +                                                               path, iblock,
312 +                                                               max_blocks);
313 +                       if (ret <= 0)
314 +                               goto out2;
315 +                       else
316 +                               allocated = ret;
317 +                       goto outnew;
318                 }
319         }
321 @@ -2128,8 +2314,10 @@ int ext4_ext_get_blocks(handle_t *handle
322          * we couldn't try to create block if create flag is zero
323          */
324         if (!create) {
325 -               /* put just found gap into cache to speed up
326 -                * subsequent requests */
327 +               /*
328 +                * put just found gap into cache to speed up
329 +                * subsequent requests
330 +                */
331                 ext4_ext_put_gap_in_cache(inode, path, iblock);
332                 goto out2;
333         }
334 @@ -2175,6 +2363,7 @@ int ext4_ext_get_blocks(handle_t *handle
336         /* previous routine could use block we allocated */
337         newblock = ext_pblock(&newex);
338 +outnew:
339         __set_bit(BH_New, &bh_result->b_state);
341         /* Cache only when it is _not_ an uninitialized extent */
342 @@ -2244,7 +2433,8 @@ void ext4_ext_truncate(struct inode * in
343         err = ext4_ext_remove_space(inode, last_block);
345         /* In a multi-transaction truncate, we only make the final
346 -        * transaction synchronous. */
347 +        * transaction synchronous.
348 +        */
349         if (IS_SYNC(inode))
350                 handle->h_sync = 1;
352 Index: linux-2.6.22-rc4/include/linux/ext4_fs_extents.h
353 ===================================================================
354 --- linux-2.6.22-rc4.orig/include/linux/ext4_fs_extents.h
355 +++ linux-2.6.22-rc4/include/linux/ext4_fs_extents.h
356 @@ -205,6 +205,9 @@ static inline int ext4_ext_get_actual_le
358  extern int ext4_extent_tree_init(handle_t *, struct inode *);
359  extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
360 +extern int ext4_ext_try_to_merge(struct inode *inode,
361 +                                struct ext4_ext_path *path,
362 +                                struct ext4_extent *);
363  extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
364  extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
365  extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);