Fix grammar in comment.
[ext4-patch-queue.git] / ext4-fix-online-resize-with-mballoc.patch
blob7212475075dead0d141946610802fd016c7170c6
1 ext4: fix online resize with mballoc
3 From: Frederic Bohe <frederic.bohe@bull.net>
5 Update group infos when updating a group's descriptor.
6 Add group infos when adding a group's descriptor.
7 Refresh cache pages used by mb_alloc when changes occur.
8 This will probably need modifications when META_BG resizing will be allowed.
10 Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
11 Signed-off-by: Mingming Cao <cmm@us.ibm.com>
12 ---
13 This patch apply on top of 2.6.26-rc6 + ext4-patch-queue-37b3e39765d8521ba2252bfec81bb91504fa35c8
15 It fixes oops when a filesystem is resized online while being mounted with mballoc.
16 Two oops have been identified:
17 The first one occurs when unmounting the resized filesystem.
18 The second one happens when trying to write to a group added during the online resize.
20 This patch has been tested with:
21 - small (100MB) and large (5TB) filesystems.
22 - 1K blocks and 4K blocks filesystems.
23 - inode size=256 up to 1024.
25 Tests consist in :
26 - online resizing, filling all blocks of the fs, unmounting, fs check
27 - filling all blocks of the fs, online resizing, filling newly added groups, unmounting, fs check
28 - Concurrent file copy/online resize, unmounting, fs check.
30 Non regression tests :
31 - offline resizing.
32 - online resizing without mballoc.
34 fs/ext4/ext4.h | 4
35 fs/ext4/mballoc.c | 234 +++++++++++++++++++++++++++++++++++++++++-------------
36 fs/ext4/resize.c | 52 +++++++++++-
37 3 files changed, 234 insertions(+), 56 deletions(-)
39 Index: linux-2.6.26-rc8/fs/ext4/ext4.h
40 ===================================================================
41 --- linux-2.6.26-rc8.orig/fs/ext4/ext4.h 2008-07-01 11:32:13.000000000 -0700
42 +++ linux-2.6.26-rc8/fs/ext4/ext4.h 2008-07-01 11:32:15.000000000 -0700
43 @@ -1033,6 +1033,10 @@ extern int __init init_ext4_mballoc(void
44 extern void exit_ext4_mballoc(void);
45 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
46 unsigned long, unsigned long, int, unsigned long *);
47 +extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
48 + ext4_group_t i, struct ext4_group_desc *desc);
49 +extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
50 + ext4_grpblk_t add);
53 /* inode.c */
54 Index: linux-2.6.26-rc8/fs/ext4/mballoc.c
55 ===================================================================
56 --- linux-2.6.26-rc8.orig/fs/ext4/mballoc.c 2008-07-01 11:32:09.000000000 -0700
57 +++ linux-2.6.26-rc8/fs/ext4/mballoc.c 2008-07-01 11:32:15.000000000 -0700
58 @@ -2231,21 +2231,192 @@ ext4_mb_store_history(struct ext4_alloca
59 #define ext4_mb_history_init(sb)
60 #endif
63 +/* Create and initialize ext4_group_info data for the given group. */
64 +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
65 + struct ext4_group_desc *desc)
67 + int i, len;
68 + int metalen = 0;
69 + struct ext4_sb_info *sbi = EXT4_SB(sb);
70 + struct ext4_group_info **meta_group_info;
72 + /*
73 + * First check if this group is the first of a reserved block.
74 + * If it's true, we have to allocate a new table of pointers
75 + * to ext4_group_info structures
76 + */
77 + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
78 + metalen = sizeof(*meta_group_info) <<
79 + EXT4_DESC_PER_BLOCK_BITS(sb);
80 + meta_group_info = kmalloc(metalen, GFP_KERNEL);
81 + if (meta_group_info == NULL) {
82 + printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
83 + "buddy group\n");
84 + goto exit_meta_group_info;
85 + }
86 + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
87 + meta_group_info;
88 + }
90 + /*
91 + * calculate needed size. if change bb_counters size,
92 + * don't forget about ext4_mb_generate_buddy()
93 + */
94 + len = offsetof(typeof(**meta_group_info),
95 + bb_counters[sb->s_blocksize_bits + 2]);
97 + meta_group_info =
98 + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
99 + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
101 + meta_group_info[i] = kzalloc(len, GFP_KERNEL);
102 + if (meta_group_info[i] == NULL) {
103 + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
104 + goto exit_group_info;
106 + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
107 + &(meta_group_info[i]->bb_state));
109 + /*
110 + * initialize bb_free to be able to skip
111 + * empty groups without initialization
112 + */
113 + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
114 + meta_group_info[i]->bb_free =
115 + ext4_free_blocks_after_init(sb, group, desc);
116 + } else {
117 + meta_group_info[i]->bb_free =
118 + le16_to_cpu(desc->bg_free_blocks_count);
121 + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
123 +#ifdef DOUBLE_CHECK
125 + struct buffer_head *bh;
126 + meta_group_info[i]->bb_bitmap =
127 + kmalloc(sb->s_blocksize, GFP_KERNEL);
128 + BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
129 + bh = ext4_read_block_bitmap(sb, group);
130 + BUG_ON(bh == NULL);
131 + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
132 + sb->s_blocksize);
133 + put_bh(bh);
135 +#endif
137 + return 0;
139 +exit_group_info:
140 + /* If a meta_group_info table has been allocated, release it now */
141 + if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
142 + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
143 +exit_meta_group_info:
144 + return -ENOMEM;
145 +} /* ext4_mb_add_groupinfo */
148 + * Add a group to the existing groups.
149 + * This function is used for online resize
150 + */
151 +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
152 + struct ext4_group_desc *desc)
154 + struct ext4_sb_info *sbi = EXT4_SB(sb);
155 + struct inode *inode = sbi->s_buddy_cache;
156 + int blocks_per_page;
157 + int block;
158 + int pnum;
159 + struct page *page;
160 + int err;
162 + /* Add group based on group descriptor*/
163 + err = ext4_mb_add_groupinfo(sb, group, desc);
164 + if (err)
165 + return err;
167 + /*
168 + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
169 + * datas) are set not up to date so that they will be re-initilaized
170 + * during the next call to ext4_mb_load_buddy
171 + */
173 + /* Set buddy page as not up to date */
174 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
175 + block = group * 2;
176 + pnum = block / blocks_per_page;
177 + page = find_get_page(inode->i_mapping, pnum);
178 + if (page != NULL) {
179 + ClearPageUptodate(page);
180 + page_cache_release(page);
183 + /* Set bitmap page as not up to date */
184 + block++;
185 + pnum = block / blocks_per_page;
186 + page = find_get_page(inode->i_mapping, pnum);
187 + if (page != NULL) {
188 + ClearPageUptodate(page);
189 + page_cache_release(page);
192 + return 0;
196 + * Update an existing group.
197 + * This function is used for online resize
198 + */
199 +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
201 + grp->bb_free += add;
204 static int ext4_mb_init_backend(struct super_block *sb)
206 ext4_group_t i;
207 - int j, len, metalen;
208 + int metalen;
209 struct ext4_sb_info *sbi = EXT4_SB(sb);
210 - int num_meta_group_infos =
211 - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
212 - EXT4_DESC_PER_BLOCK_BITS(sb);
213 + struct ext4_super_block *es = sbi->s_es;
214 + int num_meta_group_infos;
215 + int num_meta_group_infos_max;
216 + int array_size;
217 struct ext4_group_info **meta_group_info;
218 + struct ext4_group_desc *desc;
220 + /* This is the number of blocks used by GDT */
221 + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
222 + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
224 + /*
225 + * This is the total number of blocks used by GDT including
226 + * the number of reserved blocks for GDT.
227 + * The s_group_info array is allocated with this value
228 + * to allow a clean online resize without a complex
229 + * manipulation of pointer.
230 + * The drawback is the unused memory when no resize
231 + * occurs but it's very low in terms of pages
232 + * (see comments below)
233 + * Need to handle this properly when META_BG resizing is allowed
234 + */
235 + num_meta_group_infos_max = num_meta_group_infos +
236 + le16_to_cpu(es->s_reserved_gdt_blocks);
238 + /*
239 + * array_size is the size of s_group_info array. We round it
240 + * to the next power of two because this approximation is done
241 + * internally by kmalloc so we can have some more memory
242 + * for free here (e.g. may be used for META_BG resize).
243 + */
244 + array_size = 1;
245 + while (array_size < sizeof(*sbi->s_group_info) *
246 + num_meta_group_infos_max)
247 + array_size = array_size << 1;
248 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
249 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
250 * So a two level scheme suffices for now. */
251 - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
252 - num_meta_group_infos, GFP_KERNEL);
253 + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
254 if (sbi->s_group_info == NULL) {
255 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
256 return -ENOMEM;
257 @@ -2272,62 +2443,15 @@ static int ext4_mb_init_backend(struct s
258 sbi->s_group_info[i] = meta_group_info;
261 - /*
262 - * calculate needed size. if change bb_counters size,
263 - * don't forget about ext4_mb_generate_buddy()
264 - */
265 - len = sizeof(struct ext4_group_info);
266 - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
267 for (i = 0; i < sbi->s_groups_count; i++) {
268 - struct ext4_group_desc *desc;
270 - meta_group_info =
271 - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
272 - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
274 - meta_group_info[j] = kzalloc(len, GFP_KERNEL);
275 - if (meta_group_info[j] == NULL) {
276 - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
277 - goto err_freebuddy;
279 desc = ext4_get_group_desc(sb, i, NULL);
280 if (desc == NULL) {
281 printk(KERN_ERR
282 "EXT4-fs: can't read descriptor %lu\n", i);
283 - i++;
284 goto err_freebuddy;
286 - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
287 - &(meta_group_info[j]->bb_state));
289 - /*
290 - * initialize bb_free to be able to skip
291 - * empty groups without initialization
292 - */
293 - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
294 - meta_group_info[j]->bb_free =
295 - ext4_free_blocks_after_init(sb, i, desc);
296 - } else {
297 - meta_group_info[j]->bb_free =
298 - le16_to_cpu(desc->bg_free_blocks_count);
301 - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
303 -#ifdef DOUBLE_CHECK
305 - struct buffer_head *bh;
306 - meta_group_info[j]->bb_bitmap =
307 - kmalloc(sb->s_blocksize, GFP_KERNEL);
308 - BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
309 - bh = ext4_read_block_bitmap(sb, i);
310 - BUG_ON(bh == NULL);
311 - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
312 - sb->s_blocksize);
313 - put_bh(bh);
315 -#endif
317 + if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
318 + goto err_freebuddy;
321 return 0;
322 Index: linux-2.6.26-rc8/fs/ext4/resize.c
323 ===================================================================
324 --- linux-2.6.26-rc8.orig/fs/ext4/resize.c 2008-06-24 18:58:20.000000000 -0700
325 +++ linux-2.6.26-rc8/fs/ext4/resize.c 2008-07-01 11:32:15.000000000 -0700
326 @@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *s
327 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
330 + * We can allocate memory for mb_alloc based on the new group
331 + * descriptor
332 + */
333 + if (test_opt(sb, MBALLOC)) {
334 + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
335 + if (err)
336 + goto exit_journal;
338 + /*
339 * Make the new blocks and inodes valid next. We do this before
340 * increasing the group count so that once the group is enabled,
341 * all of its blocks and inodes are already valid.
342 @@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block
343 handle_t *handle;
344 int err;
345 unsigned long freed_blocks;
346 + ext4_group_t group;
347 + struct ext4_group_info *grp;
349 /* We don't need to worry about locking wrt other resizers just
350 * yet: we're going to revalidate es->s_blocks_count after
351 @@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block
354 /* Handle the remaining blocks in the last group only. */
355 - ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
356 + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
358 if (last == 0) {
359 ext4_warning(sb, __func__,
360 @@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block
361 o_blocks_count + add);
362 if ((err = ext4_journal_stop(handle)))
363 goto exit_put;
365 + /*
366 + * Mark mballoc pages as not up to date so that they will be updated
367 + * next time they are loaded by ext4_mb_load_buddy.
368 + */
369 + if (test_opt(sb, MBALLOC)) {
370 + struct ext4_sb_info *sbi = EXT4_SB(sb);
371 + struct inode *inode = sbi->s_buddy_cache;
372 + int blocks_per_page;
373 + int block;
374 + int pnum;
375 + struct page *page;
377 + /* Set buddy page as not up to date */
378 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
379 + block = group * 2;
380 + pnum = block / blocks_per_page;
381 + page = find_get_page(inode->i_mapping, pnum);
382 + if (page != NULL) {
383 + ClearPageUptodate(page);
384 + page_cache_release(page);
387 + /* Set bitmap page as not up to date */
388 + block++;
389 + pnum = block / blocks_per_page;
390 + page = find_get_page(inode->i_mapping, pnum);
391 + if (page != NULL) {
392 + ClearPageUptodate(page);
393 + page_cache_release(page);
396 + /* Get the info on the last group */
397 + grp = ext4_get_group_info(sb, group);
399 + /* Update free blocks in group info */
400 + ext4_mb_update_group_info(grp, add);
403 if (test_opt(sb, DEBUG))
404 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
405 ext4_blocks_count(es));