1 ext4: Switch to non delalloc mode when we are low on free blocks count.
3 From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5 The delayed allocation code allocates blocks during writepages(), which
6 can not handle block allocation failures. To deal with this, we switch
7 away from delayed allocation mode when we are running low on free bocks.
8 This also allows us to avoid needing to reserve a large number of
9 meta-data blocks in case all of the requested blocks are discontiguous,
11 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
12 Signed-off-by: Mingming Cao <cmm@us.ibm.com>
13 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
16 fs/ext4/inode.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
17 1 file changed, 50 insertions(+), 2 deletions(-)
19 Index: linux-2.6.27-rc3/fs/ext4/inode.c
20 ===================================================================
21 --- linux-2.6.27-rc3.orig/fs/ext4/inode.c 2008-08-27 13:59:34.000000000 -0700
22 +++ linux-2.6.27-rc3/fs/ext4/inode.c 2008-08-27 14:02:17.000000000 -0700
23 @@ -2458,6 +2458,33 @@
27 +#define FALL_BACK_TO_NONDELALLOC 1
28 +static int ext4_nonda_switch(struct super_block *sb)
30 + s64 free_blocks, dirty_blocks;
31 + struct ext4_sb_info *sbi = EXT4_SB(sb);
34 + * switch to non delalloc mode if we are running low
35 + * on free block. The free block accounting via percpu
36 + * counters can get slightly wrong with FBC_BATCH getting
37 + * accumulated on each CPU without updating global counters
38 + * Delalloc need an accurate free block accounting. So switch
39 + * to non delalloc when we are near to error range.
41 + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
42 + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
43 + if (2 * free_blocks < 3 * dirty_blocks ||
44 + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
46 + * free block count is less that 150% of dirty blocks
47 + * or free blocks is less that watermark
54 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
55 loff_t pos, unsigned len, unsigned flags,
56 struct page **pagep, void **fsdata)
57 @@ -2472,6 +2499,13 @@
58 index = pos >> PAGE_CACHE_SHIFT;
59 from = pos & (PAGE_CACHE_SIZE - 1);
62 + if (ext4_nonda_switch(inode->i_sb)) {
63 + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
64 + return ext4_write_begin(file, mapping, pos,
65 + len, flags, pagep, fsdata);
67 + *fsdata = (void *)0;
70 * With delayed allocation, we don't log the i_disksize update
71 @@ -2540,6 +2574,19 @@
72 handle_t *handle = ext4_journal_current_handle();
74 unsigned long start, end;
75 + int write_mode = (int)fsdata;
77 + if (write_mode == FALL_BACK_TO_NONDELALLOC) {
78 + if (ext4_should_order_data(inode)) {
79 + return ext4_ordered_write_end(file, mapping, pos,
80 + len, copied, page, fsdata);
81 + } else if (ext4_should_writeback_data(inode)) {
82 + return ext4_writeback_write_end(file, mapping, pos,
83 + len, copied, page, fsdata);
89 start = pos & (PAGE_CACHE_SIZE - 1);
90 end = start + copied -1;
96 struct file *file = vma->vm_file;
97 struct inode *inode = file->f_path.dentry->d_inode;
98 struct address_space *mapping = inode->i_mapping;
99 @@ -4915,11 +4963,11 @@
100 * on the same page though
102 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
103 - len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
104 + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
107 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
108 - len, len, page, NULL);
109 + len, len, page, fsdata);