Further updates of Documentation/filesystem/ext4.txt
[ext4-patch-queue.git] / ext4-handle-page-writhout-buffers-in-da-writepage.patch
blobec32db35b4ff4ebbb8458b991e3bcf711e751a1c
1 ext4: Handle page without buffers in ext4_*_writepage()
3 From: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
5 It can happen that buffers are removed from the page before it gets
6 marked dirty and then is passed to writepage(). In writepage() we just
7 initialize the buffers and check whether they are mapped and non
8 delay. If they are mapped and non delay we write the page. Otherwise we
9 mark them dirty. With this change we don't do block allocation at all
10 in ext4_*_write_page.
12 writepage() can get called under many condition and with a locking order
13 of journal_start -> lock_page, we should not try to allocate blocks in
14 writepage() which get called after taking page lock. writepage() can
15 get called via shrink_page_list even with a journal handle which was
16 created for doing inode update. For example when doing
17 ext4_da_write_begin we create a journal handle with credit 1 expecting a
18 i_disksize update for the inode. But ext4_da_write_begin can cause
19 shrink_page_list via _grab_page_cache. So having a valid handle via
20 ext4_journal_current_handle is not a guarantee that we can use the
21 handle for block allocation in writepage, since we shouldn't be using
22 credits that had been reserved for other updates. That it could result
23 in we running out of credits when we update inodes.
25 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
26 Signed-off-by: Mingming Cao <cmm@us.ibm.com>
27 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
28 ---
29 fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++---------------
30 1 file changed, 124 insertions(+), 45 deletions(-)
32 Index: linux-2.6.26-rc8/fs/ext4/inode.c
33 ===================================================================
34 --- linux-2.6.26-rc8.orig/fs/ext4/inode.c 2008-06-30 11:49:49.000000000 -0700
35 +++ linux-2.6.26-rc8/fs/ext4/inode.c 2008-06-30 11:49:56.000000000 -0700
36 @@ -1592,11 +1592,15 @@ static int ext4_da_get_block_write(struc
37 handle_t *handle = NULL;
39 handle = ext4_journal_current_handle();
40 - BUG_ON(handle == NULL);
41 - BUG_ON(create == 0);
43 - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
44 + if (!handle) {
45 + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
46 + bh_result, 0, 0, 0);
47 + BUG_ON(!ret);
48 + } else {
49 + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
50 bh_result, create, 0, EXT4_DELALLOC_RSVED);
51 + }
53 if (ret > 0) {
54 bh_result->b_size = (ret << inode->i_blkbits);
56 @@ -1634,15 +1638,37 @@ static int ext4_da_get_block_write(struc
58 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
60 - return !buffer_mapped(bh) || buffer_delay(bh);
61 + /*
62 + * unmapped buffer is possible for holes.
63 + * delay buffer is possible with delayed allocation
64 + */
65 + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
68 +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
69 + struct buffer_head *bh_result, int create)
71 + int ret = 0;
72 + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
74 + /*
75 + * we don't want to do block allocation in writepage
76 + * so call get_block_wrap with create = 0
77 + */
78 + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
79 + bh_result, 0, 0, 0);
80 + if (ret > 0) {
81 + bh_result->b_size = (ret << inode->i_blkbits);
82 + ret = 0;
83 + }
84 + return ret;
88 - * get called vi ext4_da_writepages after taking page lock
89 - * We may end up doing block allocation here in case
90 - * mpage_da_map_blocks failed to allocate blocks.
91 - *
92 - * We also get called via journal_submit_inode_data_buffers
93 + * get called vi ext4_da_writepages after taking page lock (have journal handle)
94 + * get called via journal_submit_inode_data_buffers (no journal handle)
95 + * get called via shrink_page_list via pdflush (no journal handle)
96 + * or grab_page_cache when doing write_begin (have journal handle)
98 static int ext4_da_writepage(struct page *page,
99 struct writeback_control *wbc)
100 @@ -1650,37 +1676,61 @@ static int ext4_da_writepage(struct page
101 int ret = 0;
102 loff_t size;
103 unsigned long len;
104 - handle_t *handle = NULL;
105 struct buffer_head *page_bufs;
106 struct inode *inode = page->mapping->host;
108 - handle = ext4_journal_current_handle();
109 - if (!handle) {
110 - /*
111 - * This can happen when we aren't called via
112 - * ext4_da_writepages() but directly (shrink_page_list).
113 - * We cannot easily start a transaction here so we just skip
114 - * writing the page in case we would have to do so.
115 - * We reach here also via journal_submit_inode_data_buffers
116 - */
117 - size = i_size_read(inode);
118 + size = i_size_read(inode);
119 + if (page->index == size >> PAGE_CACHE_SHIFT)
120 + len = size & ~PAGE_CACHE_MASK;
121 + else
122 + len = PAGE_CACHE_SIZE;
124 + if (page_has_buffers(page)) {
125 page_bufs = page_buffers(page);
126 - if (page->index == size >> PAGE_CACHE_SHIFT)
127 - len = size & ~PAGE_CACHE_MASK;
128 - else
129 - len = PAGE_CACHE_SIZE;
131 - if (walk_page_buffers(NULL, page_bufs, 0,
132 - len, NULL, ext4_bh_unmapped_or_delay)) {
133 + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
134 + ext4_bh_unmapped_or_delay)) {
136 - * We can't do block allocation under
137 - * page lock without a handle . So redirty
138 - * the page and return
139 + * We don't want to do block allocation
140 + * So redirty the page and return
141 * We may reach here when we do a journal commit
142 * via journal_submit_inode_data_buffers.
143 * If we don't have mapping block we just ignore
144 - * them
145 + * them. We can also reach here via shrink_page_list
146 + */
147 + redirty_page_for_writepage(wbc, page);
148 + unlock_page(page);
149 + return 0;
151 + } else {
152 + /*
153 + * The test for page_has_buffers() is subtle:
154 + * We know the page is dirty but it lost buffers. That means
155 + * that at some moment in time after write_begin()/write_end()
156 + * has been called all buffers have been clean and thus they
157 + * must have been written at least once. So they are all
158 + * mapped and we can happily proceed with mapping them
159 + * and writing the page.
161 + * Try to initialize the buffer_heads and check whether
162 + * all are mapped and non delay. We don't want to
163 + * do block allocation here.
164 + */
165 + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
166 + ext4_normal_get_block_write);
167 + if (!ret) {
168 + page_bufs = page_buffers(page);
169 + /* check whether all are mapped and non delay */
170 + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
171 + ext4_bh_unmapped_or_delay)) {
172 + redirty_page_for_writepage(wbc, page);
173 + unlock_page(page);
174 + return 0;
176 + } else {
177 + /*
178 + * We can't do block allocation here
179 + * so just redity the page and unlock
180 + * and return
182 redirty_page_for_writepage(wbc, page);
183 unlock_page(page);
184 @@ -1689,9 +1739,11 @@ static int ext4_da_writepage(struct page
187 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
188 - ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
189 + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
190 else
191 - ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
192 + ret = block_write_full_page(page,
193 + ext4_normal_get_block_write,
194 + wbc);
196 return ret;
198 @@ -2032,12 +2084,14 @@ static int __ext4_normal_writepage(struc
199 struct inode *inode = page->mapping->host;
201 if (test_opt(inode->i_sb, NOBH))
202 - return nobh_writepage(page, ext4_get_block, wbc);
203 + return nobh_writepage(page,
204 + ext4_normal_get_block_write, wbc);
205 else
206 - return block_write_full_page(page, ext4_get_block, wbc);
207 + return block_write_full_page(page,
208 + ext4_normal_get_block_write,
209 + wbc);
213 static int ext4_normal_writepage(struct page *page,
214 struct writeback_control *wbc)
216 @@ -2046,13 +2100,24 @@ static int ext4_normal_writepage(struct
217 loff_t len;
219 J_ASSERT(PageLocked(page));
220 - J_ASSERT(page_has_buffers(page));
221 if (page->index == size >> PAGE_CACHE_SHIFT)
222 len = size & ~PAGE_CACHE_MASK;
223 else
224 len = PAGE_CACHE_SIZE;
225 - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
226 - ext4_bh_unmapped_or_delay));
228 + if (page_has_buffers(page)) {
229 + /* if page has buffers it should all be mapped
230 + * and allocated. If there are not buffers attached
231 + * to the page we know the page is dirty but it lost
232 + * buffers. That means that at some moment in time
233 + * after write_begin() / write_end() has been called
234 + * all buffers have been clean and thus they must have been
235 + * written at least once. So they are all mapped and we can
236 + * happily proceed with mapping them and writing the page.
237 + */
238 + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
239 + ext4_bh_unmapped_or_delay));
242 if (!ext4_journal_current_handle())
243 return __ext4_normal_writepage(page, wbc);
244 @@ -2072,7 +2137,8 @@ static int __ext4_journalled_writepage(s
245 int ret = 0;
246 int err;
248 - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
249 + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
250 + ext4_normal_get_block_write);
251 if (ret != 0)
252 goto out_unlock;
254 @@ -2119,13 +2185,24 @@ static int ext4_journalled_writepage(str
255 loff_t len;
257 J_ASSERT(PageLocked(page));
258 - J_ASSERT(page_has_buffers(page));
259 if (page->index == size >> PAGE_CACHE_SHIFT)
260 len = size & ~PAGE_CACHE_MASK;
261 else
262 len = PAGE_CACHE_SIZE;
263 - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
264 - ext4_bh_unmapped_or_delay));
266 + if (page_has_buffers(page)) {
267 + /* if page has buffers it should all be mapped
268 + * and allocated. If there are not buffers attached
269 + * to the page we know the page is dirty but it lost
270 + * buffers. That means that at some moment in time
271 + * after write_begin() / write_end() has been called
272 + * all buffers have been clean and thus they must have been
273 + * written at least once. So they are all mapped and we can
274 + * happily proceed with mapping them and writing the page.
275 + */
276 + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
277 + ext4_bh_unmapped_or_delay));
280 if (ext4_journal_current_handle())
281 goto no_write;
282 @@ -2143,7 +2220,9 @@ static int ext4_journalled_writepage(str
283 * really know unless we go poke around in the buffer_heads.
284 * But block_write_full_page will do the right thing.
286 - return block_write_full_page(page, ext4_get_block, wbc);
287 + return block_write_full_page(page,
288 + ext4_normal_get_block_write,
289 + wbc);
291 no_write:
292 redirty_page_for_writepage(wbc, page);