More patch description fixups. Standardize case.
[ext4-patch-queue.git] / delalloc-vfs.patch
blobc764491180719573a27536f0fb25a3a87714731c
1 vfs: add basic delayed allocation support
3 From: Alex Tomas <alex@clusterfs.com>
5 * block_prepare_write() can be passed special ->get_block() which
6 doesn't allocate blocks, but reserve them and mark bh delayed
7 * a filesystem can use mpage_da_writepages() with other ->get_block()
8 which doesn't defer allocation. mpage_da_writepages() finds all
9 non-allocated blocks and try to allocate them with minimal calls
10 to ->get_block(), then submit IO using __mpage_writepage()
13 Signed-off-by: Alex Tomas <alex@clusterfs.com>
14 Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
15 ---
17 fs/buffer.c | 3
18 fs/mpage.c | 406 +++++++++++++++++++++++++++++++++++++++++++++++++
19 include/linux/mpage.h | 2
20 3 files changed, 410 insertions(+), 1 deletions(-)
23 diff --git a/fs/buffer.c b/fs/buffer.c
24 index 7593ff3..982cf1a 100644
25 --- a/fs/buffer.c
26 +++ b/fs/buffer.c
27 @@ -1660,7 +1660,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
29 clear_buffer_dirty(bh);
30 set_buffer_uptodate(bh);
31 - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
32 + } else if ((!buffer_mapped(bh) || buffer_delay(bh))
33 + && buffer_dirty(bh)) {
34 WARN_ON(bh->b_size != blocksize);
35 err = get_block(inode, block, bh, 1);
36 if (err)
37 diff --git a/fs/mpage.c b/fs/mpage.c
38 index d54f8f8..6ebb4ac 100644
39 --- a/fs/mpage.c
40 +++ b/fs/mpage.c
41 @@ -10,6 +10,8 @@
42 * Initial version
43 * 27Jun2002 axboe@suse.de
44 * use bio_add_page() to build bio's just the right size
45 + * 26Jul2007 alex@clusterfs.com AKA bzzz
46 + * basic delayed allocation support
49 #include <linux/kernel.h>
50 @@ -718,3 +720,407 @@ int mpage_writepage(struct page *page, get_block_t get_block,
51 return ret;
53 EXPORT_SYMBOL(mpage_writepage);
55 +/*
56 + * Delayed allocation stuff
57 + */
59 +struct mpage_da_data {
60 + struct inode *inode;
61 + struct buffer_head lbh; /* extent of blocks */
62 + unsigned long first_page, next_page; /* extent of pages */
63 + get_block_t *get_block;
64 + struct writeback_control *wbc;
65 +};
68 +/*
69 + * mpage_da_submit_io - walks through extent of pages and try to write
70 + * them with __mpage_writepage()
71 + *
72 + * @mpd->inode: inode
73 + * @mpd->first_page: first page of the extent
74 + * @mpd->next_page: page after the last page of the extent
75 + * @mpd->get_block: the filesystem's block mapper function
76 + *
77 + * By the time mpage_da_submit_io() is called we expect all blocks
78 + * to be allocated. this may be wrong if allocation failed.
79 + *
80 + * As pages are already locked by write_cache_pages(), we can't use it
81 + */
82 +static int mpage_da_submit_io(struct mpage_da_data *mpd)
84 + struct address_space *mapping = mpd->inode->i_mapping;
85 + struct mpage_data mpd_pp = {
86 + .bio = NULL,
87 + .last_block_in_bio = 0,
88 + .get_block = mpd->get_block,
89 + .use_writepage = 1,
90 + };
91 + int ret = 0, err, nr_pages, i;
92 + unsigned long index, end;
93 + struct pagevec pvec;
95 + BUG_ON(mpd->next_page <= mpd->first_page);
97 + pagevec_init(&pvec, 0);
98 + index = mpd->first_page;
99 + end = mpd->next_page - 1;
101 + while (index <= end) {
102 + /* XXX: optimize tail */
103 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
104 + if (nr_pages == 0)
105 + break;
106 + for (i = 0; i < nr_pages; i++) {
107 + struct page *page = pvec.pages[i];
109 + index = page->index;
110 + if (index > end)
111 + break;
112 + index++;
114 + err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
116 + /*
117 + * In error case, we have to continue because
118 + * remaining pages are still locked
119 + * XXX: unlock and re-dirty them?
120 + */
121 + if (ret == 0)
122 + ret = err;
124 + pagevec_release(&pvec);
126 + if (mpd_pp.bio)
127 + mpage_bio_submit(WRITE, mpd_pp.bio);
129 + return ret;
133 + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
135 + * @mpd->inode - inode to walk through
136 + * @exbh->b_blocknr - first block on a disk
137 + * @exbh->b_size - amount of space in bytes
138 + * @logical - first logical block to start assignment with
140 + * the function goes through all passed space and put actual disk
141 + * block numbers into buffer heads, dropping BH_Delay
142 + */
143 +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
144 + struct buffer_head *exbh)
146 + struct inode *inode = mpd->inode;
147 + struct address_space *mapping = inode->i_mapping;
148 + int blocks = exbh->b_size >> inode->i_blkbits;
149 + sector_t pblock = exbh->b_blocknr, cur_logical;
150 + struct buffer_head *head, *bh;
151 + unsigned long index, end;
152 + struct pagevec pvec;
153 + int nr_pages, i;
155 + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
156 + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
157 + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
159 + pagevec_init(&pvec, 0);
161 + while (index <= end) {
162 + /* XXX: optimize tail */
163 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
164 + if (nr_pages == 0)
165 + break;
166 + for (i = 0; i < nr_pages; i++) {
167 + struct page *page = pvec.pages[i];
169 + index = page->index;
170 + if (index > end)
171 + break;
172 + index++;
174 + BUG_ON(!PageLocked(page));
175 + BUG_ON(PageWriteback(page));
176 + BUG_ON(!page_has_buffers(page));
178 + bh = page_buffers(page);
179 + head = bh;
181 + /* skip blocks out of the range */
182 + do {
183 + if (cur_logical >= logical)
184 + break;
185 + cur_logical++;
186 + pblock++;
187 + } while ((bh = bh->b_this_page) != head);
189 + do {
190 + if (cur_logical >= logical + blocks)
191 + break;
193 + if (buffer_delay(bh)) {
194 + bh->b_blocknr = pblock;
195 + clear_buffer_delay(bh);
196 + } else if (buffer_mapped(bh)) {
197 + BUG_ON(bh->b_blocknr != pblock);
200 + cur_logical++;
201 + pblock++;
202 + } while ((bh = bh->b_this_page) != head);
204 + pagevec_release(&pvec);
210 + * __unmap_underlying_blocks - just a helper function to unmap
211 + * set of blocks described by @bh
212 + */
213 +static inline void __unmap_underlying_blocks(struct inode *inode,
214 + struct buffer_head *bh)
216 + struct block_device *bdev = inode->i_sb->s_bdev;
217 + int blocks, i;
219 + blocks = bh->b_size >> inode->i_blkbits;
220 + for (i = 0; i < blocks; i++)
221 + unmap_underlying_metadata(bdev, bh->b_blocknr + i);
225 + * mpage_da_map_blocks - go through given space
227 + * @mpd->lbh - bh describing space
228 + * @mpd->get_block - the filesystem's block mapper function
230 + * The function skips space we know is already mapped to disk blocks.
232 + * The function ignores errors ->get_block() returns, thus real
233 + * error handling is postponed to __mpage_writepage()
234 + */
235 +static void mpage_da_map_blocks(struct mpage_da_data *mpd)
237 + struct buffer_head *lbh = &mpd->lbh;
238 + int err = 0, remain = lbh->b_size;
239 + sector_t next = lbh->b_blocknr;
240 + struct buffer_head new;
242 + /*
243 + * We consider only non-mapped and non-allocated blocks
244 + */
245 + if (buffer_mapped(lbh) && !buffer_delay(lbh))
246 + return;
248 + while (remain) {
249 + new.b_state = lbh->b_state;
250 + new.b_blocknr = 0;
251 + new.b_size = remain;
252 + err = mpd->get_block(mpd->inode, next, &new, 1);
253 + if (err) {
254 + /*
255 + * Rather than implement own error handling
256 + * here, we just leave remaining blocks
257 + * unallocated and try again with ->writepage()
258 + */
259 + break;
261 + BUG_ON(new.b_size == 0);
263 + if (buffer_new(&new))
264 + __unmap_underlying_blocks(mpd->inode, &new);
266 + /*
267 + * If blocks are delayed marked, we need to
268 + * put actual blocknr and drop delayed bit
269 + */
270 + if (buffer_delay(lbh))
271 + mpage_put_bnr_to_bhs(mpd, next, &new);
273 + /* go for the remaining blocks */
274 + next += new.b_size >> mpd->inode->i_blkbits;
275 + remain -= new.b_size;
279 +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
282 + * mpage_add_bh_to_extent - try to add one more block to extent of blocks
284 + * @mpd->lbh - extent of blocks
285 + * @logical - logical number of the block in the file
286 + * @bh - bh of the block (used to access block's state)
288 + * the function is used to collect contig. blocks in same state
289 + */
290 +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
291 + sector_t logical, struct buffer_head *bh)
293 + struct buffer_head *lbh = &mpd->lbh;
294 + sector_t next;
296 + next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
298 + /*
299 + * First block in the extent
300 + */
301 + if (lbh->b_size == 0) {
302 + lbh->b_blocknr = logical;
303 + lbh->b_size = bh->b_size;
304 + lbh->b_state = bh->b_state & BH_FLAGS;
305 + return;
308 + /*
309 + * Can we merge the block to our big extent?
310 + */
311 + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
312 + lbh->b_size += bh->b_size;
313 + return;
316 + /*
317 + * We couldn't merge the block to our extent, so we
318 + * need to flush current extent and start new one
319 + */
320 + mpage_da_map_blocks(mpd);
322 + /*
323 + * Now start a new extent
324 + */
325 + lbh->b_size = bh->b_size;
326 + lbh->b_state = bh->b_state & BH_FLAGS;
327 + lbh->b_blocknr = logical;
331 + * __mpage_da_writepage - finds extent of pages and blocks
333 + * @page: page to consider
334 + * @wbc: not used, we just follow rules
335 + * @data: context
337 + * The function finds extents of pages and scan them for all blocks.
338 + */
339 +static int __mpage_da_writepage(struct page *page,
340 + struct writeback_control *wbc, void *data)
342 + struct mpage_da_data *mpd = data;
343 + struct inode *inode = mpd->inode;
344 + struct buffer_head *bh, *head, fake;
345 + sector_t logical;
347 + /*
348 + * Can we merge this page to current extent?
349 + */
350 + if (mpd->next_page != page->index) {
351 + /*
352 + * Nope, we can't. So, we map non-allocated blocks
353 + * and start IO on them using __mpage_writepage()
354 + */
355 + if (mpd->next_page != mpd->first_page) {
356 + mpage_da_map_blocks(mpd);
357 + mpage_da_submit_io(mpd);
360 + /*
361 + * Start next extent of pages ...
362 + */
363 + mpd->first_page = page->index;
365 + /*
366 + * ... and blocks
367 + */
368 + mpd->lbh.b_size = 0;
369 + mpd->lbh.b_state = 0;
370 + mpd->lbh.b_blocknr = 0;
373 + mpd->next_page = page->index + 1;
374 + logical = (sector_t) page->index <<
375 + (PAGE_CACHE_SHIFT - inode->i_blkbits);
377 + if (!page_has_buffers(page)) {
378 + /*
379 + * There is no attached buffer heads yet (mmap?)
380 + * we treat the page asfull of dirty blocks
381 + */
382 + bh = &fake;
383 + bh->b_size = PAGE_CACHE_SIZE;
384 + bh->b_state = 0;
385 + set_buffer_dirty(bh);
386 + set_buffer_uptodate(bh);
387 + mpage_add_bh_to_extent(mpd, logical, bh);
388 + } else {
389 + /*
390 + * Page with regular buffer heads, just add all dirty ones
391 + */
392 + head = page_buffers(page);
393 + bh = head;
394 + do {
395 + BUG_ON(buffer_locked(bh));
396 + if (buffer_dirty(bh))
397 + mpage_add_bh_to_extent(mpd, logical, bh);
398 + logical++;
399 + } while ((bh = bh->b_this_page) != head);
402 + return 0;
406 + * mpage_da_writepages - walk the list of dirty pages of the given
407 + * address space, allocates non-allocated blocks, maps newly-allocated
408 + * blocks to existing bhs and issue IO them
410 + * @mapping: address space structure to write
411 + * @wbc: subtract the number of written pages from *@wbc->nr_to_write
412 + * @get_block: the filesystem's block mapper function.
414 + * This is a library function, which implements the writepages()
415 + * address_space_operation.
417 + * In order to avoid duplication of logic that deals with partial pages,
418 + * multiple bio per page, etc, we find non-allocated blocks, allocate
419 + * them with minimal calls to ->get_block() and re-use __mpage_writepage()
421 + * It's important that we call __mpage_writepage() only once for each
422 + * involved page, otherwise we'd have to implement more complicated logic
423 + * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
425 + * See comments to mpage_writepages()
426 + */
427 +int mpage_da_writepages(struct address_space *mapping,
428 + struct writeback_control *wbc, get_block_t get_block)
430 + struct mpage_da_data mpd;
431 + int ret;
433 + if (!get_block)
434 + return generic_writepages(mapping, wbc);
436 + mpd.wbc = wbc;
437 + mpd.inode = mapping->host;
438 + mpd.lbh.b_size = 0;
439 + mpd.lbh.b_state = 0;
440 + mpd.lbh.b_blocknr = 0;
441 + mpd.first_page = 0;
442 + mpd.next_page = 0;
443 + mpd.get_block = get_block;
445 + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
447 + /*
448 + * Handle last extent of pages
449 + */
450 + if (mpd.next_page != mpd.first_page) {
451 + mpage_da_map_blocks(&mpd);
452 + mpage_da_submit_io(&mpd);
455 + return ret;
457 +EXPORT_SYMBOL(mpage_da_writepages);
458 diff --git a/include/linux/mpage.h b/include/linux/mpage.h
459 index 068a0c9..1f67d34 100644
460 --- a/include/linux/mpage.h
461 +++ b/include/linux/mpage.h
462 @@ -18,6 +18,8 @@ int mpage_readpages(struct address_space *mapping, struct list_head *pages,
463 int mpage_readpage(struct page *page, get_block_t get_block);
464 int mpage_writepages(struct address_space *mapping,
465 struct writeback_control *wbc, get_block_t get_block);
466 +int mpage_da_writepages(struct address_space *mapping,
467 + struct writeback_control *wbc, get_block_t get_block);
468 int mpage_writepage(struct page *page, get_block_t *get_block,
469 struct writeback_control *wbc);