1 vfs: add basic delayed allocation support
3 From: Alex Tomas <alex@clusterfs.com>
5 * block_prepare_write() can be passed special ->get_block() which
6 doesn't allocate blocks, but reserve them and mark bh delayed
7 * a filesystem can use mpage_da_writepages() with other ->get_block()
8 which doesn't defer allocation. mpage_da_writepages() finds all
9 non-allocated blocks and try to allocate them with minimal calls
10 to ->get_block(), then submit IO using __mpage_writepage()
13 Signed-off-by: Alex Tomas <alex@clusterfs.com>
14 Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
15 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
20 fs/mpage.c | 406 ++++++++++++++++++++++++++++++++++++++++++++++++++
21 include/linux/mpage.h | 2
22 3 files changed, 410 insertions(+), 1 deletion(-)
25 Index: linux-2.6.26-rc5/fs/buffer.c
26 ===================================================================
27 --- linux-2.6.26-rc5.orig/fs/buffer.c 2008-06-06 17:03:51.000000000 -0700
28 +++ linux-2.6.26-rc5/fs/buffer.c 2008-06-06 17:03:54.000000000 -0700
29 @@ -1691,7 +1691,8 @@ static int __block_write_full_page(struc
31 clear_buffer_dirty(bh);
32 set_buffer_uptodate(bh);
33 - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
34 + } else if ((!buffer_mapped(bh) || buffer_delay(bh))
35 + && buffer_dirty(bh)) {
36 WARN_ON(bh->b_size != blocksize);
37 err = get_block(inode, block, bh, 1);
39 Index: linux-2.6.26-rc5/fs/mpage.c
40 ===================================================================
41 --- linux-2.6.26-rc5.orig/fs/mpage.c 2008-06-06 17:03:35.000000000 -0700
42 +++ linux-2.6.26-rc5/fs/mpage.c 2008-06-06 17:03:54.000000000 -0700
45 * 27Jun2002 axboe@suse.de
46 * use bio_add_page() to build bio's just the right size
47 + * 26Jul2007 alex@clusterfs.com AKA bzzz
48 + * basic delayed allocation support
51 #include <linux/kernel.h>
52 @@ -710,3 +712,407 @@ int mpage_writepage(struct page *page, g
55 EXPORT_SYMBOL(mpage_writepage);
58 + * Delayed allocation stuff
61 +struct mpage_da_data {
62 + struct inode *inode;
63 + struct buffer_head lbh; /* extent of blocks */
64 + unsigned long first_page, next_page; /* extent of pages */
65 + get_block_t *get_block;
66 + struct writeback_control *wbc;
71 + * mpage_da_submit_io - walks through extent of pages and try to write
72 + * them with __mpage_writepage()
74 + * @mpd->inode: inode
75 + * @mpd->first_page: first page of the extent
76 + * @mpd->next_page: page after the last page of the extent
77 + * @mpd->get_block: the filesystem's block mapper function
79 + * By the time mpage_da_submit_io() is called we expect all blocks
80 + * to be allocated. this may be wrong if allocation failed.
82 + * As pages are already locked by write_cache_pages(), we can't use it
84 +static int mpage_da_submit_io(struct mpage_da_data *mpd)
86 + struct address_space *mapping = mpd->inode->i_mapping;
87 + struct mpage_data mpd_pp = {
89 + .last_block_in_bio = 0,
90 + .get_block = mpd->get_block,
93 + int ret = 0, err, nr_pages, i;
94 + unsigned long index, end;
95 + struct pagevec pvec;
97 + BUG_ON(mpd->next_page <= mpd->first_page);
99 + pagevec_init(&pvec, 0);
100 + index = mpd->first_page;
101 + end = mpd->next_page - 1;
103 + while (index <= end) {
104 + /* XXX: optimize tail */
105 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
108 + for (i = 0; i < nr_pages; i++) {
109 + struct page *page = pvec.pages[i];
111 + index = page->index;
116 + err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
119 + * In error case, we have to continue because
120 + * remaining pages are still locked
121 + * XXX: unlock and re-dirty them?
126 + pagevec_release(&pvec);
129 + mpage_bio_submit(WRITE, mpd_pp.bio);
135 + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
137 + * @mpd->inode - inode to walk through
138 + * @exbh->b_blocknr - first block on a disk
139 + * @exbh->b_size - amount of space in bytes
140 + * @logical - first logical block to start assignment with
142 + * the function goes through all passed space and put actual disk
143 + * block numbers into buffer heads, dropping BH_Delay
145 +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
146 + struct buffer_head *exbh)
148 + struct inode *inode = mpd->inode;
149 + struct address_space *mapping = inode->i_mapping;
150 + int blocks = exbh->b_size >> inode->i_blkbits;
151 + sector_t pblock = exbh->b_blocknr, cur_logical;
152 + struct buffer_head *head, *bh;
153 + unsigned long index, end;
154 + struct pagevec pvec;
157 + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
158 + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
159 + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
161 + pagevec_init(&pvec, 0);
163 + while (index <= end) {
164 + /* XXX: optimize tail */
165 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
168 + for (i = 0; i < nr_pages; i++) {
169 + struct page *page = pvec.pages[i];
171 + index = page->index;
176 + BUG_ON(!PageLocked(page));
177 + BUG_ON(PageWriteback(page));
178 + BUG_ON(!page_has_buffers(page));
180 + bh = page_buffers(page);
183 + /* skip blocks out of the range */
185 + if (cur_logical >= logical)
189 + } while ((bh = bh->b_this_page) != head);
192 + if (cur_logical >= logical + blocks)
195 + if (buffer_delay(bh)) {
196 + bh->b_blocknr = pblock;
197 + clear_buffer_delay(bh);
198 + } else if (buffer_mapped(bh)) {
199 + BUG_ON(bh->b_blocknr != pblock);
204 + } while ((bh = bh->b_this_page) != head);
206 + pagevec_release(&pvec);
212 + * __unmap_underlying_blocks - just a helper function to unmap
213 + * set of blocks described by @bh
215 +static inline void __unmap_underlying_blocks(struct inode *inode,
216 + struct buffer_head *bh)
218 + struct block_device *bdev = inode->i_sb->s_bdev;
221 + blocks = bh->b_size >> inode->i_blkbits;
222 + for (i = 0; i < blocks; i++)
223 + unmap_underlying_metadata(bdev, bh->b_blocknr + i);
227 + * mpage_da_map_blocks - go through given space
229 + * @mpd->lbh - bh describing space
230 + * @mpd->get_block - the filesystem's block mapper function
232 + * The function skips space we know is already mapped to disk blocks.
234 + * The function ignores errors ->get_block() returns, thus real
235 + * error handling is postponed to __mpage_writepage()
237 +static void mpage_da_map_blocks(struct mpage_da_data *mpd)
239 + struct buffer_head *lbh = &mpd->lbh;
240 + int err = 0, remain = lbh->b_size;
241 + sector_t next = lbh->b_blocknr;
242 + struct buffer_head new;
245 + * We consider only non-mapped and non-allocated blocks
247 + if (buffer_mapped(lbh) && !buffer_delay(lbh))
251 + new.b_state = lbh->b_state;
253 + new.b_size = remain;
254 + err = mpd->get_block(mpd->inode, next, &new, 1);
257 + * Rather than implement own error handling
258 + * here, we just leave remaining blocks
259 + * unallocated and try again with ->writepage()
263 + BUG_ON(new.b_size == 0);
265 + if (buffer_new(&new))
266 + __unmap_underlying_blocks(mpd->inode, &new);
269 + * If blocks are delayed marked, we need to
270 + * put actual blocknr and drop delayed bit
272 + if (buffer_delay(lbh))
273 + mpage_put_bnr_to_bhs(mpd, next, &new);
275 + /* go for the remaining blocks */
276 + next += new.b_size >> mpd->inode->i_blkbits;
277 + remain -= new.b_size;
281 +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
284 + * mpage_add_bh_to_extent - try to add one more block to extent of blocks
286 + * @mpd->lbh - extent of blocks
287 + * @logical - logical number of the block in the file
288 + * @bh - bh of the block (used to access block's state)
290 + * the function is used to collect contig. blocks in same state
292 +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
293 + sector_t logical, struct buffer_head *bh)
295 + struct buffer_head *lbh = &mpd->lbh;
298 + next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
301 + * First block in the extent
303 + if (lbh->b_size == 0) {
304 + lbh->b_blocknr = logical;
305 + lbh->b_size = bh->b_size;
306 + lbh->b_state = bh->b_state & BH_FLAGS;
311 + * Can we merge the block to our big extent?
313 + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
314 + lbh->b_size += bh->b_size;
319 + * We couldn't merge the block to our extent, so we
320 + * need to flush current extent and start new one
322 + mpage_da_map_blocks(mpd);
325 + * Now start a new extent
327 + lbh->b_size = bh->b_size;
328 + lbh->b_state = bh->b_state & BH_FLAGS;
329 + lbh->b_blocknr = logical;
333 + * __mpage_da_writepage - finds extent of pages and blocks
335 + * @page: page to consider
336 + * @wbc: not used, we just follow rules
339 + * The function finds extents of pages and scan them for all blocks.
341 +static int __mpage_da_writepage(struct page *page,
342 + struct writeback_control *wbc, void *data)
344 + struct mpage_da_data *mpd = data;
345 + struct inode *inode = mpd->inode;
346 + struct buffer_head *bh, *head, fake;
350 + * Can we merge this page to current extent?
352 + if (mpd->next_page != page->index) {
354 + * Nope, we can't. So, we map non-allocated blocks
355 + * and start IO on them using __mpage_writepage()
357 + if (mpd->next_page != mpd->first_page) {
358 + mpage_da_map_blocks(mpd);
359 + mpage_da_submit_io(mpd);
363 + * Start next extent of pages ...
365 + mpd->first_page = page->index;
370 + mpd->lbh.b_size = 0;
371 + mpd->lbh.b_state = 0;
372 + mpd->lbh.b_blocknr = 0;
375 + mpd->next_page = page->index + 1;
376 + logical = (sector_t) page->index <<
377 + (PAGE_CACHE_SHIFT - inode->i_blkbits);
379 + if (!page_has_buffers(page)) {
381 + * There is no attached buffer heads yet (mmap?)
382 + * we treat the page asfull of dirty blocks
385 + bh->b_size = PAGE_CACHE_SIZE;
387 + set_buffer_dirty(bh);
388 + set_buffer_uptodate(bh);
389 + mpage_add_bh_to_extent(mpd, logical, bh);
392 + * Page with regular buffer heads, just add all dirty ones
394 + head = page_buffers(page);
397 + BUG_ON(buffer_locked(bh));
398 + if (buffer_dirty(bh))
399 + mpage_add_bh_to_extent(mpd, logical, bh);
401 + } while ((bh = bh->b_this_page) != head);
408 + * mpage_da_writepages - walk the list of dirty pages of the given
409 + * address space, allocates non-allocated blocks, maps newly-allocated
410 + * blocks to existing bhs and issue IO them
412 + * @mapping: address space structure to write
413 + * @wbc: subtract the number of written pages from *@wbc->nr_to_write
414 + * @get_block: the filesystem's block mapper function.
416 + * This is a library function, which implements the writepages()
417 + * address_space_operation.
419 + * In order to avoid duplication of logic that deals with partial pages,
420 + * multiple bio per page, etc, we find non-allocated blocks, allocate
421 + * them with minimal calls to ->get_block() and re-use __mpage_writepage()
423 + * It's important that we call __mpage_writepage() only once for each
424 + * involved page, otherwise we'd have to implement more complicated logic
425 + * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
427 + * See comments to mpage_writepages()
429 +int mpage_da_writepages(struct address_space *mapping,
430 + struct writeback_control *wbc, get_block_t get_block)
432 + struct mpage_da_data mpd;
436 + return generic_writepages(mapping, wbc);
439 + mpd.inode = mapping->host;
440 + mpd.lbh.b_size = 0;
441 + mpd.lbh.b_state = 0;
442 + mpd.lbh.b_blocknr = 0;
443 + mpd.first_page = 0;
445 + mpd.get_block = get_block;
447 + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
450 + * Handle last extent of pages
452 + if (mpd.next_page != mpd.first_page) {
453 + mpage_da_map_blocks(&mpd);
454 + mpage_da_submit_io(&mpd);
459 +EXPORT_SYMBOL(mpage_da_writepages);
460 Index: linux-2.6.26-rc5/include/linux/mpage.h
461 ===================================================================
462 --- linux-2.6.26-rc5.orig/include/linux/mpage.h 2008-06-06 17:03:35.000000000 -0700
463 +++ linux-2.6.26-rc5/include/linux/mpage.h 2008-06-06 17:03:54.000000000 -0700
464 @@ -18,6 +18,8 @@ int mpage_readpages(struct address_space
465 int mpage_readpage(struct page *page, get_block_t get_block);
466 int mpage_writepages(struct address_space *mapping,
467 struct writeback_control *wbc, get_block_t get_block);
468 +int mpage_da_writepages(struct address_space *mapping,
469 + struct writeback_control *wbc, get_block_t get_block);
470 int mpage_writepage(struct page *page, get_block_t *get_block,
471 struct writeback_control *wbc);