1 basic delayed allocation in VFS:
3 From: Alex Tomas <alex@clusterfs.com>
5 * block_prepare_write() can be passed special ->get_block() which
6 doesn't allocate blocks, but reserve them and mark bh delayed
7 * a filesystem can use mpage_da_writepages() with other ->get_block()
8 which doesn't defer allocation. mpage_da_writepages() finds all
9 non-allocated blocks and try to allocate them with minimal calls
10 to ->get_block(), then submit IO using __mpage_writepage()
13 Signed-off-by: Alex Tomas <alex@clusterfs.com>
14 Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
18 fs/mpage.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++++++
19 include/linux/mpage.h | 2
20 3 files changed, 411 insertions(+), 1 deletion(-)
23 Index: linux-2.6.23-rc5/fs/buffer.c
24 ===================================================================
25 --- linux-2.6.23-rc5.orig/fs/buffer.c 2007-09-11 18:15:51.000000000 -0700
26 +++ linux-2.6.23-rc5/fs/buffer.c 2007-09-12 16:14:02.000000000 -0700
27 @@ -1645,7 +1645,8 @@ static int __block_write_full_page(struc
29 clear_buffer_dirty(bh);
30 set_buffer_uptodate(bh);
31 - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
32 + } else if ((!buffer_mapped(bh) || buffer_delay(bh))
33 + && buffer_dirty(bh)) {
34 WARN_ON(bh->b_size != blocksize);
35 err = get_block(inode, block, bh, 1);
37 Index: linux-2.6.23-rc5/fs/mpage.c
38 ===================================================================
39 --- linux-2.6.23-rc5.orig/fs/mpage.c 2007-09-11 18:15:51.000000000 -0700
40 +++ linux-2.6.23-rc5/fs/mpage.c 2007-09-12 16:14:02.000000000 -0700
43 * 27Jun2002 axboe@suse.de
44 * use bio_add_page() to build bio's just the right size
45 + * 26Jul2007 alex@clusterfs.com AKA bzzz
46 + * basic delayed allocation support
49 #include <linux/kernel.h>
50 @@ -732,3 +734,408 @@ int mpage_writepage(struct page *page, g
53 EXPORT_SYMBOL(mpage_writepage);
56 + * Delayed allocation stuff
59 +struct mpage_da_data {
60 + struct inode *inode;
61 + struct buffer_head lbh; /* extent of blocks */
62 + unsigned long first_page, next_page; /* extent of pages */
63 + get_block_t *get_block;
64 + struct writeback_control *wbc;
69 + * mpage_da_submit_io - walks through extent of pages and try to write
70 + * them with __mpage_writepage()
72 + * @mpd->inode: inode
73 + * @mpd->first_page: first page of the extent
74 + * @mpd->next_page: page after the last page of the extent
75 + * @mpd->get_block: the filesystem's block mapper function
77 + * By the time mpage_da_submit_io() is called we expect all blocks
78 + * to be allocated. this may be wrong if allocation failed.
80 + * As pages are already locked by write_cache_pages(), we can't use it
82 +static int mpage_da_submit_io(struct mpage_da_data *mpd)
84 + struct address_space *mapping = mpd->inode->i_mapping;
85 + struct mpage_data mpd_pp = {
87 + .last_block_in_bio = 0,
88 + .get_block = mpd->get_block,
91 + int ret = 0, err, nr_pages, i;
92 + unsigned long index, end;
93 + struct pagevec pvec;
95 + BUG_ON(mpd->next_page <= mpd->first_page);
97 + pagevec_init(&pvec, 0);
98 + index = mpd->first_page;
99 + end = mpd->next_page - 1;
101 + while (index <= end) {
102 + /* XXX: optimize tail */
103 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
106 + for (i = 0; i < nr_pages; i++) {
107 + struct page *page = pvec.pages[i];
109 + index = page->index;
114 + err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
117 + * In error case, we have to continue because
118 + * remaining pages are still locked
119 + * XXX: unlock and re-dirty them?
124 + pagevec_release(&pvec);
127 + mpage_bio_submit(WRITE, mpd_pp.bio);
133 + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
135 + * @mpd->inode - inode to walk through
136 + * @exbh->b_blocknr - first block on a disk
137 + * @exbh->b_size - amount of space in bytes
138 + * @logical - first logical block to start assignment with
140 + * the function goes through all passed space and put actual disk
141 + * block numbers into buffer heads, dropping BH_Delay
143 +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
144 + struct buffer_head *exbh)
146 + struct inode *inode = mpd->inode;
147 + struct address_space *mapping = inode->i_mapping;
148 + int blocks = exbh->b_size >> inode->i_blkbits;
149 + sector_t pblock = exbh->b_blocknr, cur_logical;
150 + struct buffer_head *head, *bh;
151 + unsigned long index, end;
152 + struct pagevec pvec;
155 + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
156 + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
157 + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
159 + pagevec_init(&pvec, 0);
161 + while (index <= end) {
162 + /* XXX: optimize tail */
163 + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
166 + for (i = 0; i < nr_pages; i++) {
167 + struct page *page = pvec.pages[i];
169 + index = page->index;
174 + BUG_ON(!PageLocked(page));
175 + BUG_ON(PageWriteback(page));
176 + BUG_ON(!page_has_buffers(page));
178 + bh = page_buffers(page);
181 + /* skip blocks out of the range */
183 + if (cur_logical >= logical)
187 + } while ((bh = bh->b_this_page) != head);
190 + if (cur_logical >= logical + blocks)
193 + if (buffer_delay(bh)) {
194 + bh->b_blocknr = pblock;
195 + clear_buffer_delay(bh);
196 + } else if (buffer_mapped(bh)) {
197 + BUG_ON(bh->b_blocknr != pblock);
202 + } while ((bh = bh->b_this_page) != head);
204 + pagevec_release(&pvec);
210 + * __unmap_underlying_blocks - just a helper function to unmap
211 + * set of blocks described by @bh
213 +static inline void __unmap_underlying_blocks(struct inode *inode,
214 + struct buffer_head *bh)
216 + struct block_device *bdev = inode->i_sb->s_bdev;
219 + blocks = bh->b_size >> inode->i_blkbits;
220 + for (i = 0; i < blocks; i++)
221 + unmap_underlying_metadata(bdev, bh->b_blocknr + i);
225 + * mpage_da_map_blocks - go through given space
227 + * @mpd->lbh - bh describing space
228 + * @mpd->get_block - the filesystem's block mapper function
230 + * The function skips space we know is already mapped to disk blocks.
232 + * The function ignores errors ->get_block() returns, thus real
233 + * error handling is postponed to __mpage_writepage()
235 +static void mpage_da_map_blocks(struct mpage_da_data *mpd)
237 + struct buffer_head *lbh = &mpd->lbh;
238 + int err = 0, remain = lbh->b_size;
239 + sector_t next = lbh->b_blocknr;
240 + struct buffer_head new;
243 + * We consider only non-mapped and non-allocated blocks
245 + if (buffer_mapped(lbh) && !buffer_delay(lbh))
249 + new.b_state = lbh->b_state;
251 + new.b_size = remain;
252 + err = mpd->get_block(mpd->inode, next, &new, 1);
255 + * Rather than implement own error handling
256 + * here, we just leave remaining blocks
257 + * unallocated and try again with ->writepage()
261 + BUG_ON(new.b_size == 0);
263 + if (buffer_new(&new))
264 + __unmap_underlying_blocks(mpd->inode, &new);
267 + * If blocks are delayed marked, we need to
268 + * put actual blocknr and drop delayed bit
270 + if (buffer_delay(lbh))
271 + mpage_put_bnr_to_bhs(mpd, next, &new);
273 + /* go for the remaining blocks */
274 + next += new.b_size >> mpd->inode->i_blkbits;
275 + remain -= new.b_size;
279 +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
282 + * mpage_add_bh_to_extent - try to add one more block to extent of blocks
284 + * @mpd->lbh - extent of blocks
285 + * @logical - logical number of the block in the file
286 + * @bh - bh of the block (used to access block's state)
288 + * the function is used to collect contig. blocks in same state
290 +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
291 + sector_t logical, struct buffer_head *bh)
293 + struct buffer_head *lbh = &mpd->lbh;
296 + next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
299 + * First block in the extent
301 + if (lbh->b_size == 0) {
302 + lbh->b_blocknr = logical;
303 + lbh->b_size = bh->b_size;
304 + lbh->b_state = bh->b_state & BH_FLAGS;
309 + * Can we merge the block to our big extent?
311 + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
312 + lbh->b_size += bh->b_size;
317 + * We couldn't merge the block to our extent, so we
318 + * need to flush current extent and start new one
320 + mpage_da_map_blocks(mpd);
323 + * Now start a new extent
325 + lbh->b_size = bh->b_size;
326 + lbh->b_state = bh->b_state & BH_FLAGS;
327 + lbh->b_blocknr = logical;
331 + * __mpage_da_writepage - finds extent of pages and blocks
333 + * @page: page to consider
334 + * @wbc: not used, we just follow rules
337 + * The function finds extents of pages and scan them for all blocks.
339 +static int __mpage_da_writepage(struct page *page,
340 + struct writeback_control *wbc, void *data)
342 + struct mpage_da_data *mpd = data;
343 + struct inode *inode = mpd->inode;
344 + struct buffer_head *bh, *head, fake;
348 + * Can we merge this page to current extent?
350 + if (mpd->next_page != page->index) {
352 + * Nope, we can't. So, we map non-allocated blocks
353 + * and start IO on them using __mpage_writepage()
355 + if (mpd->next_page != mpd->first_page) {
356 + mpage_da_map_blocks(mpd);
357 + mpage_da_submit_io(mpd);
361 + * Start next extent of pages ...
363 + mpd->first_page = page->index;
368 + mpd->lbh.b_size = 0;
369 + mpd->lbh.b_state = 0;
370 + mpd->lbh.b_blocknr = 0;
373 + mpd->next_page = page->index + 1;
374 + logical = (sector_t) page->index <<
375 + (PAGE_CACHE_SHIFT - inode->i_blkbits);
377 + if (!page_has_buffers(page)) {
379 + * There is no attached buffer heads yet (mmap?)
380 + * we treat the page asfull of dirty blocks
383 + bh->b_size = PAGE_CACHE_SIZE;
385 + set_buffer_dirty(bh);
386 + set_buffer_uptodate(bh);
387 + mpage_add_bh_to_extent(mpd, logical, bh);
390 + * Page with regular buffer heads, just add all dirty ones
392 + head = page_buffers(page);
395 + BUG_ON(buffer_locked(bh));
396 + if (buffer_dirty(bh))
397 + mpage_add_bh_to_extent(mpd, logical, bh);
399 + } while ((bh = bh->b_this_page) != head);
406 + * mpage_da_writepages - walk the list of dirty pages of the given
407 + * address space, allocates non-allocated blocks, maps newly-allocated
408 + * blocks to existing bhs and issue IO them
410 + * @mapping: address space structure to write
411 + * @wbc: subtract the number of written pages from *@wbc->nr_to_write
412 + * @get_block: the filesystem's block mapper function.
414 + * This is a library function, which implements the writepages()
415 + * address_space_operation.
417 + * In order to avoid duplication of logic that deals with partial pages,
418 + * multiple bio per page, etc, we find non-allocated blocks, allocate
419 + * them with minimal calls to ->get_block() and re-use __mpage_writepage()
421 + * It's important that we call __mpage_writepage() only once for each
422 + * involved page, otherwise we'd have to implement more complicated logic
423 + * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
425 + * See comments to mpage_writepages()
427 +int mpage_da_writepages(struct address_space *mapping,
428 + struct writeback_control *wbc, get_block_t get_block)
430 + struct mpage_da_data mpd;
434 + return generic_writepages(mapping, wbc);
437 + mpd.inode = mapping->host;
438 + mpd.lbh.b_size = 0;
439 + mpd.lbh.b_state = 0;
440 + mpd.lbh.b_blocknr = 0;
441 + mpd.first_page = 0;
443 + mpd.get_block = get_block;
445 + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
448 + * Handle last extent of pages
450 + if (mpd.next_page != mpd.first_page) {
451 + mpage_da_map_blocks(&mpd);
452 + mpage_da_submit_io(&mpd);
457 +EXPORT_SYMBOL(mpage_da_writepages);
459 Index: linux-2.6.23-rc5/include/linux/mpage.h
460 ===================================================================
461 --- linux-2.6.23-rc5.orig/include/linux/mpage.h 2007-09-11 18:15:51.000000000 -0700
462 +++ linux-2.6.23-rc5/include/linux/mpage.h 2007-09-12 16:14:02.000000000 -0700
463 @@ -18,6 +18,8 @@ int mpage_readpages(struct address_space
464 int mpage_readpage(struct page *page, get_block_t get_block);
465 int mpage_writepages(struct address_space *mapping,
466 struct writeback_control *wbc, get_block_t get_block);
467 +int mpage_da_writepages(struct address_space *mapping,
468 + struct writeback_control *wbc, get_block_t get_block);
469 int mpage_writepage(struct page *page, get_block_t *get_block,
470 struct writeback_control *wbc);