Btrfs: Wait for kernel threads to make progress during async submission
[linux-2.6/mini2440.git] / fs / btrfs / extent_io.c
blobf46f88620c709415acd1c4d64fe21bf58a3aa57a
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
19 /* temporary define until extent_map moves out of btrfs */
20 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
21 unsigned long extra_flags,
22 void (*ctor)(void *, struct kmem_cache *,
23 unsigned long));
25 static struct kmem_cache *extent_state_cache;
26 static struct kmem_cache *extent_buffer_cache;
28 static LIST_HEAD(buffers);
29 static LIST_HEAD(states);
30 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
32 #define BUFFER_LRU_MAX 64
34 struct tree_entry {
35 u64 start;
36 u64 end;
37 struct rb_node rb_node;
40 struct extent_page_data {
41 struct bio *bio;
42 struct extent_io_tree *tree;
43 get_extent_t *get_extent;
46 int __init extent_io_init(void)
48 extent_state_cache = btrfs_cache_create("extent_state",
49 sizeof(struct extent_state), 0,
50 NULL);
51 if (!extent_state_cache)
52 return -ENOMEM;
54 extent_buffer_cache = btrfs_cache_create("extent_buffers",
55 sizeof(struct extent_buffer), 0,
56 NULL);
57 if (!extent_buffer_cache)
58 goto free_state_cache;
59 return 0;
61 free_state_cache:
62 kmem_cache_destroy(extent_state_cache);
63 return -ENOMEM;
66 void extent_io_exit(void)
68 struct extent_state *state;
69 struct extent_buffer *eb;
71 while (!list_empty(&states)) {
72 state = list_entry(states.next, struct extent_state, leak_list);
73 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
74 list_del(&state->leak_list);
75 kmem_cache_free(extent_state_cache, state);
79 while (!list_empty(&buffers)) {
80 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
81 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
82 list_del(&eb->leak_list);
83 kmem_cache_free(extent_buffer_cache, eb);
85 if (extent_state_cache)
86 kmem_cache_destroy(extent_state_cache);
87 if (extent_buffer_cache)
88 kmem_cache_destroy(extent_buffer_cache);
91 void extent_io_tree_init(struct extent_io_tree *tree,
92 struct address_space *mapping, gfp_t mask)
94 tree->state.rb_node = NULL;
95 tree->buffer.rb_node = NULL;
96 tree->ops = NULL;
97 tree->dirty_bytes = 0;
98 spin_lock_init(&tree->lock);
99 spin_lock_init(&tree->buffer_lock);
100 tree->mapping = mapping;
102 EXPORT_SYMBOL(extent_io_tree_init);
104 struct extent_state *alloc_extent_state(gfp_t mask)
106 struct extent_state *state;
107 unsigned long flags;
109 state = kmem_cache_alloc(extent_state_cache, mask);
110 if (!state)
111 return state;
112 state->state = 0;
113 state->private = 0;
114 state->tree = NULL;
115 spin_lock_irqsave(&leak_lock, flags);
116 list_add(&state->leak_list, &states);
117 spin_unlock_irqrestore(&leak_lock, flags);
119 atomic_set(&state->refs, 1);
120 init_waitqueue_head(&state->wq);
121 return state;
123 EXPORT_SYMBOL(alloc_extent_state);
125 void free_extent_state(struct extent_state *state)
127 if (!state)
128 return;
129 if (atomic_dec_and_test(&state->refs)) {
130 unsigned long flags;
131 WARN_ON(state->tree);
132 spin_lock_irqsave(&leak_lock, flags);
133 list_del(&state->leak_list);
134 spin_unlock_irqrestore(&leak_lock, flags);
135 kmem_cache_free(extent_state_cache, state);
138 EXPORT_SYMBOL(free_extent_state);
140 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
141 struct rb_node *node)
143 struct rb_node ** p = &root->rb_node;
144 struct rb_node * parent = NULL;
145 struct tree_entry *entry;
147 while(*p) {
148 parent = *p;
149 entry = rb_entry(parent, struct tree_entry, rb_node);
151 if (offset < entry->start)
152 p = &(*p)->rb_left;
153 else if (offset > entry->end)
154 p = &(*p)->rb_right;
155 else
156 return parent;
159 entry = rb_entry(node, struct tree_entry, rb_node);
160 rb_link_node(node, parent, p);
161 rb_insert_color(node, root);
162 return NULL;
165 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
166 struct rb_node **prev_ret,
167 struct rb_node **next_ret)
169 struct rb_root *root = &tree->state;
170 struct rb_node * n = root->rb_node;
171 struct rb_node *prev = NULL;
172 struct rb_node *orig_prev = NULL;
173 struct tree_entry *entry;
174 struct tree_entry *prev_entry = NULL;
176 while(n) {
177 entry = rb_entry(n, struct tree_entry, rb_node);
178 prev = n;
179 prev_entry = entry;
181 if (offset < entry->start)
182 n = n->rb_left;
183 else if (offset > entry->end)
184 n = n->rb_right;
185 else {
186 return n;
190 if (prev_ret) {
191 orig_prev = prev;
192 while(prev && offset > prev_entry->end) {
193 prev = rb_next(prev);
194 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
196 *prev_ret = prev;
197 prev = orig_prev;
200 if (next_ret) {
201 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
202 while(prev && offset < prev_entry->start) {
203 prev = rb_prev(prev);
204 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
206 *next_ret = prev;
208 return NULL;
211 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
212 u64 offset)
214 struct rb_node *prev = NULL;
215 struct rb_node *ret;
217 ret = __etree_search(tree, offset, &prev, NULL);
218 if (!ret) {
219 return prev;
221 return ret;
224 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
225 u64 offset, struct rb_node *node)
227 struct rb_root *root = &tree->buffer;
228 struct rb_node ** p = &root->rb_node;
229 struct rb_node * parent = NULL;
230 struct extent_buffer *eb;
232 while(*p) {
233 parent = *p;
234 eb = rb_entry(parent, struct extent_buffer, rb_node);
236 if (offset < eb->start)
237 p = &(*p)->rb_left;
238 else if (offset > eb->start)
239 p = &(*p)->rb_right;
240 else
241 return eb;
244 rb_link_node(node, parent, p);
245 rb_insert_color(node, root);
246 return NULL;
249 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
250 u64 offset)
252 struct rb_root *root = &tree->buffer;
253 struct rb_node * n = root->rb_node;
254 struct extent_buffer *eb;
256 while(n) {
257 eb = rb_entry(n, struct extent_buffer, rb_node);
258 if (offset < eb->start)
259 n = n->rb_left;
260 else if (offset > eb->start)
261 n = n->rb_right;
262 else
263 return eb;
265 return NULL;
269 * utility function to look for merge candidates inside a given range.
270 * Any extents with matching state are merged together into a single
271 * extent in the tree. Extents with EXTENT_IO in their state field
272 * are not merged because the end_io handlers need to be able to do
273 * operations on them without sleeping (or doing allocations/splits).
275 * This should be called with the tree lock held.
277 static int merge_state(struct extent_io_tree *tree,
278 struct extent_state *state)
280 struct extent_state *other;
281 struct rb_node *other_node;
283 if (state->state & EXTENT_IOBITS)
284 return 0;
286 other_node = rb_prev(&state->rb_node);
287 if (other_node) {
288 other = rb_entry(other_node, struct extent_state, rb_node);
289 if (other->end == state->start - 1 &&
290 other->state == state->state) {
291 state->start = other->start;
292 other->tree = NULL;
293 rb_erase(&other->rb_node, &tree->state);
294 free_extent_state(other);
297 other_node = rb_next(&state->rb_node);
298 if (other_node) {
299 other = rb_entry(other_node, struct extent_state, rb_node);
300 if (other->start == state->end + 1 &&
301 other->state == state->state) {
302 other->start = state->start;
303 state->tree = NULL;
304 rb_erase(&state->rb_node, &tree->state);
305 free_extent_state(state);
308 return 0;
311 static void set_state_cb(struct extent_io_tree *tree,
312 struct extent_state *state,
313 unsigned long bits)
315 if (tree->ops && tree->ops->set_bit_hook) {
316 tree->ops->set_bit_hook(tree->mapping->host, state->start,
317 state->end, state->state, bits);
321 static void clear_state_cb(struct extent_io_tree *tree,
322 struct extent_state *state,
323 unsigned long bits)
325 if (tree->ops && tree->ops->set_bit_hook) {
326 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
327 state->end, state->state, bits);
332 * insert an extent_state struct into the tree. 'bits' are set on the
333 * struct before it is inserted.
335 * This may return -EEXIST if the extent is already there, in which case the
336 * state struct is freed.
338 * The tree lock is not taken internally. This is a utility function and
339 * probably isn't what you want to call (see set/clear_extent_bit).
341 static int insert_state(struct extent_io_tree *tree,
342 struct extent_state *state, u64 start, u64 end,
343 int bits)
345 struct rb_node *node;
347 if (end < start) {
348 printk("end < start %Lu %Lu\n", end, start);
349 WARN_ON(1);
351 if (bits & EXTENT_DIRTY)
352 tree->dirty_bytes += end - start + 1;
353 set_state_cb(tree, state, bits);
354 state->state |= bits;
355 state->start = start;
356 state->end = end;
357 node = tree_insert(&tree->state, end, &state->rb_node);
358 if (node) {
359 struct extent_state *found;
360 found = rb_entry(node, struct extent_state, rb_node);
361 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
362 free_extent_state(state);
363 return -EEXIST;
365 state->tree = tree;
366 merge_state(tree, state);
367 return 0;
371 * split a given extent state struct in two, inserting the preallocated
372 * struct 'prealloc' as the newly created second half. 'split' indicates an
373 * offset inside 'orig' where it should be split.
375 * Before calling,
376 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
377 * are two extent state structs in the tree:
378 * prealloc: [orig->start, split - 1]
379 * orig: [ split, orig->end ]
381 * The tree locks are not taken by this function. They need to be held
382 * by the caller.
384 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
385 struct extent_state *prealloc, u64 split)
387 struct rb_node *node;
388 prealloc->start = orig->start;
389 prealloc->end = split - 1;
390 prealloc->state = orig->state;
391 orig->start = split;
393 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
394 if (node) {
395 struct extent_state *found;
396 found = rb_entry(node, struct extent_state, rb_node);
397 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
398 free_extent_state(prealloc);
399 return -EEXIST;
401 prealloc->tree = tree;
402 return 0;
406 * utility function to clear some bits in an extent state struct.
407 * it will optionally wake up any one waiting on this state (wake == 1), or
408 * forcibly remove the state from the tree (delete == 1).
410 * If no bits are set on the state struct after clearing things, the
411 * struct is freed and removed from the tree
413 static int clear_state_bit(struct extent_io_tree *tree,
414 struct extent_state *state, int bits, int wake,
415 int delete)
417 int ret = state->state & bits;
419 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
420 u64 range = state->end - state->start + 1;
421 WARN_ON(range > tree->dirty_bytes);
422 tree->dirty_bytes -= range;
424 clear_state_cb(tree, state, bits);
425 state->state &= ~bits;
426 if (wake)
427 wake_up(&state->wq);
428 if (delete || state->state == 0) {
429 if (state->tree) {
430 clear_state_cb(tree, state, state->state);
431 rb_erase(&state->rb_node, &tree->state);
432 state->tree = NULL;
433 free_extent_state(state);
434 } else {
435 WARN_ON(1);
437 } else {
438 merge_state(tree, state);
440 return ret;
444 * clear some bits on a range in the tree. This may require splitting
445 * or inserting elements in the tree, so the gfp mask is used to
446 * indicate which allocations or sleeping are allowed.
448 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
449 * the given range from the tree regardless of state (ie for truncate).
451 * the range [start, end] is inclusive.
453 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
454 * bits were already set, or zero if none of the bits were already set.
456 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
457 int bits, int wake, int delete, gfp_t mask)
459 struct extent_state *state;
460 struct extent_state *prealloc = NULL;
461 struct rb_node *node;
462 unsigned long flags;
463 int err;
464 int set = 0;
466 again:
467 if (!prealloc && (mask & __GFP_WAIT)) {
468 prealloc = alloc_extent_state(mask);
469 if (!prealloc)
470 return -ENOMEM;
473 spin_lock_irqsave(&tree->lock, flags);
475 * this search will find the extents that end after
476 * our range starts
478 node = tree_search(tree, start);
479 if (!node)
480 goto out;
481 state = rb_entry(node, struct extent_state, rb_node);
482 if (state->start > end)
483 goto out;
484 WARN_ON(state->end < start);
487 * | ---- desired range ---- |
488 * | state | or
489 * | ------------- state -------------- |
491 * We need to split the extent we found, and may flip
492 * bits on second half.
494 * If the extent we found extends past our range, we
495 * just split and search again. It'll get split again
496 * the next time though.
498 * If the extent we found is inside our range, we clear
499 * the desired bit on it.
502 if (state->start < start) {
503 if (!prealloc)
504 prealloc = alloc_extent_state(GFP_ATOMIC);
505 err = split_state(tree, state, prealloc, start);
506 BUG_ON(err == -EEXIST);
507 prealloc = NULL;
508 if (err)
509 goto out;
510 if (state->end <= end) {
511 start = state->end + 1;
512 set |= clear_state_bit(tree, state, bits,
513 wake, delete);
514 } else {
515 start = state->start;
517 goto search_again;
520 * | ---- desired range ---- |
521 * | state |
522 * We need to split the extent, and clear the bit
523 * on the first half
525 if (state->start <= end && state->end > end) {
526 if (!prealloc)
527 prealloc = alloc_extent_state(GFP_ATOMIC);
528 err = split_state(tree, state, prealloc, end + 1);
529 BUG_ON(err == -EEXIST);
531 if (wake)
532 wake_up(&state->wq);
533 set |= clear_state_bit(tree, prealloc, bits,
534 wake, delete);
535 prealloc = NULL;
536 goto out;
539 start = state->end + 1;
540 set |= clear_state_bit(tree, state, bits, wake, delete);
541 goto search_again;
543 out:
544 spin_unlock_irqrestore(&tree->lock, flags);
545 if (prealloc)
546 free_extent_state(prealloc);
548 return set;
550 search_again:
551 if (start > end)
552 goto out;
553 spin_unlock_irqrestore(&tree->lock, flags);
554 if (mask & __GFP_WAIT)
555 cond_resched();
556 goto again;
558 EXPORT_SYMBOL(clear_extent_bit);
560 static int wait_on_state(struct extent_io_tree *tree,
561 struct extent_state *state)
563 DEFINE_WAIT(wait);
564 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
565 spin_unlock_irq(&tree->lock);
566 schedule();
567 spin_lock_irq(&tree->lock);
568 finish_wait(&state->wq, &wait);
569 return 0;
573 * waits for one or more bits to clear on a range in the state tree.
574 * The range [start, end] is inclusive.
575 * The tree lock is taken by this function
577 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
579 struct extent_state *state;
580 struct rb_node *node;
582 spin_lock_irq(&tree->lock);
583 again:
584 while (1) {
586 * this search will find all the extents that end after
587 * our range starts
589 node = tree_search(tree, start);
590 if (!node)
591 break;
593 state = rb_entry(node, struct extent_state, rb_node);
595 if (state->start > end)
596 goto out;
598 if (state->state & bits) {
599 start = state->start;
600 atomic_inc(&state->refs);
601 wait_on_state(tree, state);
602 free_extent_state(state);
603 goto again;
605 start = state->end + 1;
607 if (start > end)
608 break;
610 if (need_resched()) {
611 spin_unlock_irq(&tree->lock);
612 cond_resched();
613 spin_lock_irq(&tree->lock);
616 out:
617 spin_unlock_irq(&tree->lock);
618 return 0;
620 EXPORT_SYMBOL(wait_extent_bit);
622 static void set_state_bits(struct extent_io_tree *tree,
623 struct extent_state *state,
624 int bits)
626 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
627 u64 range = state->end - state->start + 1;
628 tree->dirty_bytes += range;
630 set_state_cb(tree, state, bits);
631 state->state |= bits;
635 * set some bits on a range in the tree. This may require allocations
636 * or sleeping, so the gfp mask is used to indicate what is allowed.
638 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
639 * range already has the desired bits set. The start of the existing
640 * range is returned in failed_start in this case.
642 * [start, end] is inclusive
643 * This takes the tree lock.
645 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
646 int exclusive, u64 *failed_start, gfp_t mask)
648 struct extent_state *state;
649 struct extent_state *prealloc = NULL;
650 struct rb_node *node;
651 unsigned long flags;
652 int err = 0;
653 int set;
654 u64 last_start;
655 u64 last_end;
656 again:
657 if (!prealloc && (mask & __GFP_WAIT)) {
658 prealloc = alloc_extent_state(mask);
659 if (!prealloc)
660 return -ENOMEM;
663 spin_lock_irqsave(&tree->lock, flags);
665 * this search will find all the extents that end after
666 * our range starts.
668 node = tree_search(tree, start);
669 if (!node) {
670 err = insert_state(tree, prealloc, start, end, bits);
671 prealloc = NULL;
672 BUG_ON(err == -EEXIST);
673 goto out;
676 state = rb_entry(node, struct extent_state, rb_node);
677 last_start = state->start;
678 last_end = state->end;
681 * | ---- desired range ---- |
682 * | state |
684 * Just lock what we found and keep going
686 if (state->start == start && state->end <= end) {
687 set = state->state & bits;
688 if (set && exclusive) {
689 *failed_start = state->start;
690 err = -EEXIST;
691 goto out;
693 set_state_bits(tree, state, bits);
694 start = state->end + 1;
695 merge_state(tree, state);
696 goto search_again;
700 * | ---- desired range ---- |
701 * | state |
702 * or
703 * | ------------- state -------------- |
705 * We need to split the extent we found, and may flip bits on
706 * second half.
708 * If the extent we found extends past our
709 * range, we just split and search again. It'll get split
710 * again the next time though.
712 * If the extent we found is inside our range, we set the
713 * desired bit on it.
715 if (state->start < start) {
716 set = state->state & bits;
717 if (exclusive && set) {
718 *failed_start = start;
719 err = -EEXIST;
720 goto out;
722 err = split_state(tree, state, prealloc, start);
723 BUG_ON(err == -EEXIST);
724 prealloc = NULL;
725 if (err)
726 goto out;
727 if (state->end <= end) {
728 set_state_bits(tree, state, bits);
729 start = state->end + 1;
730 merge_state(tree, state);
731 } else {
732 start = state->start;
734 goto search_again;
737 * | ---- desired range ---- |
738 * | state | or | state |
740 * There's a hole, we need to insert something in it and
741 * ignore the extent we found.
743 if (state->start > start) {
744 u64 this_end;
745 if (end < last_start)
746 this_end = end;
747 else
748 this_end = last_start -1;
749 err = insert_state(tree, prealloc, start, this_end,
750 bits);
751 prealloc = NULL;
752 BUG_ON(err == -EEXIST);
753 if (err)
754 goto out;
755 start = this_end + 1;
756 goto search_again;
759 * | ---- desired range ---- |
760 * | state |
761 * We need to split the extent, and set the bit
762 * on the first half
764 if (state->start <= end && state->end > end) {
765 set = state->state & bits;
766 if (exclusive && set) {
767 *failed_start = start;
768 err = -EEXIST;
769 goto out;
771 err = split_state(tree, state, prealloc, end + 1);
772 BUG_ON(err == -EEXIST);
774 set_state_bits(tree, prealloc, bits);
775 merge_state(tree, prealloc);
776 prealloc = NULL;
777 goto out;
780 goto search_again;
782 out:
783 spin_unlock_irqrestore(&tree->lock, flags);
784 if (prealloc)
785 free_extent_state(prealloc);
787 return err;
789 search_again:
790 if (start > end)
791 goto out;
792 spin_unlock_irqrestore(&tree->lock, flags);
793 if (mask & __GFP_WAIT)
794 cond_resched();
795 goto again;
797 EXPORT_SYMBOL(set_extent_bit);
799 /* wrappers around set/clear extent bit */
800 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
801 gfp_t mask)
803 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
804 mask);
806 EXPORT_SYMBOL(set_extent_dirty);
808 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
809 gfp_t mask)
811 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
813 EXPORT_SYMBOL(set_extent_ordered);
815 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
816 int bits, gfp_t mask)
818 return set_extent_bit(tree, start, end, bits, 0, NULL,
819 mask);
821 EXPORT_SYMBOL(set_extent_bits);
823 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
824 int bits, gfp_t mask)
826 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
828 EXPORT_SYMBOL(clear_extent_bits);
830 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
831 gfp_t mask)
833 return set_extent_bit(tree, start, end,
834 EXTENT_DELALLOC | EXTENT_DIRTY,
835 0, NULL, mask);
837 EXPORT_SYMBOL(set_extent_delalloc);
839 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
840 gfp_t mask)
842 return clear_extent_bit(tree, start, end,
843 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
845 EXPORT_SYMBOL(clear_extent_dirty);
847 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
848 gfp_t mask)
850 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
852 EXPORT_SYMBOL(clear_extent_ordered);
854 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
855 gfp_t mask)
857 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
858 mask);
860 EXPORT_SYMBOL(set_extent_new);
862 int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
863 gfp_t mask)
865 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
867 EXPORT_SYMBOL(clear_extent_new);
869 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
870 gfp_t mask)
872 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
873 mask);
875 EXPORT_SYMBOL(set_extent_uptodate);
877 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
878 gfp_t mask)
880 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
882 EXPORT_SYMBOL(clear_extent_uptodate);
884 int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
885 gfp_t mask)
887 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
888 0, NULL, mask);
890 EXPORT_SYMBOL(set_extent_writeback);
892 int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
893 gfp_t mask)
895 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
897 EXPORT_SYMBOL(clear_extent_writeback);
899 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
901 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
903 EXPORT_SYMBOL(wait_on_extent_writeback);
905 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
907 int err;
908 u64 failed_start;
909 while (1) {
910 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
911 &failed_start, mask);
912 if (err == -EEXIST && (mask & __GFP_WAIT)) {
913 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
914 start = failed_start;
915 } else {
916 break;
918 WARN_ON(start > end);
920 return err;
922 EXPORT_SYMBOL(lock_extent);
924 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
925 gfp_t mask)
927 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
929 EXPORT_SYMBOL(unlock_extent);
932 * helper function to set pages and extents in the tree dirty
934 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
936 unsigned long index = start >> PAGE_CACHE_SHIFT;
937 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
938 struct page *page;
940 while (index <= end_index) {
941 page = find_get_page(tree->mapping, index);
942 BUG_ON(!page);
943 __set_page_dirty_nobuffers(page);
944 page_cache_release(page);
945 index++;
947 set_extent_dirty(tree, start, end, GFP_NOFS);
948 return 0;
950 EXPORT_SYMBOL(set_range_dirty);
953 * helper function to set both pages and extents in the tree writeback
955 int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
957 unsigned long index = start >> PAGE_CACHE_SHIFT;
958 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
959 struct page *page;
961 while (index <= end_index) {
962 page = find_get_page(tree->mapping, index);
963 BUG_ON(!page);
964 set_page_writeback(page);
965 page_cache_release(page);
966 index++;
968 set_extent_writeback(tree, start, end, GFP_NOFS);
969 return 0;
971 EXPORT_SYMBOL(set_range_writeback);
973 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
974 u64 *start_ret, u64 *end_ret, int bits)
976 struct rb_node *node;
977 struct extent_state *state;
978 int ret = 1;
980 spin_lock_irq(&tree->lock);
982 * this search will find all the extents that end after
983 * our range starts.
985 node = tree_search(tree, start);
986 if (!node) {
987 goto out;
990 while(1) {
991 state = rb_entry(node, struct extent_state, rb_node);
992 if (state->end >= start && (state->state & bits)) {
993 *start_ret = state->start;
994 *end_ret = state->end;
995 ret = 0;
996 break;
998 node = rb_next(node);
999 if (!node)
1000 break;
1002 out:
1003 spin_unlock_irq(&tree->lock);
1004 return ret;
1006 EXPORT_SYMBOL(find_first_extent_bit);
1008 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1009 u64 start, int bits)
1011 struct rb_node *node;
1012 struct extent_state *state;
1015 * this search will find all the extents that end after
1016 * our range starts.
1018 node = tree_search(tree, start);
1019 if (!node) {
1020 goto out;
1023 while(1) {
1024 state = rb_entry(node, struct extent_state, rb_node);
1025 if (state->end >= start && (state->state & bits)) {
1026 return state;
1028 node = rb_next(node);
1029 if (!node)
1030 break;
1032 out:
1033 return NULL;
1035 EXPORT_SYMBOL(find_first_extent_bit_state);
1037 u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1038 u64 *start, u64 *end, u64 max_bytes)
1040 struct rb_node *node;
1041 struct extent_state *state;
1042 u64 cur_start = *start;
1043 u64 found = 0;
1044 u64 total_bytes = 0;
1046 spin_lock_irq(&tree->lock);
1048 * this search will find all the extents that end after
1049 * our range starts.
1051 search_again:
1052 node = tree_search(tree, cur_start);
1053 if (!node) {
1054 if (!found)
1055 *end = (u64)-1;
1056 goto out;
1059 while(1) {
1060 state = rb_entry(node, struct extent_state, rb_node);
1061 if (found && state->start != cur_start) {
1062 goto out;
1064 if (!(state->state & EXTENT_DELALLOC)) {
1065 if (!found)
1066 *end = state->end;
1067 goto out;
1069 if (!found) {
1070 struct extent_state *prev_state;
1071 struct rb_node *prev_node = node;
1072 while(1) {
1073 prev_node = rb_prev(prev_node);
1074 if (!prev_node)
1075 break;
1076 prev_state = rb_entry(prev_node,
1077 struct extent_state,
1078 rb_node);
1079 if (!(prev_state->state & EXTENT_DELALLOC))
1080 break;
1081 state = prev_state;
1082 node = prev_node;
1085 if (state->state & EXTENT_LOCKED) {
1086 DEFINE_WAIT(wait);
1087 atomic_inc(&state->refs);
1088 prepare_to_wait(&state->wq, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090 spin_unlock_irq(&tree->lock);
1091 schedule();
1092 spin_lock_irq(&tree->lock);
1093 finish_wait(&state->wq, &wait);
1094 free_extent_state(state);
1095 goto search_again;
1097 set_state_cb(tree, state, EXTENT_LOCKED);
1098 state->state |= EXTENT_LOCKED;
1099 if (!found)
1100 *start = state->start;
1101 found++;
1102 *end = state->end;
1103 cur_start = state->end + 1;
1104 node = rb_next(node);
1105 if (!node)
1106 break;
1107 total_bytes += state->end - state->start + 1;
1108 if (total_bytes >= max_bytes)
1109 break;
1111 out:
1112 spin_unlock_irq(&tree->lock);
1113 return found;
1116 u64 count_range_bits(struct extent_io_tree *tree,
1117 u64 *start, u64 search_end, u64 max_bytes,
1118 unsigned long bits)
1120 struct rb_node *node;
1121 struct extent_state *state;
1122 u64 cur_start = *start;
1123 u64 total_bytes = 0;
1124 int found = 0;
1126 if (search_end <= cur_start) {
1127 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1128 WARN_ON(1);
1129 return 0;
1132 spin_lock_irq(&tree->lock);
1133 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1134 total_bytes = tree->dirty_bytes;
1135 goto out;
1138 * this search will find all the extents that end after
1139 * our range starts.
1141 node = tree_search(tree, cur_start);
1142 if (!node) {
1143 goto out;
1146 while(1) {
1147 state = rb_entry(node, struct extent_state, rb_node);
1148 if (state->start > search_end)
1149 break;
1150 if (state->end >= cur_start && (state->state & bits)) {
1151 total_bytes += min(search_end, state->end) + 1 -
1152 max(cur_start, state->start);
1153 if (total_bytes >= max_bytes)
1154 break;
1155 if (!found) {
1156 *start = state->start;
1157 found = 1;
1160 node = rb_next(node);
1161 if (!node)
1162 break;
1164 out:
1165 spin_unlock_irq(&tree->lock);
1166 return total_bytes;
1169 * helper function to lock both pages and extents in the tree.
1170 * pages must be locked first.
1172 int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1174 unsigned long index = start >> PAGE_CACHE_SHIFT;
1175 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1176 struct page *page;
1177 int err;
1179 while (index <= end_index) {
1180 page = grab_cache_page(tree->mapping, index);
1181 if (!page) {
1182 err = -ENOMEM;
1183 goto failed;
1185 if (IS_ERR(page)) {
1186 err = PTR_ERR(page);
1187 goto failed;
1189 index++;
1191 lock_extent(tree, start, end, GFP_NOFS);
1192 return 0;
1194 failed:
1196 * we failed above in getting the page at 'index', so we undo here
1197 * up to but not including the page at 'index'
1199 end_index = index;
1200 index = start >> PAGE_CACHE_SHIFT;
1201 while (index < end_index) {
1202 page = find_get_page(tree->mapping, index);
1203 unlock_page(page);
1204 page_cache_release(page);
1205 index++;
1207 return err;
1209 EXPORT_SYMBOL(lock_range);
1212 * helper function to unlock both pages and extents in the tree.
1214 int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1216 unsigned long index = start >> PAGE_CACHE_SHIFT;
1217 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1218 struct page *page;
1220 while (index <= end_index) {
1221 page = find_get_page(tree->mapping, index);
1222 unlock_page(page);
1223 page_cache_release(page);
1224 index++;
1226 unlock_extent(tree, start, end, GFP_NOFS);
1227 return 0;
1229 EXPORT_SYMBOL(unlock_range);
1231 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1233 struct rb_node *node;
1234 struct extent_state *state;
1235 int ret = 0;
1237 spin_lock_irq(&tree->lock);
1239 * this search will find all the extents that end after
1240 * our range starts.
1242 node = tree_search(tree, start);
1243 if (!node) {
1244 ret = -ENOENT;
1245 goto out;
1247 state = rb_entry(node, struct extent_state, rb_node);
1248 if (state->start != start) {
1249 ret = -ENOENT;
1250 goto out;
1252 state->private = private;
1253 out:
1254 spin_unlock_irq(&tree->lock);
1255 return ret;
1258 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1260 struct rb_node *node;
1261 struct extent_state *state;
1262 int ret = 0;
1264 spin_lock_irq(&tree->lock);
1266 * this search will find all the extents that end after
1267 * our range starts.
1269 node = tree_search(tree, start);
1270 if (!node) {
1271 ret = -ENOENT;
1272 goto out;
1274 state = rb_entry(node, struct extent_state, rb_node);
1275 if (state->start != start) {
1276 ret = -ENOENT;
1277 goto out;
1279 *private = state->private;
1280 out:
1281 spin_unlock_irq(&tree->lock);
1282 return ret;
1286 * searches a range in the state tree for a given mask.
1287 * If 'filled' == 1, this returns 1 only if every extent in the tree
1288 * has the bits set. Otherwise, 1 is returned if any bit in the
1289 * range is found set.
1291 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1292 int bits, int filled)
1294 struct extent_state *state = NULL;
1295 struct rb_node *node;
1296 int bitset = 0;
1297 unsigned long flags;
1299 spin_lock_irqsave(&tree->lock, flags);
1300 node = tree_search(tree, start);
1301 while (node && start <= end) {
1302 state = rb_entry(node, struct extent_state, rb_node);
1304 if (filled && state->start > start) {
1305 bitset = 0;
1306 break;
1309 if (state->start > end)
1310 break;
1312 if (state->state & bits) {
1313 bitset = 1;
1314 if (!filled)
1315 break;
1316 } else if (filled) {
1317 bitset = 0;
1318 break;
1320 start = state->end + 1;
1321 if (start > end)
1322 break;
1323 node = rb_next(node);
1324 if (!node) {
1325 if (filled)
1326 bitset = 0;
1327 break;
1330 spin_unlock_irqrestore(&tree->lock, flags);
1331 return bitset;
1333 EXPORT_SYMBOL(test_range_bit);
1336 * helper function to set a given page up to date if all the
1337 * extents in the tree for that page are up to date
1339 static int check_page_uptodate(struct extent_io_tree *tree,
1340 struct page *page)
1342 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1343 u64 end = start + PAGE_CACHE_SIZE - 1;
1344 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1345 SetPageUptodate(page);
1346 return 0;
1350 * helper function to unlock a page if all the extents in the tree
1351 * for that page are unlocked
1353 static int check_page_locked(struct extent_io_tree *tree,
1354 struct page *page)
1356 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1357 u64 end = start + PAGE_CACHE_SIZE - 1;
1358 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1359 unlock_page(page);
1360 return 0;
1364 * helper function to end page writeback if all the extents
1365 * in the tree for that page are done with writeback
1367 static int check_page_writeback(struct extent_io_tree *tree,
1368 struct page *page)
1370 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1371 u64 end = start + PAGE_CACHE_SIZE - 1;
1372 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1373 end_page_writeback(page);
1374 return 0;
1377 /* lots and lots of room for performance fixes in the end_bio funcs */
1380 * after a writepage IO is done, we need to:
1381 * clear the uptodate bits on error
1382 * clear the writeback bits in the extent tree for this IO
1383 * end_page_writeback if the page has no more pending IO
1385 * Scheduling is not allowed, so the extent state tree is expected
1386 * to have one and only one object corresponding to this IO.
1388 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1389 static void end_bio_extent_writepage(struct bio *bio, int err)
1390 #else
1391 static int end_bio_extent_writepage(struct bio *bio,
1392 unsigned int bytes_done, int err)
1393 #endif
1395 int uptodate = err == 0;
1396 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1397 struct extent_state *state = bio->bi_private;
1398 struct extent_io_tree *tree = state->tree;
1399 struct rb_node *node;
1400 u64 start;
1401 u64 end;
1402 u64 cur;
1403 int whole_page;
1404 int ret;
1405 unsigned long flags;
1407 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1408 if (bio->bi_size)
1409 return 1;
1410 #endif
1411 do {
1412 struct page *page = bvec->bv_page;
1413 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1414 bvec->bv_offset;
1415 end = start + bvec->bv_len - 1;
1417 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1418 whole_page = 1;
1419 else
1420 whole_page = 0;
1422 if (--bvec >= bio->bi_io_vec)
1423 prefetchw(&bvec->bv_page->flags);
1424 if (tree->ops && tree->ops->writepage_end_io_hook) {
1425 ret = tree->ops->writepage_end_io_hook(page, start,
1426 end, state, uptodate);
1427 if (ret)
1428 uptodate = 0;
1431 if (!uptodate && tree->ops &&
1432 tree->ops->writepage_io_failed_hook) {
1433 ret = tree->ops->writepage_io_failed_hook(bio, page,
1434 start, end, state);
1435 if (ret == 0) {
1436 state = NULL;
1437 uptodate = (err == 0);
1438 continue;
1442 if (!uptodate) {
1443 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1444 ClearPageUptodate(page);
1445 SetPageError(page);
1449 * bios can get merged in funny ways, and so we need to
1450 * be careful with the state variable. We know the
1451 * state won't be merged with others because it has
1452 * WRITEBACK set, but we can't be sure each biovec is
1453 * sequential in the file. So, if our cached state
1454 * doesn't match the expected end, search the tree
1455 * for the correct one.
1458 spin_lock_irqsave(&tree->lock, flags);
1459 if (!state || state->end != end) {
1460 state = NULL;
1461 node = __etree_search(tree, start, NULL, NULL);
1462 if (node) {
1463 state = rb_entry(node, struct extent_state,
1464 rb_node);
1465 if (state->end != end ||
1466 !(state->state & EXTENT_WRITEBACK))
1467 state = NULL;
1469 if (!state) {
1470 spin_unlock_irqrestore(&tree->lock, flags);
1471 clear_extent_writeback(tree, start,
1472 end, GFP_ATOMIC);
1473 goto next_io;
1476 cur = end;
1477 while(1) {
1478 struct extent_state *clear = state;
1479 cur = state->start;
1480 node = rb_prev(&state->rb_node);
1481 if (node) {
1482 state = rb_entry(node,
1483 struct extent_state,
1484 rb_node);
1485 } else {
1486 state = NULL;
1489 clear_state_bit(tree, clear, EXTENT_WRITEBACK,
1490 1, 0);
1491 if (cur == start)
1492 break;
1493 if (cur < start) {
1494 WARN_ON(1);
1495 break;
1497 if (!node)
1498 break;
1500 /* before releasing the lock, make sure the next state
1501 * variable has the expected bits set and corresponds
1502 * to the correct offsets in the file
1504 if (state && (state->end + 1 != start ||
1505 !(state->state & EXTENT_WRITEBACK))) {
1506 state = NULL;
1508 spin_unlock_irqrestore(&tree->lock, flags);
1509 next_io:
1511 if (whole_page)
1512 end_page_writeback(page);
1513 else
1514 check_page_writeback(tree, page);
1515 } while (bvec >= bio->bi_io_vec);
1516 bio_put(bio);
1517 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1518 return 0;
1519 #endif
1523 * after a readpage IO is done, we need to:
1524 * clear the uptodate bits on error
1525 * set the uptodate bits if things worked
1526 * set the page up to date if all extents in the tree are uptodate
1527 * clear the lock bit in the extent tree
1528 * unlock the page if there are no other extents locked for it
1530 * Scheduling is not allowed, so the extent state tree is expected
1531 * to have one and only one object corresponding to this IO.
1533 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1534 static void end_bio_extent_readpage(struct bio *bio, int err)
1535 #else
1536 static int end_bio_extent_readpage(struct bio *bio,
1537 unsigned int bytes_done, int err)
1538 #endif
1540 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1541 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1542 struct extent_state *state = bio->bi_private;
1543 struct extent_io_tree *tree = state->tree;
1544 struct rb_node *node;
1545 u64 start;
1546 u64 end;
1547 u64 cur;
1548 unsigned long flags;
1549 int whole_page;
1550 int ret;
1552 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1553 if (bio->bi_size)
1554 return 1;
1555 #endif
1557 do {
1558 struct page *page = bvec->bv_page;
1559 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1560 bvec->bv_offset;
1561 end = start + bvec->bv_len - 1;
1563 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1564 whole_page = 1;
1565 else
1566 whole_page = 0;
1568 if (--bvec >= bio->bi_io_vec)
1569 prefetchw(&bvec->bv_page->flags);
1571 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1572 ret = tree->ops->readpage_end_io_hook(page, start, end,
1573 state);
1574 if (ret)
1575 uptodate = 0;
1577 if (!uptodate && tree->ops &&
1578 tree->ops->readpage_io_failed_hook) {
1579 ret = tree->ops->readpage_io_failed_hook(bio, page,
1580 start, end, state);
1581 if (ret == 0) {
1582 state = NULL;
1583 uptodate =
1584 test_bit(BIO_UPTODATE, &bio->bi_flags);
1585 continue;
1589 spin_lock_irqsave(&tree->lock, flags);
1590 if (!state || state->end != end) {
1591 state = NULL;
1592 node = __etree_search(tree, start, NULL, NULL);
1593 if (node) {
1594 state = rb_entry(node, struct extent_state,
1595 rb_node);
1596 if (state->end != end ||
1597 !(state->state & EXTENT_LOCKED))
1598 state = NULL;
1600 if (!state) {
1601 spin_unlock_irqrestore(&tree->lock, flags);
1602 if (uptodate)
1603 set_extent_uptodate(tree, start, end,
1604 GFP_ATOMIC);
1605 unlock_extent(tree, start, end, GFP_ATOMIC);
1606 goto next_io;
1610 cur = end;
1611 while(1) {
1612 struct extent_state *clear = state;
1613 cur = state->start;
1614 node = rb_prev(&state->rb_node);
1615 if (node) {
1616 state = rb_entry(node,
1617 struct extent_state,
1618 rb_node);
1619 } else {
1620 state = NULL;
1622 if (uptodate) {
1623 set_state_cb(tree, clear, EXTENT_UPTODATE);
1624 clear->state |= EXTENT_UPTODATE;
1626 clear_state_bit(tree, clear, EXTENT_LOCKED,
1627 1, 0);
1628 if (cur == start)
1629 break;
1630 if (cur < start) {
1631 WARN_ON(1);
1632 break;
1634 if (!node)
1635 break;
1637 /* before releasing the lock, make sure the next state
1638 * variable has the expected bits set and corresponds
1639 * to the correct offsets in the file
1641 if (state && (state->end + 1 != start ||
1642 !(state->state & EXTENT_LOCKED))) {
1643 state = NULL;
1645 spin_unlock_irqrestore(&tree->lock, flags);
1646 next_io:
1647 if (whole_page) {
1648 if (uptodate) {
1649 SetPageUptodate(page);
1650 } else {
1651 ClearPageUptodate(page);
1652 SetPageError(page);
1654 unlock_page(page);
1655 } else {
1656 if (uptodate) {
1657 check_page_uptodate(tree, page);
1658 } else {
1659 ClearPageUptodate(page);
1660 SetPageError(page);
1662 check_page_locked(tree, page);
1664 } while (bvec >= bio->bi_io_vec);
1666 bio_put(bio);
1667 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1668 return 0;
1669 #endif
1673 * IO done from prepare_write is pretty simple, we just unlock
1674 * the structs in the extent tree when done, and set the uptodate bits
1675 * as appropriate.
1677 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1678 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1679 #else
1680 static int end_bio_extent_preparewrite(struct bio *bio,
1681 unsigned int bytes_done, int err)
1682 #endif
1684 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1685 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1686 struct extent_state *state = bio->bi_private;
1687 struct extent_io_tree *tree = state->tree;
1688 u64 start;
1689 u64 end;
1691 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1692 if (bio->bi_size)
1693 return 1;
1694 #endif
1696 do {
1697 struct page *page = bvec->bv_page;
1698 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1699 bvec->bv_offset;
1700 end = start + bvec->bv_len - 1;
1702 if (--bvec >= bio->bi_io_vec)
1703 prefetchw(&bvec->bv_page->flags);
1705 if (uptodate) {
1706 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1707 } else {
1708 ClearPageUptodate(page);
1709 SetPageError(page);
1712 unlock_extent(tree, start, end, GFP_ATOMIC);
1714 } while (bvec >= bio->bi_io_vec);
1716 bio_put(bio);
1717 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1718 return 0;
1719 #endif
1722 static struct bio *
1723 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1724 gfp_t gfp_flags)
1726 struct bio *bio;
1728 bio = bio_alloc(gfp_flags, nr_vecs);
1730 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1731 while (!bio && (nr_vecs /= 2))
1732 bio = bio_alloc(gfp_flags, nr_vecs);
1735 if (bio) {
1736 bio->bi_size = 0;
1737 bio->bi_bdev = bdev;
1738 bio->bi_sector = first_sector;
1740 return bio;
1743 static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
1745 int ret = 0;
1746 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1747 struct page *page = bvec->bv_page;
1748 struct extent_io_tree *tree = bio->bi_private;
1749 struct rb_node *node;
1750 struct extent_state *state;
1751 u64 start;
1752 u64 end;
1754 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1755 end = start + bvec->bv_len - 1;
1757 spin_lock_irq(&tree->lock);
1758 node = __etree_search(tree, start, NULL, NULL);
1759 BUG_ON(!node);
1760 state = rb_entry(node, struct extent_state, rb_node);
1761 while(state->end < end) {
1762 node = rb_next(node);
1763 state = rb_entry(node, struct extent_state, rb_node);
1765 BUG_ON(state->end != end);
1766 spin_unlock_irq(&tree->lock);
1768 bio->bi_private = state;
1770 bio_get(bio);
1772 if (tree->ops && tree->ops->submit_bio_hook)
1773 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1774 mirror_num);
1775 else
1776 submit_bio(rw, bio);
1777 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1778 ret = -EOPNOTSUPP;
1779 bio_put(bio);
1780 return ret;
1783 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1784 struct page *page, sector_t sector,
1785 size_t size, unsigned long offset,
1786 struct block_device *bdev,
1787 struct bio **bio_ret,
1788 unsigned long max_pages,
1789 bio_end_io_t end_io_func,
1790 int mirror_num)
1792 int ret = 0;
1793 struct bio *bio;
1794 int nr;
1796 if (bio_ret && *bio_ret) {
1797 bio = *bio_ret;
1798 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1799 (tree->ops && tree->ops->merge_bio_hook &&
1800 tree->ops->merge_bio_hook(page, offset, size, bio)) ||
1801 bio_add_page(bio, page, size, offset) < size) {
1802 ret = submit_one_bio(rw, bio, mirror_num);
1803 bio = NULL;
1804 } else {
1805 return 0;
1808 nr = bio_get_nr_vecs(bdev);
1809 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1810 if (!bio) {
1811 printk("failed to allocate bio nr %d\n", nr);
1815 bio_add_page(bio, page, size, offset);
1816 bio->bi_end_io = end_io_func;
1817 bio->bi_private = tree;
1819 if (bio_ret) {
1820 *bio_ret = bio;
1821 } else {
1822 ret = submit_one_bio(rw, bio, mirror_num);
1825 return ret;
1828 void set_page_extent_mapped(struct page *page)
1830 if (!PagePrivate(page)) {
1831 SetPagePrivate(page);
1832 page_cache_get(page);
1833 set_page_private(page, EXTENT_PAGE_PRIVATE);
1837 void set_page_extent_head(struct page *page, unsigned long len)
1839 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1843 * basic readpage implementation. Locked extent state structs are inserted
1844 * into the tree that are removed when the IO is done (by the end_io
1845 * handlers)
1847 static int __extent_read_full_page(struct extent_io_tree *tree,
1848 struct page *page,
1849 get_extent_t *get_extent,
1850 struct bio **bio, int mirror_num)
1852 struct inode *inode = page->mapping->host;
1853 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1854 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1855 u64 end;
1856 u64 cur = start;
1857 u64 extent_offset;
1858 u64 last_byte = i_size_read(inode);
1859 u64 block_start;
1860 u64 cur_end;
1861 sector_t sector;
1862 struct extent_map *em;
1863 struct block_device *bdev;
1864 int ret;
1865 int nr = 0;
1866 size_t page_offset = 0;
1867 size_t iosize;
1868 size_t blocksize = inode->i_sb->s_blocksize;
1870 set_page_extent_mapped(page);
1872 end = page_end;
1873 lock_extent(tree, start, end, GFP_NOFS);
1875 while (cur <= end) {
1876 if (cur >= last_byte) {
1877 char *userpage;
1878 iosize = PAGE_CACHE_SIZE - page_offset;
1879 userpage = kmap_atomic(page, KM_USER0);
1880 memset(userpage + page_offset, 0, iosize);
1881 flush_dcache_page(page);
1882 kunmap_atomic(userpage, KM_USER0);
1883 set_extent_uptodate(tree, cur, cur + iosize - 1,
1884 GFP_NOFS);
1885 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1886 break;
1888 em = get_extent(inode, page, page_offset, cur,
1889 end - cur + 1, 0);
1890 if (IS_ERR(em) || !em) {
1891 SetPageError(page);
1892 unlock_extent(tree, cur, end, GFP_NOFS);
1893 break;
1895 extent_offset = cur - em->start;
1896 if (extent_map_end(em) <= cur) {
1897 printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
1899 BUG_ON(extent_map_end(em) <= cur);
1900 if (end < cur) {
1901 printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1903 BUG_ON(end < cur);
1905 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1906 cur_end = min(extent_map_end(em) - 1, end);
1907 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1908 sector = (em->block_start + extent_offset) >> 9;
1909 bdev = em->bdev;
1910 block_start = em->block_start;
1911 free_extent_map(em);
1912 em = NULL;
1914 /* we've found a hole, just zero and go on */
1915 if (block_start == EXTENT_MAP_HOLE) {
1916 char *userpage;
1917 userpage = kmap_atomic(page, KM_USER0);
1918 memset(userpage + page_offset, 0, iosize);
1919 flush_dcache_page(page);
1920 kunmap_atomic(userpage, KM_USER0);
1922 set_extent_uptodate(tree, cur, cur + iosize - 1,
1923 GFP_NOFS);
1924 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1925 cur = cur + iosize;
1926 page_offset += iosize;
1927 continue;
1929 /* the get_extent function already copied into the page */
1930 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1931 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1932 cur = cur + iosize;
1933 page_offset += iosize;
1934 continue;
1936 /* we have an inline extent but it didn't get marked up
1937 * to date. Error out
1939 if (block_start == EXTENT_MAP_INLINE) {
1940 SetPageError(page);
1941 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1942 cur = cur + iosize;
1943 page_offset += iosize;
1944 continue;
1947 ret = 0;
1948 if (tree->ops && tree->ops->readpage_io_hook) {
1949 ret = tree->ops->readpage_io_hook(page, cur,
1950 cur + iosize - 1);
1952 if (!ret) {
1953 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1954 pnr -= page->index;
1955 ret = submit_extent_page(READ, tree, page,
1956 sector, iosize, page_offset,
1957 bdev, bio, pnr,
1958 end_bio_extent_readpage, mirror_num);
1959 nr++;
1961 if (ret)
1962 SetPageError(page);
1963 cur = cur + iosize;
1964 page_offset += iosize;
1966 if (!nr) {
1967 if (!PageError(page))
1968 SetPageUptodate(page);
1969 unlock_page(page);
1971 return 0;
1974 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1975 get_extent_t *get_extent)
1977 struct bio *bio = NULL;
1978 int ret;
1980 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
1981 if (bio)
1982 submit_one_bio(READ, bio, 0);
1983 return ret;
1985 EXPORT_SYMBOL(extent_read_full_page);
1988 * the writepage semantics are similar to regular writepage. extent
1989 * records are inserted to lock ranges in the tree, and as dirty areas
1990 * are found, they are marked writeback. Then the lock bits are removed
1991 * and the end_io handler clears the writeback ranges
1993 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1994 void *data)
1996 struct inode *inode = page->mapping->host;
1997 struct extent_page_data *epd = data;
1998 struct extent_io_tree *tree = epd->tree;
1999 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2000 u64 delalloc_start;
2001 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2002 u64 end;
2003 u64 cur = start;
2004 u64 extent_offset;
2005 u64 last_byte = i_size_read(inode);
2006 u64 block_start;
2007 u64 iosize;
2008 u64 unlock_start;
2009 sector_t sector;
2010 struct extent_map *em;
2011 struct block_device *bdev;
2012 int ret;
2013 int nr = 0;
2014 size_t pg_offset = 0;
2015 size_t blocksize;
2016 loff_t i_size = i_size_read(inode);
2017 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2018 u64 nr_delalloc;
2019 u64 delalloc_end;
2021 WARN_ON(!PageLocked(page));
2022 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2023 if (page->index > end_index ||
2024 (page->index == end_index && !pg_offset)) {
2025 page->mapping->a_ops->invalidatepage(page, 0);
2026 unlock_page(page);
2027 return 0;
2030 if (page->index == end_index) {
2031 char *userpage;
2033 userpage = kmap_atomic(page, KM_USER0);
2034 memset(userpage + pg_offset, 0,
2035 PAGE_CACHE_SIZE - pg_offset);
2036 kunmap_atomic(userpage, KM_USER0);
2037 flush_dcache_page(page);
2039 pg_offset = 0;
2041 set_page_extent_mapped(page);
2043 delalloc_start = start;
2044 delalloc_end = 0;
2045 while(delalloc_end < page_end) {
2046 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
2047 &delalloc_end,
2048 128 * 1024 * 1024);
2049 if (nr_delalloc == 0) {
2050 delalloc_start = delalloc_end + 1;
2051 continue;
2053 tree->ops->fill_delalloc(inode, delalloc_start,
2054 delalloc_end);
2055 clear_extent_bit(tree, delalloc_start,
2056 delalloc_end,
2057 EXTENT_LOCKED | EXTENT_DELALLOC,
2058 1, 0, GFP_NOFS);
2059 delalloc_start = delalloc_end + 1;
2061 lock_extent(tree, start, page_end, GFP_NOFS);
2062 unlock_start = start;
2064 if (tree->ops && tree->ops->writepage_start_hook) {
2065 ret = tree->ops->writepage_start_hook(page, start, page_end);
2066 if (ret == -EAGAIN) {
2067 unlock_extent(tree, start, page_end, GFP_NOFS);
2068 redirty_page_for_writepage(wbc, page);
2069 unlock_page(page);
2070 return 0;
2074 end = page_end;
2075 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
2076 printk("found delalloc bits after lock_extent\n");
2079 if (last_byte <= start) {
2080 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2081 unlock_extent(tree, start, page_end, GFP_NOFS);
2082 if (tree->ops && tree->ops->writepage_end_io_hook)
2083 tree->ops->writepage_end_io_hook(page, start,
2084 page_end, NULL, 1);
2085 unlock_start = page_end + 1;
2086 goto done;
2089 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2090 blocksize = inode->i_sb->s_blocksize;
2092 while (cur <= end) {
2093 if (cur >= last_byte) {
2094 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2095 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2096 if (tree->ops && tree->ops->writepage_end_io_hook)
2097 tree->ops->writepage_end_io_hook(page, cur,
2098 page_end, NULL, 1);
2099 unlock_start = page_end + 1;
2100 break;
2102 em = epd->get_extent(inode, page, pg_offset, cur,
2103 end - cur + 1, 1);
2104 if (IS_ERR(em) || !em) {
2105 SetPageError(page);
2106 break;
2109 extent_offset = cur - em->start;
2110 BUG_ON(extent_map_end(em) <= cur);
2111 BUG_ON(end < cur);
2112 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2113 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2114 sector = (em->block_start + extent_offset) >> 9;
2115 bdev = em->bdev;
2116 block_start = em->block_start;
2117 free_extent_map(em);
2118 em = NULL;
2120 if (block_start == EXTENT_MAP_HOLE ||
2121 block_start == EXTENT_MAP_INLINE) {
2122 clear_extent_dirty(tree, cur,
2123 cur + iosize - 1, GFP_NOFS);
2125 unlock_extent(tree, unlock_start, cur + iosize -1,
2126 GFP_NOFS);
2128 if (tree->ops && tree->ops->writepage_end_io_hook)
2129 tree->ops->writepage_end_io_hook(page, cur,
2130 cur + iosize - 1,
2131 NULL, 1);
2132 cur = cur + iosize;
2133 pg_offset += iosize;
2134 unlock_start = cur;
2135 continue;
2138 /* leave this out until we have a page_mkwrite call */
2139 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2140 EXTENT_DIRTY, 0)) {
2141 cur = cur + iosize;
2142 pg_offset += iosize;
2143 continue;
2145 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2146 if (tree->ops && tree->ops->writepage_io_hook) {
2147 ret = tree->ops->writepage_io_hook(page, cur,
2148 cur + iosize - 1);
2149 } else {
2150 ret = 0;
2152 if (ret) {
2153 SetPageError(page);
2154 } else {
2155 unsigned long max_nr = end_index + 1;
2157 set_range_writeback(tree, cur, cur + iosize - 1);
2158 if (!PageWriteback(page)) {
2159 printk("warning page %lu not writeback, "
2160 "cur %llu end %llu\n", page->index,
2161 (unsigned long long)cur,
2162 (unsigned long long)end);
2165 ret = submit_extent_page(WRITE, tree, page, sector,
2166 iosize, pg_offset, bdev,
2167 &epd->bio, max_nr,
2168 end_bio_extent_writepage, 0);
2169 if (ret)
2170 SetPageError(page);
2172 cur = cur + iosize;
2173 pg_offset += iosize;
2174 nr++;
2176 done:
2177 if (nr == 0) {
2178 /* make sure the mapping tag for page dirty gets cleared */
2179 set_page_writeback(page);
2180 end_page_writeback(page);
2182 if (unlock_start <= page_end)
2183 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2184 unlock_page(page);
2185 return 0;
2188 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
2189 /* Taken directly from 2.6.23 for 2.6.18 back port */
2190 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2191 void *data);
2194 * write_cache_pages - walk the list of dirty pages of the given address space
2195 * and write all of them.
2196 * @mapping: address space structure to write
2197 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2198 * @writepage: function called for each page
2199 * @data: data passed to writepage function
2201 * If a page is already under I/O, write_cache_pages() skips it, even
2202 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2203 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2204 * and msync() need to guarantee that all the data which was dirty at the time
2205 * the call was made get new I/O started against them. If wbc->sync_mode is
2206 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2207 * existing IO to complete.
2209 static int write_cache_pages(struct address_space *mapping,
2210 struct writeback_control *wbc, writepage_t writepage,
2211 void *data)
2213 struct backing_dev_info *bdi = mapping->backing_dev_info;
2214 int ret = 0;
2215 int done = 0;
2216 struct pagevec pvec;
2217 int nr_pages;
2218 pgoff_t index;
2219 pgoff_t end; /* Inclusive */
2220 int scanned = 0;
2221 int range_whole = 0;
2223 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2224 wbc->encountered_congestion = 1;
2225 return 0;
2228 pagevec_init(&pvec, 0);
2229 if (wbc->range_cyclic) {
2230 index = mapping->writeback_index; /* Start from prev offset */
2231 end = -1;
2232 } else {
2233 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2234 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2235 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2236 range_whole = 1;
2237 scanned = 1;
2239 retry:
2240 while (!done && (index <= end) &&
2241 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2242 PAGECACHE_TAG_DIRTY,
2243 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2244 unsigned i;
2246 scanned = 1;
2247 for (i = 0; i < nr_pages; i++) {
2248 struct page *page = pvec.pages[i];
2251 * At this point we hold neither mapping->tree_lock nor
2252 * lock on the page itself: the page may be truncated or
2253 * invalidated (changing page->mapping to NULL), or even
2254 * swizzled back from swapper_space to tmpfs file
2255 * mapping
2257 lock_page(page);
2259 if (unlikely(page->mapping != mapping)) {
2260 unlock_page(page);
2261 continue;
2264 if (!wbc->range_cyclic && page->index > end) {
2265 done = 1;
2266 unlock_page(page);
2267 continue;
2270 if (wbc->sync_mode != WB_SYNC_NONE)
2271 wait_on_page_writeback(page);
2273 if (PageWriteback(page) ||
2274 !clear_page_dirty_for_io(page)) {
2275 unlock_page(page);
2276 continue;
2279 ret = (*writepage)(page, wbc, data);
2281 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2282 unlock_page(page);
2283 ret = 0;
2285 if (ret || (--(wbc->nr_to_write) <= 0))
2286 done = 1;
2287 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2288 wbc->encountered_congestion = 1;
2289 done = 1;
2292 pagevec_release(&pvec);
2293 cond_resched();
2295 if (!scanned && !done) {
2297 * We hit the last page and there is more work to be done: wrap
2298 * back to the start of the file
2300 scanned = 1;
2301 index = 0;
2302 goto retry;
2304 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2305 mapping->writeback_index = index;
2306 return ret;
2308 #endif
2310 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2311 get_extent_t *get_extent,
2312 struct writeback_control *wbc)
2314 int ret;
2315 struct address_space *mapping = page->mapping;
2316 struct extent_page_data epd = {
2317 .bio = NULL,
2318 .tree = tree,
2319 .get_extent = get_extent,
2321 struct writeback_control wbc_writepages = {
2322 .bdi = wbc->bdi,
2323 .sync_mode = WB_SYNC_NONE,
2324 .older_than_this = NULL,
2325 .nr_to_write = 64,
2326 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2327 .range_end = (loff_t)-1,
2331 ret = __extent_writepage(page, wbc, &epd);
2333 write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
2334 if (epd.bio) {
2335 submit_one_bio(WRITE, epd.bio, 0);
2337 return ret;
2339 EXPORT_SYMBOL(extent_write_full_page);
2342 int extent_writepages(struct extent_io_tree *tree,
2343 struct address_space *mapping,
2344 get_extent_t *get_extent,
2345 struct writeback_control *wbc)
2347 int ret = 0;
2348 struct extent_page_data epd = {
2349 .bio = NULL,
2350 .tree = tree,
2351 .get_extent = get_extent,
2354 ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2355 if (epd.bio) {
2356 submit_one_bio(WRITE, epd.bio, 0);
2358 return ret;
2360 EXPORT_SYMBOL(extent_writepages);
2362 int extent_readpages(struct extent_io_tree *tree,
2363 struct address_space *mapping,
2364 struct list_head *pages, unsigned nr_pages,
2365 get_extent_t get_extent)
2367 struct bio *bio = NULL;
2368 unsigned page_idx;
2369 struct pagevec pvec;
2371 pagevec_init(&pvec, 0);
2372 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2373 struct page *page = list_entry(pages->prev, struct page, lru);
2375 prefetchw(&page->flags);
2376 list_del(&page->lru);
2378 * what we want to do here is call add_to_page_cache_lru,
2379 * but that isn't exported, so we reproduce it here
2381 if (!add_to_page_cache(page, mapping,
2382 page->index, GFP_KERNEL)) {
2384 /* open coding of lru_cache_add, also not exported */
2385 page_cache_get(page);
2386 if (!pagevec_add(&pvec, page))
2387 __pagevec_lru_add(&pvec);
2388 __extent_read_full_page(tree, page, get_extent,
2389 &bio, 0);
2391 page_cache_release(page);
2393 if (pagevec_count(&pvec))
2394 __pagevec_lru_add(&pvec);
2395 BUG_ON(!list_empty(pages));
2396 if (bio)
2397 submit_one_bio(READ, bio, 0);
2398 return 0;
2400 EXPORT_SYMBOL(extent_readpages);
2403 * basic invalidatepage code, this waits on any locked or writeback
2404 * ranges corresponding to the page, and then deletes any extent state
2405 * records from the tree
2407 int extent_invalidatepage(struct extent_io_tree *tree,
2408 struct page *page, unsigned long offset)
2410 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2411 u64 end = start + PAGE_CACHE_SIZE - 1;
2412 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2414 start += (offset + blocksize -1) & ~(blocksize - 1);
2415 if (start > end)
2416 return 0;
2418 lock_extent(tree, start, end, GFP_NOFS);
2419 wait_on_extent_writeback(tree, start, end);
2420 clear_extent_bit(tree, start, end,
2421 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2422 1, 1, GFP_NOFS);
2423 return 0;
2425 EXPORT_SYMBOL(extent_invalidatepage);
2428 * simple commit_write call, set_range_dirty is used to mark both
2429 * the pages and the extent records as dirty
2431 int extent_commit_write(struct extent_io_tree *tree,
2432 struct inode *inode, struct page *page,
2433 unsigned from, unsigned to)
2435 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2437 set_page_extent_mapped(page);
2438 set_page_dirty(page);
2440 if (pos > inode->i_size) {
2441 i_size_write(inode, pos);
2442 mark_inode_dirty(inode);
2444 return 0;
2446 EXPORT_SYMBOL(extent_commit_write);
2448 int extent_prepare_write(struct extent_io_tree *tree,
2449 struct inode *inode, struct page *page,
2450 unsigned from, unsigned to, get_extent_t *get_extent)
2452 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2453 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2454 u64 block_start;
2455 u64 orig_block_start;
2456 u64 block_end;
2457 u64 cur_end;
2458 struct extent_map *em;
2459 unsigned blocksize = 1 << inode->i_blkbits;
2460 size_t page_offset = 0;
2461 size_t block_off_start;
2462 size_t block_off_end;
2463 int err = 0;
2464 int iocount = 0;
2465 int ret = 0;
2466 int isnew;
2468 set_page_extent_mapped(page);
2470 block_start = (page_start + from) & ~((u64)blocksize - 1);
2471 block_end = (page_start + to - 1) | (blocksize - 1);
2472 orig_block_start = block_start;
2474 lock_extent(tree, page_start, page_end, GFP_NOFS);
2475 while(block_start <= block_end) {
2476 em = get_extent(inode, page, page_offset, block_start,
2477 block_end - block_start + 1, 1);
2478 if (IS_ERR(em) || !em) {
2479 goto err;
2481 cur_end = min(block_end, extent_map_end(em) - 1);
2482 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2483 block_off_end = block_off_start + blocksize;
2484 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2486 if (!PageUptodate(page) && isnew &&
2487 (block_off_end > to || block_off_start < from)) {
2488 void *kaddr;
2490 kaddr = kmap_atomic(page, KM_USER0);
2491 if (block_off_end > to)
2492 memset(kaddr + to, 0, block_off_end - to);
2493 if (block_off_start < from)
2494 memset(kaddr + block_off_start, 0,
2495 from - block_off_start);
2496 flush_dcache_page(page);
2497 kunmap_atomic(kaddr, KM_USER0);
2499 if ((em->block_start != EXTENT_MAP_HOLE &&
2500 em->block_start != EXTENT_MAP_INLINE) &&
2501 !isnew && !PageUptodate(page) &&
2502 (block_off_end > to || block_off_start < from) &&
2503 !test_range_bit(tree, block_start, cur_end,
2504 EXTENT_UPTODATE, 1)) {
2505 u64 sector;
2506 u64 extent_offset = block_start - em->start;
2507 size_t iosize;
2508 sector = (em->block_start + extent_offset) >> 9;
2509 iosize = (cur_end - block_start + blocksize) &
2510 ~((u64)blocksize - 1);
2512 * we've already got the extent locked, but we
2513 * need to split the state such that our end_bio
2514 * handler can clear the lock.
2516 set_extent_bit(tree, block_start,
2517 block_start + iosize - 1,
2518 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2519 ret = submit_extent_page(READ, tree, page,
2520 sector, iosize, page_offset, em->bdev,
2521 NULL, 1,
2522 end_bio_extent_preparewrite, 0);
2523 iocount++;
2524 block_start = block_start + iosize;
2525 } else {
2526 set_extent_uptodate(tree, block_start, cur_end,
2527 GFP_NOFS);
2528 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2529 block_start = cur_end + 1;
2531 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2532 free_extent_map(em);
2534 if (iocount) {
2535 wait_extent_bit(tree, orig_block_start,
2536 block_end, EXTENT_LOCKED);
2538 check_page_uptodate(tree, page);
2539 err:
2540 /* FIXME, zero out newly allocated blocks on error */
2541 return err;
2543 EXPORT_SYMBOL(extent_prepare_write);
2546 * a helper for releasepage, this tests for areas of the page that
2547 * are locked or under IO and drops the related state bits if it is safe
2548 * to drop the page.
2550 int try_release_extent_state(struct extent_map_tree *map,
2551 struct extent_io_tree *tree, struct page *page,
2552 gfp_t mask)
2554 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2555 u64 end = start + PAGE_CACHE_SIZE - 1;
2556 int ret = 1;
2558 if (test_range_bit(tree, start, end,
2559 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2560 ret = 0;
2561 else {
2562 if ((mask & GFP_NOFS) == GFP_NOFS)
2563 mask = GFP_NOFS;
2564 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2565 1, 1, mask);
2567 return ret;
2569 EXPORT_SYMBOL(try_release_extent_state);
2572 * a helper for releasepage. As long as there are no locked extents
2573 * in the range corresponding to the page, both state records and extent
2574 * map records are removed
2576 int try_release_extent_mapping(struct extent_map_tree *map,
2577 struct extent_io_tree *tree, struct page *page,
2578 gfp_t mask)
2580 struct extent_map *em;
2581 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2582 u64 end = start + PAGE_CACHE_SIZE - 1;
2584 if ((mask & __GFP_WAIT) &&
2585 page->mapping->host->i_size > 16 * 1024 * 1024) {
2586 u64 len;
2587 while (start <= end) {
2588 len = end - start + 1;
2589 spin_lock(&map->lock);
2590 em = lookup_extent_mapping(map, start, len);
2591 if (!em || IS_ERR(em)) {
2592 spin_unlock(&map->lock);
2593 break;
2595 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2596 em->start != start) {
2597 spin_unlock(&map->lock);
2598 free_extent_map(em);
2599 break;
2601 if (!test_range_bit(tree, em->start,
2602 extent_map_end(em) - 1,
2603 EXTENT_LOCKED, 0)) {
2604 remove_extent_mapping(map, em);
2605 /* once for the rb tree */
2606 free_extent_map(em);
2608 start = extent_map_end(em);
2609 spin_unlock(&map->lock);
2611 /* once for us */
2612 free_extent_map(em);
2615 return try_release_extent_state(map, tree, page, mask);
2617 EXPORT_SYMBOL(try_release_extent_mapping);
2619 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2620 get_extent_t *get_extent)
2622 struct inode *inode = mapping->host;
2623 u64 start = iblock << inode->i_blkbits;
2624 sector_t sector = 0;
2625 struct extent_map *em;
2627 em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2628 if (!em || IS_ERR(em))
2629 return 0;
2631 if (em->block_start == EXTENT_MAP_INLINE ||
2632 em->block_start == EXTENT_MAP_HOLE)
2633 goto out;
2635 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2636 out:
2637 free_extent_map(em);
2638 return sector;
2641 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2642 unsigned long i)
2644 struct page *p;
2645 struct address_space *mapping;
2647 if (i == 0)
2648 return eb->first_page;
2649 i += eb->start >> PAGE_CACHE_SHIFT;
2650 mapping = eb->first_page->mapping;
2651 if (!mapping)
2652 return NULL;
2655 * extent_buffer_page is only called after pinning the page
2656 * by increasing the reference count. So we know the page must
2657 * be in the radix tree.
2659 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2660 rcu_read_lock();
2661 #else
2662 read_lock_irq(&mapping->tree_lock);
2663 #endif
2664 p = radix_tree_lookup(&mapping->page_tree, i);
2666 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2667 rcu_read_unlock();
2668 #else
2669 read_unlock_irq(&mapping->tree_lock);
2670 #endif
2671 return p;
2674 static inline unsigned long num_extent_pages(u64 start, u64 len)
2676 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2677 (start >> PAGE_CACHE_SHIFT);
2680 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2681 u64 start,
2682 unsigned long len,
2683 gfp_t mask)
2685 struct extent_buffer *eb = NULL;
2686 unsigned long flags;
2688 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2689 eb->start = start;
2690 eb->len = len;
2691 mutex_init(&eb->mutex);
2692 spin_lock_irqsave(&leak_lock, flags);
2693 list_add(&eb->leak_list, &buffers);
2694 spin_unlock_irqrestore(&leak_lock, flags);
2695 atomic_set(&eb->refs, 1);
2697 return eb;
2700 static void __free_extent_buffer(struct extent_buffer *eb)
2702 unsigned long flags;
2703 spin_lock_irqsave(&leak_lock, flags);
2704 list_del(&eb->leak_list);
2705 spin_unlock_irqrestore(&leak_lock, flags);
2706 kmem_cache_free(extent_buffer_cache, eb);
2709 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2710 u64 start, unsigned long len,
2711 struct page *page0,
2712 gfp_t mask)
2714 unsigned long num_pages = num_extent_pages(start, len);
2715 unsigned long i;
2716 unsigned long index = start >> PAGE_CACHE_SHIFT;
2717 struct extent_buffer *eb;
2718 struct extent_buffer *exists = NULL;
2719 struct page *p;
2720 struct address_space *mapping = tree->mapping;
2721 int uptodate = 1;
2723 spin_lock(&tree->buffer_lock);
2724 eb = buffer_search(tree, start);
2725 if (eb) {
2726 atomic_inc(&eb->refs);
2727 spin_unlock(&tree->buffer_lock);
2728 return eb;
2730 spin_unlock(&tree->buffer_lock);
2732 eb = __alloc_extent_buffer(tree, start, len, mask);
2733 if (!eb)
2734 return NULL;
2736 if (page0) {
2737 eb->first_page = page0;
2738 i = 1;
2739 index++;
2740 page_cache_get(page0);
2741 mark_page_accessed(page0);
2742 set_page_extent_mapped(page0);
2743 set_page_extent_head(page0, len);
2744 uptodate = PageUptodate(page0);
2745 } else {
2746 i = 0;
2748 for (; i < num_pages; i++, index++) {
2749 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2750 if (!p) {
2751 WARN_ON(1);
2752 goto free_eb;
2754 set_page_extent_mapped(p);
2755 mark_page_accessed(p);
2756 if (i == 0) {
2757 eb->first_page = p;
2758 set_page_extent_head(p, len);
2759 } else {
2760 set_page_private(p, EXTENT_PAGE_PRIVATE);
2762 if (!PageUptodate(p))
2763 uptodate = 0;
2764 unlock_page(p);
2766 if (uptodate)
2767 eb->flags |= EXTENT_UPTODATE;
2768 eb->flags |= EXTENT_BUFFER_FILLED;
2770 spin_lock(&tree->buffer_lock);
2771 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2772 if (exists) {
2773 /* add one reference for the caller */
2774 atomic_inc(&exists->refs);
2775 spin_unlock(&tree->buffer_lock);
2776 goto free_eb;
2778 spin_unlock(&tree->buffer_lock);
2780 /* add one reference for the tree */
2781 atomic_inc(&eb->refs);
2782 return eb;
2784 free_eb:
2785 if (!atomic_dec_and_test(&eb->refs))
2786 return exists;
2787 for (index = 1; index < i; index++)
2788 page_cache_release(extent_buffer_page(eb, index));
2789 page_cache_release(extent_buffer_page(eb, 0));
2790 __free_extent_buffer(eb);
2791 return exists;
2793 EXPORT_SYMBOL(alloc_extent_buffer);
2795 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2796 u64 start, unsigned long len,
2797 gfp_t mask)
2799 struct extent_buffer *eb;
2801 spin_lock(&tree->buffer_lock);
2802 eb = buffer_search(tree, start);
2803 if (eb)
2804 atomic_inc(&eb->refs);
2805 spin_unlock(&tree->buffer_lock);
2807 return eb;
2809 EXPORT_SYMBOL(find_extent_buffer);
2811 void free_extent_buffer(struct extent_buffer *eb)
2813 if (!eb)
2814 return;
2816 if (!atomic_dec_and_test(&eb->refs))
2817 return;
2819 WARN_ON(1);
2821 EXPORT_SYMBOL(free_extent_buffer);
2823 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2824 struct extent_buffer *eb)
2826 int set;
2827 unsigned long i;
2828 unsigned long num_pages;
2829 struct page *page;
2831 u64 start = eb->start;
2832 u64 end = start + eb->len - 1;
2834 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2835 num_pages = num_extent_pages(eb->start, eb->len);
2837 for (i = 0; i < num_pages; i++) {
2838 page = extent_buffer_page(eb, i);
2839 lock_page(page);
2840 if (i == 0)
2841 set_page_extent_head(page, eb->len);
2842 else
2843 set_page_private(page, EXTENT_PAGE_PRIVATE);
2846 * if we're on the last page or the first page and the
2847 * block isn't aligned on a page boundary, do extra checks
2848 * to make sure we don't clean page that is partially dirty
2850 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2851 ((i == num_pages - 1) &&
2852 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2853 start = (u64)page->index << PAGE_CACHE_SHIFT;
2854 end = start + PAGE_CACHE_SIZE - 1;
2855 if (test_range_bit(tree, start, end,
2856 EXTENT_DIRTY, 0)) {
2857 unlock_page(page);
2858 continue;
2861 clear_page_dirty_for_io(page);
2862 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2863 spin_lock_irq(&page->mapping->tree_lock);
2864 #else
2865 read_lock_irq(&page->mapping->tree_lock);
2866 #endif
2867 if (!PageDirty(page)) {
2868 radix_tree_tag_clear(&page->mapping->page_tree,
2869 page_index(page),
2870 PAGECACHE_TAG_DIRTY);
2872 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2873 spin_unlock_irq(&page->mapping->tree_lock);
2874 #else
2875 read_unlock_irq(&page->mapping->tree_lock);
2876 #endif
2877 unlock_page(page);
2879 return 0;
2881 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2883 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2884 struct extent_buffer *eb)
2886 return wait_on_extent_writeback(tree, eb->start,
2887 eb->start + eb->len - 1);
2889 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2891 int set_extent_buffer_dirty(struct extent_io_tree *tree,
2892 struct extent_buffer *eb)
2894 unsigned long i;
2895 unsigned long num_pages;
2897 num_pages = num_extent_pages(eb->start, eb->len);
2898 for (i = 0; i < num_pages; i++) {
2899 struct page *page = extent_buffer_page(eb, i);
2900 /* writepage may need to do something special for the
2901 * first page, we have to make sure page->private is
2902 * properly set. releasepage may drop page->private
2903 * on us if the page isn't already dirty.
2905 if (i == 0) {
2906 lock_page(page);
2907 set_page_extent_head(page, eb->len);
2908 } else if (PagePrivate(page) &&
2909 page->private != EXTENT_PAGE_PRIVATE) {
2910 lock_page(page);
2911 set_page_extent_mapped(page);
2912 unlock_page(page);
2914 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2915 if (i == 0)
2916 unlock_page(page);
2918 return set_extent_dirty(tree, eb->start,
2919 eb->start + eb->len - 1, GFP_NOFS);
2921 EXPORT_SYMBOL(set_extent_buffer_dirty);
2923 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
2924 struct extent_buffer *eb)
2926 unsigned long i;
2927 struct page *page;
2928 unsigned long num_pages;
2930 num_pages = num_extent_pages(eb->start, eb->len);
2931 eb->flags &= ~EXTENT_UPTODATE;
2933 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2934 GFP_NOFS);
2935 for (i = 0; i < num_pages; i++) {
2936 page = extent_buffer_page(eb, i);
2937 if (page)
2938 ClearPageUptodate(page);
2940 return 0;
2943 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2944 struct extent_buffer *eb)
2946 unsigned long i;
2947 struct page *page;
2948 unsigned long num_pages;
2950 num_pages = num_extent_pages(eb->start, eb->len);
2952 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2953 GFP_NOFS);
2954 for (i = 0; i < num_pages; i++) {
2955 page = extent_buffer_page(eb, i);
2956 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2957 ((i == num_pages - 1) &&
2958 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2959 check_page_uptodate(tree, page);
2960 continue;
2962 SetPageUptodate(page);
2964 return 0;
2966 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2968 int extent_range_uptodate(struct extent_io_tree *tree,
2969 u64 start, u64 end)
2971 struct page *page;
2972 int ret;
2973 int pg_uptodate = 1;
2974 int uptodate;
2975 unsigned long index;
2977 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
2978 if (ret)
2979 return 1;
2980 while(start <= end) {
2981 index = start >> PAGE_CACHE_SHIFT;
2982 page = find_get_page(tree->mapping, index);
2983 uptodate = PageUptodate(page);
2984 page_cache_release(page);
2985 if (!uptodate) {
2986 pg_uptodate = 0;
2987 break;
2989 start += PAGE_CACHE_SIZE;
2991 return pg_uptodate;
2994 int extent_buffer_uptodate(struct extent_io_tree *tree,
2995 struct extent_buffer *eb)
2997 int ret = 0;
2998 unsigned long num_pages;
2999 unsigned long i;
3000 struct page *page;
3001 int pg_uptodate = 1;
3003 if (eb->flags & EXTENT_UPTODATE)
3004 return 1;
3006 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3007 EXTENT_UPTODATE, 1);
3008 if (ret)
3009 return ret;
3011 num_pages = num_extent_pages(eb->start, eb->len);
3012 for (i = 0; i < num_pages; i++) {
3013 page = extent_buffer_page(eb, i);
3014 if (!PageUptodate(page)) {
3015 pg_uptodate = 0;
3016 break;
3019 return pg_uptodate;
3021 EXPORT_SYMBOL(extent_buffer_uptodate);
3023 int read_extent_buffer_pages(struct extent_io_tree *tree,
3024 struct extent_buffer *eb,
3025 u64 start, int wait,
3026 get_extent_t *get_extent, int mirror_num)
3028 unsigned long i;
3029 unsigned long start_i;
3030 struct page *page;
3031 int err;
3032 int ret = 0;
3033 int locked_pages = 0;
3034 int all_uptodate = 1;
3035 int inc_all_pages = 0;
3036 unsigned long num_pages;
3037 struct bio *bio = NULL;
3039 if (eb->flags & EXTENT_UPTODATE)
3040 return 0;
3042 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3043 EXTENT_UPTODATE, 1)) {
3044 return 0;
3047 if (start) {
3048 WARN_ON(start < eb->start);
3049 start_i = (start >> PAGE_CACHE_SHIFT) -
3050 (eb->start >> PAGE_CACHE_SHIFT);
3051 } else {
3052 start_i = 0;
3055 num_pages = num_extent_pages(eb->start, eb->len);
3056 for (i = start_i; i < num_pages; i++) {
3057 page = extent_buffer_page(eb, i);
3058 if (!wait) {
3059 if (!trylock_page(page))
3060 goto unlock_exit;
3061 } else {
3062 lock_page(page);
3064 locked_pages++;
3065 if (!PageUptodate(page)) {
3066 all_uptodate = 0;
3069 if (all_uptodate) {
3070 if (start_i == 0)
3071 eb->flags |= EXTENT_UPTODATE;
3072 goto unlock_exit;
3075 for (i = start_i; i < num_pages; i++) {
3076 page = extent_buffer_page(eb, i);
3077 if (inc_all_pages)
3078 page_cache_get(page);
3079 if (!PageUptodate(page)) {
3080 if (start_i == 0)
3081 inc_all_pages = 1;
3082 ClearPageError(page);
3083 err = __extent_read_full_page(tree, page,
3084 get_extent, &bio,
3085 mirror_num);
3086 if (err) {
3087 ret = err;
3089 } else {
3090 unlock_page(page);
3094 if (bio)
3095 submit_one_bio(READ, bio, mirror_num);
3097 if (ret || !wait) {
3098 return ret;
3100 for (i = start_i; i < num_pages; i++) {
3101 page = extent_buffer_page(eb, i);
3102 wait_on_page_locked(page);
3103 if (!PageUptodate(page)) {
3104 ret = -EIO;
3107 if (!ret)
3108 eb->flags |= EXTENT_UPTODATE;
3109 return ret;
3111 unlock_exit:
3112 i = start_i;
3113 while(locked_pages > 0) {
3114 page = extent_buffer_page(eb, i);
3115 i++;
3116 unlock_page(page);
3117 locked_pages--;
3119 return ret;
3121 EXPORT_SYMBOL(read_extent_buffer_pages);
3123 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3124 unsigned long start,
3125 unsigned long len)
3127 size_t cur;
3128 size_t offset;
3129 struct page *page;
3130 char *kaddr;
3131 char *dst = (char *)dstv;
3132 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3133 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3135 WARN_ON(start > eb->len);
3136 WARN_ON(start + len > eb->start + eb->len);
3138 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3140 while(len > 0) {
3141 page = extent_buffer_page(eb, i);
3143 cur = min(len, (PAGE_CACHE_SIZE - offset));
3144 kaddr = kmap_atomic(page, KM_USER1);
3145 memcpy(dst, kaddr + offset, cur);
3146 kunmap_atomic(kaddr, KM_USER1);
3148 dst += cur;
3149 len -= cur;
3150 offset = 0;
3151 i++;
3154 EXPORT_SYMBOL(read_extent_buffer);
3156 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3157 unsigned long min_len, char **token, char **map,
3158 unsigned long *map_start,
3159 unsigned long *map_len, int km)
3161 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3162 char *kaddr;
3163 struct page *p;
3164 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3165 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3166 unsigned long end_i = (start_offset + start + min_len - 1) >>
3167 PAGE_CACHE_SHIFT;
3169 if (i != end_i)
3170 return -EINVAL;
3172 if (i == 0) {
3173 offset = start_offset;
3174 *map_start = 0;
3175 } else {
3176 offset = 0;
3177 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3179 if (start + min_len > eb->len) {
3180 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3181 WARN_ON(1);
3184 p = extent_buffer_page(eb, i);
3185 kaddr = kmap_atomic(p, km);
3186 *token = kaddr;
3187 *map = kaddr + offset;
3188 *map_len = PAGE_CACHE_SIZE - offset;
3189 return 0;
3191 EXPORT_SYMBOL(map_private_extent_buffer);
3193 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3194 unsigned long min_len,
3195 char **token, char **map,
3196 unsigned long *map_start,
3197 unsigned long *map_len, int km)
3199 int err;
3200 int save = 0;
3201 if (eb->map_token) {
3202 unmap_extent_buffer(eb, eb->map_token, km);
3203 eb->map_token = NULL;
3204 save = 1;
3206 err = map_private_extent_buffer(eb, start, min_len, token, map,
3207 map_start, map_len, km);
3208 if (!err && save) {
3209 eb->map_token = *token;
3210 eb->kaddr = *map;
3211 eb->map_start = *map_start;
3212 eb->map_len = *map_len;
3214 return err;
3216 EXPORT_SYMBOL(map_extent_buffer);
3218 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3220 kunmap_atomic(token, km);
3222 EXPORT_SYMBOL(unmap_extent_buffer);
3224 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3225 unsigned long start,
3226 unsigned long len)
3228 size_t cur;
3229 size_t offset;
3230 struct page *page;
3231 char *kaddr;
3232 char *ptr = (char *)ptrv;
3233 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3234 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3235 int ret = 0;
3237 WARN_ON(start > eb->len);
3238 WARN_ON(start + len > eb->start + eb->len);
3240 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3242 while(len > 0) {
3243 page = extent_buffer_page(eb, i);
3245 cur = min(len, (PAGE_CACHE_SIZE - offset));
3247 kaddr = kmap_atomic(page, KM_USER0);
3248 ret = memcmp(ptr, kaddr + offset, cur);
3249 kunmap_atomic(kaddr, KM_USER0);
3250 if (ret)
3251 break;
3253 ptr += cur;
3254 len -= cur;
3255 offset = 0;
3256 i++;
3258 return ret;
3260 EXPORT_SYMBOL(memcmp_extent_buffer);
3262 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3263 unsigned long start, unsigned long len)
3265 size_t cur;
3266 size_t offset;
3267 struct page *page;
3268 char *kaddr;
3269 char *src = (char *)srcv;
3270 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3271 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3273 WARN_ON(start > eb->len);
3274 WARN_ON(start + len > eb->start + eb->len);
3276 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3278 while(len > 0) {
3279 page = extent_buffer_page(eb, i);
3280 WARN_ON(!PageUptodate(page));
3282 cur = min(len, PAGE_CACHE_SIZE - offset);
3283 kaddr = kmap_atomic(page, KM_USER1);
3284 memcpy(kaddr + offset, src, cur);
3285 kunmap_atomic(kaddr, KM_USER1);
3287 src += cur;
3288 len -= cur;
3289 offset = 0;
3290 i++;
3293 EXPORT_SYMBOL(write_extent_buffer);
3295 void memset_extent_buffer(struct extent_buffer *eb, char c,
3296 unsigned long start, unsigned long len)
3298 size_t cur;
3299 size_t offset;
3300 struct page *page;
3301 char *kaddr;
3302 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3303 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3305 WARN_ON(start > eb->len);
3306 WARN_ON(start + len > eb->start + eb->len);
3308 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3310 while(len > 0) {
3311 page = extent_buffer_page(eb, i);
3312 WARN_ON(!PageUptodate(page));
3314 cur = min(len, PAGE_CACHE_SIZE - offset);
3315 kaddr = kmap_atomic(page, KM_USER0);
3316 memset(kaddr + offset, c, cur);
3317 kunmap_atomic(kaddr, KM_USER0);
3319 len -= cur;
3320 offset = 0;
3321 i++;
3324 EXPORT_SYMBOL(memset_extent_buffer);
3326 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3327 unsigned long dst_offset, unsigned long src_offset,
3328 unsigned long len)
3330 u64 dst_len = dst->len;
3331 size_t cur;
3332 size_t offset;
3333 struct page *page;
3334 char *kaddr;
3335 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3336 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3338 WARN_ON(src->len != dst_len);
3340 offset = (start_offset + dst_offset) &
3341 ((unsigned long)PAGE_CACHE_SIZE - 1);
3343 while(len > 0) {
3344 page = extent_buffer_page(dst, i);
3345 WARN_ON(!PageUptodate(page));
3347 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3349 kaddr = kmap_atomic(page, KM_USER0);
3350 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3351 kunmap_atomic(kaddr, KM_USER0);
3353 src_offset += cur;
3354 len -= cur;
3355 offset = 0;
3356 i++;
3359 EXPORT_SYMBOL(copy_extent_buffer);
3361 static void move_pages(struct page *dst_page, struct page *src_page,
3362 unsigned long dst_off, unsigned long src_off,
3363 unsigned long len)
3365 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3366 if (dst_page == src_page) {
3367 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3368 } else {
3369 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3370 char *p = dst_kaddr + dst_off + len;
3371 char *s = src_kaddr + src_off + len;
3373 while (len--)
3374 *--p = *--s;
3376 kunmap_atomic(src_kaddr, KM_USER1);
3378 kunmap_atomic(dst_kaddr, KM_USER0);
3381 static void copy_pages(struct page *dst_page, struct page *src_page,
3382 unsigned long dst_off, unsigned long src_off,
3383 unsigned long len)
3385 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3386 char *src_kaddr;
3388 if (dst_page != src_page)
3389 src_kaddr = kmap_atomic(src_page, KM_USER1);
3390 else
3391 src_kaddr = dst_kaddr;
3393 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3394 kunmap_atomic(dst_kaddr, KM_USER0);
3395 if (dst_page != src_page)
3396 kunmap_atomic(src_kaddr, KM_USER1);
3399 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3400 unsigned long src_offset, unsigned long len)
3402 size_t cur;
3403 size_t dst_off_in_page;
3404 size_t src_off_in_page;
3405 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3406 unsigned long dst_i;
3407 unsigned long src_i;
3409 if (src_offset + len > dst->len) {
3410 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3411 src_offset, len, dst->len);
3412 BUG_ON(1);
3414 if (dst_offset + len > dst->len) {
3415 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3416 dst_offset, len, dst->len);
3417 BUG_ON(1);
3420 while(len > 0) {
3421 dst_off_in_page = (start_offset + dst_offset) &
3422 ((unsigned long)PAGE_CACHE_SIZE - 1);
3423 src_off_in_page = (start_offset + src_offset) &
3424 ((unsigned long)PAGE_CACHE_SIZE - 1);
3426 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3427 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3429 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3430 src_off_in_page));
3431 cur = min_t(unsigned long, cur,
3432 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3434 copy_pages(extent_buffer_page(dst, dst_i),
3435 extent_buffer_page(dst, src_i),
3436 dst_off_in_page, src_off_in_page, cur);
3438 src_offset += cur;
3439 dst_offset += cur;
3440 len -= cur;
3443 EXPORT_SYMBOL(memcpy_extent_buffer);
3445 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3446 unsigned long src_offset, unsigned long len)
3448 size_t cur;
3449 size_t dst_off_in_page;
3450 size_t src_off_in_page;
3451 unsigned long dst_end = dst_offset + len - 1;
3452 unsigned long src_end = src_offset + len - 1;
3453 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3454 unsigned long dst_i;
3455 unsigned long src_i;
3457 if (src_offset + len > dst->len) {
3458 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3459 src_offset, len, dst->len);
3460 BUG_ON(1);
3462 if (dst_offset + len > dst->len) {
3463 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3464 dst_offset, len, dst->len);
3465 BUG_ON(1);
3467 if (dst_offset < src_offset) {
3468 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3469 return;
3471 while(len > 0) {
3472 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3473 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3475 dst_off_in_page = (start_offset + dst_end) &
3476 ((unsigned long)PAGE_CACHE_SIZE - 1);
3477 src_off_in_page = (start_offset + src_end) &
3478 ((unsigned long)PAGE_CACHE_SIZE - 1);
3480 cur = min_t(unsigned long, len, src_off_in_page + 1);
3481 cur = min(cur, dst_off_in_page + 1);
3482 move_pages(extent_buffer_page(dst, dst_i),
3483 extent_buffer_page(dst, src_i),
3484 dst_off_in_page - cur + 1,
3485 src_off_in_page - cur + 1, cur);
3487 dst_end -= cur;
3488 src_end -= cur;
3489 len -= cur;
3492 EXPORT_SYMBOL(memmove_extent_buffer);
3494 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3496 u64 start = page_offset(page);
3497 struct extent_buffer *eb;
3498 int ret = 1;
3499 unsigned long i;
3500 unsigned long num_pages;
3502 spin_lock(&tree->buffer_lock);
3503 eb = buffer_search(tree, start);
3504 if (!eb)
3505 goto out;
3507 if (atomic_read(&eb->refs) > 1) {
3508 ret = 0;
3509 goto out;
3511 /* at this point we can safely release the extent buffer */
3512 num_pages = num_extent_pages(eb->start, eb->len);
3513 for (i = 0; i < num_pages; i++) {
3514 struct page *page = extent_buffer_page(eb, i);
3515 page_cache_release(page);
3517 rb_erase(&eb->rb_node, &tree->buffer);
3518 __free_extent_buffer(eb);
3519 out:
3520 spin_unlock(&tree->buffer_lock);
3521 return ret;
3523 EXPORT_SYMBOL(try_release_extent_buffer);