Merge branch 'nfs-for-2.6.37' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
[linux-2.6.git] / fs / btrfs / free-space-cache.c
blobf488fac04d99ea45eea93607bbf17c021b5b2207
1 /*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/math64.h>
23 #include "ctree.h"
24 #include "free-space-cache.h"
25 #include "transaction.h"
27 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
28 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
30 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
31 u64 offset)
33 BUG_ON(offset < bitmap_start);
34 offset -= bitmap_start;
35 return (unsigned long)(div64_u64(offset, sectorsize));
38 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
40 return (unsigned long)(div64_u64(bytes, sectorsize));
43 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
44 u64 offset)
46 u64 bitmap_start;
47 u64 bytes_per_bitmap;
49 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
50 bitmap_start = offset - block_group->key.objectid;
51 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
52 bitmap_start *= bytes_per_bitmap;
53 bitmap_start += block_group->key.objectid;
55 return bitmap_start;
58 static int tree_insert_offset(struct rb_root *root, u64 offset,
59 struct rb_node *node, int bitmap)
61 struct rb_node **p = &root->rb_node;
62 struct rb_node *parent = NULL;
63 struct btrfs_free_space *info;
65 while (*p) {
66 parent = *p;
67 info = rb_entry(parent, struct btrfs_free_space, offset_index);
69 if (offset < info->offset) {
70 p = &(*p)->rb_left;
71 } else if (offset > info->offset) {
72 p = &(*p)->rb_right;
73 } else {
75 * we could have a bitmap entry and an extent entry
76 * share the same offset. If this is the case, we want
77 * the extent entry to always be found first if we do a
78 * linear search through the tree, since we want to have
79 * the quickest allocation time, and allocating from an
80 * extent is faster than allocating from a bitmap. So
81 * if we're inserting a bitmap and we find an entry at
82 * this offset, we want to go right, or after this entry
83 * logically. If we are inserting an extent and we've
84 * found a bitmap, we want to go left, or before
85 * logically.
87 if (bitmap) {
88 WARN_ON(info->bitmap);
89 p = &(*p)->rb_right;
90 } else {
91 WARN_ON(!info->bitmap);
92 p = &(*p)->rb_left;
97 rb_link_node(node, parent, p);
98 rb_insert_color(node, root);
100 return 0;
104 * searches the tree for the given offset.
106 * fuzzy - If this is set, then we are trying to make an allocation, and we just
107 * want a section that has at least bytes size and comes at or after the given
108 * offset.
110 static struct btrfs_free_space *
111 tree_search_offset(struct btrfs_block_group_cache *block_group,
112 u64 offset, int bitmap_only, int fuzzy)
114 struct rb_node *n = block_group->free_space_offset.rb_node;
115 struct btrfs_free_space *entry, *prev = NULL;
117 /* find entry that is closest to the 'offset' */
118 while (1) {
119 if (!n) {
120 entry = NULL;
121 break;
124 entry = rb_entry(n, struct btrfs_free_space, offset_index);
125 prev = entry;
127 if (offset < entry->offset)
128 n = n->rb_left;
129 else if (offset > entry->offset)
130 n = n->rb_right;
131 else
132 break;
135 if (bitmap_only) {
136 if (!entry)
137 return NULL;
138 if (entry->bitmap)
139 return entry;
142 * bitmap entry and extent entry may share same offset,
143 * in that case, bitmap entry comes after extent entry.
145 n = rb_next(n);
146 if (!n)
147 return NULL;
148 entry = rb_entry(n, struct btrfs_free_space, offset_index);
149 if (entry->offset != offset)
150 return NULL;
152 WARN_ON(!entry->bitmap);
153 return entry;
154 } else if (entry) {
155 if (entry->bitmap) {
157 * if previous extent entry covers the offset,
158 * we should return it instead of the bitmap entry
160 n = &entry->offset_index;
161 while (1) {
162 n = rb_prev(n);
163 if (!n)
164 break;
165 prev = rb_entry(n, struct btrfs_free_space,
166 offset_index);
167 if (!prev->bitmap) {
168 if (prev->offset + prev->bytes > offset)
169 entry = prev;
170 break;
174 return entry;
177 if (!prev)
178 return NULL;
180 /* find last entry before the 'offset' */
181 entry = prev;
182 if (entry->offset > offset) {
183 n = rb_prev(&entry->offset_index);
184 if (n) {
185 entry = rb_entry(n, struct btrfs_free_space,
186 offset_index);
187 BUG_ON(entry->offset > offset);
188 } else {
189 if (fuzzy)
190 return entry;
191 else
192 return NULL;
196 if (entry->bitmap) {
197 n = &entry->offset_index;
198 while (1) {
199 n = rb_prev(n);
200 if (!n)
201 break;
202 prev = rb_entry(n, struct btrfs_free_space,
203 offset_index);
204 if (!prev->bitmap) {
205 if (prev->offset + prev->bytes > offset)
206 return prev;
207 break;
210 if (entry->offset + BITS_PER_BITMAP *
211 block_group->sectorsize > offset)
212 return entry;
213 } else if (entry->offset + entry->bytes > offset)
214 return entry;
216 if (!fuzzy)
217 return NULL;
219 while (1) {
220 if (entry->bitmap) {
221 if (entry->offset + BITS_PER_BITMAP *
222 block_group->sectorsize > offset)
223 break;
224 } else {
225 if (entry->offset + entry->bytes > offset)
226 break;
229 n = rb_next(&entry->offset_index);
230 if (!n)
231 return NULL;
232 entry = rb_entry(n, struct btrfs_free_space, offset_index);
234 return entry;
237 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
238 struct btrfs_free_space *info)
240 rb_erase(&info->offset_index, &block_group->free_space_offset);
241 block_group->free_extents--;
242 block_group->free_space -= info->bytes;
245 static int link_free_space(struct btrfs_block_group_cache *block_group,
246 struct btrfs_free_space *info)
248 int ret = 0;
250 BUG_ON(!info->bitmap && !info->bytes);
251 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
252 &info->offset_index, (info->bitmap != NULL));
253 if (ret)
254 return ret;
256 block_group->free_space += info->bytes;
257 block_group->free_extents++;
258 return ret;
261 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
263 u64 max_bytes;
264 u64 bitmap_bytes;
265 u64 extent_bytes;
268 * The goal is to keep the total amount of memory used per 1gb of space
269 * at or below 32k, so we need to adjust how much memory we allow to be
270 * used by extent based free space tracking
272 max_bytes = MAX_CACHE_BYTES_PER_GIG *
273 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
276 * we want to account for 1 more bitmap than what we have so we can make
277 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
278 * we add more bitmaps.
280 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
282 if (bitmap_bytes >= max_bytes) {
283 block_group->extents_thresh = 0;
284 return;
288 * we want the extent entry threshold to always be at most 1/2 the maxw
289 * bytes we can have, or whatever is less than that.
291 extent_bytes = max_bytes - bitmap_bytes;
292 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
294 block_group->extents_thresh =
295 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
298 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
299 struct btrfs_free_space *info, u64 offset,
300 u64 bytes)
302 unsigned long start, end;
303 unsigned long i;
305 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
306 end = start + bytes_to_bits(bytes, block_group->sectorsize);
307 BUG_ON(end > BITS_PER_BITMAP);
309 for (i = start; i < end; i++)
310 clear_bit(i, info->bitmap);
312 info->bytes -= bytes;
313 block_group->free_space -= bytes;
316 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
317 struct btrfs_free_space *info, u64 offset,
318 u64 bytes)
320 unsigned long start, end;
321 unsigned long i;
323 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
324 end = start + bytes_to_bits(bytes, block_group->sectorsize);
325 BUG_ON(end > BITS_PER_BITMAP);
327 for (i = start; i < end; i++)
328 set_bit(i, info->bitmap);
330 info->bytes += bytes;
331 block_group->free_space += bytes;
334 static int search_bitmap(struct btrfs_block_group_cache *block_group,
335 struct btrfs_free_space *bitmap_info, u64 *offset,
336 u64 *bytes)
338 unsigned long found_bits = 0;
339 unsigned long bits, i;
340 unsigned long next_zero;
342 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
343 max_t(u64, *offset, bitmap_info->offset));
344 bits = bytes_to_bits(*bytes, block_group->sectorsize);
346 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
347 i < BITS_PER_BITMAP;
348 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
349 next_zero = find_next_zero_bit(bitmap_info->bitmap,
350 BITS_PER_BITMAP, i);
351 if ((next_zero - i) >= bits) {
352 found_bits = next_zero - i;
353 break;
355 i = next_zero;
358 if (found_bits) {
359 *offset = (u64)(i * block_group->sectorsize) +
360 bitmap_info->offset;
361 *bytes = (u64)(found_bits) * block_group->sectorsize;
362 return 0;
365 return -1;
368 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
369 *block_group, u64 *offset,
370 u64 *bytes, int debug)
372 struct btrfs_free_space *entry;
373 struct rb_node *node;
374 int ret;
376 if (!block_group->free_space_offset.rb_node)
377 return NULL;
379 entry = tree_search_offset(block_group,
380 offset_to_bitmap(block_group, *offset),
381 0, 1);
382 if (!entry)
383 return NULL;
385 for (node = &entry->offset_index; node; node = rb_next(node)) {
386 entry = rb_entry(node, struct btrfs_free_space, offset_index);
387 if (entry->bytes < *bytes)
388 continue;
390 if (entry->bitmap) {
391 ret = search_bitmap(block_group, entry, offset, bytes);
392 if (!ret)
393 return entry;
394 continue;
397 *offset = entry->offset;
398 *bytes = entry->bytes;
399 return entry;
402 return NULL;
405 static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
406 struct btrfs_free_space *info, u64 offset)
408 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
409 int max_bitmaps = (int)div64_u64(block_group->key.offset +
410 bytes_per_bg - 1, bytes_per_bg);
411 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
413 info->offset = offset_to_bitmap(block_group, offset);
414 info->bytes = 0;
415 link_free_space(block_group, info);
416 block_group->total_bitmaps++;
418 recalculate_thresholds(block_group);
421 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
422 struct btrfs_free_space *bitmap_info,
423 u64 *offset, u64 *bytes)
425 u64 end;
426 u64 search_start, search_bytes;
427 int ret;
429 again:
430 end = bitmap_info->offset +
431 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
434 * XXX - this can go away after a few releases.
436 * since the only user of btrfs_remove_free_space is the tree logging
437 * stuff, and the only way to test that is under crash conditions, we
438 * want to have this debug stuff here just in case somethings not
439 * working. Search the bitmap for the space we are trying to use to
440 * make sure its actually there. If its not there then we need to stop
441 * because something has gone wrong.
443 search_start = *offset;
444 search_bytes = *bytes;
445 ret = search_bitmap(block_group, bitmap_info, &search_start,
446 &search_bytes);
447 BUG_ON(ret < 0 || search_start != *offset);
449 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
450 bitmap_clear_bits(block_group, bitmap_info, *offset,
451 end - *offset + 1);
452 *bytes -= end - *offset + 1;
453 *offset = end + 1;
454 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
455 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
456 *bytes = 0;
459 if (*bytes) {
460 struct rb_node *next = rb_next(&bitmap_info->offset_index);
461 if (!bitmap_info->bytes) {
462 unlink_free_space(block_group, bitmap_info);
463 kfree(bitmap_info->bitmap);
464 kfree(bitmap_info);
465 block_group->total_bitmaps--;
466 recalculate_thresholds(block_group);
470 * no entry after this bitmap, but we still have bytes to
471 * remove, so something has gone wrong.
473 if (!next)
474 return -EINVAL;
476 bitmap_info = rb_entry(next, struct btrfs_free_space,
477 offset_index);
480 * if the next entry isn't a bitmap we need to return to let the
481 * extent stuff do its work.
483 if (!bitmap_info->bitmap)
484 return -EAGAIN;
487 * Ok the next item is a bitmap, but it may not actually hold
488 * the information for the rest of this free space stuff, so
489 * look for it, and if we don't find it return so we can try
490 * everything over again.
492 search_start = *offset;
493 search_bytes = *bytes;
494 ret = search_bitmap(block_group, bitmap_info, &search_start,
495 &search_bytes);
496 if (ret < 0 || search_start != *offset)
497 return -EAGAIN;
499 goto again;
500 } else if (!bitmap_info->bytes) {
501 unlink_free_space(block_group, bitmap_info);
502 kfree(bitmap_info->bitmap);
503 kfree(bitmap_info);
504 block_group->total_bitmaps--;
505 recalculate_thresholds(block_group);
508 return 0;
511 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
512 struct btrfs_free_space *info)
514 struct btrfs_free_space *bitmap_info;
515 int added = 0;
516 u64 bytes, offset, end;
517 int ret;
520 * If we are below the extents threshold then we can add this as an
521 * extent, and don't have to deal with the bitmap
523 if (block_group->free_extents < block_group->extents_thresh &&
524 info->bytes > block_group->sectorsize * 4)
525 return 0;
528 * some block groups are so tiny they can't be enveloped by a bitmap, so
529 * don't even bother to create a bitmap for this
531 if (BITS_PER_BITMAP * block_group->sectorsize >
532 block_group->key.offset)
533 return 0;
535 bytes = info->bytes;
536 offset = info->offset;
538 again:
539 bitmap_info = tree_search_offset(block_group,
540 offset_to_bitmap(block_group, offset),
541 1, 0);
542 if (!bitmap_info) {
543 BUG_ON(added);
544 goto new_bitmap;
547 end = bitmap_info->offset +
548 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
550 if (offset >= bitmap_info->offset && offset + bytes > end) {
551 bitmap_set_bits(block_group, bitmap_info, offset,
552 end - offset);
553 bytes -= end - offset;
554 offset = end;
555 added = 0;
556 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
557 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
558 bytes = 0;
559 } else {
560 BUG();
563 if (!bytes) {
564 ret = 1;
565 goto out;
566 } else
567 goto again;
569 new_bitmap:
570 if (info && info->bitmap) {
571 add_new_bitmap(block_group, info, offset);
572 added = 1;
573 info = NULL;
574 goto again;
575 } else {
576 spin_unlock(&block_group->tree_lock);
578 /* no pre-allocated info, allocate a new one */
579 if (!info) {
580 info = kzalloc(sizeof(struct btrfs_free_space),
581 GFP_NOFS);
582 if (!info) {
583 spin_lock(&block_group->tree_lock);
584 ret = -ENOMEM;
585 goto out;
589 /* allocate the bitmap */
590 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
591 spin_lock(&block_group->tree_lock);
592 if (!info->bitmap) {
593 ret = -ENOMEM;
594 goto out;
596 goto again;
599 out:
600 if (info) {
601 if (info->bitmap)
602 kfree(info->bitmap);
603 kfree(info);
606 return ret;
609 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
610 u64 offset, u64 bytes)
612 struct btrfs_free_space *right_info = NULL;
613 struct btrfs_free_space *left_info = NULL;
614 struct btrfs_free_space *info = NULL;
615 int ret = 0;
617 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
618 if (!info)
619 return -ENOMEM;
621 info->offset = offset;
622 info->bytes = bytes;
624 spin_lock(&block_group->tree_lock);
627 * first we want to see if there is free space adjacent to the range we
628 * are adding, if there is remove that struct and add a new one to
629 * cover the entire range
631 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
632 if (right_info && rb_prev(&right_info->offset_index))
633 left_info = rb_entry(rb_prev(&right_info->offset_index),
634 struct btrfs_free_space, offset_index);
635 else
636 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
639 * If there was no extent directly to the left or right of this new
640 * extent then we know we're going to have to allocate a new extent, so
641 * before we do that see if we need to drop this into a bitmap
643 if ((!left_info || left_info->bitmap) &&
644 (!right_info || right_info->bitmap)) {
645 ret = insert_into_bitmap(block_group, info);
647 if (ret < 0) {
648 goto out;
649 } else if (ret) {
650 ret = 0;
651 goto out;
655 if (right_info && !right_info->bitmap) {
656 unlink_free_space(block_group, right_info);
657 info->bytes += right_info->bytes;
658 kfree(right_info);
661 if (left_info && !left_info->bitmap &&
662 left_info->offset + left_info->bytes == offset) {
663 unlink_free_space(block_group, left_info);
664 info->offset = left_info->offset;
665 info->bytes += left_info->bytes;
666 kfree(left_info);
669 ret = link_free_space(block_group, info);
670 if (ret)
671 kfree(info);
672 out:
673 spin_unlock(&block_group->tree_lock);
675 if (ret) {
676 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
677 BUG_ON(ret == -EEXIST);
680 return ret;
683 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
684 u64 offset, u64 bytes)
686 struct btrfs_free_space *info;
687 struct btrfs_free_space *next_info = NULL;
688 int ret = 0;
690 spin_lock(&block_group->tree_lock);
692 again:
693 info = tree_search_offset(block_group, offset, 0, 0);
694 if (!info) {
696 * oops didn't find an extent that matched the space we wanted
697 * to remove, look for a bitmap instead
699 info = tree_search_offset(block_group,
700 offset_to_bitmap(block_group, offset),
701 1, 0);
702 if (!info) {
703 WARN_ON(1);
704 goto out_lock;
708 if (info->bytes < bytes && rb_next(&info->offset_index)) {
709 u64 end;
710 next_info = rb_entry(rb_next(&info->offset_index),
711 struct btrfs_free_space,
712 offset_index);
714 if (next_info->bitmap)
715 end = next_info->offset + BITS_PER_BITMAP *
716 block_group->sectorsize - 1;
717 else
718 end = next_info->offset + next_info->bytes;
720 if (next_info->bytes < bytes ||
721 next_info->offset > offset || offset > end) {
722 printk(KERN_CRIT "Found free space at %llu, size %llu,"
723 " trying to use %llu\n",
724 (unsigned long long)info->offset,
725 (unsigned long long)info->bytes,
726 (unsigned long long)bytes);
727 WARN_ON(1);
728 ret = -EINVAL;
729 goto out_lock;
732 info = next_info;
735 if (info->bytes == bytes) {
736 unlink_free_space(block_group, info);
737 if (info->bitmap) {
738 kfree(info->bitmap);
739 block_group->total_bitmaps--;
741 kfree(info);
742 goto out_lock;
745 if (!info->bitmap && info->offset == offset) {
746 unlink_free_space(block_group, info);
747 info->offset += bytes;
748 info->bytes -= bytes;
749 link_free_space(block_group, info);
750 goto out_lock;
753 if (!info->bitmap && info->offset <= offset &&
754 info->offset + info->bytes >= offset + bytes) {
755 u64 old_start = info->offset;
757 * we're freeing space in the middle of the info,
758 * this can happen during tree log replay
760 * first unlink the old info and then
761 * insert it again after the hole we're creating
763 unlink_free_space(block_group, info);
764 if (offset + bytes < info->offset + info->bytes) {
765 u64 old_end = info->offset + info->bytes;
767 info->offset = offset + bytes;
768 info->bytes = old_end - info->offset;
769 ret = link_free_space(block_group, info);
770 WARN_ON(ret);
771 if (ret)
772 goto out_lock;
773 } else {
774 /* the hole we're creating ends at the end
775 * of the info struct, just free the info
777 kfree(info);
779 spin_unlock(&block_group->tree_lock);
781 /* step two, insert a new info struct to cover
782 * anything before the hole
784 ret = btrfs_add_free_space(block_group, old_start,
785 offset - old_start);
786 WARN_ON(ret);
787 goto out;
790 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
791 if (ret == -EAGAIN)
792 goto again;
793 BUG_ON(ret);
794 out_lock:
795 spin_unlock(&block_group->tree_lock);
796 out:
797 return ret;
800 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
801 u64 bytes)
803 struct btrfs_free_space *info;
804 struct rb_node *n;
805 int count = 0;
807 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
808 info = rb_entry(n, struct btrfs_free_space, offset_index);
809 if (info->bytes >= bytes)
810 count++;
811 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
812 (unsigned long long)info->offset,
813 (unsigned long long)info->bytes,
814 (info->bitmap) ? "yes" : "no");
816 printk(KERN_INFO "block group has cluster?: %s\n",
817 list_empty(&block_group->cluster_list) ? "no" : "yes");
818 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
819 "\n", count);
822 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
824 struct btrfs_free_space *info;
825 struct rb_node *n;
826 u64 ret = 0;
828 for (n = rb_first(&block_group->free_space_offset); n;
829 n = rb_next(n)) {
830 info = rb_entry(n, struct btrfs_free_space, offset_index);
831 ret += info->bytes;
834 return ret;
838 * for a given cluster, put all of its extents back into the free
839 * space cache. If the block group passed doesn't match the block group
840 * pointed to by the cluster, someone else raced in and freed the
841 * cluster already. In that case, we just return without changing anything
843 static int
844 __btrfs_return_cluster_to_free_space(
845 struct btrfs_block_group_cache *block_group,
846 struct btrfs_free_cluster *cluster)
848 struct btrfs_free_space *entry;
849 struct rb_node *node;
850 bool bitmap;
852 spin_lock(&cluster->lock);
853 if (cluster->block_group != block_group)
854 goto out;
856 bitmap = cluster->points_to_bitmap;
857 cluster->block_group = NULL;
858 cluster->window_start = 0;
859 list_del_init(&cluster->block_group_list);
860 cluster->points_to_bitmap = false;
862 if (bitmap)
863 goto out;
865 node = rb_first(&cluster->root);
866 while (node) {
867 entry = rb_entry(node, struct btrfs_free_space, offset_index);
868 node = rb_next(&entry->offset_index);
869 rb_erase(&entry->offset_index, &cluster->root);
870 BUG_ON(entry->bitmap);
871 tree_insert_offset(&block_group->free_space_offset,
872 entry->offset, &entry->offset_index, 0);
874 cluster->root = RB_ROOT;
876 out:
877 spin_unlock(&cluster->lock);
878 btrfs_put_block_group(block_group);
879 return 0;
882 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
884 struct btrfs_free_space *info;
885 struct rb_node *node;
886 struct btrfs_free_cluster *cluster;
887 struct list_head *head;
889 spin_lock(&block_group->tree_lock);
890 while ((head = block_group->cluster_list.next) !=
891 &block_group->cluster_list) {
892 cluster = list_entry(head, struct btrfs_free_cluster,
893 block_group_list);
895 WARN_ON(cluster->block_group != block_group);
896 __btrfs_return_cluster_to_free_space(block_group, cluster);
897 if (need_resched()) {
898 spin_unlock(&block_group->tree_lock);
899 cond_resched();
900 spin_lock(&block_group->tree_lock);
904 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
905 info = rb_entry(node, struct btrfs_free_space, offset_index);
906 unlink_free_space(block_group, info);
907 if (info->bitmap)
908 kfree(info->bitmap);
909 kfree(info);
910 if (need_resched()) {
911 spin_unlock(&block_group->tree_lock);
912 cond_resched();
913 spin_lock(&block_group->tree_lock);
917 spin_unlock(&block_group->tree_lock);
920 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
921 u64 offset, u64 bytes, u64 empty_size)
923 struct btrfs_free_space *entry = NULL;
924 u64 bytes_search = bytes + empty_size;
925 u64 ret = 0;
927 spin_lock(&block_group->tree_lock);
928 entry = find_free_space(block_group, &offset, &bytes_search, 0);
929 if (!entry)
930 goto out;
932 ret = offset;
933 if (entry->bitmap) {
934 bitmap_clear_bits(block_group, entry, offset, bytes);
935 if (!entry->bytes) {
936 unlink_free_space(block_group, entry);
937 kfree(entry->bitmap);
938 kfree(entry);
939 block_group->total_bitmaps--;
940 recalculate_thresholds(block_group);
942 } else {
943 unlink_free_space(block_group, entry);
944 entry->offset += bytes;
945 entry->bytes -= bytes;
946 if (!entry->bytes)
947 kfree(entry);
948 else
949 link_free_space(block_group, entry);
952 out:
953 spin_unlock(&block_group->tree_lock);
955 return ret;
959 * given a cluster, put all of its extents back into the free space
960 * cache. If a block group is passed, this function will only free
961 * a cluster that belongs to the passed block group.
963 * Otherwise, it'll get a reference on the block group pointed to by the
964 * cluster and remove the cluster from it.
966 int btrfs_return_cluster_to_free_space(
967 struct btrfs_block_group_cache *block_group,
968 struct btrfs_free_cluster *cluster)
970 int ret;
972 /* first, get a safe pointer to the block group */
973 spin_lock(&cluster->lock);
974 if (!block_group) {
975 block_group = cluster->block_group;
976 if (!block_group) {
977 spin_unlock(&cluster->lock);
978 return 0;
980 } else if (cluster->block_group != block_group) {
981 /* someone else has already freed it don't redo their work */
982 spin_unlock(&cluster->lock);
983 return 0;
985 atomic_inc(&block_group->count);
986 spin_unlock(&cluster->lock);
988 /* now return any extents the cluster had on it */
989 spin_lock(&block_group->tree_lock);
990 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
991 spin_unlock(&block_group->tree_lock);
993 /* finally drop our ref */
994 btrfs_put_block_group(block_group);
995 return ret;
998 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
999 struct btrfs_free_cluster *cluster,
1000 u64 bytes, u64 min_start)
1002 struct btrfs_free_space *entry;
1003 int err;
1004 u64 search_start = cluster->window_start;
1005 u64 search_bytes = bytes;
1006 u64 ret = 0;
1008 spin_lock(&block_group->tree_lock);
1009 spin_lock(&cluster->lock);
1011 if (!cluster->points_to_bitmap)
1012 goto out;
1014 if (cluster->block_group != block_group)
1015 goto out;
1018 * search_start is the beginning of the bitmap, but at some point it may
1019 * be a good idea to point to the actual start of the free area in the
1020 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1021 * to 1 to make sure we get the bitmap entry
1023 entry = tree_search_offset(block_group,
1024 offset_to_bitmap(block_group, search_start),
1025 1, 0);
1026 if (!entry || !entry->bitmap)
1027 goto out;
1029 search_start = min_start;
1030 search_bytes = bytes;
1032 err = search_bitmap(block_group, entry, &search_start,
1033 &search_bytes);
1034 if (err)
1035 goto out;
1037 ret = search_start;
1038 bitmap_clear_bits(block_group, entry, ret, bytes);
1039 out:
1040 spin_unlock(&cluster->lock);
1041 spin_unlock(&block_group->tree_lock);
1043 return ret;
1047 * given a cluster, try to allocate 'bytes' from it, returns 0
1048 * if it couldn't find anything suitably large, or a logical disk offset
1049 * if things worked out
1051 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1052 struct btrfs_free_cluster *cluster, u64 bytes,
1053 u64 min_start)
1055 struct btrfs_free_space *entry = NULL;
1056 struct rb_node *node;
1057 u64 ret = 0;
1059 if (cluster->points_to_bitmap)
1060 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1061 min_start);
1063 spin_lock(&cluster->lock);
1064 if (bytes > cluster->max_size)
1065 goto out;
1067 if (cluster->block_group != block_group)
1068 goto out;
1070 node = rb_first(&cluster->root);
1071 if (!node)
1072 goto out;
1074 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1076 while(1) {
1077 if (entry->bytes < bytes || entry->offset < min_start) {
1078 struct rb_node *node;
1080 node = rb_next(&entry->offset_index);
1081 if (!node)
1082 break;
1083 entry = rb_entry(node, struct btrfs_free_space,
1084 offset_index);
1085 continue;
1087 ret = entry->offset;
1089 entry->offset += bytes;
1090 entry->bytes -= bytes;
1092 if (entry->bytes == 0) {
1093 rb_erase(&entry->offset_index, &cluster->root);
1094 kfree(entry);
1096 break;
1098 out:
1099 spin_unlock(&cluster->lock);
1101 return ret;
1104 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1105 struct btrfs_free_space *entry,
1106 struct btrfs_free_cluster *cluster,
1107 u64 offset, u64 bytes, u64 min_bytes)
1109 unsigned long next_zero;
1110 unsigned long i;
1111 unsigned long search_bits;
1112 unsigned long total_bits;
1113 unsigned long found_bits;
1114 unsigned long start = 0;
1115 unsigned long total_found = 0;
1116 bool found = false;
1118 i = offset_to_bit(entry->offset, block_group->sectorsize,
1119 max_t(u64, offset, entry->offset));
1120 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1121 total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1123 again:
1124 found_bits = 0;
1125 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1126 i < BITS_PER_BITMAP;
1127 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1128 next_zero = find_next_zero_bit(entry->bitmap,
1129 BITS_PER_BITMAP, i);
1130 if (next_zero - i >= search_bits) {
1131 found_bits = next_zero - i;
1132 break;
1134 i = next_zero;
1137 if (!found_bits)
1138 return -1;
1140 if (!found) {
1141 start = i;
1142 found = true;
1145 total_found += found_bits;
1147 if (cluster->max_size < found_bits * block_group->sectorsize)
1148 cluster->max_size = found_bits * block_group->sectorsize;
1150 if (total_found < total_bits) {
1151 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1152 if (i - start > total_bits * 2) {
1153 total_found = 0;
1154 cluster->max_size = 0;
1155 found = false;
1157 goto again;
1160 cluster->window_start = start * block_group->sectorsize +
1161 entry->offset;
1162 cluster->points_to_bitmap = true;
1164 return 0;
1168 * here we try to find a cluster of blocks in a block group. The goal
1169 * is to find at least bytes free and up to empty_size + bytes free.
1170 * We might not find them all in one contiguous area.
1172 * returns zero and sets up cluster if things worked out, otherwise
1173 * it returns -enospc
1175 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
1176 struct btrfs_root *root,
1177 struct btrfs_block_group_cache *block_group,
1178 struct btrfs_free_cluster *cluster,
1179 u64 offset, u64 bytes, u64 empty_size)
1181 struct btrfs_free_space *entry = NULL;
1182 struct rb_node *node;
1183 struct btrfs_free_space *next;
1184 struct btrfs_free_space *last = NULL;
1185 u64 min_bytes;
1186 u64 window_start;
1187 u64 window_free;
1188 u64 max_extent = 0;
1189 bool found_bitmap = false;
1190 int ret;
1192 /* for metadata, allow allocates with more holes */
1193 if (btrfs_test_opt(root, SSD_SPREAD)) {
1194 min_bytes = bytes + empty_size;
1195 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1197 * we want to do larger allocations when we are
1198 * flushing out the delayed refs, it helps prevent
1199 * making more work as we go along.
1201 if (trans->transaction->delayed_refs.flushing)
1202 min_bytes = max(bytes, (bytes + empty_size) >> 1);
1203 else
1204 min_bytes = max(bytes, (bytes + empty_size) >> 4);
1205 } else
1206 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1208 spin_lock(&block_group->tree_lock);
1209 spin_lock(&cluster->lock);
1211 /* someone already found a cluster, hooray */
1212 if (cluster->block_group) {
1213 ret = 0;
1214 goto out;
1216 again:
1217 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1218 if (!entry) {
1219 ret = -ENOSPC;
1220 goto out;
1224 * If found_bitmap is true, we exhausted our search for extent entries,
1225 * and we just want to search all of the bitmaps that we can find, and
1226 * ignore any extent entries we find.
1228 while (entry->bitmap || found_bitmap ||
1229 (!entry->bitmap && entry->bytes < min_bytes)) {
1230 struct rb_node *node = rb_next(&entry->offset_index);
1232 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1233 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1234 offset, bytes + empty_size,
1235 min_bytes);
1236 if (!ret)
1237 goto got_it;
1240 if (!node) {
1241 ret = -ENOSPC;
1242 goto out;
1244 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1248 * We already searched all the extent entries from the passed in offset
1249 * to the end and didn't find enough space for the cluster, and we also
1250 * didn't find any bitmaps that met our criteria, just go ahead and exit
1252 if (found_bitmap) {
1253 ret = -ENOSPC;
1254 goto out;
1257 cluster->points_to_bitmap = false;
1258 window_start = entry->offset;
1259 window_free = entry->bytes;
1260 last = entry;
1261 max_extent = entry->bytes;
1263 while (1) {
1264 /* out window is just right, lets fill it */
1265 if (window_free >= bytes + empty_size)
1266 break;
1268 node = rb_next(&last->offset_index);
1269 if (!node) {
1270 if (found_bitmap)
1271 goto again;
1272 ret = -ENOSPC;
1273 goto out;
1275 next = rb_entry(node, struct btrfs_free_space, offset_index);
1278 * we found a bitmap, so if this search doesn't result in a
1279 * cluster, we know to go and search again for the bitmaps and
1280 * start looking for space there
1282 if (next->bitmap) {
1283 if (!found_bitmap)
1284 offset = next->offset;
1285 found_bitmap = true;
1286 last = next;
1287 continue;
1291 * we haven't filled the empty size and the window is
1292 * very large. reset and try again
1294 if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
1295 next->offset - window_start > (bytes + empty_size) * 2) {
1296 entry = next;
1297 window_start = entry->offset;
1298 window_free = entry->bytes;
1299 last = entry;
1300 max_extent = entry->bytes;
1301 } else {
1302 last = next;
1303 window_free += next->bytes;
1304 if (entry->bytes > max_extent)
1305 max_extent = entry->bytes;
1309 cluster->window_start = entry->offset;
1312 * now we've found our entries, pull them out of the free space
1313 * cache and put them into the cluster rbtree
1315 * The cluster includes an rbtree, but only uses the offset index
1316 * of each free space cache entry.
1318 while (1) {
1319 node = rb_next(&entry->offset_index);
1320 if (entry->bitmap && node) {
1321 entry = rb_entry(node, struct btrfs_free_space,
1322 offset_index);
1323 continue;
1324 } else if (entry->bitmap && !node) {
1325 break;
1328 rb_erase(&entry->offset_index, &block_group->free_space_offset);
1329 ret = tree_insert_offset(&cluster->root, entry->offset,
1330 &entry->offset_index, 0);
1331 BUG_ON(ret);
1333 if (!node || entry == last)
1334 break;
1336 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1339 cluster->max_size = max_extent;
1340 got_it:
1341 ret = 0;
1342 atomic_inc(&block_group->count);
1343 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
1344 cluster->block_group = block_group;
1345 out:
1346 spin_unlock(&cluster->lock);
1347 spin_unlock(&block_group->tree_lock);
1349 return ret;
1353 * simple code to zero out a cluster
1355 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1357 spin_lock_init(&cluster->lock);
1358 spin_lock_init(&cluster->refill_lock);
1359 cluster->root = RB_ROOT;
1360 cluster->max_size = 0;
1361 cluster->points_to_bitmap = false;
1362 INIT_LIST_HEAD(&cluster->block_group_list);
1363 cluster->block_group = NULL;