fs/btrfs/extent-tree.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/writeback.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/sort.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include "compat.h"
  27 #include "hash.h"
  28 #include "ctree.h"
  29 #include "disk-io.h"
  30 #include "print-tree.h"
  31 #include "transaction.h"
  32 #include "volumes.h"
  33 #include "locking.h"
  34 #include "free-space-cache.h"
  35
  36 /* control flags for do_chunk_alloc's force field
  37  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  38  * if we really need one.
  39  *
  40  * CHUNK_ALLOC_FORCE means it must try to allocate one
  41  *
  42  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  43  * if we have very few chunks already allocated.  This is
  44  * used as part of the clustering code to help make sure
  45  * we have a good pool of storage to cluster in, without
  46  * filling the FS with empty chunks
  47  *
  48  */
  49 enum {
  50         CHUNK_ALLOC_NO_FORCE = 0,
  51         CHUNK_ALLOC_FORCE = 1,
  52         CHUNK_ALLOC_LIMITED = 2,
  53 };
  54
  55 static int update_block_group(struct btrfs_trans_handle *trans,
  56                               struct btrfs_root *root,
  57                               u64 bytenr, u64 num_bytes, int alloc);
  58 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  59                                 struct btrfs_root *root,
  60                                 u64 bytenr, u64 num_bytes, u64 parent,
  61                                 u64 root_objectid, u64 owner_objectid,
  62                                 u64 owner_offset, int refs_to_drop,
  63                                 struct btrfs_delayed_extent_op *extra_op);
  64 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  65                                     struct extent_buffer *leaf,
  66                                     struct btrfs_extent_item *ei);
  67 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  68                                       struct btrfs_root *root,
  69                                       u64 parent, u64 root_objectid,
  70                                       u64 flags, u64 owner, u64 offset,
  71                                       struct btrfs_key *ins, int ref_mod);
  72 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  73                                      struct btrfs_root *root,
  74                                      u64 parent, u64 root_objectid,
  75                                      u64 flags, struct btrfs_disk_key *key,
  76                                      int level, struct btrfs_key *ins);
  77 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  78                           struct btrfs_root *extent_root, u64 alloc_bytes,
  79                           u64 flags, int force);
  80 static int find_next_key(struct btrfs_path *path, int level,
  81                          struct btrfs_key *key);
  82 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
  83                             int dump_block_groups);
  84
  85 static noinline int
  86 block_group_cache_done(struct btrfs_block_group_cache *cache)
  87 {
  88         smp_mb();
  89         return cache->cached == BTRFS_CACHE_FINISHED;
  90 }
  91
  92 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  93 {
  94         return (cache->flags & bits) == bits;
  95 }
  96
  97 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  98 {
  99         atomic_inc(&cache->count);
 100 }
 101
 102 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 103 {
 104         if (atomic_dec_and_test(&cache->count)) {
 105                 WARN_ON(cache->pinned > 0);
 106                 WARN_ON(cache->reserved > 0);
 107                 WARN_ON(cache->reserved_pinned > 0);
 108                 kfree(cache->free_space_ctl);
 109                 kfree(cache);
 110         }
 111 }
 112
 113 /*
 114  * this adds the block group to the fs_info rb tree for the block group
 115  * cache
 116  */
 117 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 118                                 struct btrfs_block_group_cache *block_group)
 119 {
 120         struct rb_node **p;
 121         struct rb_node *parent = NULL;
 122         struct btrfs_block_group_cache *cache;
 123
 124         spin_lock(&info->block_group_cache_lock);
 125         p = &info->block_group_cache_tree.rb_node;
 126
 127         while (*p) {
 128                 parent = *p;
 129                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 130                                  cache_node);
 131                 if (block_group->key.objectid < cache->key.objectid) {
 132                         p = &(*p)->rb_left;
 133                 } else if (block_group->key.objectid > cache->key.objectid) {
 134                         p = &(*p)->rb_right;
 135                 } else {
 136                         spin_unlock(&info->block_group_cache_lock);
 137                         return -EEXIST;
 138                 }
 139         }
 140
 141         rb_link_node(&block_group->cache_node, parent, p);
 142         rb_insert_color(&block_group->cache_node,
 143                         &info->block_group_cache_tree);
 144         spin_unlock(&info->block_group_cache_lock);
 145
 146         return 0;
 147 }
 148
 149 /*
 150  * This will return the block group at or after bytenr if contains is 0, else
 151  * it will return the block group that contains the bytenr
 152  */
 153 static struct btrfs_block_group_cache *
 154 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 155                               int contains)
 156 {
 157         struct btrfs_block_group_cache *cache, *ret = NULL;
 158         struct rb_node *n;
 159         u64 end, start;
 160
 161         spin_lock(&info->block_group_cache_lock);
 162         n = info->block_group_cache_tree.rb_node;
 163
 164         while (n) {
 165                 cache = rb_entry(n, struct btrfs_block_group_cache,
 166                                  cache_node);
 167                 end = cache->key.objectid + cache->key.offset - 1;
 168                 start = cache->key.objectid;
 169
 170                 if (bytenr < start) {
 171                         if (!contains && (!ret || start < ret->key.objectid))
 172                                 ret = cache;
 173                         n = n->rb_left;
 174                 } else if (bytenr > start) {
 175                         if (contains && bytenr <= end) {
 176                                 ret = cache;
 177                                 break;
 178                         }
 179                         n = n->rb_right;
 180                 } else {
 181                         ret = cache;
 182                         break;
 183                 }
 184         }
 185         if (ret)
 186                 btrfs_get_block_group(ret);
 187         spin_unlock(&info->block_group_cache_lock);
 188
 189         return ret;
 190 }
 191
 192 static int add_excluded_extent(struct btrfs_root *root,
 193                                u64 start, u64 num_bytes)
 194 {
 195         u64 end = start + num_bytes - 1;
 196         set_extent_bits(&root->fs_info->freed_extents[0],
 197                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 198         set_extent_bits(&root->fs_info->freed_extents[1],
 199                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 200         return 0;
 201 }
 202
 203 static void free_excluded_extents(struct btrfs_root *root,
 204                                   struct btrfs_block_group_cache *cache)
 205 {
 206         u64 start, end;
 207
 208         start = cache->key.objectid;
 209         end = start + cache->key.offset - 1;
 210
 211         clear_extent_bits(&root->fs_info->freed_extents[0],
 212                           start, end, EXTENT_UPTODATE, GFP_NOFS);
 213         clear_extent_bits(&root->fs_info->freed_extents[1],
 214                           start, end, EXTENT_UPTODATE, GFP_NOFS);
 215 }
 216
 217 static int exclude_super_stripes(struct btrfs_root *root,
 218                                  struct btrfs_block_group_cache *cache)
 219 {
 220         u64 bytenr;
 221         u64 *logical;
 222         int stripe_len;
 223         int i, nr, ret;
 224
 225         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 226                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 227                 cache->bytes_super += stripe_len;
 228                 ret = add_excluded_extent(root, cache->key.objectid,
 229                                           stripe_len);
 230                 BUG_ON(ret);
 231         }
 232
 233         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 234                 bytenr = btrfs_sb_offset(i);
 235                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 236                                        cache->key.objectid, bytenr,
 237                                        0, &logical, &nr, &stripe_len);
 238                 BUG_ON(ret);
 239
 240                 while (nr--) {
 241                         cache->bytes_super += stripe_len;
 242                         ret = add_excluded_extent(root, logical[nr],
 243                                                   stripe_len);
 244                         BUG_ON(ret);
 245                 }
 246
 247                 kfree(logical);
 248         }
 249         return 0;
 250 }
 251
 252 static struct btrfs_caching_control *
 253 get_caching_control(struct btrfs_block_group_cache *cache)
 254 {
 255         struct btrfs_caching_control *ctl;
 256
 257         spin_lock(&cache->lock);
 258         if (cache->cached != BTRFS_CACHE_STARTED) {
 259                 spin_unlock(&cache->lock);
 260                 return NULL;
 261         }
 262
 263         /* We're loading it the fast way, so we don't have a caching_ctl. */
 264         if (!cache->caching_ctl) {
 265                 spin_unlock(&cache->lock);
 266                 return NULL;
 267         }
 268
 269         ctl = cache->caching_ctl;
 270         atomic_inc(&ctl->count);
 271         spin_unlock(&cache->lock);
 272         return ctl;
 273 }
 274
 275 static void put_caching_control(struct btrfs_caching_control *ctl)
 276 {
 277         if (atomic_dec_and_test(&ctl->count))
 278                 kfree(ctl);
 279 }
 280
 281 /*
 282  * this is only called by cache_block_group, since we could have freed extents
 283  * we need to check the pinned_extents for any extents that can't be used yet
 284  * since their free space will be released as soon as the transaction commits.
 285  */
 286 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 287                               struct btrfs_fs_info *info, u64 start, u64 end)
 288 {
 289         u64 extent_start, extent_end, size, total_added = 0;
 290         int ret;
 291
 292         while (start < end) {
 293                 ret = find_first_extent_bit(info->pinned_extents, start,
 294                                             &extent_start, &extent_end,
 295                                             EXTENT_DIRTY | EXTENT_UPTODATE);
 296                 if (ret)
 297                         break;
 298
 299                 if (extent_start <= start) {
 300                         start = extent_end + 1;
 301                 } else if (extent_start > start && extent_start < end) {
 302                         size = extent_start - start;
 303                         total_added += size;
 304                         ret = btrfs_add_free_space(block_group, start,
 305                                                    size);
 306                         BUG_ON(ret);
 307                         start = extent_end + 1;
 308                 } else {
 309                         break;
 310                 }
 311         }
 312
 313         if (start < end) {
 314                 size = end - start;
 315                 total_added += size;
 316                 ret = btrfs_add_free_space(block_group, start, size);
 317                 BUG_ON(ret);
 318         }
 319
 320         return total_added;
 321 }
 322
 323 static int caching_kthread(void *data)
 324 {
 325         struct btrfs_block_group_cache *block_group = data;
 326         struct btrfs_fs_info *fs_info = block_group->fs_info;
 327         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
 328         struct btrfs_root *extent_root = fs_info->extent_root;
 329         struct btrfs_path *path;
 330         struct extent_buffer *leaf;
 331         struct btrfs_key key;
 332         u64 total_found = 0;
 333         u64 last = 0;
 334         u32 nritems;
 335         int ret = 0;
 336
 337         path = btrfs_alloc_path();
 338         if (!path)
 339                 return -ENOMEM;
 340
 341         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 342
 343         /*
 344          * We don't want to deadlock with somebody trying to allocate a new
 345          * extent for the extent root while also trying to search the extent
 346          * root to add free space.  So we skip locking and search the commit
 347          * root, since its read-only
 348          */
 349         path->skip_locking = 1;
 350         path->search_commit_root = 1;
 351         path->reada = 1;
 352
 353         key.objectid = last;
 354         key.offset = 0;
 355         key.type = BTRFS_EXTENT_ITEM_KEY;
 356 again:
 357         mutex_lock(&caching_ctl->mutex);
 358         /* need to make sure the commit_root doesn't disappear */
 359         down_read(&fs_info->extent_commit_sem);
 360
 361         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 362         if (ret < 0)
 363                 goto err;
 364
 365         leaf = path->nodes[0];
 366         nritems = btrfs_header_nritems(leaf);
 367
 368         while (1) {
 369                 if (btrfs_fs_closing(fs_info) > 1) {
 370                         last = (u64)-1;
 371                         break;
 372                 }
 373
 374                 if (path->slots[0] < nritems) {
 375                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 376                 } else {
 377                         ret = find_next_key(path, 0, &key);
 378                         if (ret)
 379                                 break;
 380
 381                         if (need_resched() ||
 382                             btrfs_next_leaf(extent_root, path)) {
 383                                 caching_ctl->progress = last;
 384                                 btrfs_release_path(path);
 385                                 up_read(&fs_info->extent_commit_sem);
 386                                 mutex_unlock(&caching_ctl->mutex);
 387                                 cond_resched();
 388                                 goto again;
 389                         }
 390                         leaf = path->nodes[0];
 391                         nritems = btrfs_header_nritems(leaf);
 392                         continue;
 393                 }
 394
 395                 if (key.objectid < block_group->key.objectid) {
 396                         path->slots[0]++;
 397                         continue;
 398                 }
 399
 400                 if (key.objectid >= block_group->key.objectid +
 401                     block_group->key.offset)
 402                         break;
 403
 404                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 405                         total_found += add_new_free_space(block_group,
 406                                                           fs_info, last,
 407                                                           key.objectid);
 408                         last = key.objectid + key.offset;
 409
 410                         if (total_found > (1024 * 1024 * 2)) {
 411                                 total_found = 0;
 412                                 wake_up(&caching_ctl->wait);
 413                         }
 414                 }
 415                 path->slots[0]++;
 416         }
 417         ret = 0;
 418
 419         total_found += add_new_free_space(block_group, fs_info, last,
 420                                           block_group->key.objectid +
 421                                           block_group->key.offset);
 422         caching_ctl->progress = (u64)-1;
 423
 424         spin_lock(&block_group->lock);
 425         block_group->caching_ctl = NULL;
 426         block_group->cached = BTRFS_CACHE_FINISHED;
 427         spin_unlock(&block_group->lock);
 428
 429 err:
 430         btrfs_free_path(path);
 431         up_read(&fs_info->extent_commit_sem);
 432
 433         free_excluded_extents(extent_root, block_group);
 434
 435         mutex_unlock(&caching_ctl->mutex);
 436         wake_up(&caching_ctl->wait);
 437
 438         put_caching_control(caching_ctl);
 439         atomic_dec(&block_group->space_info->caching_threads);
 440         btrfs_put_block_group(block_group);
 441
 442         return 0;
 443 }
 444
 445 static int cache_block_group(struct btrfs_block_group_cache *cache,
 446                              struct btrfs_trans_handle *trans,
 447                              struct btrfs_root *root,
 448                              int load_cache_only)
 449 {
 450         struct btrfs_fs_info *fs_info = cache->fs_info;
 451         struct btrfs_caching_control *caching_ctl;
 452         struct task_struct *tsk;
 453         int ret = 0;
 454
 455         smp_mb();
 456         if (cache->cached != BTRFS_CACHE_NO)
 457                 return 0;
 458
 459         /*
 460          * We can't do the read from on-disk cache during a commit since we need
 461          * to have the normal tree locking.  Also if we are currently trying to
 462          * allocate blocks for the tree root we can't do the fast caching since
 463          * we likely hold important locks.
 464          */
 465         if (trans && (!trans->transaction->in_commit) &&
 466             (root && root != root->fs_info->tree_root)) {
 467                 spin_lock(&cache->lock);
 468                 if (cache->cached != BTRFS_CACHE_NO) {
 469                         spin_unlock(&cache->lock);
 470                         return 0;
 471                 }
 472                 cache->cached = BTRFS_CACHE_STARTED;
 473                 spin_unlock(&cache->lock);
 474
 475                 ret = load_free_space_cache(fs_info, cache);
 476
 477                 spin_lock(&cache->lock);
 478                 if (ret == 1) {
 479                         cache->cached = BTRFS_CACHE_FINISHED;
 480                         cache->last_byte_to_unpin = (u64)-1;
 481                 } else {
 482                         cache->cached = BTRFS_CACHE_NO;
 483                 }
 484                 spin_unlock(&cache->lock);
 485                 if (ret == 1) {
 486                         free_excluded_extents(fs_info->extent_root, cache);
 487                         return 0;
 488                 }
 489         }
 490
 491         if (load_cache_only)
 492                 return 0;
 493
 494         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 495         BUG_ON(!caching_ctl);
 496
 497         INIT_LIST_HEAD(&caching_ctl->list);
 498         mutex_init(&caching_ctl->mutex);
 499         init_waitqueue_head(&caching_ctl->wait);
 500         caching_ctl->block_group = cache;
 501         caching_ctl->progress = cache->key.objectid;
 502         /* one for caching kthread, one for caching block group list */
 503         atomic_set(&caching_ctl->count, 2);
 504
 505         spin_lock(&cache->lock);
 506         if (cache->cached != BTRFS_CACHE_NO) {
 507                 spin_unlock(&cache->lock);
 508                 kfree(caching_ctl);
 509                 return 0;
 510         }
 511         cache->caching_ctl = caching_ctl;
 512         cache->cached = BTRFS_CACHE_STARTED;
 513         spin_unlock(&cache->lock);
 514
 515         down_write(&fs_info->extent_commit_sem);
 516         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 517         up_write(&fs_info->extent_commit_sem);
 518
 519         atomic_inc(&cache->space_info->caching_threads);
 520         btrfs_get_block_group(cache);
 521
 522         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
 523                           cache->key.objectid);
 524         if (IS_ERR(tsk)) {
 525                 ret = PTR_ERR(tsk);
 526                 printk(KERN_ERR "error running thread %d\n", ret);
 527                 BUG();
 528         }
 529
 530         return ret;
 531 }
 532
 533 /*
 534  * return the block group that starts at or after bytenr
 535  */
 536 static struct btrfs_block_group_cache *
 537 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 538 {
 539         struct btrfs_block_group_cache *cache;
 540
 541         cache = block_group_cache_tree_search(info, bytenr, 0);
 542
 543         return cache;
 544 }
 545
 546 /*
 547  * return the block group that contains the given bytenr
 548  */
 549 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 550                                                  struct btrfs_fs_info *info,
 551                                                  u64 bytenr)
 552 {
 553         struct btrfs_block_group_cache *cache;
 554
 555         cache = block_group_cache_tree_search(info, bytenr, 1);
 556
 557         return cache;
 558 }
 559
 560 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 561                                                   u64 flags)
 562 {
 563         struct list_head *head = &info->space_info;
 564         struct btrfs_space_info *found;
 565
 566         flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
 567                  BTRFS_BLOCK_GROUP_METADATA;
 568
 569         rcu_read_lock();
 570         list_for_each_entry_rcu(found, head, list) {
 571                 if (found->flags & flags) {
 572                         rcu_read_unlock();
 573                         return found;
 574                 }
 575         }
 576         rcu_read_unlock();
 577         return NULL;
 578 }
 579
 580 /*
 581  * after adding space to the filesystem, we need to clear the full flags
 582  * on all the space infos.
 583  */
 584 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 585 {
 586         struct list_head *head = &info->space_info;
 587         struct btrfs_space_info *found;
 588
 589         rcu_read_lock();
 590         list_for_each_entry_rcu(found, head, list)
 591                 found->full = 0;
 592         rcu_read_unlock();
 593 }
 594
 595 static u64 div_factor(u64 num, int factor)
 596 {
 597         if (factor == 10)
 598                 return num;
 599         num *= factor;
 600         do_div(num, 10);
 601         return num;
 602 }
 603
 604 static u64 div_factor_fine(u64 num, int factor)
 605 {
 606         if (factor == 100)
 607                 return num;
 608         num *= factor;
 609         do_div(num, 100);
 610         return num;
 611 }
 612
 613 u64 btrfs_find_block_group(struct btrfs_root *root,
 614                            u64 search_start, u64 search_hint, int owner)
 615 {
 616         struct btrfs_block_group_cache *cache;
 617         u64 used;
 618         u64 last = max(search_hint, search_start);
 619         u64 group_start = 0;
 620         int full_search = 0;
 621         int factor = 9;
 622         int wrapped = 0;
 623 again:
 624         while (1) {
 625                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
 626                 if (!cache)
 627                         break;
 628
 629                 spin_lock(&cache->lock);
 630                 last = cache->key.objectid + cache->key.offset;
 631                 used = btrfs_block_group_used(&cache->item);
 632
 633                 if ((full_search || !cache->ro) &&
 634                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 635                         if (used + cache->pinned + cache->reserved <
 636                             div_factor(cache->key.offset, factor)) {
 637                                 group_start = cache->key.objectid;
 638                                 spin_unlock(&cache->lock);
 639                                 btrfs_put_block_group(cache);
 640                                 goto found;
 641                         }
 642                 }
 643                 spin_unlock(&cache->lock);
 644                 btrfs_put_block_group(cache);
 645                 cond_resched();
 646         }
 647         if (!wrapped) {
 648                 last = search_start;
 649                 wrapped = 1;
 650                 goto again;
 651         }
 652         if (!full_search && factor < 10) {
 653                 last = search_start;
 654                 full_search = 1;
 655                 factor = 10;
 656                 goto again;
 657         }
 658 found:
 659         return group_start;
 660 }
 661
 662 /* simple helper to search for an existing extent at a given offset */
 663 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 664 {
 665         int ret;
 666         struct btrfs_key key;
 667         struct btrfs_path *path;
 668
 669         path = btrfs_alloc_path();
 670         BUG_ON(!path);
 671         key.objectid = start;
 672         key.offset = len;
 673         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 674         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 675                                 0, 0);
 676         btrfs_free_path(path);
 677         return ret;
 678 }
 679
 680 /*
 681  * helper function to lookup reference count and flags of extent.
 682  *
 683  * the head node for delayed ref is used to store the sum of all the
 684  * reference count modifications queued up in the rbtree. the head
 685  * node may also store the extent flags to set. This way you can check
 686  * to see what the reference count and extent flags would be if all of
 687  * the delayed refs are not processed.
 688  */
 689 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 690                              struct btrfs_root *root, u64 bytenr,
 691                              u64 num_bytes, u64 *refs, u64 *flags)
 692 {
 693         struct btrfs_delayed_ref_head *head;
 694         struct btrfs_delayed_ref_root *delayed_refs;
 695         struct btrfs_path *path;
 696         struct btrfs_extent_item *ei;
 697         struct extent_buffer *leaf;
 698         struct btrfs_key key;
 699         u32 item_size;
 700         u64 num_refs;
 701         u64 extent_flags;
 702         int ret;
 703
 704         path = btrfs_alloc_path();
 705         if (!path)
 706                 return -ENOMEM;
 707
 708         key.objectid = bytenr;
 709         key.type = BTRFS_EXTENT_ITEM_KEY;
 710         key.offset = num_bytes;
 711         if (!trans) {
 712                 path->skip_locking = 1;
 713                 path->search_commit_root = 1;
 714         }
 715 again:
 716         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 717                                 &key, path, 0, 0);
 718         if (ret < 0)
 719                 goto out_free;
 720
 721         if (ret == 0) {
 722                 leaf = path->nodes[0];
 723                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 724                 if (item_size >= sizeof(*ei)) {
 725                         ei = btrfs_item_ptr(leaf, path->slots[0],
 726                                             struct btrfs_extent_item);
 727                         num_refs = btrfs_extent_refs(leaf, ei);
 728                         extent_flags = btrfs_extent_flags(leaf, ei);
 729                 } else {
 730 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 731                         struct btrfs_extent_item_v0 *ei0;
 732                         BUG_ON(item_size != sizeof(*ei0));
 733                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 734                                              struct btrfs_extent_item_v0);
 735                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
 736                         /* FIXME: this isn't correct for data */
 737                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 738 #else
 739                         BUG();
 740 #endif
 741                 }
 742                 BUG_ON(num_refs == 0);
 743         } else {
 744                 num_refs = 0;
 745                 extent_flags = 0;
 746                 ret = 0;
 747         }
 748
 749         if (!trans)
 750                 goto out;
 751
 752         delayed_refs = &trans->transaction->delayed_refs;
 753         spin_lock(&delayed_refs->lock);
 754         head = btrfs_find_delayed_ref_head(trans, bytenr);
 755         if (head) {
 756                 if (!mutex_trylock(&head->mutex)) {
 757                         atomic_inc(&head->node.refs);
 758                         spin_unlock(&delayed_refs->lock);
 759
 760                         btrfs_release_path(path);
 761
 762                         /*
 763                          * Mutex was contended, block until it's released and try
 764                          * again
 765                          */
 766                         mutex_lock(&head->mutex);
 767                         mutex_unlock(&head->mutex);
 768                         btrfs_put_delayed_ref(&head->node);
 769                         goto again;
 770                 }
 771                 if (head->extent_op && head->extent_op->update_flags)
 772                         extent_flags |= head->extent_op->flags_to_set;
 773                 else
 774                         BUG_ON(num_refs == 0);
 775
 776                 num_refs += head->node.ref_mod;
 777                 mutex_unlock(&head->mutex);
 778         }
 779         spin_unlock(&delayed_refs->lock);
 780 out:
 781         WARN_ON(num_refs == 0);
 782         if (refs)
 783                 *refs = num_refs;
 784         if (flags)
 785                 *flags = extent_flags;
 786 out_free:
 787         btrfs_free_path(path);
 788         return ret;
 789 }
 790
 791 /*
 792  * Back reference rules.  Back refs have three main goals:
 793  *
 794  * 1) differentiate between all holders of references to an extent so that
 795  *    when a reference is dropped we can make sure it was a valid reference
 796  *    before freeing the extent.
 797  *
 798  * 2) Provide enough information to quickly find the holders of an extent
 799  *    if we notice a given block is corrupted or bad.
 800  *
 801  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 802  *    maintenance.  This is actually the same as #2, but with a slightly
 803  *    different use case.
 804  *
 805  * There are two kinds of back refs. The implicit back refs is optimized
 806  * for pointers in non-shared tree blocks. For a given pointer in a block,
 807  * back refs of this kind provide information about the block's owner tree
 808  * and the pointer's key. These information allow us to find the block by
 809  * b-tree searching. The full back refs is for pointers in tree blocks not
 810  * referenced by their owner trees. The location of tree block is recorded
 811  * in the back refs. Actually the full back refs is generic, and can be
 812  * used in all cases the implicit back refs is used. The major shortcoming
 813  * of the full back refs is its overhead. Every time a tree block gets
 814  * COWed, we have to update back refs entry for all pointers in it.
 815  *
 816  * For a newly allocated tree block, we use implicit back refs for
 817  * pointers in it. This means most tree related operations only involve
 818  * implicit back refs. For a tree block created in old transaction, the
 819  * only way to drop a reference to it is COW it. So we can detect the
 820  * event that tree block loses its owner tree's reference and do the
 821  * back refs conversion.
 822  *
 823  * When a tree block is COW'd through a tree, there are four cases:
 824  *
 825  * The reference count of the block is one and the tree is the block's
 826  * owner tree. Nothing to do in this case.
 827  *
 828  * The reference count of the block is one and the tree is not the
 829  * block's owner tree. In this case, full back refs is used for pointers
 830  * in the block. Remove these full back refs, add implicit back refs for
 831  * every pointers in the new block.
 832  *
 833  * The reference count of the block is greater than one and the tree is
 834  * the block's owner tree. In this case, implicit back refs is used for
 835  * pointers in the block. Add full back refs for every pointers in the
 836  * block, increase lower level extents' reference counts. The original
 837  * implicit back refs are entailed to the new block.
 838  *
 839  * The reference count of the block is greater than one and the tree is
 840  * not the block's owner tree. Add implicit back refs for every pointer in
 841  * the new block, increase lower level extents' reference count.
 842  *
 843  * Back Reference Key composing:
 844  *
 845  * The key objectid corresponds to the first byte in the extent,
 846  * The key type is used to differentiate between types of back refs.
 847  * There are different meanings of the key offset for different types
 848  * of back refs.
 849  *
 850  * File extents can be referenced by:
 851  *
 852  * - multiple snapshots, subvolumes, or different generations in one subvol
 853  * - different files inside a single subvolume
 854  * - different offsets inside a file (bookend extents in file.c)
 855  *
 856  * The extent ref structure for the implicit back refs has fields for:
 857  *
 858  * - Objectid of the subvolume root
 859  * - objectid of the file holding the reference
 860  * - original offset in the file
 861  * - how many bookend extents
 862  *
 863  * The key offset for the implicit back refs is hash of the first
 864  * three fields.
 865  *
 866  * The extent ref structure for the full back refs has field for:
 867  *
 868  * - number of pointers in the tree leaf
 869  *
 870  * The key offset for the implicit back refs is the first byte of
 871  * the tree leaf
 872  *
 873  * When a file extent is allocated, The implicit back refs is used.
 874  * the fields are filled in:
 875  *
 876  *     (root_key.objectid, inode objectid, offset in file, 1)
 877  *
 878  * When a file extent is removed file truncation, we find the
 879  * corresponding implicit back refs and check the following fields:
 880  *
 881  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 882  *
 883  * Btree extents can be referenced by:
 884  *
 885  * - Different subvolumes
 886  *
 887  * Both the implicit back refs and the full back refs for tree blocks
 888  * only consist of key. The key offset for the implicit back refs is
 889  * objectid of block's owner tree. The key offset for the full back refs
 890  * is the first byte of parent block.
 891  *
 892  * When implicit back refs is used, information about the lowest key and
 893  * level of the tree block are required. These information are stored in
 894  * tree block info structure.
 895  */
 896
 897 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 898 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 899                                   struct btrfs_root *root,
 900                                   struct btrfs_path *path,
 901                                   u64 owner, u32 extra_size)
 902 {
 903         struct btrfs_extent_item *item;
 904         struct btrfs_extent_item_v0 *ei0;
 905         struct btrfs_extent_ref_v0 *ref0;
 906         struct btrfs_tree_block_info *bi;
 907         struct extent_buffer *leaf;
 908         struct btrfs_key key;
 909         struct btrfs_key found_key;
 910         u32 new_size = sizeof(*item);
 911         u64 refs;
 912         int ret;
 913
 914         leaf = path->nodes[0];
 915         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 916
 917         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 918         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 919                              struct btrfs_extent_item_v0);
 920         refs = btrfs_extent_refs_v0(leaf, ei0);
 921
 922         if (owner == (u64)-1) {
 923                 while (1) {
 924                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 925                                 ret = btrfs_next_leaf(root, path);
 926                                 if (ret < 0)
 927                                         return ret;
 928                                 BUG_ON(ret > 0);
 929                                 leaf = path->nodes[0];
 930                         }
 931                         btrfs_item_key_to_cpu(leaf, &found_key,
 932                                               path->slots[0]);
 933                         BUG_ON(key.objectid != found_key.objectid);
 934                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
 935                                 path->slots[0]++;
 936                                 continue;
 937                         }
 938                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
 939                                               struct btrfs_extent_ref_v0);
 940                         owner = btrfs_ref_objectid_v0(leaf, ref0);
 941                         break;
 942                 }
 943         }
 944         btrfs_release_path(path);
 945
 946         if (owner < BTRFS_FIRST_FREE_OBJECTID)
 947                 new_size += sizeof(*bi);
 948
 949         new_size -= sizeof(*ei0);
 950         ret = btrfs_search_slot(trans, root, &key, path,
 951                                 new_size + extra_size, 1);
 952         if (ret < 0)
 953                 return ret;
 954         BUG_ON(ret);
 955
 956         ret = btrfs_extend_item(trans, root, path, new_size);
 957
 958         leaf = path->nodes[0];
 959         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 960         btrfs_set_extent_refs(leaf, item, refs);
 961         /* FIXME: get real generation */
 962         btrfs_set_extent_generation(leaf, item, 0);
 963         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 964                 btrfs_set_extent_flags(leaf, item,
 965                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
 966                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
 967                 bi = (struct btrfs_tree_block_info *)(item + 1);
 968                 /* FIXME: get first key of the block */
 969                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
 970                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
 971         } else {
 972                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
 973         }
 974         btrfs_mark_buffer_dirty(leaf);
 975         return 0;
 976 }
 977 #endif
 978
 979 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 980 {
 981         u32 high_crc = ~(u32)0;
 982         u32 low_crc = ~(u32)0;
 983         __le64 lenum;
 984
 985         lenum = cpu_to_le64(root_objectid);
 986         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 987         lenum = cpu_to_le64(owner);
 988         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 989         lenum = cpu_to_le64(offset);
 990         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 991
 992         return ((u64)high_crc << 31) ^ (u64)low_crc;
 993 }
 994
 995 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
 996                                      struct btrfs_extent_data_ref *ref)
 997 {
 998         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
 999                                     btrfs_extent_data_ref_objectid(leaf, ref),
1000                                     btrfs_extent_data_ref_offset(leaf, ref));
1001 }
1002
1003 static int match_extent_data_ref(struct extent_buffer *leaf,
1004                                  struct btrfs_extent_data_ref *ref,
1005                                  u64 root_objectid, u64 owner, u64 offset)
1006 {
1007         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1008             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1009             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1010                 return 0;
1011         return 1;
1012 }
1013
1014 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1015                                            struct btrfs_root *root,
1016                                            struct btrfs_path *path,
1017                                            u64 bytenr, u64 parent,
1018                                            u64 root_objectid,
1019                                            u64 owner, u64 offset)
1020 {
1021         struct btrfs_key key;
1022         struct btrfs_extent_data_ref *ref;
1023         struct extent_buffer *leaf;
1024         u32 nritems;
1025         int ret;
1026         int recow;
1027         int err = -ENOENT;
1028
1029         key.objectid = bytenr;
1030         if (parent) {
1031                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1032                 key.offset = parent;
1033         } else {
1034                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1035                 key.offset = hash_extent_data_ref(root_objectid,
1036                                                   owner, offset);
1037         }
1038 again:
1039         recow = 0;
1040         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1041         if (ret < 0) {
1042                 err = ret;
1043                 goto fail;
1044         }
1045
1046         if (parent) {
1047                 if (!ret)
1048                         return 0;
1049 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1050                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1051                 btrfs_release_path(path);
1052                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1053                 if (ret < 0) {
1054                         err = ret;
1055                         goto fail;
1056                 }
1057                 if (!ret)
1058                         return 0;
1059 #endif
1060                 goto fail;
1061         }
1062
1063         leaf = path->nodes[0];
1064         nritems = btrfs_header_nritems(leaf);
1065         while (1) {
1066                 if (path->slots[0] >= nritems) {
1067                         ret = btrfs_next_leaf(root, path);
1068                         if (ret < 0)
1069                                 err = ret;
1070                         if (ret)
1071                                 goto fail;
1072
1073                         leaf = path->nodes[0];
1074                         nritems = btrfs_header_nritems(leaf);
1075                         recow = 1;
1076                 }
1077
1078                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1079                 if (key.objectid != bytenr ||
1080                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1081                         goto fail;
1082
1083                 ref = btrfs_item_ptr(leaf, path->slots[0],
1084                                      struct btrfs_extent_data_ref);
1085
1086                 if (match_extent_data_ref(leaf, ref, root_objectid,
1087                                           owner, offset)) {
1088                         if (recow) {
1089                                 btrfs_release_path(path);
1090                                 goto again;
1091                         }
1092                         err = 0;
1093                         break;
1094                 }
1095                 path->slots[0]++;
1096         }
1097 fail:
1098         return err;
1099 }
1100
1101 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1102                                            struct btrfs_root *root,
1103                                            struct btrfs_path *path,
1104                                            u64 bytenr, u64 parent,
1105                                            u64 root_objectid, u64 owner,
1106                                            u64 offset, int refs_to_add)
1107 {
1108         struct btrfs_key key;
1109         struct extent_buffer *leaf;
1110         u32 size;
1111         u32 num_refs;
1112         int ret;
1113
1114         key.objectid = bytenr;
1115         if (parent) {
1116                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1117                 key.offset = parent;
1118                 size = sizeof(struct btrfs_shared_data_ref);
1119         } else {
1120                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1121                 key.offset = hash_extent_data_ref(root_objectid,
1122                                                   owner, offset);
1123                 size = sizeof(struct btrfs_extent_data_ref);
1124         }
1125
1126         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1127         if (ret && ret != -EEXIST)
1128                 goto fail;
1129
1130         leaf = path->nodes[0];
1131         if (parent) {
1132                 struct btrfs_shared_data_ref *ref;
1133                 ref = btrfs_item_ptr(leaf, path->slots[0],
1134                                      struct btrfs_shared_data_ref);
1135                 if (ret == 0) {
1136                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1137                 } else {
1138                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1139                         num_refs += refs_to_add;
1140                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1141                 }
1142         } else {
1143                 struct btrfs_extent_data_ref *ref;
1144                 while (ret == -EEXIST) {
1145                         ref = btrfs_item_ptr(leaf, path->slots[0],
1146                                              struct btrfs_extent_data_ref);
1147                         if (match_extent_data_ref(leaf, ref, root_objectid,
1148                                                   owner, offset))
1149                                 break;
1150                         btrfs_release_path(path);
1151                         key.offset++;
1152                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1153                                                       size);
1154                         if (ret && ret != -EEXIST)
1155                                 goto fail;
1156
1157                         leaf = path->nodes[0];
1158                 }
1159                 ref = btrfs_item_ptr(leaf, path->slots[0],
1160                                      struct btrfs_extent_data_ref);
1161                 if (ret == 0) {
1162                         btrfs_set_extent_data_ref_root(leaf, ref,
1163                                                        root_objectid);
1164                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1165                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1166                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1167                 } else {
1168                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1169                         num_refs += refs_to_add;
1170                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1171                 }
1172         }
1173         btrfs_mark_buffer_dirty(leaf);
1174         ret = 0;
1175 fail:
1176         btrfs_release_path(path);
1177         return ret;
1178 }
1179
1180 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1181                                            struct btrfs_root *root,
1182                                            struct btrfs_path *path,
1183                                            int refs_to_drop)
1184 {
1185         struct btrfs_key key;
1186         struct btrfs_extent_data_ref *ref1 = NULL;
1187         struct btrfs_shared_data_ref *ref2 = NULL;
1188         struct extent_buffer *leaf;
1189         u32 num_refs = 0;
1190         int ret = 0;
1191
1192         leaf = path->nodes[0];
1193         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1194
1195         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1196                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1197                                       struct btrfs_extent_data_ref);
1198                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1199         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1200                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1201                                       struct btrfs_shared_data_ref);
1202                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1203 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1204         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1205                 struct btrfs_extent_ref_v0 *ref0;
1206                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1207                                       struct btrfs_extent_ref_v0);
1208                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1209 #endif
1210         } else {
1211                 BUG();
1212         }
1213
1214         BUG_ON(num_refs < refs_to_drop);
1215         num_refs -= refs_to_drop;
1216
1217         if (num_refs == 0) {
1218                 ret = btrfs_del_item(trans, root, path);
1219         } else {
1220                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1221                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1222                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1223                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1224 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1225                 else {
1226                         struct btrfs_extent_ref_v0 *ref0;
1227                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1228                                         struct btrfs_extent_ref_v0);
1229                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1230                 }
1231 #endif
1232                 btrfs_mark_buffer_dirty(leaf);
1233         }
1234         return ret;
1235 }
1236
1237 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1238                                           struct btrfs_path *path,
1239                                           struct btrfs_extent_inline_ref *iref)
1240 {
1241         struct btrfs_key key;
1242         struct extent_buffer *leaf;
1243         struct btrfs_extent_data_ref *ref1;
1244         struct btrfs_shared_data_ref *ref2;
1245         u32 num_refs = 0;
1246
1247         leaf = path->nodes[0];
1248         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1249         if (iref) {
1250                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1251                     BTRFS_EXTENT_DATA_REF_KEY) {
1252                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1253                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1254                 } else {
1255                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1256                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1257                 }
1258         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1259                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1260                                       struct btrfs_extent_data_ref);
1261                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1262         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1263                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1264                                       struct btrfs_shared_data_ref);
1265                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1266 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1267         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1268                 struct btrfs_extent_ref_v0 *ref0;
1269                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1270                                       struct btrfs_extent_ref_v0);
1271                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1272 #endif
1273         } else {
1274                 WARN_ON(1);
1275         }
1276         return num_refs;
1277 }
1278
1279 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1280                                           struct btrfs_root *root,
1281                                           struct btrfs_path *path,
1282                                           u64 bytenr, u64 parent,
1283                                           u64 root_objectid)
1284 {
1285         struct btrfs_key key;
1286         int ret;
1287
1288         key.objectid = bytenr;
1289         if (parent) {
1290                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1291                 key.offset = parent;
1292         } else {
1293                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1294                 key.offset = root_objectid;
1295         }
1296
1297         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1298         if (ret > 0)
1299                 ret = -ENOENT;
1300 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1301         if (ret == -ENOENT && parent) {
1302                 btrfs_release_path(path);
1303                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1304                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1305                 if (ret > 0)
1306                         ret = -ENOENT;
1307         }
1308 #endif
1309         return ret;
1310 }
1311
1312 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1313                                           struct btrfs_root *root,
1314                                           struct btrfs_path *path,
1315                                           u64 bytenr, u64 parent,
1316                                           u64 root_objectid)
1317 {
1318         struct btrfs_key key;
1319         int ret;
1320
1321         key.objectid = bytenr;
1322         if (parent) {
1323                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1324                 key.offset = parent;
1325         } else {
1326                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1327                 key.offset = root_objectid;
1328         }
1329
1330         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1331         btrfs_release_path(path);
1332         return ret;
1333 }
1334
1335 static inline int extent_ref_type(u64 parent, u64 owner)
1336 {
1337         int type;
1338         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1339                 if (parent > 0)
1340                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1341                 else
1342                         type = BTRFS_TREE_BLOCK_REF_KEY;
1343         } else {
1344                 if (parent > 0)
1345                         type = BTRFS_SHARED_DATA_REF_KEY;
1346                 else
1347                         type = BTRFS_EXTENT_DATA_REF_KEY;
1348         }
1349         return type;
1350 }
1351
1352 static int find_next_key(struct btrfs_path *path, int level,
1353                          struct btrfs_key *key)
1354
1355 {
1356         for (; level < BTRFS_MAX_LEVEL; level++) {
1357                 if (!path->nodes[level])
1358                         break;
1359                 if (path->slots[level] + 1 >=
1360                     btrfs_header_nritems(path->nodes[level]))
1361                         continue;
1362                 if (level == 0)
1363                         btrfs_item_key_to_cpu(path->nodes[level], key,
1364                                               path->slots[level] + 1);
1365                 else
1366                         btrfs_node_key_to_cpu(path->nodes[level], key,
1367                                               path->slots[level] + 1);
1368                 return 0;
1369         }
1370         return 1;
1371 }
1372
1373 /*
1374  * look for inline back ref. if back ref is found, *ref_ret is set
1375  * to the address of inline back ref, and 0 is returned.
1376  *
1377  * if back ref isn't found, *ref_ret is set to the address where it
1378  * should be inserted, and -ENOENT is returned.
1379  *
1380  * if insert is true and there are too many inline back refs, the path
1381  * points to the extent item, and -EAGAIN is returned.
1382  *
1383  * NOTE: inline back refs are ordered in the same way that back ref
1384  *       items in the tree are ordered.
1385  */
1386 static noinline_for_stack
1387 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1388                                  struct btrfs_root *root,
1389                                  struct btrfs_path *path,
1390                                  struct btrfs_extent_inline_ref **ref_ret,
1391                                  u64 bytenr, u64 num_bytes,
1392                                  u64 parent, u64 root_objectid,
1393                                  u64 owner, u64 offset, int insert)
1394 {
1395         struct btrfs_key key;
1396         struct extent_buffer *leaf;
1397         struct btrfs_extent_item *ei;
1398         struct btrfs_extent_inline_ref *iref;
1399         u64 flags;
1400         u64 item_size;
1401         unsigned long ptr;
1402         unsigned long end;
1403         int extra_size;
1404         int type;
1405         int want;
1406         int ret;
1407         int err = 0;
1408
1409         key.objectid = bytenr;
1410         key.type = BTRFS_EXTENT_ITEM_KEY;
1411         key.offset = num_bytes;
1412
1413         want = extent_ref_type(parent, owner);
1414         if (insert) {
1415                 extra_size = btrfs_extent_inline_ref_size(want);
1416                 path->keep_locks = 1;
1417         } else
1418                 extra_size = -1;
1419         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1420         if (ret < 0) {
1421                 err = ret;
1422                 goto out;
1423         }
1424         BUG_ON(ret);
1425
1426         leaf = path->nodes[0];
1427         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1428 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1429         if (item_size < sizeof(*ei)) {
1430                 if (!insert) {
1431                         err = -ENOENT;
1432                         goto out;
1433                 }
1434                 ret = convert_extent_item_v0(trans, root, path, owner,
1435                                              extra_size);
1436                 if (ret < 0) {
1437                         err = ret;
1438                         goto out;
1439                 }
1440                 leaf = path->nodes[0];
1441                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1442         }
1443 #endif
1444         BUG_ON(item_size < sizeof(*ei));
1445
1446         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1447         flags = btrfs_extent_flags(leaf, ei);
1448
1449         ptr = (unsigned long)(ei + 1);
1450         end = (unsigned long)ei + item_size;
1451
1452         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1453                 ptr += sizeof(struct btrfs_tree_block_info);
1454                 BUG_ON(ptr > end);
1455         } else {
1456                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1457         }
1458
1459         err = -ENOENT;
1460         while (1) {
1461                 if (ptr >= end) {
1462                         WARN_ON(ptr > end);
1463                         break;
1464                 }
1465                 iref = (struct btrfs_extent_inline_ref *)ptr;
1466                 type = btrfs_extent_inline_ref_type(leaf, iref);
1467                 if (want < type)
1468                         break;
1469                 if (want > type) {
1470                         ptr += btrfs_extent_inline_ref_size(type);
1471                         continue;
1472                 }
1473
1474                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1475                         struct btrfs_extent_data_ref *dref;
1476                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1477                         if (match_extent_data_ref(leaf, dref, root_objectid,
1478                                                   owner, offset)) {
1479                                 err = 0;
1480                                 break;
1481                         }
1482                         if (hash_extent_data_ref_item(leaf, dref) <
1483                             hash_extent_data_ref(root_objectid, owner, offset))
1484                                 break;
1485                 } else {
1486                         u64 ref_offset;
1487                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1488                         if (parent > 0) {
1489                                 if (parent == ref_offset) {
1490                                         err = 0;
1491                                         break;
1492                                 }
1493                                 if (ref_offset < parent)
1494                                         break;
1495                         } else {
1496                                 if (root_objectid == ref_offset) {
1497                                         err = 0;
1498                                         break;
1499                                 }
1500                                 if (ref_offset < root_objectid)
1501                                         break;
1502                         }
1503                 }
1504                 ptr += btrfs_extent_inline_ref_size(type);
1505         }
1506         if (err == -ENOENT && insert) {
1507                 if (item_size + extra_size >=
1508                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1509                         err = -EAGAIN;
1510                         goto out;
1511                 }
1512                 /*
1513                  * To add new inline back ref, we have to make sure
1514                  * there is no corresponding back ref item.
1515                  * For simplicity, we just do not add new inline back
1516                  * ref if there is any kind of item for this block
1517                  */
1518                 if (find_next_key(path, 0, &key) == 0 &&
1519                     key.objectid == bytenr &&
1520                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1521                         err = -EAGAIN;
1522                         goto out;
1523                 }
1524         }
1525         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1526 out:
1527         if (insert) {
1528                 path->keep_locks = 0;
1529                 btrfs_unlock_up_safe(path, 1);
1530         }
1531         return err;
1532 }
1533
1534 /*
1535  * helper to add new inline back ref
1536  */
1537 static noinline_for_stack
1538 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1539                                 struct btrfs_root *root,
1540                                 struct btrfs_path *path,
1541                                 struct btrfs_extent_inline_ref *iref,
1542                                 u64 parent, u64 root_objectid,
1543                                 u64 owner, u64 offset, int refs_to_add,
1544                                 struct btrfs_delayed_extent_op *extent_op)
1545 {
1546         struct extent_buffer *leaf;
1547         struct btrfs_extent_item *ei;
1548         unsigned long ptr;
1549         unsigned long end;
1550         unsigned long item_offset;
1551         u64 refs;
1552         int size;
1553         int type;
1554         int ret;
1555
1556         leaf = path->nodes[0];
1557         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1558         item_offset = (unsigned long)iref - (unsigned long)ei;
1559
1560         type = extent_ref_type(parent, owner);
1561         size = btrfs_extent_inline_ref_size(type);
1562
1563         ret = btrfs_extend_item(trans, root, path, size);
1564
1565         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1566         refs = btrfs_extent_refs(leaf, ei);
1567         refs += refs_to_add;
1568         btrfs_set_extent_refs(leaf, ei, refs);
1569         if (extent_op)
1570                 __run_delayed_extent_op(extent_op, leaf, ei);
1571
1572         ptr = (unsigned long)ei + item_offset;
1573         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1574         if (ptr < end - size)
1575                 memmove_extent_buffer(leaf, ptr + size, ptr,
1576                                       end - size - ptr);
1577
1578         iref = (struct btrfs_extent_inline_ref *)ptr;
1579         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1580         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1581                 struct btrfs_extent_data_ref *dref;
1582                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1583                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1584                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1585                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1586                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1587         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1588                 struct btrfs_shared_data_ref *sref;
1589                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1590                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1591                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1592         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1593                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1594         } else {
1595                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1596         }
1597         btrfs_mark_buffer_dirty(leaf);
1598         return 0;
1599 }
1600
1601 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1602                                  struct btrfs_root *root,
1603                                  struct btrfs_path *path,
1604                                  struct btrfs_extent_inline_ref **ref_ret,
1605                                  u64 bytenr, u64 num_bytes, u64 parent,
1606                                  u64 root_objectid, u64 owner, u64 offset)
1607 {
1608         int ret;
1609
1610         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1611                                            bytenr, num_bytes, parent,
1612                                            root_objectid, owner, offset, 0);
1613         if (ret != -ENOENT)
1614                 return ret;
1615
1616         btrfs_release_path(path);
1617         *ref_ret = NULL;
1618
1619         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1620                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1621                                             root_objectid);
1622         } else {
1623                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1624                                              root_objectid, owner, offset);
1625         }
1626         return ret;
1627 }
1628
1629 /*
1630  * helper to update/remove inline back ref
1631  */
1632 static noinline_for_stack
1633 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1634                                  struct btrfs_root *root,
1635                                  struct btrfs_path *path,
1636                                  struct btrfs_extent_inline_ref *iref,
1637                                  int refs_to_mod,
1638                                  struct btrfs_delayed_extent_op *extent_op)
1639 {
1640         struct extent_buffer *leaf;
1641         struct btrfs_extent_item *ei;
1642         struct btrfs_extent_data_ref *dref = NULL;
1643         struct btrfs_shared_data_ref *sref = NULL;
1644         unsigned long ptr;
1645         unsigned long end;
1646         u32 item_size;
1647         int size;
1648         int type;
1649         int ret;
1650         u64 refs;
1651
1652         leaf = path->nodes[0];
1653         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1654         refs = btrfs_extent_refs(leaf, ei);
1655         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1656         refs += refs_to_mod;
1657         btrfs_set_extent_refs(leaf, ei, refs);
1658         if (extent_op)
1659                 __run_delayed_extent_op(extent_op, leaf, ei);
1660
1661         type = btrfs_extent_inline_ref_type(leaf, iref);
1662
1663         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1664                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1665                 refs = btrfs_extent_data_ref_count(leaf, dref);
1666         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1667                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1668                 refs = btrfs_shared_data_ref_count(leaf, sref);
1669         } else {
1670                 refs = 1;
1671                 BUG_ON(refs_to_mod != -1);
1672         }
1673
1674         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1675         refs += refs_to_mod;
1676
1677         if (refs > 0) {
1678                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1679                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1680                 else
1681                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1682         } else {
1683                 size =  btrfs_extent_inline_ref_size(type);
1684                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1685                 ptr = (unsigned long)iref;
1686                 end = (unsigned long)ei + item_size;
1687                 if (ptr + size < end)
1688                         memmove_extent_buffer(leaf, ptr, ptr + size,
1689                                               end - ptr - size);
1690                 item_size -= size;
1691                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1692         }
1693         btrfs_mark_buffer_dirty(leaf);
1694         return 0;
1695 }
1696
1697 static noinline_for_stack
1698 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1699                                  struct btrfs_root *root,
1700                                  struct btrfs_path *path,
1701                                  u64 bytenr, u64 num_bytes, u64 parent,
1702                                  u64 root_objectid, u64 owner,
1703                                  u64 offset, int refs_to_add,
1704                                  struct btrfs_delayed_extent_op *extent_op)
1705 {
1706         struct btrfs_extent_inline_ref *iref;
1707         int ret;
1708
1709         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1710                                            bytenr, num_bytes, parent,
1711                                            root_objectid, owner, offset, 1);
1712         if (ret == 0) {
1713                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1714                 ret = update_inline_extent_backref(trans, root, path, iref,
1715                                                    refs_to_add, extent_op);
1716         } else if (ret == -ENOENT) {
1717                 ret = setup_inline_extent_backref(trans, root, path, iref,
1718                                                   parent, root_objectid,
1719                                                   owner, offset, refs_to_add,
1720                                                   extent_op);
1721         }
1722         return ret;
1723 }
1724
1725 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1726                                  struct btrfs_root *root,
1727                                  struct btrfs_path *path,
1728                                  u64 bytenr, u64 parent, u64 root_objectid,
1729                                  u64 owner, u64 offset, int refs_to_add)
1730 {
1731         int ret;
1732         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1733                 BUG_ON(refs_to_add != 1);
1734                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1735                                             parent, root_objectid);
1736         } else {
1737                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1738                                              parent, root_objectid,
1739                                              owner, offset, refs_to_add);
1740         }
1741         return ret;
1742 }
1743
1744 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1745                                  struct btrfs_root *root,
1746                                  struct btrfs_path *path,
1747                                  struct btrfs_extent_inline_ref *iref,
1748                                  int refs_to_drop, int is_data)
1749 {
1750         int ret;
1751
1752         BUG_ON(!is_data && refs_to_drop != 1);
1753         if (iref) {
1754                 ret = update_inline_extent_backref(trans, root, path, iref,
1755                                                    -refs_to_drop, NULL);
1756         } else if (is_data) {
1757                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1758         } else {
1759                 ret = btrfs_del_item(trans, root, path);
1760         }
1761         return ret;
1762 }
1763
1764 static int btrfs_issue_discard(struct block_device *bdev,
1765                                 u64 start, u64 len)
1766 {
1767         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1768 }
1769
1770 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1771                                 u64 num_bytes, u64 *actual_bytes)
1772 {
1773         int ret;
1774         u64 discarded_bytes = 0;
1775         struct btrfs_multi_bio *multi = NULL;
1776
1777
1778         /* Tell the block device(s) that the sectors can be discarded */
1779         ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1780                               bytenr, &num_bytes, &multi, 0);
1781         if (!ret) {
1782                 struct btrfs_bio_stripe *stripe = multi->stripes;
1783                 int i;
1784
1785
1786                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1787                         ret = btrfs_issue_discard(stripe->dev->bdev,
1788                                                   stripe->physical,
1789                                                   stripe->length);
1790                         if (!ret)
1791                                 discarded_bytes += stripe->length;
1792                         else if (ret != -EOPNOTSUPP)
1793                                 break;
1794                 }
1795                 kfree(multi);
1796         }
1797         if (discarded_bytes && ret == -EOPNOTSUPP)
1798                 ret = 0;
1799
1800         if (actual_bytes)
1801                 *actual_bytes = discarded_bytes;
1802
1803
1804         return ret;
1805 }
1806
1807 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1808                          struct btrfs_root *root,
1809                          u64 bytenr, u64 num_bytes, u64 parent,
1810                          u64 root_objectid, u64 owner, u64 offset)
1811 {
1812         int ret;
1813         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1814                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1815
1816         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1817                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1818                                         parent, root_objectid, (int)owner,
1819                                         BTRFS_ADD_DELAYED_REF, NULL);
1820         } else {
1821                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1822                                         parent, root_objectid, owner, offset,
1823                                         BTRFS_ADD_DELAYED_REF, NULL);
1824         }
1825         return ret;
1826 }
1827
1828 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1829                                   struct btrfs_root *root,
1830                                   u64 bytenr, u64 num_bytes,
1831                                   u64 parent, u64 root_objectid,
1832                                   u64 owner, u64 offset, int refs_to_add,
1833                                   struct btrfs_delayed_extent_op *extent_op)
1834 {
1835         struct btrfs_path *path;
1836         struct extent_buffer *leaf;
1837         struct btrfs_extent_item *item;
1838         u64 refs;
1839         int ret;
1840         int err = 0;
1841
1842         path = btrfs_alloc_path();
1843         if (!path)
1844                 return -ENOMEM;
1845
1846         path->reada = 1;
1847         path->leave_spinning = 1;
1848         /* this will setup the path even if it fails to insert the back ref */
1849         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1850                                            path, bytenr, num_bytes, parent,
1851                                            root_objectid, owner, offset,
1852                                            refs_to_add, extent_op);
1853         if (ret == 0)
1854                 goto out;
1855
1856         if (ret != -EAGAIN) {
1857                 err = ret;
1858                 goto out;
1859         }
1860
1861         leaf = path->nodes[0];
1862         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1863         refs = btrfs_extent_refs(leaf, item);
1864         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1865         if (extent_op)
1866                 __run_delayed_extent_op(extent_op, leaf, item);
1867
1868         btrfs_mark_buffer_dirty(leaf);
1869         btrfs_release_path(path);
1870
1871         path->reada = 1;
1872         path->leave_spinning = 1;
1873
1874         /* now insert the actual backref */
1875         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1876                                     path, bytenr, parent, root_objectid,
1877                                     owner, offset, refs_to_add);
1878         BUG_ON(ret);
1879 out:
1880         btrfs_free_path(path);
1881         return err;
1882 }
1883
1884 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1885                                 struct btrfs_root *root,
1886                                 struct btrfs_delayed_ref_node *node,
1887                                 struct btrfs_delayed_extent_op *extent_op,
1888                                 int insert_reserved)
1889 {
1890         int ret = 0;
1891         struct btrfs_delayed_data_ref *ref;
1892         struct btrfs_key ins;
1893         u64 parent = 0;
1894         u64 ref_root = 0;
1895         u64 flags = 0;
1896
1897         ins.objectid = node->bytenr;
1898         ins.offset = node->num_bytes;
1899         ins.type = BTRFS_EXTENT_ITEM_KEY;
1900
1901         ref = btrfs_delayed_node_to_data_ref(node);
1902         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1903                 parent = ref->parent;
1904         else
1905                 ref_root = ref->root;
1906
1907         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1908                 if (extent_op) {
1909                         BUG_ON(extent_op->update_key);
1910                         flags |= extent_op->flags_to_set;
1911                 }
1912                 ret = alloc_reserved_file_extent(trans, root,
1913                                                  parent, ref_root, flags,
1914                                                  ref->objectid, ref->offset,
1915                                                  &ins, node->ref_mod);
1916         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1917                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1918                                              node->num_bytes, parent,
1919                                              ref_root, ref->objectid,
1920                                              ref->offset, node->ref_mod,
1921                                              extent_op);
1922         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1923                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1924                                           node->num_bytes, parent,
1925                                           ref_root, ref->objectid,
1926                                           ref->offset, node->ref_mod,
1927                                           extent_op);
1928         } else {
1929                 BUG();
1930         }
1931         return ret;
1932 }
1933
1934 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1935                                     struct extent_buffer *leaf,
1936                                     struct btrfs_extent_item *ei)
1937 {
1938         u64 flags = btrfs_extent_flags(leaf, ei);
1939         if (extent_op->update_flags) {
1940                 flags |= extent_op->flags_to_set;
1941                 btrfs_set_extent_flags(leaf, ei, flags);
1942         }
1943
1944         if (extent_op->update_key) {
1945                 struct btrfs_tree_block_info *bi;
1946                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1947                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1948                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1949         }
1950 }
1951
1952 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1953                                  struct btrfs_root *root,
1954                                  struct btrfs_delayed_ref_node *node,
1955                                  struct btrfs_delayed_extent_op *extent_op)
1956 {
1957         struct btrfs_key key;
1958         struct btrfs_path *path;
1959         struct btrfs_extent_item *ei;
1960         struct extent_buffer *leaf;
1961         u32 item_size;
1962         int ret;
1963         int err = 0;
1964
1965         path = btrfs_alloc_path();
1966         if (!path)
1967                 return -ENOMEM;
1968
1969         key.objectid = node->bytenr;
1970         key.type = BTRFS_EXTENT_ITEM_KEY;
1971         key.offset = node->num_bytes;
1972
1973         path->reada = 1;
1974         path->leave_spinning = 1;
1975         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1976                                 path, 0, 1);
1977         if (ret < 0) {
1978                 err = ret;
1979                 goto out;
1980         }
1981         if (ret > 0) {
1982                 err = -EIO;
1983                 goto out;
1984         }
1985
1986         leaf = path->nodes[0];
1987         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1988 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1989         if (item_size < sizeof(*ei)) {
1990                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1991                                              path, (u64)-1, 0);
1992                 if (ret < 0) {
1993                         err = ret;
1994                         goto out;
1995                 }
1996                 leaf = path->nodes[0];
1997                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1998         }
1999 #endif
2000         BUG_ON(item_size < sizeof(*ei));
2001         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2002         __run_delayed_extent_op(extent_op, leaf, ei);
2003
2004         btrfs_mark_buffer_dirty(leaf);
2005 out:
2006         btrfs_free_path(path);
2007         return err;
2008 }
2009
2010 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2011                                 struct btrfs_root *root,
2012                                 struct btrfs_delayed_ref_node *node,
2013                                 struct btrfs_delayed_extent_op *extent_op,
2014                                 int insert_reserved)
2015 {
2016         int ret = 0;
2017         struct btrfs_delayed_tree_ref *ref;
2018         struct btrfs_key ins;
2019         u64 parent = 0;
2020         u64 ref_root = 0;
2021
2022         ins.objectid = node->bytenr;
2023         ins.offset = node->num_bytes;
2024         ins.type = BTRFS_EXTENT_ITEM_KEY;
2025
2026         ref = btrfs_delayed_node_to_tree_ref(node);
2027         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2028                 parent = ref->parent;
2029         else
2030                 ref_root = ref->root;
2031
2032         BUG_ON(node->ref_mod != 1);
2033         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2034                 BUG_ON(!extent_op || !extent_op->update_flags ||
2035                        !extent_op->update_key);
2036                 ret = alloc_reserved_tree_block(trans, root,
2037                                                 parent, ref_root,
2038                                                 extent_op->flags_to_set,
2039                                                 &extent_op->key,
2040                                                 ref->level, &ins);
2041         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2042                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2043                                              node->num_bytes, parent, ref_root,
2044                                              ref->level, 0, 1, extent_op);
2045         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2046                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2047                                           node->num_bytes, parent, ref_root,
2048                                           ref->level, 0, 1, extent_op);
2049         } else {
2050                 BUG();
2051         }
2052         return ret;
2053 }
2054
2055 /* helper function to actually process a single delayed ref entry */
2056 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2057                                struct btrfs_root *root,
2058                                struct btrfs_delayed_ref_node *node,
2059                                struct btrfs_delayed_extent_op *extent_op,
2060                                int insert_reserved)
2061 {
2062         int ret;
2063         if (btrfs_delayed_ref_is_head(node)) {
2064                 struct btrfs_delayed_ref_head *head;
2065                 /*
2066                  * we've hit the end of the chain and we were supposed
2067                  * to insert this extent into the tree.  But, it got
2068                  * deleted before we ever needed to insert it, so all
2069                  * we have to do is clean up the accounting
2070                  */
2071                 BUG_ON(extent_op);
2072                 head = btrfs_delayed_node_to_head(node);
2073                 if (insert_reserved) {
2074                         btrfs_pin_extent(root, node->bytenr,
2075                                          node->num_bytes, 1);
2076                         if (head->is_data) {
2077                                 ret = btrfs_del_csums(trans, root,
2078                                                       node->bytenr,
2079                                                       node->num_bytes);
2080                                 BUG_ON(ret);
2081                         }
2082                 }
2083                 mutex_unlock(&head->mutex);
2084                 return 0;
2085         }
2086
2087         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2088             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2089                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2090                                            insert_reserved);
2091         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2092                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2093                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2094                                            insert_reserved);
2095         else
2096                 BUG();
2097         return ret;
2098 }
2099
2100 static noinline struct btrfs_delayed_ref_node *
2101 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2102 {
2103         struct rb_node *node;
2104         struct btrfs_delayed_ref_node *ref;
2105         int action = BTRFS_ADD_DELAYED_REF;
2106 again:
2107         /*
2108          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2109          * this prevents ref count from going down to zero when
2110          * there still are pending delayed ref.
2111          */
2112         node = rb_prev(&head->node.rb_node);
2113         while (1) {
2114                 if (!node)
2115                         break;
2116                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2117                                 rb_node);
2118                 if (ref->bytenr != head->node.bytenr)
2119                         break;
2120                 if (ref->action == action)
2121                         return ref;
2122                 node = rb_prev(node);
2123         }
2124         if (action == BTRFS_ADD_DELAYED_REF) {
2125                 action = BTRFS_DROP_DELAYED_REF;
2126                 goto again;
2127         }
2128         return NULL;
2129 }
2130
2131 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2132                                        struct btrfs_root *root,
2133                                        struct list_head *cluster)
2134 {
2135         struct btrfs_delayed_ref_root *delayed_refs;
2136         struct btrfs_delayed_ref_node *ref;
2137         struct btrfs_delayed_ref_head *locked_ref = NULL;
2138         struct btrfs_delayed_extent_op *extent_op;
2139         int ret;
2140         int count = 0;
2141         int must_insert_reserved = 0;
2142
2143         delayed_refs = &trans->transaction->delayed_refs;
2144         while (1) {
2145                 if (!locked_ref) {
2146                         /* pick a new head ref from the cluster list */
2147                         if (list_empty(cluster))
2148                                 break;
2149
2150                         locked_ref = list_entry(cluster->next,
2151                                      struct btrfs_delayed_ref_head, cluster);
2152
2153                         /* grab the lock that says we are going to process
2154                          * all the refs for this head */
2155                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2156
2157                         /*
2158                          * we may have dropped the spin lock to get the head
2159                          * mutex lock, and that might have given someone else
2160                          * time to free the head.  If that's true, it has been
2161                          * removed from our list and we can move on.
2162                          */
2163                         if (ret == -EAGAIN) {
2164                                 locked_ref = NULL;
2165                                 count++;
2166                                 continue;
2167                         }
2168                 }
2169
2170                 /*
2171                  * record the must insert reserved flag before we
2172                  * drop the spin lock.
2173                  */
2174                 must_insert_reserved = locked_ref->must_insert_reserved;
2175                 locked_ref->must_insert_reserved = 0;
2176
2177                 extent_op = locked_ref->extent_op;
2178                 locked_ref->extent_op = NULL;
2179
2180                 /*
2181                  * locked_ref is the head node, so we have to go one
2182                  * node back for any delayed ref updates
2183                  */
2184                 ref = select_delayed_ref(locked_ref);
2185                 if (!ref) {
2186                         /* All delayed refs have been processed, Go ahead
2187                          * and send the head node to run_one_delayed_ref,
2188                          * so that any accounting fixes can happen
2189                          */
2190                         ref = &locked_ref->node;
2191
2192                         if (extent_op && must_insert_reserved) {
2193                                 kfree(extent_op);
2194                                 extent_op = NULL;
2195                         }
2196
2197                         if (extent_op) {
2198                                 spin_unlock(&delayed_refs->lock);
2199
2200                                 ret = run_delayed_extent_op(trans, root,
2201                                                             ref, extent_op);
2202                                 BUG_ON(ret);
2203                                 kfree(extent_op);
2204
2205                                 cond_resched();
2206                                 spin_lock(&delayed_refs->lock);
2207                                 continue;
2208                         }
2209
2210                         list_del_init(&locked_ref->cluster);
2211                         locked_ref = NULL;
2212                 }
2213
2214                 ref->in_tree = 0;
2215                 rb_erase(&ref->rb_node, &delayed_refs->root);
2216                 delayed_refs->num_entries--;
2217
2218                 spin_unlock(&delayed_refs->lock);
2219
2220                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2221                                           must_insert_reserved);
2222                 BUG_ON(ret);
2223
2224                 btrfs_put_delayed_ref(ref);
2225                 kfree(extent_op);
2226                 count++;
2227
2228                 cond_resched();
2229                 spin_lock(&delayed_refs->lock);
2230         }
2231         return count;
2232 }
2233
2234 /*
2235  * this starts processing the delayed reference count updates and
2236  * extent insertions we have queued up so far.  count can be
2237  * 0, which means to process everything in the tree at the start
2238  * of the run (but not newly added entries), or it can be some target
2239  * number you'd like to process.
2240  */
2241 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2242                            struct btrfs_root *root, unsigned long count)
2243 {
2244         struct rb_node *node;
2245         struct btrfs_delayed_ref_root *delayed_refs;
2246         struct btrfs_delayed_ref_node *ref;
2247         struct list_head cluster;
2248         int ret;
2249         int run_all = count == (unsigned long)-1;
2250         int run_most = 0;
2251
2252         if (root == root->fs_info->extent_root)
2253                 root = root->fs_info->tree_root;
2254
2255         delayed_refs = &trans->transaction->delayed_refs;
2256         INIT_LIST_HEAD(&cluster);
2257 again:
2258         spin_lock(&delayed_refs->lock);
2259         if (count == 0) {
2260                 count = delayed_refs->num_entries * 2;
2261                 run_most = 1;
2262         }
2263         while (1) {
2264                 if (!(run_all || run_most) &&
2265                     delayed_refs->num_heads_ready < 64)
2266                         break;
2267
2268                 /*
2269                  * go find something we can process in the rbtree.  We start at
2270                  * the beginning of the tree, and then build a cluster
2271                  * of refs to process starting at the first one we are able to
2272                  * lock
2273                  */
2274                 ret = btrfs_find_ref_cluster(trans, &cluster,
2275                                              delayed_refs->run_delayed_start);
2276                 if (ret)
2277                         break;
2278
2279                 ret = run_clustered_refs(trans, root, &cluster);
2280                 BUG_ON(ret < 0);
2281
2282                 count -= min_t(unsigned long, ret, count);
2283
2284                 if (count == 0)
2285                         break;
2286         }
2287
2288         if (run_all) {
2289                 node = rb_first(&delayed_refs->root);
2290                 if (!node)
2291                         goto out;
2292                 count = (unsigned long)-1;
2293
2294                 while (node) {
2295                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2296                                        rb_node);
2297                         if (btrfs_delayed_ref_is_head(ref)) {
2298                                 struct btrfs_delayed_ref_head *head;
2299
2300                                 head = btrfs_delayed_node_to_head(ref);
2301                                 atomic_inc(&ref->refs);
2302
2303                                 spin_unlock(&delayed_refs->lock);
2304                                 /*
2305                                  * Mutex was contended, block until it's
2306                                  * released and try again
2307                                  */
2308                                 mutex_lock(&head->mutex);
2309                                 mutex_unlock(&head->mutex);
2310
2311                                 btrfs_put_delayed_ref(ref);
2312                                 cond_resched();
2313                                 goto again;
2314                         }
2315                         node = rb_next(node);
2316                 }
2317                 spin_unlock(&delayed_refs->lock);
2318                 schedule_timeout(1);
2319                 goto again;
2320         }
2321 out:
2322         spin_unlock(&delayed_refs->lock);
2323         return 0;
2324 }
2325
2326 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2327                                 struct btrfs_root *root,
2328                                 u64 bytenr, u64 num_bytes, u64 flags,
2329                                 int is_data)
2330 {
2331         struct btrfs_delayed_extent_op *extent_op;
2332         int ret;
2333
2334         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2335         if (!extent_op)
2336                 return -ENOMEM;
2337
2338         extent_op->flags_to_set = flags;
2339         extent_op->update_flags = 1;
2340         extent_op->update_key = 0;
2341         extent_op->is_data = is_data ? 1 : 0;
2342
2343         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2344         if (ret)
2345                 kfree(extent_op);
2346         return ret;
2347 }
2348
2349 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2350                                       struct btrfs_root *root,
2351                                       struct btrfs_path *path,
2352                                       u64 objectid, u64 offset, u64 bytenr)
2353 {
2354         struct btrfs_delayed_ref_head *head;
2355         struct btrfs_delayed_ref_node *ref;
2356         struct btrfs_delayed_data_ref *data_ref;
2357         struct btrfs_delayed_ref_root *delayed_refs;
2358         struct rb_node *node;
2359         int ret = 0;
2360
2361         ret = -ENOENT;
2362         delayed_refs = &trans->transaction->delayed_refs;
2363         spin_lock(&delayed_refs->lock);
2364         head = btrfs_find_delayed_ref_head(trans, bytenr);
2365         if (!head)
2366                 goto out;
2367
2368         if (!mutex_trylock(&head->mutex)) {
2369                 atomic_inc(&head->node.refs);
2370                 spin_unlock(&delayed_refs->lock);
2371
2372                 btrfs_release_path(path);
2373
2374                 /*
2375                  * Mutex was contended, block until it's released and let
2376                  * caller try again
2377                  */
2378                 mutex_lock(&head->mutex);
2379                 mutex_unlock(&head->mutex);
2380                 btrfs_put_delayed_ref(&head->node);
2381                 return -EAGAIN;
2382         }
2383
2384         node = rb_prev(&head->node.rb_node);
2385         if (!node)
2386                 goto out_unlock;
2387
2388         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2389
2390         if (ref->bytenr != bytenr)
2391                 goto out_unlock;
2392
2393         ret = 1;
2394         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2395                 goto out_unlock;
2396
2397         data_ref = btrfs_delayed_node_to_data_ref(ref);
2398
2399         node = rb_prev(node);
2400         if (node) {
2401                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2402                 if (ref->bytenr == bytenr)
2403                         goto out_unlock;
2404         }
2405
2406         if (data_ref->root != root->root_key.objectid ||
2407             data_ref->objectid != objectid || data_ref->offset != offset)
2408                 goto out_unlock;
2409
2410         ret = 0;
2411 out_unlock:
2412         mutex_unlock(&head->mutex);
2413 out:
2414         spin_unlock(&delayed_refs->lock);
2415         return ret;
2416 }
2417
2418 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2419                                         struct btrfs_root *root,
2420                                         struct btrfs_path *path,
2421                                         u64 objectid, u64 offset, u64 bytenr)
2422 {
2423         struct btrfs_root *extent_root = root->fs_info->extent_root;
2424         struct extent_buffer *leaf;
2425         struct btrfs_extent_data_ref *ref;
2426         struct btrfs_extent_inline_ref *iref;
2427         struct btrfs_extent_item *ei;
2428         struct btrfs_key key;
2429         u32 item_size;
2430         int ret;
2431
2432         key.objectid = bytenr;
2433         key.offset = (u64)-1;
2434         key.type = BTRFS_EXTENT_ITEM_KEY;
2435
2436         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2437         if (ret < 0)
2438                 goto out;
2439         BUG_ON(ret == 0);
2440
2441         ret = -ENOENT;
2442         if (path->slots[0] == 0)
2443                 goto out;
2444
2445         path->slots[0]--;
2446         leaf = path->nodes[0];
2447         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2448
2449         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2450                 goto out;
2451
2452         ret = 1;
2453         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2454 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2455         if (item_size < sizeof(*ei)) {
2456                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2457                 goto out;
2458         }
2459 #endif
2460         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2461
2462         if (item_size != sizeof(*ei) +
2463             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2464                 goto out;
2465
2466         if (btrfs_extent_generation(leaf, ei) <=
2467             btrfs_root_last_snapshot(&root->root_item))
2468                 goto out;
2469
2470         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2471         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2472             BTRFS_EXTENT_DATA_REF_KEY)
2473                 goto out;
2474
2475         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2476         if (btrfs_extent_refs(leaf, ei) !=
2477             btrfs_extent_data_ref_count(leaf, ref) ||
2478             btrfs_extent_data_ref_root(leaf, ref) !=
2479             root->root_key.objectid ||
2480             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2481             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2482                 goto out;
2483
2484         ret = 0;
2485 out:
2486         return ret;
2487 }
2488
2489 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2490                           struct btrfs_root *root,
2491                           u64 objectid, u64 offset, u64 bytenr)
2492 {
2493         struct btrfs_path *path;
2494         int ret;
2495         int ret2;
2496
2497         path = btrfs_alloc_path();
2498         if (!path)
2499                 return -ENOENT;
2500
2501         do {
2502                 ret = check_committed_ref(trans, root, path, objectid,
2503                                           offset, bytenr);
2504                 if (ret && ret != -ENOENT)
2505                         goto out;
2506
2507                 ret2 = check_delayed_ref(trans, root, path, objectid,
2508                                          offset, bytenr);
2509         } while (ret2 == -EAGAIN);
2510
2511         if (ret2 && ret2 != -ENOENT) {
2512                 ret = ret2;
2513                 goto out;
2514         }
2515
2516         if (ret != -ENOENT || ret2 != -ENOENT)
2517                 ret = 0;
2518 out:
2519         btrfs_free_path(path);
2520         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2521                 WARN_ON(ret > 0);
2522         return ret;
2523 }
2524
2525 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2526                            struct btrfs_root *root,
2527                            struct extent_buffer *buf,
2528                            int full_backref, int inc)
2529 {
2530         u64 bytenr;
2531         u64 num_bytes;
2532         u64 parent;
2533         u64 ref_root;
2534         u32 nritems;
2535         struct btrfs_key key;
2536         struct btrfs_file_extent_item *fi;
2537         int i;
2538         int level;
2539         int ret = 0;
2540         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2541                             u64, u64, u64, u64, u64, u64);
2542
2543         ref_root = btrfs_header_owner(buf);
2544         nritems = btrfs_header_nritems(buf);
2545         level = btrfs_header_level(buf);
2546
2547         if (!root->ref_cows && level == 0)
2548                 return 0;
2549
2550         if (inc)
2551                 process_func = btrfs_inc_extent_ref;
2552         else
2553                 process_func = btrfs_free_extent;
2554
2555         if (full_backref)
2556                 parent = buf->start;
2557         else
2558                 parent = 0;
2559
2560         for (i = 0; i < nritems; i++) {
2561                 if (level == 0) {
2562                         btrfs_item_key_to_cpu(buf, &key, i);
2563                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2564                                 continue;
2565                         fi = btrfs_item_ptr(buf, i,
2566                                             struct btrfs_file_extent_item);
2567                         if (btrfs_file_extent_type(buf, fi) ==
2568                             BTRFS_FILE_EXTENT_INLINE)
2569                                 continue;
2570                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2571                         if (bytenr == 0)
2572                                 continue;
2573
2574                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2575                         key.offset -= btrfs_file_extent_offset(buf, fi);
2576                         ret = process_func(trans, root, bytenr, num_bytes,
2577                                            parent, ref_root, key.objectid,
2578                                            key.offset);
2579                         if (ret)
2580                                 goto fail;
2581                 } else {
2582                         bytenr = btrfs_node_blockptr(buf, i);
2583                         num_bytes = btrfs_level_size(root, level - 1);
2584                         ret = process_func(trans, root, bytenr, num_bytes,
2585                                            parent, ref_root, level - 1, 0);
2586                         if (ret)
2587                                 goto fail;
2588                 }
2589         }
2590         return 0;
2591 fail:
2592         BUG();
2593         return ret;
2594 }
2595
2596 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2597                   struct extent_buffer *buf, int full_backref)
2598 {
2599         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2600 }
2601
2602 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2603                   struct extent_buffer *buf, int full_backref)
2604 {
2605         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2606 }
2607
2608 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2609                                  struct btrfs_root *root,
2610                                  struct btrfs_path *path,
2611                                  struct btrfs_block_group_cache *cache)
2612 {
2613         int ret;
2614         struct btrfs_root *extent_root = root->fs_info->extent_root;
2615         unsigned long bi;
2616         struct extent_buffer *leaf;
2617
2618         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2619         if (ret < 0)
2620                 goto fail;
2621         BUG_ON(ret);
2622
2623         leaf = path->nodes[0];
2624         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2625         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2626         btrfs_mark_buffer_dirty(leaf);
2627         btrfs_release_path(path);
2628 fail:
2629         if (ret)
2630                 return ret;
2631         return 0;
2632
2633 }
2634
2635 static struct btrfs_block_group_cache *
2636 next_block_group(struct btrfs_root *root,
2637                  struct btrfs_block_group_cache *cache)
2638 {
2639         struct rb_node *node;
2640         spin_lock(&root->fs_info->block_group_cache_lock);
2641         node = rb_next(&cache->cache_node);
2642         btrfs_put_block_group(cache);
2643         if (node) {
2644                 cache = rb_entry(node, struct btrfs_block_group_cache,
2645                                  cache_node);
2646                 btrfs_get_block_group(cache);
2647         } else
2648                 cache = NULL;
2649         spin_unlock(&root->fs_info->block_group_cache_lock);
2650         return cache;
2651 }
2652
2653 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2654                             struct btrfs_trans_handle *trans,
2655                             struct btrfs_path *path)
2656 {
2657         struct btrfs_root *root = block_group->fs_info->tree_root;
2658         struct inode *inode = NULL;
2659         u64 alloc_hint = 0;
2660         int dcs = BTRFS_DC_ERROR;
2661         int num_pages = 0;
2662         int retries = 0;
2663         int ret = 0;
2664
2665         /*
2666          * If this block group is smaller than 100 megs don't bother caching the
2667          * block group.
2668          */
2669         if (block_group->key.offset < (100 * 1024 * 1024)) {
2670                 spin_lock(&block_group->lock);
2671                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2672                 spin_unlock(&block_group->lock);
2673                 return 0;
2674         }
2675
2676 again:
2677         inode = lookup_free_space_inode(root, block_group, path);
2678         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2679                 ret = PTR_ERR(inode);
2680                 btrfs_release_path(path);
2681                 goto out;
2682         }
2683
2684         if (IS_ERR(inode)) {
2685                 BUG_ON(retries);
2686                 retries++;
2687
2688                 if (block_group->ro)
2689                         goto out_free;
2690
2691                 ret = create_free_space_inode(root, trans, block_group, path);
2692                 if (ret)
2693                         goto out_free;
2694                 goto again;
2695         }
2696
2697         /*
2698          * We want to set the generation to 0, that way if anything goes wrong
2699          * from here on out we know not to trust this cache when we load up next
2700          * time.
2701          */
2702         BTRFS_I(inode)->generation = 0;
2703         ret = btrfs_update_inode(trans, root, inode);
2704         WARN_ON(ret);
2705
2706         if (i_size_read(inode) > 0) {
2707                 ret = btrfs_truncate_free_space_cache(root, trans, path,
2708                                                       inode);
2709                 if (ret)
2710                         goto out_put;
2711         }
2712
2713         spin_lock(&block_group->lock);
2714         if (block_group->cached != BTRFS_CACHE_FINISHED) {
2715                 /* We're not cached, don't bother trying to write stuff out */
2716                 dcs = BTRFS_DC_WRITTEN;
2717                 spin_unlock(&block_group->lock);
2718                 goto out_put;
2719         }
2720         spin_unlock(&block_group->lock);
2721
2722         num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2723         if (!num_pages)
2724                 num_pages = 1;
2725
2726         /*
2727          * Just to make absolutely sure we have enough space, we're going to
2728          * preallocate 12 pages worth of space for each block group.  In
2729          * practice we ought to use at most 8, but we need extra space so we can
2730          * add our header and have a terminator between the extents and the
2731          * bitmaps.
2732          */
2733         num_pages *= 16;
2734         num_pages *= PAGE_CACHE_SIZE;
2735
2736         ret = btrfs_check_data_free_space(inode, num_pages);
2737         if (ret)
2738                 goto out_put;
2739
2740         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2741                                               num_pages, num_pages,
2742                                               &alloc_hint);
2743         if (!ret)
2744                 dcs = BTRFS_DC_SETUP;
2745         btrfs_free_reserved_data_space(inode, num_pages);
2746 out_put:
2747         iput(inode);
2748 out_free:
2749         btrfs_release_path(path);
2750 out:
2751         spin_lock(&block_group->lock);
2752         block_group->disk_cache_state = dcs;
2753         spin_unlock(&block_group->lock);
2754
2755         return ret;
2756 }
2757
2758 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2759                                    struct btrfs_root *root)
2760 {
2761         struct btrfs_block_group_cache *cache;
2762         int err = 0;
2763         struct btrfs_path *path;
2764         u64 last = 0;
2765
2766         path = btrfs_alloc_path();
2767         if (!path)
2768                 return -ENOMEM;
2769
2770 again:
2771         while (1) {
2772                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2773                 while (cache) {
2774                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2775                                 break;
2776                         cache = next_block_group(root, cache);
2777                 }
2778                 if (!cache) {
2779                         if (last == 0)
2780                                 break;
2781                         last = 0;
2782                         continue;
2783                 }
2784                 err = cache_save_setup(cache, trans, path);
2785                 last = cache->key.objectid + cache->key.offset;
2786                 btrfs_put_block_group(cache);
2787         }
2788
2789         while (1) {
2790                 if (last == 0) {
2791                         err = btrfs_run_delayed_refs(trans, root,
2792                                                      (unsigned long)-1);
2793                         BUG_ON(err);
2794                 }
2795
2796                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2797                 while (cache) {
2798                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2799                                 btrfs_put_block_group(cache);
2800                                 goto again;
2801                         }
2802
2803                         if (cache->dirty)
2804                                 break;
2805                         cache = next_block_group(root, cache);
2806                 }
2807                 if (!cache) {
2808                         if (last == 0)
2809                                 break;
2810                         last = 0;
2811                         continue;
2812                 }
2813
2814                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2815                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2816                 cache->dirty = 0;
2817                 last = cache->key.objectid + cache->key.offset;
2818
2819                 err = write_one_cache_group(trans, root, path, cache);
2820                 BUG_ON(err);
2821                 btrfs_put_block_group(cache);
2822         }
2823
2824         while (1) {
2825                 /*
2826                  * I don't think this is needed since we're just marking our
2827                  * preallocated extent as written, but just in case it can't
2828                  * hurt.
2829                  */
2830                 if (last == 0) {
2831                         err = btrfs_run_delayed_refs(trans, root,
2832                                                      (unsigned long)-1);
2833                         BUG_ON(err);
2834                 }
2835
2836                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2837                 while (cache) {
2838                         /*
2839                          * Really this shouldn't happen, but it could if we
2840                          * couldn't write the entire preallocated extent and
2841                          * splitting the extent resulted in a new block.
2842                          */
2843                         if (cache->dirty) {
2844                                 btrfs_put_block_group(cache);
2845                                 goto again;
2846                         }
2847                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2848                                 break;
2849                         cache = next_block_group(root, cache);
2850                 }
2851                 if (!cache) {
2852                         if (last == 0)
2853                                 break;
2854                         last = 0;
2855                         continue;
2856                 }
2857
2858                 btrfs_write_out_cache(root, trans, cache, path);
2859
2860                 /*
2861                  * If we didn't have an error then the cache state is still
2862                  * NEED_WRITE, so we can set it to WRITTEN.
2863                  */
2864                 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2865                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
2866                 last = cache->key.objectid + cache->key.offset;
2867                 btrfs_put_block_group(cache);
2868         }
2869
2870         btrfs_free_path(path);
2871         return 0;
2872 }
2873
2874 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2875 {
2876         struct btrfs_block_group_cache *block_group;
2877         int readonly = 0;
2878
2879         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2880         if (!block_group || block_group->ro)
2881                 readonly = 1;
2882         if (block_group)
2883                 btrfs_put_block_group(block_group);
2884         return readonly;
2885 }
2886
2887 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2888                              u64 total_bytes, u64 bytes_used,
2889                              struct btrfs_space_info **space_info)
2890 {
2891         struct btrfs_space_info *found;
2892         int i;
2893         int factor;
2894
2895         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2896                      BTRFS_BLOCK_GROUP_RAID10))
2897                 factor = 2;
2898         else
2899                 factor = 1;
2900
2901         found = __find_space_info(info, flags);
2902         if (found) {
2903                 spin_lock(&found->lock);
2904                 found->total_bytes += total_bytes;
2905                 found->disk_total += total_bytes * factor;
2906                 found->bytes_used += bytes_used;
2907                 found->disk_used += bytes_used * factor;
2908                 found->full = 0;
2909                 spin_unlock(&found->lock);
2910                 *space_info = found;
2911                 return 0;
2912         }
2913         found = kzalloc(sizeof(*found), GFP_NOFS);
2914         if (!found)
2915                 return -ENOMEM;
2916
2917         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2918                 INIT_LIST_HEAD(&found->block_groups[i]);
2919         init_rwsem(&found->groups_sem);
2920         spin_lock_init(&found->lock);
2921         found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2922                                 BTRFS_BLOCK_GROUP_SYSTEM |
2923                                 BTRFS_BLOCK_GROUP_METADATA);
2924         found->total_bytes = total_bytes;
2925         found->disk_total = total_bytes * factor;
2926         found->bytes_used = bytes_used;
2927         found->disk_used = bytes_used * factor;
2928         found->bytes_pinned = 0;
2929         found->bytes_reserved = 0;
2930         found->bytes_readonly = 0;
2931         found->bytes_may_use = 0;
2932         found->full = 0;
2933         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2934         found->chunk_alloc = 0;
2935         *space_info = found;
2936         list_add_rcu(&found->list, &info->space_info);
2937         atomic_set(&found->caching_threads, 0);
2938         return 0;
2939 }
2940
2941 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2942 {
2943         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2944                                    BTRFS_BLOCK_GROUP_RAID1 |
2945                                    BTRFS_BLOCK_GROUP_RAID10 |
2946                                    BTRFS_BLOCK_GROUP_DUP);
2947         if (extra_flags) {
2948                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2949                         fs_info->avail_data_alloc_bits |= extra_flags;
2950                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2951                         fs_info->avail_metadata_alloc_bits |= extra_flags;
2952                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2953                         fs_info->avail_system_alloc_bits |= extra_flags;
2954         }
2955 }
2956
2957 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2958 {
2959         /*
2960          * we add in the count of missing devices because we want
2961          * to make sure that any RAID levels on a degraded FS
2962          * continue to be honored.
2963          */
2964         u64 num_devices = root->fs_info->fs_devices->rw_devices +
2965                 root->fs_info->fs_devices->missing_devices;
2966
2967         if (num_devices == 1)
2968                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2969         if (num_devices < 4)
2970                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2971
2972         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2973             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2974                       BTRFS_BLOCK_GROUP_RAID10))) {
2975                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
2976         }
2977
2978         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2979             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2980                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2981         }
2982
2983         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2984             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2985              (flags & BTRFS_BLOCK_GROUP_RAID10) |
2986              (flags & BTRFS_BLOCK_GROUP_DUP)))
2987                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2988         return flags;
2989 }
2990
2991 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2992 {
2993         if (flags & BTRFS_BLOCK_GROUP_DATA)
2994                 flags |= root->fs_info->avail_data_alloc_bits &
2995                          root->fs_info->data_alloc_profile;
2996         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2997                 flags |= root->fs_info->avail_system_alloc_bits &
2998                          root->fs_info->system_alloc_profile;
2999         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3000                 flags |= root->fs_info->avail_metadata_alloc_bits &
3001                          root->fs_info->metadata_alloc_profile;
3002         return btrfs_reduce_alloc_profile(root, flags);
3003 }
3004
3005 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3006 {
3007         u64 flags;
3008
3009         if (data)
3010                 flags = BTRFS_BLOCK_GROUP_DATA;
3011         else if (root == root->fs_info->chunk_root)
3012                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3013         else
3014                 flags = BTRFS_BLOCK_GROUP_METADATA;
3015
3016         return get_alloc_profile(root, flags);
3017 }
3018
3019 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3020 {
3021         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3022                                                        BTRFS_BLOCK_GROUP_DATA);
3023 }
3024
3025 /*
3026  * This will check the space that the inode allocates from to make sure we have
3027  * enough space for bytes.
3028  */
3029 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3030 {
3031         struct btrfs_space_info *data_sinfo;
3032         struct btrfs_root *root = BTRFS_I(inode)->root;
3033         u64 used;
3034         int ret = 0, committed = 0, alloc_chunk = 1;
3035
3036         /* make sure bytes are sectorsize aligned */
3037         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3038
3039         if (root == root->fs_info->tree_root ||
3040             BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3041                 alloc_chunk = 0;
3042                 committed = 1;
3043         }
3044
3045         data_sinfo = BTRFS_I(inode)->space_info;
3046         if (!data_sinfo)
3047                 goto alloc;
3048
3049 again:
3050         /* make sure we have enough space to handle the data first */
3051         spin_lock(&data_sinfo->lock);
3052         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3053                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3054                 data_sinfo->bytes_may_use;
3055
3056         if (used + bytes > data_sinfo->total_bytes) {
3057                 struct btrfs_trans_handle *trans;
3058
3059                 /*
3060                  * if we don't have enough free bytes in this space then we need
3061                  * to alloc a new chunk.
3062                  */
3063                 if (!data_sinfo->full && alloc_chunk) {
3064                         u64 alloc_target;
3065
3066                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3067                         spin_unlock(&data_sinfo->lock);
3068 alloc:
3069                         alloc_target = btrfs_get_alloc_profile(root, 1);
3070                         trans = btrfs_join_transaction(root);
3071                         if (IS_ERR(trans))
3072                                 return PTR_ERR(trans);
3073
3074                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3075                                              bytes + 2 * 1024 * 1024,
3076                                              alloc_target,
3077                                              CHUNK_ALLOC_NO_FORCE);
3078                         btrfs_end_transaction(trans, root);
3079                         if (ret < 0) {
3080                                 if (ret != -ENOSPC)
3081                                         return ret;
3082                                 else
3083                                         goto commit_trans;
3084                         }
3085
3086                         if (!data_sinfo) {
3087                                 btrfs_set_inode_space_info(root, inode);
3088                                 data_sinfo = BTRFS_I(inode)->space_info;
3089                         }
3090                         goto again;
3091                 }
3092                 spin_unlock(&data_sinfo->lock);
3093
3094                 /* commit the current transaction and try again */
3095 commit_trans:
3096                 if (!committed &&
3097                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3098                         committed = 1;
3099                         trans = btrfs_join_transaction(root);
3100                         if (IS_ERR(trans))
3101                                 return PTR_ERR(trans);
3102                         ret = btrfs_commit_transaction(trans, root);
3103                         if (ret)
3104                                 return ret;
3105                         goto again;
3106                 }
3107
3108                 return -ENOSPC;
3109         }
3110         data_sinfo->bytes_may_use += bytes;
3111         BTRFS_I(inode)->reserved_bytes += bytes;
3112         spin_unlock(&data_sinfo->lock);
3113
3114         return 0;
3115 }
3116
3117 /*
3118  * called when we are clearing an delalloc extent from the
3119  * inode's io_tree or there was an error for whatever reason
3120  * after calling btrfs_check_data_free_space
3121  */
3122 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3123 {
3124         struct btrfs_root *root = BTRFS_I(inode)->root;
3125         struct btrfs_space_info *data_sinfo;
3126
3127         /* make sure bytes are sectorsize aligned */
3128         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3129
3130         data_sinfo = BTRFS_I(inode)->space_info;
3131         spin_lock(&data_sinfo->lock);
3132         data_sinfo->bytes_may_use -= bytes;
3133         BTRFS_I(inode)->reserved_bytes -= bytes;
3134         spin_unlock(&data_sinfo->lock);
3135 }
3136
3137 static void force_metadata_allocation(struct btrfs_fs_info *info)
3138 {
3139         struct list_head *head = &info->space_info;
3140         struct btrfs_space_info *found;
3141
3142         rcu_read_lock();
3143         list_for_each_entry_rcu(found, head, list) {
3144                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3145                         found->force_alloc = CHUNK_ALLOC_FORCE;
3146         }
3147         rcu_read_unlock();
3148 }
3149
3150 static int should_alloc_chunk(struct btrfs_root *root,
3151                               struct btrfs_space_info *sinfo, u64 alloc_bytes,
3152                               int force)
3153 {
3154         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3155         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3156         u64 thresh;
3157
3158         if (force == CHUNK_ALLOC_FORCE)
3159                 return 1;
3160
3161         /*
3162          * in limited mode, we want to have some free space up to
3163          * about 1% of the FS size.
3164          */
3165         if (force == CHUNK_ALLOC_LIMITED) {
3166                 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3167                 thresh = max_t(u64, 64 * 1024 * 1024,
3168                                div_factor_fine(thresh, 1));
3169
3170                 if (num_bytes - num_allocated < thresh)
3171                         return 1;
3172         }
3173
3174         /*
3175          * we have two similar checks here, one based on percentage
3176          * and once based on a hard number of 256MB.  The idea
3177          * is that if we have a good amount of free
3178          * room, don't allocate a chunk.  A good mount is
3179          * less than 80% utilized of the chunks we have allocated,
3180          * or more than 256MB free
3181          */
3182         if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3183                 return 0;
3184
3185         if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3186                 return 0;
3187
3188         thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3189
3190         /* 256MB or 5% of the FS */
3191         thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3192
3193         if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3194                 return 0;
3195         return 1;
3196 }
3197
3198 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3199                           struct btrfs_root *extent_root, u64 alloc_bytes,
3200                           u64 flags, int force)
3201 {
3202         struct btrfs_space_info *space_info;
3203         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3204         int wait_for_alloc = 0;
3205         int ret = 0;
3206
3207         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3208
3209         space_info = __find_space_info(extent_root->fs_info, flags);
3210         if (!space_info) {
3211                 ret = update_space_info(extent_root->fs_info, flags,
3212                                         0, 0, &space_info);
3213                 BUG_ON(ret);
3214         }
3215         BUG_ON(!space_info);
3216
3217 again:
3218         spin_lock(&space_info->lock);
3219         if (space_info->force_alloc)
3220                 force = space_info->force_alloc;
3221         if (space_info->full) {
3222                 spin_unlock(&space_info->lock);
3223                 return 0;
3224         }
3225
3226         if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3227                 spin_unlock(&space_info->lock);
3228                 return 0;
3229         } else if (space_info->chunk_alloc) {
3230                 wait_for_alloc = 1;
3231         } else {
3232                 space_info->chunk_alloc = 1;
3233         }
3234
3235         spin_unlock(&space_info->lock);
3236
3237         mutex_lock(&fs_info->chunk_mutex);
3238
3239         /*
3240          * The chunk_mutex is held throughout the entirety of a chunk
3241          * allocation, so once we've acquired the chunk_mutex we know that the
3242          * other guy is done and we need to recheck and see if we should
3243          * allocate.
3244          */
3245         if (wait_for_alloc) {
3246                 mutex_unlock(&fs_info->chunk_mutex);
3247                 wait_for_alloc = 0;
3248                 goto again;
3249         }
3250
3251         /*
3252          * If we have mixed data/metadata chunks we want to make sure we keep
3253          * allocating mixed chunks instead of individual chunks.
3254          */
3255         if (btrfs_mixed_space_info(space_info))
3256                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3257
3258         /*
3259          * if we're doing a data chunk, go ahead and make sure that
3260          * we keep a reasonable number of metadata chunks allocated in the
3261          * FS as well.
3262          */
3263         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3264                 fs_info->data_chunk_allocations++;
3265                 if (!(fs_info->data_chunk_allocations %
3266                       fs_info->metadata_ratio))
3267                         force_metadata_allocation(fs_info);
3268         }
3269
3270         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3271         spin_lock(&space_info->lock);
3272         if (ret)
3273                 space_info->full = 1;
3274         else
3275                 ret = 1;
3276
3277         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3278         space_info->chunk_alloc = 0;
3279         spin_unlock(&space_info->lock);
3280         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3281         return ret;
3282 }
3283
3284 /*
3285  * shrink metadata reservation for delalloc
3286  */
3287 static int shrink_delalloc(struct btrfs_trans_handle *trans,
3288                            struct btrfs_root *root, u64 to_reclaim, int sync)
3289 {
3290         struct btrfs_block_rsv *block_rsv;
3291         struct btrfs_space_info *space_info;
3292         u64 reserved;
3293         u64 max_reclaim;
3294         u64 reclaimed = 0;
3295         long time_left;
3296         int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3297         int loops = 0;
3298         unsigned long progress;
3299
3300         block_rsv = &root->fs_info->delalloc_block_rsv;
3301         space_info = block_rsv->space_info;
3302
3303         smp_mb();
3304         reserved = space_info->bytes_reserved;
3305         progress = space_info->reservation_progress;
3306
3307         if (reserved == 0)
3308                 return 0;
3309
3310         /* nothing to shrink - nothing to reclaim */
3311         if (root->fs_info->delalloc_bytes == 0)
3312                 return 0;
3313
3314         max_reclaim = min(reserved, to_reclaim);
3315
3316         while (loops < 1024) {
3317                 /* have the flusher threads jump in and do some IO */
3318                 smp_mb();
3319                 nr_pages = min_t(unsigned long, nr_pages,
3320                        root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3321                 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3322
3323                 spin_lock(&space_info->lock);
3324                 if (reserved > space_info->bytes_reserved)
3325                         reclaimed += reserved - space_info->bytes_reserved;
3326                 reserved = space_info->bytes_reserved;
3327                 spin_unlock(&space_info->lock);
3328
3329                 loops++;
3330
3331                 if (reserved == 0 || reclaimed >= max_reclaim)
3332                         break;
3333
3334                 if (trans && trans->transaction->blocked)
3335                         return -EAGAIN;
3336
3337                 time_left = schedule_timeout_interruptible(1);
3338
3339                 /* We were interrupted, exit */
3340                 if (time_left)
3341                         break;
3342
3343                 /* we've kicked the IO a few times, if anything has been freed,
3344                  * exit.  There is no sense in looping here for a long time
3345                  * when we really need to commit the transaction, or there are
3346                  * just too many writers without enough free space
3347                  */
3348
3349                 if (loops > 3) {
3350                         smp_mb();
3351                         if (progress != space_info->reservation_progress)
3352                                 break;
3353                 }
3354
3355         }
3356         return reclaimed >= to_reclaim;
3357 }
3358
3359 /*
3360  * Retries tells us how many times we've called reserve_metadata_bytes.  The
3361  * idea is if this is the first call (retries == 0) then we will add to our
3362  * reserved count if we can't make the allocation in order to hold our place
3363  * while we go and try and free up space.  That way for retries > 1 we don't try
3364  * and add space, we just check to see if the amount of unused space is >= the
3365  * total space, meaning that our reservation is valid.
3366  *
3367  * However if we don't intend to retry this reservation, pass -1 as retries so
3368  * that it short circuits this logic.
3369  */
3370 static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3371                                   struct btrfs_root *root,
3372                                   struct btrfs_block_rsv *block_rsv,
3373                                   u64 orig_bytes, int flush)
3374 {
3375         struct btrfs_space_info *space_info = block_rsv->space_info;
3376         u64 unused;
3377         u64 num_bytes = orig_bytes;
3378         int retries = 0;
3379         int ret = 0;
3380         bool reserved = false;
3381         bool committed = false;
3382
3383 again:
3384         ret = -ENOSPC;
3385         if (reserved)
3386                 num_bytes = 0;
3387
3388         spin_lock(&space_info->lock);
3389         unused = space_info->bytes_used + space_info->bytes_reserved +
3390                  space_info->bytes_pinned + space_info->bytes_readonly +
3391                  space_info->bytes_may_use;
3392
3393         /*
3394          * The idea here is that we've not already over-reserved the block group
3395          * then we can go ahead and save our reservation first and then start
3396          * flushing if we need to.  Otherwise if we've already overcommitted
3397          * lets start flushing stuff first and then come back and try to make
3398          * our reservation.
3399          */
3400         if (unused <= space_info->total_bytes) {
3401                 unused = space_info->total_bytes - unused;
3402                 if (unused >= num_bytes) {
3403                         if (!reserved)
3404                                 space_info->bytes_reserved += orig_bytes;
3405                         ret = 0;
3406                 } else {
3407                         /*
3408                          * Ok set num_bytes to orig_bytes since we aren't
3409                          * overocmmitted, this way we only try and reclaim what
3410                          * we need.
3411                          */
3412                         num_bytes = orig_bytes;
3413                 }
3414         } else {
3415                 /*
3416                  * Ok we're over committed, set num_bytes to the overcommitted
3417                  * amount plus the amount of bytes that we need for this
3418                  * reservation.
3419                  */
3420                 num_bytes = unused - space_info->total_bytes +
3421                         (orig_bytes * (retries + 1));
3422         }
3423
3424         /*
3425          * Couldn't make our reservation, save our place so while we're trying
3426          * to reclaim space we can actually use it instead of somebody else
3427          * stealing it from us.
3428          */
3429         if (ret && !reserved) {
3430                 space_info->bytes_reserved += orig_bytes;
3431                 reserved = true;
3432         }
3433
3434         spin_unlock(&space_info->lock);
3435
3436         if (!ret)
3437                 return 0;
3438
3439         if (!flush)
3440                 goto out;
3441
3442         /*
3443          * We do synchronous shrinking since we don't actually unreserve
3444          * metadata until after the IO is completed.
3445          */
3446         ret = shrink_delalloc(trans, root, num_bytes, 1);
3447         if (ret > 0)
3448                 return 0;
3449         else if (ret < 0)
3450                 goto out;
3451
3452         /*
3453          * So if we were overcommitted it's possible that somebody else flushed
3454          * out enough space and we simply didn't have enough space to reclaim,
3455          * so go back around and try again.
3456          */
3457         if (retries < 2) {
3458                 retries++;
3459                 goto again;
3460         }
3461
3462         spin_lock(&space_info->lock);
3463         /*
3464          * Not enough space to be reclaimed, don't bother committing the
3465          * transaction.
3466          */
3467         if (space_info->bytes_pinned < orig_bytes)
3468                 ret = -ENOSPC;
3469         spin_unlock(&space_info->lock);
3470         if (ret)
3471                 goto out;
3472
3473         ret = -EAGAIN;
3474         if (trans || committed)
3475                 goto out;
3476
3477         ret = -ENOSPC;
3478         trans = btrfs_join_transaction(root);
3479         if (IS_ERR(trans))
3480                 goto out;
3481         ret = btrfs_commit_transaction(trans, root);
3482         if (!ret) {
3483                 trans = NULL;
3484                 committed = true;
3485                 goto again;
3486         }
3487
3488 out:
3489         if (reserved) {
3490                 spin_lock(&space_info->lock);
3491                 space_info->bytes_reserved -= orig_bytes;
3492                 spin_unlock(&space_info->lock);
3493         }
3494
3495         return ret;
3496 }
3497
3498 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3499                                              struct btrfs_root *root)
3500 {
3501         struct btrfs_block_rsv *block_rsv;
3502         if (root->ref_cows)
3503                 block_rsv = trans->block_rsv;
3504         else
3505                 block_rsv = root->block_rsv;
3506
3507         if (!block_rsv)
3508                 block_rsv = &root->fs_info->empty_block_rsv;
3509
3510         return block_rsv;
3511 }
3512
3513 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3514                                u64 num_bytes)
3515 {
3516         int ret = -ENOSPC;
3517         spin_lock(&block_rsv->lock);
3518         if (block_rsv->reserved >= num_bytes) {
3519                 block_rsv->reserved -= num_bytes;
3520                 if (block_rsv->reserved < block_rsv->size)
3521                         block_rsv->full = 0;
3522                 ret = 0;
3523         }
3524         spin_unlock(&block_rsv->lock);
3525         return ret;
3526 }
3527
3528 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3529                                 u64 num_bytes, int update_size)
3530 {
3531         spin_lock(&block_rsv->lock);
3532         block_rsv->reserved += num_bytes;
3533         if (update_size)
3534                 block_rsv->size += num_bytes;
3535         else if (block_rsv->reserved >= block_rsv->size)
3536                 block_rsv->full = 1;
3537         spin_unlock(&block_rsv->lock);
3538 }
3539
3540 static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3541                                     struct btrfs_block_rsv *dest, u64 num_bytes)
3542 {
3543         struct btrfs_space_info *space_info = block_rsv->space_info;
3544
3545         spin_lock(&block_rsv->lock);
3546         if (num_bytes == (u64)-1)
3547                 num_bytes = block_rsv->size;
3548         block_rsv->size -= num_bytes;
3549         if (block_rsv->reserved >= block_rsv->size) {
3550                 num_bytes = block_rsv->reserved - block_rsv->size;
3551                 block_rsv->reserved = block_rsv->size;
3552                 block_rsv->full = 1;
3553         } else {
3554                 num_bytes = 0;
3555         }
3556         spin_unlock(&block_rsv->lock);
3557
3558         if (num_bytes > 0) {
3559                 if (dest) {
3560                         spin_lock(&dest->lock);
3561                         if (!dest->full) {
3562                                 u64 bytes_to_add;
3563
3564                                 bytes_to_add = dest->size - dest->reserved;
3565                                 bytes_to_add = min(num_bytes, bytes_to_add);
3566                                 dest->reserved += bytes_to_add;
3567                                 if (dest->reserved >= dest->size)
3568                                         dest->full = 1;
3569                                 num_bytes -= bytes_to_add;
3570                         }
3571                         spin_unlock(&dest->lock);
3572                 }
3573                 if (num_bytes) {
3574                         spin_lock(&space_info->lock);
3575                         space_info->bytes_reserved -= num_bytes;
3576                         space_info->reservation_progress++;
3577                         spin_unlock(&space_info->lock);
3578                 }
3579         }
3580 }
3581
3582 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3583                                    struct btrfs_block_rsv *dst, u64 num_bytes)
3584 {
3585         int ret;
3586
3587         ret = block_rsv_use_bytes(src, num_bytes);
3588         if (ret)
3589                 return ret;
3590
3591         block_rsv_add_bytes(dst, num_bytes, 1);
3592         return 0;
3593 }
3594
3595 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3596 {
3597         memset(rsv, 0, sizeof(*rsv));
3598         spin_lock_init(&rsv->lock);
3599         atomic_set(&rsv->usage, 1);
3600         rsv->priority = 6;
3601         INIT_LIST_HEAD(&rsv->list);
3602 }
3603
3604 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3605 {
3606         struct btrfs_block_rsv *block_rsv;
3607         struct btrfs_fs_info *fs_info = root->fs_info;
3608
3609         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3610         if (!block_rsv)
3611                 return NULL;
3612
3613         btrfs_init_block_rsv(block_rsv);
3614         block_rsv->space_info = __find_space_info(fs_info,
3615                                                   BTRFS_BLOCK_GROUP_METADATA);
3616         return block_rsv;
3617 }
3618
3619 void btrfs_free_block_rsv(struct btrfs_root *root,
3620                           struct btrfs_block_rsv *rsv)
3621 {
3622         if (rsv && atomic_dec_and_test(&rsv->usage)) {
3623                 btrfs_block_rsv_release(root, rsv, (u64)-1);
3624                 if (!rsv->durable)
3625                         kfree(rsv);
3626         }
3627 }
3628
3629 /*
3630  * make the block_rsv struct be able to capture freed space.
3631  * the captured space will re-add to the the block_rsv struct
3632  * after transaction commit
3633  */
3634 void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3635                                  struct btrfs_block_rsv *block_rsv)
3636 {
3637         block_rsv->durable = 1;
3638         mutex_lock(&fs_info->durable_block_rsv_mutex);
3639         list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3640         mutex_unlock(&fs_info->durable_block_rsv_mutex);
3641 }
3642
3643 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3644                         struct btrfs_root *root,
3645                         struct btrfs_block_rsv *block_rsv,
3646                         u64 num_bytes)
3647 {
3648         int ret;
3649
3650         if (num_bytes == 0)
3651                 return 0;
3652
3653         ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
3654         if (!ret) {
3655                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3656                 return 0;
3657         }
3658
3659         return ret;
3660 }
3661
3662 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3663                           struct btrfs_root *root,
3664                           struct btrfs_block_rsv *block_rsv,
3665                           u64 min_reserved, int min_factor)
3666 {
3667         u64 num_bytes = 0;
3668         int commit_trans = 0;
3669         int ret = -ENOSPC;
3670
3671         if (!block_rsv)
3672                 return 0;
3673
3674         spin_lock(&block_rsv->lock);
3675         if (min_factor > 0)
3676                 num_bytes = div_factor(block_rsv->size, min_factor);
3677         if (min_reserved > num_bytes)
3678                 num_bytes = min_reserved;
3679
3680         if (block_rsv->reserved >= num_bytes) {
3681                 ret = 0;
3682         } else {
3683                 num_bytes -= block_rsv->reserved;
3684                 if (block_rsv->durable &&
3685                     block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3686                         commit_trans = 1;
3687         }
3688         spin_unlock(&block_rsv->lock);
3689         if (!ret)
3690                 return 0;
3691
3692         if (block_rsv->refill_used) {
3693                 ret = reserve_metadata_bytes(trans, root, block_rsv,
3694                                              num_bytes, 0);
3695                 if (!ret) {
3696                         block_rsv_add_bytes(block_rsv, num_bytes, 0);
3697                         return 0;
3698                 }
3699         }
3700
3701         if (commit_trans) {
3702                 if (trans)
3703                         return -EAGAIN;
3704
3705                 trans = btrfs_join_transaction(root);
3706                 BUG_ON(IS_ERR(trans));
3707                 ret = btrfs_commit_transaction(trans, root);
3708                 return 0;
3709         }
3710
3711         return -ENOSPC;
3712 }
3713
3714 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3715                             struct btrfs_block_rsv *dst_rsv,
3716                             u64 num_bytes)
3717 {
3718         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3719 }
3720
3721 void btrfs_block_rsv_release(struct btrfs_root *root,
3722                              struct btrfs_block_rsv *block_rsv,
3723                              u64 num_bytes)
3724 {
3725         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3726         if (global_rsv->full || global_rsv == block_rsv ||
3727             block_rsv->space_info != global_rsv->space_info)
3728                 global_rsv = NULL;
3729         block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3730 }
3731
3732 /*
3733  * helper to calculate size of global block reservation.
3734  * the desired value is sum of space used by extent tree,
3735  * checksum tree and root tree
3736  */
3737 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3738 {
3739         struct btrfs_space_info *sinfo;
3740         u64 num_bytes;
3741         u64 meta_used;
3742         u64 data_used;
3743         int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3744
3745         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3746         spin_lock(&sinfo->lock);
3747         data_used = sinfo->bytes_used;
3748         spin_unlock(&sinfo->lock);
3749
3750         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3751         spin_lock(&sinfo->lock);
3752         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3753                 data_used = 0;
3754         meta_used = sinfo->bytes_used;
3755         spin_unlock(&sinfo->lock);
3756
3757         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3758                     csum_size * 2;
3759         num_bytes += div64_u64(data_used + meta_used, 50);
3760
3761         if (num_bytes * 3 > meta_used)
3762                 num_bytes = div64_u64(meta_used, 3);
3763
3764         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3765 }
3766
3767 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3768 {
3769         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3770         struct btrfs_space_info *sinfo = block_rsv->space_info;
3771         u64 num_bytes;
3772
3773         num_bytes = calc_global_metadata_size(fs_info);
3774
3775         spin_lock(&block_rsv->lock);
3776         spin_lock(&sinfo->lock);
3777
3778         block_rsv->size = num_bytes;
3779
3780         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3781                     sinfo->bytes_reserved + sinfo->bytes_readonly +
3782                     sinfo->bytes_may_use;
3783
3784         if (sinfo->total_bytes > num_bytes) {
3785                 num_bytes = sinfo->total_bytes - num_bytes;
3786                 block_rsv->reserved += num_bytes;
3787                 sinfo->bytes_reserved += num_bytes;
3788         }
3789
3790         if (block_rsv->reserved >= block_rsv->size) {
3791                 num_bytes = block_rsv->reserved - block_rsv->size;
3792                 sinfo->bytes_reserved -= num_bytes;
3793                 sinfo->reservation_progress++;
3794                 block_rsv->reserved = block_rsv->size;
3795                 block_rsv->full = 1;
3796         }
3797
3798         spin_unlock(&sinfo->lock);
3799         spin_unlock(&block_rsv->lock);
3800 }
3801
3802 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3803 {
3804         struct btrfs_space_info *space_info;
3805
3806         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3807         fs_info->chunk_block_rsv.space_info = space_info;
3808         fs_info->chunk_block_rsv.priority = 10;
3809
3810         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3811         fs_info->global_block_rsv.space_info = space_info;
3812         fs_info->global_block_rsv.priority = 10;
3813         fs_info->global_block_rsv.refill_used = 1;
3814         fs_info->delalloc_block_rsv.space_info = space_info;
3815         fs_info->trans_block_rsv.space_info = space_info;
3816         fs_info->empty_block_rsv.space_info = space_info;
3817         fs_info->empty_block_rsv.priority = 10;
3818
3819         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3820         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3821         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3822         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3823         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3824
3825         btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3826
3827         btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3828
3829         update_global_block_rsv(fs_info);
3830 }
3831
3832 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3833 {
3834         block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3835         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3836         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3837         WARN_ON(fs_info->trans_block_rsv.size > 0);
3838         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3839         WARN_ON(fs_info->chunk_block_rsv.size > 0);
3840         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3841 }
3842
3843 int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3844                                     struct btrfs_root *root,
3845                                     struct btrfs_block_rsv *rsv)
3846 {
3847         struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3848         u64 num_bytes;
3849         int ret;
3850
3851         /*
3852          * Truncate should be freeing data, but give us 2 items just in case it
3853          * needs to use some space.  We may want to be smarter about this in the
3854          * future.
3855          */
3856         num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3857
3858         /* We already have enough bytes, just return */
3859         if (rsv->reserved >= num_bytes)
3860                 return 0;
3861
3862         num_bytes -= rsv->reserved;
3863
3864         /*
3865          * You should have reserved enough space before hand to do this, so this
3866          * should not fail.
3867          */
3868         ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3869         BUG_ON(ret);
3870
3871         return 0;
3872 }
3873
3874 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3875                                  struct btrfs_root *root,
3876                                  int num_items)
3877 {
3878         u64 num_bytes;
3879         int ret;
3880
3881         if (num_items == 0 || root->fs_info->chunk_root == root)
3882                 return 0;
3883
3884         num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
3885         ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3886                                   num_bytes);
3887         if (!ret) {
3888                 trans->bytes_reserved += num_bytes;
3889                 trans->block_rsv = &root->fs_info->trans_block_rsv;
3890         }
3891         return ret;
3892 }
3893
3894 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3895                                   struct btrfs_root *root)
3896 {
3897         if (!trans->bytes_reserved)
3898                 return;
3899
3900         BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3901         btrfs_block_rsv_release(root, trans->block_rsv,
3902                                 trans->bytes_reserved);
3903         trans->bytes_reserved = 0;
3904 }
3905
3906 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3907                                   struct inode *inode)
3908 {
3909         struct btrfs_root *root = BTRFS_I(inode)->root;
3910         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3911         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3912
3913         /*
3914          * We need to hold space in order to delete our orphan item once we've
3915          * added it, so this takes the reservation so we can release it later
3916          * when we are truly done with the orphan item.
3917          */
3918         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
3919         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3920 }
3921
3922 void btrfs_orphan_release_metadata(struct inode *inode)
3923 {
3924         struct btrfs_root *root = BTRFS_I(inode)->root;
3925         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
3926         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3927 }
3928
3929 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3930                                 struct btrfs_pending_snapshot *pending)
3931 {
3932         struct btrfs_root *root = pending->root;
3933         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3934         struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3935         /*
3936          * two for root back/forward refs, two for directory entries
3937          * and one for root of the snapshot.
3938          */
3939         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3940         dst_rsv->space_info = src_rsv->space_info;
3941         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3942 }
3943
3944 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3945 {
3946         return num_bytes >>= 3;
3947 }
3948
3949 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3950 {
3951         struct btrfs_root *root = BTRFS_I(inode)->root;
3952         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3953         u64 to_reserve;
3954         int nr_extents;
3955         int reserved_extents;
3956         int ret;
3957
3958         if (btrfs_transaction_in_commit(root->fs_info))
3959                 schedule_timeout(1);
3960
3961         num_bytes = ALIGN(num_bytes, root->sectorsize);
3962
3963         nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3964         reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
3965
3966         if (nr_extents > reserved_extents) {
3967                 nr_extents -= reserved_extents;
3968                 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
3969         } else {
3970                 nr_extents = 0;
3971                 to_reserve = 0;
3972         }
3973
3974         to_reserve += calc_csum_metadata_size(inode, num_bytes);
3975         ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3976         if (ret)
3977                 return ret;
3978
3979         atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3980         atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3981
3982         block_rsv_add_bytes(block_rsv, to_reserve, 1);
3983
3984         if (block_rsv->size > 512 * 1024 * 1024)
3985                 shrink_delalloc(NULL, root, to_reserve, 0);
3986
3987         return 0;
3988 }
3989
3990 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3991 {
3992         struct btrfs_root *root = BTRFS_I(inode)->root;
3993         u64 to_free;
3994         int nr_extents;
3995         int reserved_extents;
3996
3997         num_bytes = ALIGN(num_bytes, root->sectorsize);
3998         atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3999         WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4000
4001         reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4002         do {
4003                 int old, new;
4004
4005                 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4006                 if (nr_extents >= reserved_extents) {
4007                         nr_extents = 0;
4008                         break;
4009                 }
4010                 old = reserved_extents;
4011                 nr_extents = reserved_extents - nr_extents;
4012                 new = reserved_extents - nr_extents;
4013                 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4014                                      reserved_extents, new);
4015                 if (likely(old == reserved_extents))
4016                         break;
4017                 reserved_extents = old;
4018         } while (1);
4019
4020         to_free = calc_csum_metadata_size(inode, num_bytes);
4021         if (nr_extents > 0)
4022                 to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
4023
4024         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4025                                 to_free);
4026 }
4027
4028 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4029 {
4030         int ret;
4031
4032         ret = btrfs_check_data_free_space(inode, num_bytes);
4033         if (ret)
4034                 return ret;
4035
4036         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4037         if (ret) {
4038                 btrfs_free_reserved_data_space(inode, num_bytes);
4039                 return ret;
4040         }
4041
4042         return 0;
4043 }
4044
4045 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4046 {
4047         btrfs_delalloc_release_metadata(inode, num_bytes);
4048         btrfs_free_reserved_data_space(inode, num_bytes);
4049 }
4050
4051 static int update_block_group(struct btrfs_trans_handle *trans,
4052                               struct btrfs_root *root,
4053                               u64 bytenr, u64 num_bytes, int alloc)
4054 {
4055         struct btrfs_block_group_cache *cache = NULL;
4056         struct btrfs_fs_info *info = root->fs_info;
4057         u64 total = num_bytes;
4058         u64 old_val;
4059         u64 byte_in_group;
4060         int factor;
4061
4062         /* block accounting for super block */
4063         spin_lock(&info->delalloc_lock);
4064         old_val = btrfs_super_bytes_used(&info->super_copy);
4065         if (alloc)
4066                 old_val += num_bytes;
4067         else
4068                 old_val -= num_bytes;
4069         btrfs_set_super_bytes_used(&info->super_copy, old_val);
4070         spin_unlock(&info->delalloc_lock);
4071
4072         while (total) {
4073                 cache = btrfs_lookup_block_group(info, bytenr);
4074                 if (!cache)
4075                         return -1;
4076                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4077                                     BTRFS_BLOCK_GROUP_RAID1 |
4078                                     BTRFS_BLOCK_GROUP_RAID10))
4079                         factor = 2;
4080                 else
4081                         factor = 1;
4082                 /*
4083                  * If this block group has free space cache written out, we
4084                  * need to make sure to load it if we are removing space.  This
4085                  * is because we need the unpinning stage to actually add the
4086                  * space back to the block group, otherwise we will leak space.
4087                  */
4088                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4089                         cache_block_group(cache, trans, NULL, 1);
4090
4091                 byte_in_group = bytenr - cache->key.objectid;
4092                 WARN_ON(byte_in_group > cache->key.offset);
4093
4094                 spin_lock(&cache->space_info->lock);
4095                 spin_lock(&cache->lock);
4096
4097                 if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4098                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4099                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4100
4101                 cache->dirty = 1;
4102                 old_val = btrfs_block_group_used(&cache->item);
4103                 num_bytes = min(total, cache->key.offset - byte_in_group);
4104                 if (alloc) {
4105                         old_val += num_bytes;
4106                         btrfs_set_block_group_used(&cache->item, old_val);
4107                         cache->reserved -= num_bytes;
4108                         cache->space_info->bytes_reserved -= num_bytes;
4109                         cache->space_info->reservation_progress++;
4110                         cache->space_info->bytes_used += num_bytes;
4111                         cache->space_info->disk_used += num_bytes * factor;
4112                         spin_unlock(&cache->lock);
4113                         spin_unlock(&cache->space_info->lock);
4114                 } else {
4115                         old_val -= num_bytes;
4116                         btrfs_set_block_group_used(&cache->item, old_val);
4117                         cache->pinned += num_bytes;
4118                         cache->space_info->bytes_pinned += num_bytes;
4119                         cache->space_info->bytes_used -= num_bytes;
4120                         cache->space_info->disk_used -= num_bytes * factor;
4121                         spin_unlock(&cache->lock);
4122                         spin_unlock(&cache->space_info->lock);
4123
4124                         set_extent_dirty(info->pinned_extents,
4125                                          bytenr, bytenr + num_bytes - 1,
4126                                          GFP_NOFS | __GFP_NOFAIL);
4127                 }
4128                 btrfs_put_block_group(cache);
4129                 total -= num_bytes;
4130                 bytenr += num_bytes;
4131         }
4132         return 0;
4133 }
4134
4135 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4136 {
4137         struct btrfs_block_group_cache *cache;
4138         u64 bytenr;
4139
4140         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4141         if (!cache)
4142                 return 0;
4143
4144         bytenr = cache->key.objectid;
4145         btrfs_put_block_group(cache);
4146
4147         return bytenr;
4148 }
4149
4150 static int pin_down_extent(struct btrfs_root *root,
4151                            struct btrfs_block_group_cache *cache,
4152                            u64 bytenr, u64 num_bytes, int reserved)
4153 {
4154         spin_lock(&cache->space_info->lock);
4155         spin_lock(&cache->lock);
4156         cache->pinned += num_bytes;
4157         cache->space_info->bytes_pinned += num_bytes;
4158         if (reserved) {
4159                 cache->reserved -= num_bytes;
4160                 cache->space_info->bytes_reserved -= num_bytes;
4161                 cache->space_info->reservation_progress++;
4162         }
4163         spin_unlock(&cache->lock);
4164         spin_unlock(&cache->space_info->lock);
4165
4166         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4167                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4168         return 0;
4169 }
4170
4171 /*
4172  * this function must be called within transaction
4173  */
4174 int btrfs_pin_extent(struct btrfs_root *root,
4175                      u64 bytenr, u64 num_bytes, int reserved)
4176 {
4177         struct btrfs_block_group_cache *cache;
4178
4179         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4180         BUG_ON(!cache);
4181
4182         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4183
4184         btrfs_put_block_group(cache);
4185         return 0;
4186 }
4187
4188 /*
4189  * update size of reserved extents. this function may return -EAGAIN
4190  * if 'reserve' is true or 'sinfo' is false.
4191  */
4192 int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4193                                 u64 num_bytes, int reserve, int sinfo)
4194 {
4195         int ret = 0;
4196         if (sinfo) {
4197                 struct btrfs_space_info *space_info = cache->space_info;
4198                 spin_lock(&space_info->lock);
4199                 spin_lock(&cache->lock);
4200                 if (reserve) {
4201                         if (cache->ro) {
4202                                 ret = -EAGAIN;
4203                         } else {
4204                                 cache->reserved += num_bytes;
4205                                 space_info->bytes_reserved += num_bytes;
4206                         }
4207                 } else {
4208                         if (cache->ro)
4209                                 space_info->bytes_readonly += num_bytes;
4210                         cache->reserved -= num_bytes;
4211                         space_info->bytes_reserved -= num_bytes;
4212                         space_info->reservation_progress++;
4213                 }
4214                 spin_unlock(&cache->lock);
4215                 spin_unlock(&space_info->lock);
4216         } else {
4217                 spin_lock(&cache->lock);
4218                 if (cache->ro) {
4219                         ret = -EAGAIN;
4220                 } else {
4221                         if (reserve)
4222                                 cache->reserved += num_bytes;
4223                         else
4224                                 cache->reserved -= num_bytes;
4225                 }
4226                 spin_unlock(&cache->lock);
4227         }
4228         return ret;
4229 }
4230
4231 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4232                                 struct btrfs_root *root)
4233 {
4234         struct btrfs_fs_info *fs_info = root->fs_info;
4235         struct btrfs_caching_control *next;
4236         struct btrfs_caching_control *caching_ctl;
4237         struct btrfs_block_group_cache *cache;
4238
4239         down_write(&fs_info->extent_commit_sem);
4240
4241         list_for_each_entry_safe(caching_ctl, next,
4242                                  &fs_info->caching_block_groups, list) {
4243                 cache = caching_ctl->block_group;
4244                 if (block_group_cache_done(cache)) {
4245                         cache->last_byte_to_unpin = (u64)-1;
4246                         list_del_init(&caching_ctl->list);
4247                         put_caching_control(caching_ctl);
4248                 } else {
4249                         cache->last_byte_to_unpin = caching_ctl->progress;
4250                 }
4251         }
4252
4253         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4254                 fs_info->pinned_extents = &fs_info->freed_extents[1];
4255         else
4256                 fs_info->pinned_extents = &fs_info->freed_extents[0];
4257
4258         up_write(&fs_info->extent_commit_sem);
4259
4260         update_global_block_rsv(fs_info);
4261         return 0;
4262 }
4263
4264 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4265 {
4266         struct btrfs_fs_info *fs_info = root->fs_info;
4267         struct btrfs_block_group_cache *cache = NULL;
4268         u64 len;
4269
4270         while (start <= end) {
4271                 if (!cache ||
4272                     start >= cache->key.objectid + cache->key.offset) {
4273                         if (cache)
4274                                 btrfs_put_block_group(cache);
4275                         cache = btrfs_lookup_block_group(fs_info, start);
4276                         BUG_ON(!cache);
4277                 }
4278
4279                 len = cache->key.objectid + cache->key.offset - start;
4280                 len = min(len, end + 1 - start);
4281
4282                 if (start < cache->last_byte_to_unpin) {
4283                         len = min(len, cache->last_byte_to_unpin - start);
4284                         btrfs_add_free_space(cache, start, len);
4285                 }
4286
4287                 start += len;
4288
4289                 spin_lock(&cache->space_info->lock);
4290                 spin_lock(&cache->lock);
4291                 cache->pinned -= len;
4292                 cache->space_info->bytes_pinned -= len;
4293                 if (cache->ro) {
4294                         cache->space_info->bytes_readonly += len;
4295                 } else if (cache->reserved_pinned > 0) {
4296                         len = min(len, cache->reserved_pinned);
4297                         cache->reserved_pinned -= len;
4298                         cache->space_info->bytes_reserved += len;
4299                 }
4300                 spin_unlock(&cache->lock);
4301                 spin_unlock(&cache->space_info->lock);
4302         }
4303
4304         if (cache)
4305                 btrfs_put_block_group(cache);
4306         return 0;
4307 }
4308
4309 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4310                                struct btrfs_root *root)
4311 {
4312         struct btrfs_fs_info *fs_info = root->fs_info;
4313         struct extent_io_tree *unpin;
4314         struct btrfs_block_rsv *block_rsv;
4315         struct btrfs_block_rsv *next_rsv;
4316         u64 start;
4317         u64 end;
4318         int idx;
4319         int ret;
4320
4321         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4322                 unpin = &fs_info->freed_extents[1];
4323         else
4324                 unpin = &fs_info->freed_extents[0];
4325
4326         while (1) {
4327                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4328                                             EXTENT_DIRTY);
4329                 if (ret)
4330                         break;
4331
4332                 if (btrfs_test_opt(root, DISCARD))
4333                         ret = btrfs_discard_extent(root, start,
4334                                                    end + 1 - start, NULL);
4335
4336                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4337                 unpin_extent_range(root, start, end);
4338                 cond_resched();
4339         }
4340
4341         mutex_lock(&fs_info->durable_block_rsv_mutex);
4342         list_for_each_entry_safe(block_rsv, next_rsv,
4343                                  &fs_info->durable_block_rsv_list, list) {
4344
4345                 idx = trans->transid & 0x1;
4346                 if (block_rsv->freed[idx] > 0) {
4347                         block_rsv_add_bytes(block_rsv,
4348                                             block_rsv->freed[idx], 0);
4349                         block_rsv->freed[idx] = 0;
4350                 }
4351                 if (atomic_read(&block_rsv->usage) == 0) {
4352                         btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4353
4354                         if (block_rsv->freed[0] == 0 &&
4355                             block_rsv->freed[1] == 0) {
4356                                 list_del_init(&block_rsv->list);
4357                                 kfree(block_rsv);
4358                         }
4359                 } else {
4360                         btrfs_block_rsv_release(root, block_rsv, 0);
4361                 }
4362         }
4363         mutex_unlock(&fs_info->durable_block_rsv_mutex);
4364
4365         return 0;
4366 }
4367
4368 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4369                                 struct btrfs_root *root,
4370                                 u64 bytenr, u64 num_bytes, u64 parent,
4371                                 u64 root_objectid, u64 owner_objectid,
4372                                 u64 owner_offset, int refs_to_drop,
4373                                 struct btrfs_delayed_extent_op *extent_op)
4374 {
4375         struct btrfs_key key;
4376         struct btrfs_path *path;
4377         struct btrfs_fs_info *info = root->fs_info;
4378         struct btrfs_root *extent_root = info->extent_root;
4379         struct extent_buffer *leaf;
4380         struct btrfs_extent_item *ei;
4381         struct btrfs_extent_inline_ref *iref;
4382         int ret;
4383         int is_data;
4384         int extent_slot = 0;
4385         int found_extent = 0;
4386         int num_to_del = 1;
4387         u32 item_size;
4388         u64 refs;
4389
4390         path = btrfs_alloc_path();
4391         if (!path)
4392                 return -ENOMEM;
4393
4394         path->reada = 1;
4395         path->leave_spinning = 1;
4396
4397         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4398         BUG_ON(!is_data && refs_to_drop != 1);
4399
4400         ret = lookup_extent_backref(trans, extent_root, path, &iref,
4401                                     bytenr, num_bytes, parent,
4402                                     root_objectid, owner_objectid,
4403                                     owner_offset);
4404         if (ret == 0) {
4405                 extent_slot = path->slots[0];
4406                 while (extent_slot >= 0) {
4407                         btrfs_item_key_to_cpu(path->nodes[0], &key,
4408                                               extent_slot);
4409                         if (key.objectid != bytenr)
4410                                 break;
4411                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4412                             key.offset == num_bytes) {
4413                                 found_extent = 1;
4414                                 break;
4415                         }
4416                         if (path->slots[0] - extent_slot > 5)
4417                                 break;
4418                         extent_slot--;
4419                 }
4420 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4421                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4422                 if (found_extent && item_size < sizeof(*ei))
4423                         found_extent = 0;
4424 #endif
4425                 if (!found_extent) {
4426                         BUG_ON(iref);
4427                         ret = remove_extent_backref(trans, extent_root, path,
4428                                                     NULL, refs_to_drop,
4429                                                     is_data);
4430                         BUG_ON(ret);
4431                         btrfs_release_path(path);
4432                         path->leave_spinning = 1;
4433
4434                         key.objectid = bytenr;
4435                         key.type = BTRFS_EXTENT_ITEM_KEY;
4436                         key.offset = num_bytes;
4437
4438                         ret = btrfs_search_slot(trans, extent_root,
4439                                                 &key, path, -1, 1);
4440                         if (ret) {
4441                                 printk(KERN_ERR "umm, got %d back from search"
4442                                        ", was looking for %llu\n", ret,
4443                                        (unsigned long long)bytenr);
4444                                 btrfs_print_leaf(extent_root, path->nodes[0]);
4445                         }
4446                         BUG_ON(ret);
4447                         extent_slot = path->slots[0];
4448                 }
4449         } else {
4450                 btrfs_print_leaf(extent_root, path->nodes[0]);
4451                 WARN_ON(1);
4452                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4453                        "parent %llu root %llu  owner %llu offset %llu\n",
4454                        (unsigned long long)bytenr,
4455                        (unsigned long long)parent,
4456                        (unsigned long long)root_objectid,
4457                        (unsigned long long)owner_objectid,
4458                        (unsigned long long)owner_offset);
4459         }
4460
4461         leaf = path->nodes[0];
4462         item_size = btrfs_item_size_nr(leaf, extent_slot);
4463 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4464         if (item_size < sizeof(*ei)) {
4465                 BUG_ON(found_extent || extent_slot != path->slots[0]);
4466                 ret = convert_extent_item_v0(trans, extent_root, path,
4467                                              owner_objectid, 0);
4468                 BUG_ON(ret < 0);
4469
4470                 btrfs_release_path(path);
4471                 path->leave_spinning = 1;
4472
4473                 key.objectid = bytenr;
4474                 key.type = BTRFS_EXTENT_ITEM_KEY;
4475                 key.offset = num_bytes;
4476
4477                 ret = btrfs_search_slot(trans, extent_root, &key, path,
4478                                         -1, 1);
4479                 if (ret) {
4480                         printk(KERN_ERR "umm, got %d back from search"
4481                                ", was looking for %llu\n", ret,
4482                                (unsigned long long)bytenr);
4483                         btrfs_print_leaf(extent_root, path->nodes[0]);
4484                 }
4485                 BUG_ON(ret);
4486                 extent_slot = path->slots[0];
4487                 leaf = path->nodes[0];
4488                 item_size = btrfs_item_size_nr(leaf, extent_slot);
4489         }
4490 #endif
4491         BUG_ON(item_size < sizeof(*ei));
4492         ei = btrfs_item_ptr(leaf, extent_slot,
4493                             struct btrfs_extent_item);
4494         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4495                 struct btrfs_tree_block_info *bi;
4496                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4497                 bi = (struct btrfs_tree_block_info *)(ei + 1);
4498                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4499         }
4500
4501         refs = btrfs_extent_refs(leaf, ei);
4502         BUG_ON(refs < refs_to_drop);
4503         refs -= refs_to_drop;
4504
4505         if (refs > 0) {
4506                 if (extent_op)
4507                         __run_delayed_extent_op(extent_op, leaf, ei);
4508                 /*
4509                  * In the case of inline back ref, reference count will
4510                  * be updated by remove_extent_backref
4511                  */
4512                 if (iref) {
4513                         BUG_ON(!found_extent);
4514                 } else {
4515                         btrfs_set_extent_refs(leaf, ei, refs);
4516                         btrfs_mark_buffer_dirty(leaf);
4517                 }
4518                 if (found_extent) {
4519                         ret = remove_extent_backref(trans, extent_root, path,
4520                                                     iref, refs_to_drop,
4521                                                     is_data);
4522                         BUG_ON(ret);
4523                 }
4524         } else {
4525                 if (found_extent) {
4526                         BUG_ON(is_data && refs_to_drop !=
4527                                extent_data_ref_count(root, path, iref));
4528                         if (iref) {
4529                                 BUG_ON(path->slots[0] != extent_slot);
4530                         } else {
4531                                 BUG_ON(path->slots[0] != extent_slot + 1);
4532                                 path->slots[0] = extent_slot;
4533                                 num_to_del = 2;
4534                         }
4535                 }
4536
4537                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4538                                       num_to_del);
4539                 BUG_ON(ret);
4540                 btrfs_release_path(path);
4541
4542                 if (is_data) {
4543                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4544                         BUG_ON(ret);
4545                 } else {
4546                         invalidate_mapping_pages(info->btree_inode->i_mapping,
4547                              bytenr >> PAGE_CACHE_SHIFT,
4548                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4549                 }
4550
4551                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4552                 BUG_ON(ret);
4553         }
4554         btrfs_free_path(path);
4555         return ret;
4556 }
4557
4558 /*
4559  * when we free an block, it is possible (and likely) that we free the last
4560  * delayed ref for that extent as well.  This searches the delayed ref tree for
4561  * a given extent, and if there are no other delayed refs to be processed, it
4562  * removes it from the tree.
4563  */
4564 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4565                                       struct btrfs_root *root, u64 bytenr)
4566 {
4567         struct btrfs_delayed_ref_head *head;
4568         struct btrfs_delayed_ref_root *delayed_refs;
4569         struct btrfs_delayed_ref_node *ref;
4570         struct rb_node *node;
4571         int ret = 0;
4572
4573         delayed_refs = &trans->transaction->delayed_refs;
4574         spin_lock(&delayed_refs->lock);
4575         head = btrfs_find_delayed_ref_head(trans, bytenr);
4576         if (!head)
4577                 goto out;
4578
4579         node = rb_prev(&head->node.rb_node);
4580         if (!node)
4581                 goto out;
4582
4583         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4584
4585         /* there are still entries for this ref, we can't drop it */
4586         if (ref->bytenr == bytenr)
4587                 goto out;
4588
4589         if (head->extent_op) {
4590                 if (!head->must_insert_reserved)
4591                         goto out;
4592                 kfree(head->extent_op);
4593                 head->extent_op = NULL;
4594         }
4595
4596         /*
4597          * waiting for the lock here would deadlock.  If someone else has it
4598          * locked they are already in the process of dropping it anyway
4599          */
4600         if (!mutex_trylock(&head->mutex))
4601                 goto out;
4602
4603         /*
4604          * at this point we have a head with no other entries.  Go
4605          * ahead and process it.
4606          */
4607         head->node.in_tree = 0;
4608         rb_erase(&head->node.rb_node, &delayed_refs->root);
4609
4610         delayed_refs->num_entries--;
4611
4612         /*
4613          * we don't take a ref on the node because we're removing it from the
4614          * tree, so we just steal the ref the tree was holding.
4615          */
4616         delayed_refs->num_heads--;
4617         if (list_empty(&head->cluster))
4618                 delayed_refs->num_heads_ready--;
4619
4620         list_del_init(&head->cluster);
4621         spin_unlock(&delayed_refs->lock);
4622
4623         BUG_ON(head->extent_op);
4624         if (head->must_insert_reserved)
4625                 ret = 1;
4626
4627         mutex_unlock(&head->mutex);
4628         btrfs_put_delayed_ref(&head->node);
4629         return ret;
4630 out:
4631         spin_unlock(&delayed_refs->lock);
4632         return 0;
4633 }
4634
4635 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4636                            struct btrfs_root *root,
4637                            struct extent_buffer *buf,
4638                            u64 parent, int last_ref)
4639 {
4640         struct btrfs_block_rsv *block_rsv;
4641         struct btrfs_block_group_cache *cache = NULL;
4642         int ret;
4643
4644         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4645                 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4646                                                 parent, root->root_key.objectid,
4647                                                 btrfs_header_level(buf),
4648                                                 BTRFS_DROP_DELAYED_REF, NULL);
4649                 BUG_ON(ret);
4650         }
4651
4652         if (!last_ref)
4653                 return;
4654
4655         block_rsv = get_block_rsv(trans, root);
4656         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4657         if (block_rsv->space_info != cache->space_info)
4658                 goto out;
4659
4660         if (btrfs_header_generation(buf) == trans->transid) {
4661                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4662                         ret = check_ref_cleanup(trans, root, buf->start);
4663                         if (!ret)
4664                                 goto pin;
4665                 }
4666
4667                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4668                         pin_down_extent(root, cache, buf->start, buf->len, 1);
4669                         goto pin;
4670                 }
4671
4672                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4673
4674                 btrfs_add_free_space(cache, buf->start, buf->len);
4675                 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
4676                 if (ret == -EAGAIN) {
4677                         /* block group became read-only */
4678                         btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4679                         goto out;
4680                 }
4681
4682                 ret = 1;
4683                 spin_lock(&block_rsv->lock);
4684                 if (block_rsv->reserved < block_rsv->size) {
4685                         block_rsv->reserved += buf->len;
4686                         ret = 0;
4687                 }
4688                 spin_unlock(&block_rsv->lock);
4689
4690                 if (ret) {
4691                         spin_lock(&cache->space_info->lock);
4692                         cache->space_info->bytes_reserved -= buf->len;
4693                         cache->space_info->reservation_progress++;
4694                         spin_unlock(&cache->space_info->lock);
4695                 }
4696                 goto out;
4697         }
4698 pin:
4699         if (block_rsv->durable && !cache->ro) {
4700                 ret = 0;
4701                 spin_lock(&cache->lock);
4702                 if (!cache->ro) {
4703                         cache->reserved_pinned += buf->len;
4704                         ret = 1;
4705                 }
4706                 spin_unlock(&cache->lock);
4707
4708                 if (ret) {
4709                         spin_lock(&block_rsv->lock);
4710                         block_rsv->freed[trans->transid & 0x1] += buf->len;
4711                         spin_unlock(&block_rsv->lock);
4712                 }
4713         }
4714 out:
4715         /*
4716          * Deleting the buffer, clear the corrupt flag since it doesn't matter
4717          * anymore.
4718          */
4719         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4720         btrfs_put_block_group(cache);
4721 }
4722
4723 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4724                       struct btrfs_root *root,
4725                       u64 bytenr, u64 num_bytes, u64 parent,
4726                       u64 root_objectid, u64 owner, u64 offset)
4727 {
4728         int ret;
4729
4730         /*
4731          * tree log blocks never actually go into the extent allocation
4732          * tree, just update pinning info and exit early.
4733          */
4734         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4735                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4736                 /* unlocks the pinned mutex */
4737                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
4738                 ret = 0;
4739         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4740                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4741                                         parent, root_objectid, (int)owner,
4742                                         BTRFS_DROP_DELAYED_REF, NULL);
4743                 BUG_ON(ret);
4744         } else {
4745                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4746                                         parent, root_objectid, owner,
4747                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
4748                 BUG_ON(ret);
4749         }
4750         return ret;
4751 }
4752
4753 static u64 stripe_align(struct btrfs_root *root, u64 val)
4754 {
4755         u64 mask = ((u64)root->stripesize - 1);
4756         u64 ret = (val + mask) & ~mask;
4757         return ret;
4758 }
4759
4760 /*
4761  * when we wait for progress in the block group caching, its because
4762  * our allocation attempt failed at least once.  So, we must sleep
4763  * and let some progress happen before we try again.
4764  *
4765  * This function will sleep at least once waiting for new free space to
4766  * show up, and then it will check the block group free space numbers
4767  * for our min num_bytes.  Another option is to have it go ahead
4768  * and look in the rbtree for a free extent of a given size, but this
4769  * is a good start.
4770  */
4771 static noinline int
4772 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4773                                 u64 num_bytes)
4774 {
4775         struct btrfs_caching_control *caching_ctl;
4776         DEFINE_WAIT(wait);
4777
4778         caching_ctl = get_caching_control(cache);
4779         if (!caching_ctl)
4780                 return 0;
4781
4782         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4783                    (cache->free_space_ctl->free_space >= num_bytes));
4784
4785         put_caching_control(caching_ctl);
4786         return 0;
4787 }
4788
4789 static noinline int
4790 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4791 {
4792         struct btrfs_caching_control *caching_ctl;
4793         DEFINE_WAIT(wait);
4794
4795         caching_ctl = get_caching_control(cache);
4796         if (!caching_ctl)
4797                 return 0;
4798
4799         wait_event(caching_ctl->wait, block_group_cache_done(cache));
4800
4801         put_caching_control(caching_ctl);
4802         return 0;
4803 }
4804
4805 static int get_block_group_index(struct btrfs_block_group_cache *cache)
4806 {
4807         int index;
4808         if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4809                 index = 0;
4810         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4811                 index = 1;
4812         else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4813                 index = 2;
4814         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4815                 index = 3;
4816         else
4817                 index = 4;
4818         return index;
4819 }
4820
4821 enum btrfs_loop_type {
4822         LOOP_FIND_IDEAL = 0,
4823         LOOP_CACHING_NOWAIT = 1,
4824         LOOP_CACHING_WAIT = 2,
4825         LOOP_ALLOC_CHUNK = 3,
4826         LOOP_NO_EMPTY_SIZE = 4,
4827 };
4828
4829 /*
4830  * walks the btree of allocated extents and find a hole of a given size.
4831  * The key ins is changed to record the hole:
4832  * ins->objectid == block start
4833  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4834  * ins->offset == number of blocks
4835  * Any available blocks before search_start are skipped.
4836  */
4837 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4838                                      struct btrfs_root *orig_root,
4839                                      u64 num_bytes, u64 empty_size,
4840                                      u64 search_start, u64 search_end,
4841                                      u64 hint_byte, struct btrfs_key *ins,
4842                                      int data)
4843 {
4844         int ret = 0;
4845         struct btrfs_root *root = orig_root->fs_info->extent_root;
4846         struct btrfs_free_cluster *last_ptr = NULL;
4847         struct btrfs_block_group_cache *block_group = NULL;
4848         int empty_cluster = 2 * 1024 * 1024;
4849         int allowed_chunk_alloc = 0;
4850         int done_chunk_alloc = 0;
4851         struct btrfs_space_info *space_info;
4852         int last_ptr_loop = 0;
4853         int loop = 0;
4854         int index = 0;
4855         bool found_uncached_bg = false;
4856         bool failed_cluster_refill = false;
4857         bool failed_alloc = false;
4858         bool use_cluster = true;
4859         u64 ideal_cache_percent = 0;
4860         u64 ideal_cache_offset = 0;
4861
4862         WARN_ON(num_bytes < root->sectorsize);
4863         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4864         ins->objectid = 0;
4865         ins->offset = 0;
4866
4867         space_info = __find_space_info(root->fs_info, data);
4868         if (!space_info) {
4869                 printk(KERN_ERR "No space info for %d\n", data);
4870                 return -ENOSPC;
4871         }
4872
4873         /*
4874          * If the space info is for both data and metadata it means we have a
4875          * small filesystem and we can't use the clustering stuff.
4876          */
4877         if (btrfs_mixed_space_info(space_info))
4878                 use_cluster = false;
4879
4880         if (orig_root->ref_cows || empty_size)
4881                 allowed_chunk_alloc = 1;
4882
4883         if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
4884                 last_ptr = &root->fs_info->meta_alloc_cluster;
4885                 if (!btrfs_test_opt(root, SSD))
4886                         empty_cluster = 64 * 1024;
4887         }
4888
4889         if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
4890             btrfs_test_opt(root, SSD)) {
4891                 last_ptr = &root->fs_info->data_alloc_cluster;
4892         }
4893
4894         if (last_ptr) {
4895                 spin_lock(&last_ptr->lock);
4896                 if (last_ptr->block_group)
4897                         hint_byte = last_ptr->window_start;
4898                 spin_unlock(&last_ptr->lock);
4899         }
4900
4901         search_start = max(search_start, first_logical_byte(root, 0));
4902         search_start = max(search_start, hint_byte);
4903
4904         if (!last_ptr)
4905                 empty_cluster = 0;
4906
4907         if (search_start == hint_byte) {
4908 ideal_cache:
4909                 block_group = btrfs_lookup_block_group(root->fs_info,
4910                                                        search_start);
4911                 /*
4912                  * we don't want to use the block group if it doesn't match our
4913                  * allocation bits, or if its not cached.
4914                  *
4915                  * However if we are re-searching with an ideal block group
4916                  * picked out then we don't care that the block group is cached.
4917                  */
4918                 if (block_group && block_group_bits(block_group, data) &&
4919                     (block_group->cached != BTRFS_CACHE_NO ||
4920                      search_start == ideal_cache_offset)) {
4921                         down_read(&space_info->groups_sem);
4922                         if (list_empty(&block_group->list) ||
4923                             block_group->ro) {
4924                                 /*
4925                                  * someone is removing this block group,
4926                                  * we can't jump into the have_block_group
4927                                  * target because our list pointers are not
4928                                  * valid
4929                                  */
4930                                 btrfs_put_block_group(block_group);
4931                                 up_read(&space_info->groups_sem);
4932                         } else {
4933                                 index = get_block_group_index(block_group);
4934                                 goto have_block_group;
4935                         }
4936                 } else if (block_group) {
4937                         btrfs_put_block_group(block_group);
4938                 }
4939         }
4940 search:
4941         down_read(&space_info->groups_sem);
4942         list_for_each_entry(block_group, &space_info->block_groups[index],
4943                             list) {
4944                 u64 offset;
4945                 int cached;
4946
4947                 btrfs_get_block_group(block_group);
4948                 search_start = block_group->key.objectid;
4949
4950                 /*
4951                  * this can happen if we end up cycling through all the
4952                  * raid types, but we want to make sure we only allocate
4953                  * for the proper type.
4954                  */
4955                 if (!block_group_bits(block_group, data)) {
4956                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
4957                                 BTRFS_BLOCK_GROUP_RAID1 |
4958                                 BTRFS_BLOCK_GROUP_RAID10;
4959
4960                         /*
4961                          * if they asked for extra copies and this block group
4962                          * doesn't provide them, bail.  This does allow us to
4963                          * fill raid0 from raid1.
4964                          */
4965                         if ((data & extra) && !(block_group->flags & extra))
4966                                 goto loop;
4967                 }
4968
4969 have_block_group:
4970                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4971                         u64 free_percent;
4972
4973                         ret = cache_block_group(block_group, trans,
4974                                                 orig_root, 1);
4975                         if (block_group->cached == BTRFS_CACHE_FINISHED)
4976                                 goto have_block_group;
4977
4978                         free_percent = btrfs_block_group_used(&block_group->item);
4979                         free_percent *= 100;
4980                         free_percent = div64_u64(free_percent,
4981                                                  block_group->key.offset);
4982                         free_percent = 100 - free_percent;
4983                         if (free_percent > ideal_cache_percent &&
4984                             likely(!block_group->ro)) {
4985                                 ideal_cache_offset = block_group->key.objectid;
4986                                 ideal_cache_percent = free_percent;
4987                         }
4988
4989                         /*
4990                          * We only want to start kthread caching if we are at
4991                          * the point where we will wait for caching to make
4992                          * progress, or if our ideal search is over and we've
4993                          * found somebody to start caching.
4994                          */
4995                         if (loop > LOOP_CACHING_NOWAIT ||
4996                             (loop > LOOP_FIND_IDEAL &&
4997                              atomic_read(&space_info->caching_threads) < 2)) {
4998                                 ret = cache_block_group(block_group, trans,
4999                                                         orig_root, 0);
5000                                 BUG_ON(ret);
5001                         }
5002                         found_uncached_bg = true;
5003
5004                         /*
5005                          * If loop is set for cached only, try the next block
5006                          * group.
5007                          */
5008                         if (loop == LOOP_FIND_IDEAL)
5009                                 goto loop;
5010                 }
5011
5012                 cached = block_group_cache_done(block_group);
5013                 if (unlikely(!cached))
5014                         found_uncached_bg = true;
5015
5016                 if (unlikely(block_group->ro))
5017                         goto loop;
5018
5019                 spin_lock(&block_group->free_space_ctl->tree_lock);
5020                 if (cached &&
5021                     block_group->free_space_ctl->free_space <
5022                     num_bytes + empty_size) {
5023                         spin_unlock(&block_group->free_space_ctl->tree_lock);
5024                         goto loop;
5025                 }
5026                 spin_unlock(&block_group->free_space_ctl->tree_lock);
5027
5028                 /*
5029                  * Ok we want to try and use the cluster allocator, so lets look
5030                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
5031                  * have tried the cluster allocator plenty of times at this
5032                  * point and not have found anything, so we are likely way too
5033                  * fragmented for the clustering stuff to find anything, so lets
5034                  * just skip it and let the allocator find whatever block it can
5035                  * find
5036                  */
5037                 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
5038                         /*
5039                          * the refill lock keeps out other
5040                          * people trying to start a new cluster
5041                          */
5042                         spin_lock(&last_ptr->refill_lock);
5043                         if (last_ptr->block_group &&
5044                             (last_ptr->block_group->ro ||
5045                             !block_group_bits(last_ptr->block_group, data))) {
5046                                 offset = 0;
5047                                 goto refill_cluster;
5048                         }
5049
5050                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
5051                                                  num_bytes, search_start);
5052                         if (offset) {
5053                                 /* we have a block, we're done */
5054                                 spin_unlock(&last_ptr->refill_lock);
5055                                 goto checks;
5056                         }
5057
5058                         spin_lock(&last_ptr->lock);
5059                         /*
5060                          * whoops, this cluster doesn't actually point to
5061                          * this block group.  Get a ref on the block
5062                          * group is does point to and try again
5063                          */
5064                         if (!last_ptr_loop && last_ptr->block_group &&
5065                             last_ptr->block_group != block_group) {
5066
5067                                 btrfs_put_block_group(block_group);
5068                                 block_group = last_ptr->block_group;
5069                                 btrfs_get_block_group(block_group);
5070                                 spin_unlock(&last_ptr->lock);
5071                                 spin_unlock(&last_ptr->refill_lock);
5072
5073                                 last_ptr_loop = 1;
5074                                 search_start = block_group->key.objectid;
5075                                 /*
5076                                  * we know this block group is properly
5077                                  * in the list because
5078                                  * btrfs_remove_block_group, drops the
5079                                  * cluster before it removes the block
5080                                  * group from the list
5081                                  */
5082                                 goto have_block_group;
5083                         }
5084                         spin_unlock(&last_ptr->lock);
5085 refill_cluster:
5086                         /*
5087                          * this cluster didn't work out, free it and
5088                          * start over
5089                          */
5090                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5091
5092                         last_ptr_loop = 0;
5093
5094                         /* allocate a cluster in this block group */
5095                         ret = btrfs_find_space_cluster(trans, root,
5096                                                block_group, last_ptr,
5097                                                offset, num_bytes,
5098                                                empty_cluster + empty_size);
5099                         if (ret == 0) {
5100                                 /*
5101                                  * now pull our allocation out of this
5102                                  * cluster
5103                                  */
5104                                 offset = btrfs_alloc_from_cluster(block_group,
5105                                                   last_ptr, num_bytes,
5106                                                   search_start);
5107                                 if (offset) {
5108                                         /* we found one, proceed */
5109                                         spin_unlock(&last_ptr->refill_lock);
5110                                         goto checks;
5111                                 }
5112                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
5113                                    && !failed_cluster_refill) {
5114                                 spin_unlock(&last_ptr->refill_lock);
5115
5116                                 failed_cluster_refill = true;
5117                                 wait_block_group_cache_progress(block_group,
5118                                        num_bytes + empty_cluster + empty_size);
5119                                 goto have_block_group;
5120                         }
5121
5122                         /*
5123                          * at this point we either didn't find a cluster
5124                          * or we weren't able to allocate a block from our
5125                          * cluster.  Free the cluster we've been trying
5126                          * to use, and go to the next block group
5127                          */
5128                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5129                         spin_unlock(&last_ptr->refill_lock);
5130                         goto loop;
5131                 }
5132
5133                 offset = btrfs_find_space_for_alloc(block_group, search_start,
5134                                                     num_bytes, empty_size);
5135                 /*
5136                  * If we didn't find a chunk, and we haven't failed on this
5137                  * block group before, and this block group is in the middle of
5138                  * caching and we are ok with waiting, then go ahead and wait
5139                  * for progress to be made, and set failed_alloc to true.
5140                  *
5141                  * If failed_alloc is true then we've already waited on this
5142                  * block group once and should move on to the next block group.
5143                  */
5144                 if (!offset && !failed_alloc && !cached &&
5145                     loop > LOOP_CACHING_NOWAIT) {
5146                         wait_block_group_cache_progress(block_group,
5147                                                 num_bytes + empty_size);
5148                         failed_alloc = true;
5149                         goto have_block_group;
5150                 } else if (!offset) {
5151                         goto loop;
5152                 }
5153 checks:
5154                 search_start = stripe_align(root, offset);
5155                 /* move on to the next group */
5156                 if (search_start + num_bytes >= search_end) {
5157                         btrfs_add_free_space(block_group, offset, num_bytes);
5158                         goto loop;
5159                 }
5160
5161                 /* move on to the next group */
5162                 if (search_start + num_bytes >
5163                     block_group->key.objectid + block_group->key.offset) {
5164                         btrfs_add_free_space(block_group, offset, num_bytes);
5165                         goto loop;
5166                 }
5167
5168                 ins->objectid = search_start;
5169                 ins->offset = num_bytes;
5170
5171                 if (offset < search_start)
5172                         btrfs_add_free_space(block_group, offset,
5173                                              search_start - offset);
5174                 BUG_ON(offset > search_start);
5175
5176                 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
5177                                             (data & BTRFS_BLOCK_GROUP_DATA));
5178                 if (ret == -EAGAIN) {
5179                         btrfs_add_free_space(block_group, offset, num_bytes);
5180                         goto loop;
5181                 }
5182
5183                 /* we are all good, lets return */
5184                 ins->objectid = search_start;
5185                 ins->offset = num_bytes;
5186
5187                 if (offset < search_start)
5188                         btrfs_add_free_space(block_group, offset,
5189                                              search_start - offset);
5190                 BUG_ON(offset > search_start);
5191                 btrfs_put_block_group(block_group);
5192                 break;
5193 loop:
5194                 failed_cluster_refill = false;
5195                 failed_alloc = false;
5196                 BUG_ON(index != get_block_group_index(block_group));
5197                 btrfs_put_block_group(block_group);
5198         }
5199         up_read(&space_info->groups_sem);
5200
5201         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5202                 goto search;
5203
5204         /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
5205          *                      for them to make caching progress.  Also
5206          *                      determine the best possible bg to cache
5207          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5208          *                      caching kthreads as we move along
5209          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5210          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5211          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5212          *                      again
5213          */
5214         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
5215             (found_uncached_bg || empty_size || empty_cluster ||
5216              allowed_chunk_alloc)) {
5217                 index = 0;
5218                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5219                         found_uncached_bg = false;
5220                         loop++;
5221                         if (!ideal_cache_percent &&
5222                             atomic_read(&space_info->caching_threads))
5223                                 goto search;
5224
5225                         /*
5226                          * 1 of the following 2 things have happened so far
5227                          *
5228                          * 1) We found an ideal block group for caching that
5229                          * is mostly full and will cache quickly, so we might
5230                          * as well wait for it.
5231                          *
5232                          * 2) We searched for cached only and we didn't find
5233                          * anything, and we didn't start any caching kthreads
5234                          * either, so chances are we will loop through and
5235                          * start a couple caching kthreads, and then come back
5236                          * around and just wait for them.  This will be slower
5237                          * because we will have 2 caching kthreads reading at
5238                          * the same time when we could have just started one
5239                          * and waited for it to get far enough to give us an
5240                          * allocation, so go ahead and go to the wait caching
5241                          * loop.
5242                          */
5243                         loop = LOOP_CACHING_WAIT;
5244                         search_start = ideal_cache_offset;
5245                         ideal_cache_percent = 0;
5246                         goto ideal_cache;
5247                 } else if (loop == LOOP_FIND_IDEAL) {
5248                         /*
5249                          * Didn't find a uncached bg, wait on anything we find
5250                          * next.
5251                          */
5252                         loop = LOOP_CACHING_WAIT;
5253                         goto search;
5254                 }
5255
5256                 if (loop < LOOP_CACHING_WAIT) {
5257                         loop++;
5258                         goto search;
5259                 }
5260
5261                 if (loop == LOOP_ALLOC_CHUNK) {
5262                         empty_size = 0;
5263                         empty_cluster = 0;
5264                 }
5265
5266                 if (allowed_chunk_alloc) {
5267                         ret = do_chunk_alloc(trans, root, num_bytes +
5268                                              2 * 1024 * 1024, data,
5269                                              CHUNK_ALLOC_LIMITED);
5270                         allowed_chunk_alloc = 0;
5271                         done_chunk_alloc = 1;
5272                 } else if (!done_chunk_alloc &&
5273                            space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
5274                         space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5275                 }
5276
5277                 if (loop < LOOP_NO_EMPTY_SIZE) {
5278                         loop++;
5279                         goto search;
5280                 }
5281                 ret = -ENOSPC;
5282         } else if (!ins->objectid) {
5283                 ret = -ENOSPC;
5284         } else if (ins->objectid) {
5285                 ret = 0;
5286         }
5287
5288         return ret;
5289 }
5290
5291 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5292                             int dump_block_groups)
5293 {
5294         struct btrfs_block_group_cache *cache;
5295         int index = 0;
5296
5297         spin_lock(&info->lock);
5298         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
5299                (unsigned long long)(info->total_bytes - info->bytes_used -
5300                                     info->bytes_pinned - info->bytes_reserved -
5301                                     info->bytes_readonly),
5302                (info->full) ? "" : "not ");
5303         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5304                "reserved=%llu, may_use=%llu, readonly=%llu\n",
5305                (unsigned long long)info->total_bytes,
5306                (unsigned long long)info->bytes_used,
5307                (unsigned long long)info->bytes_pinned,
5308                (unsigned long long)info->bytes_reserved,
5309                (unsigned long long)info->bytes_may_use,
5310                (unsigned long long)info->bytes_readonly);
5311         spin_unlock(&info->lock);
5312
5313         if (!dump_block_groups)
5314                 return;
5315
5316         down_read(&info->groups_sem);
5317 again:
5318         list_for_each_entry(cache, &info->block_groups[index], list) {
5319                 spin_lock(&cache->lock);
5320                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5321                        "%llu pinned %llu reserved\n",
5322                        (unsigned long long)cache->key.objectid,
5323                        (unsigned long long)cache->key.offset,
5324                        (unsigned long long)btrfs_block_group_used(&cache->item),
5325                        (unsigned long long)cache->pinned,
5326                        (unsigned long long)cache->reserved);
5327                 btrfs_dump_free_space(cache, bytes);
5328                 spin_unlock(&cache->lock);
5329         }
5330         if (++index < BTRFS_NR_RAID_TYPES)
5331                 goto again;
5332         up_read(&info->groups_sem);
5333 }
5334
5335 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5336                          struct btrfs_root *root,
5337                          u64 num_bytes, u64 min_alloc_size,
5338                          u64 empty_size, u64 hint_byte,
5339                          u64 search_end, struct btrfs_key *ins,
5340                          u64 data)
5341 {
5342         int ret;
5343         u64 search_start = 0;
5344
5345         data = btrfs_get_alloc_profile(root, data);
5346 again:
5347         /*
5348          * the only place that sets empty_size is btrfs_realloc_node, which
5349          * is not called recursively on allocations
5350          */
5351         if (empty_size || root->ref_cows)
5352                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5353                                      num_bytes + 2 * 1024 * 1024, data,
5354                                      CHUNK_ALLOC_NO_FORCE);
5355
5356         WARN_ON(num_bytes < root->sectorsize);
5357         ret = find_free_extent(trans, root, num_bytes, empty_size,
5358                                search_start, search_end, hint_byte,
5359                                ins, data);
5360
5361         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5362                 num_bytes = num_bytes >> 1;
5363                 num_bytes = num_bytes & ~(root->sectorsize - 1);
5364                 num_bytes = max(num_bytes, min_alloc_size);
5365                 do_chunk_alloc(trans, root->fs_info->extent_root,
5366                                num_bytes, data, CHUNK_ALLOC_FORCE);
5367                 goto again;
5368         }
5369         if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5370                 struct btrfs_space_info *sinfo;
5371
5372                 sinfo = __find_space_info(root->fs_info, data);
5373                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
5374                        "wanted %llu\n", (unsigned long long)data,
5375                        (unsigned long long)num_bytes);
5376                 dump_space_info(sinfo, num_bytes, 1);
5377         }
5378
5379         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5380
5381         return ret;
5382 }
5383
5384 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5385 {
5386         struct btrfs_block_group_cache *cache;
5387         int ret = 0;
5388
5389         cache = btrfs_lookup_block_group(root->fs_info, start);
5390         if (!cache) {
5391                 printk(KERN_ERR "Unable to find block group for %llu\n",
5392                        (unsigned long long)start);
5393                 return -ENOSPC;
5394         }
5395
5396         if (btrfs_test_opt(root, DISCARD))
5397                 ret = btrfs_discard_extent(root, start, len, NULL);
5398
5399         btrfs_add_free_space(cache, start, len);
5400         btrfs_update_reserved_bytes(cache, len, 0, 1);
5401         btrfs_put_block_group(cache);
5402
5403         trace_btrfs_reserved_extent_free(root, start, len);
5404
5405         return ret;
5406 }
5407
5408 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5409                                       struct btrfs_root *root,
5410                                       u64 parent, u64 root_objectid,
5411                                       u64 flags, u64 owner, u64 offset,
5412                                       struct btrfs_key *ins, int ref_mod)
5413 {
5414         int ret;
5415         struct btrfs_fs_info *fs_info = root->fs_info;
5416         struct btrfs_extent_item *extent_item;
5417         struct btrfs_extent_inline_ref *iref;
5418         struct btrfs_path *path;
5419         struct extent_buffer *leaf;
5420         int type;
5421         u32 size;
5422
5423         if (parent > 0)
5424                 type = BTRFS_SHARED_DATA_REF_KEY;
5425         else
5426                 type = BTRFS_EXTENT_DATA_REF_KEY;
5427
5428         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5429
5430         path = btrfs_alloc_path();
5431         if (!path)
5432                 return -ENOMEM;
5433
5434         path->leave_spinning = 1;
5435         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5436                                       ins, size);
5437         BUG_ON(ret);
5438
5439         leaf = path->nodes[0];
5440         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5441                                      struct btrfs_extent_item);
5442         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5443         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5444         btrfs_set_extent_flags(leaf, extent_item,
5445                                flags | BTRFS_EXTENT_FLAG_DATA);
5446
5447         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5448         btrfs_set_extent_inline_ref_type(leaf, iref, type);
5449         if (parent > 0) {
5450                 struct btrfs_shared_data_ref *ref;
5451                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
5452                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5453                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5454         } else {
5455                 struct btrfs_extent_data_ref *ref;
5456                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5457                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5458                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5459                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5460                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5461         }
5462
5463         btrfs_mark_buffer_dirty(path->nodes[0]);
5464         btrfs_free_path(path);
5465
5466         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5467         if (ret) {
5468                 printk(KERN_ERR "btrfs update block group failed for %llu "
5469                        "%llu\n", (unsigned long long)ins->objectid,
5470                        (unsigned long long)ins->offset);
5471                 BUG();
5472         }
5473         return ret;
5474 }
5475
5476 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5477                                      struct btrfs_root *root,
5478                                      u64 parent, u64 root_objectid,
5479                                      u64 flags, struct btrfs_disk_key *key,
5480                                      int level, struct btrfs_key *ins)
5481 {
5482         int ret;
5483         struct btrfs_fs_info *fs_info = root->fs_info;
5484         struct btrfs_extent_item *extent_item;
5485         struct btrfs_tree_block_info *block_info;
5486         struct btrfs_extent_inline_ref *iref;
5487         struct btrfs_path *path;
5488         struct extent_buffer *leaf;
5489         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5490
5491         path = btrfs_alloc_path();
5492         BUG_ON(!path);
5493
5494         path->leave_spinning = 1;
5495         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5496                                       ins, size);
5497         BUG_ON(ret);
5498
5499         leaf = path->nodes[0];
5500         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5501                                      struct btrfs_extent_item);
5502         btrfs_set_extent_refs(leaf, extent_item, 1);
5503         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5504         btrfs_set_extent_flags(leaf, extent_item,
5505                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5506         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5507
5508         btrfs_set_tree_block_key(leaf, block_info, key);
5509         btrfs_set_tree_block_level(leaf, block_info, level);
5510
5511         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5512         if (parent > 0) {
5513                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5514                 btrfs_set_extent_inline_ref_type(leaf, iref,
5515                                                  BTRFS_SHARED_BLOCK_REF_KEY);
5516                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5517         } else {
5518                 btrfs_set_extent_inline_ref_type(leaf, iref,
5519                                                  BTRFS_TREE_BLOCK_REF_KEY);
5520                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5521         }
5522
5523         btrfs_mark_buffer_dirty(leaf);
5524         btrfs_free_path(path);
5525
5526         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5527         if (ret) {
5528                 printk(KERN_ERR "btrfs update block group failed for %llu "
5529                        "%llu\n", (unsigned long long)ins->objectid,
5530                        (unsigned long long)ins->offset);
5531                 BUG();
5532         }
5533         return ret;
5534 }
5535
5536 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5537                                      struct btrfs_root *root,
5538                                      u64 root_objectid, u64 owner,
5539                                      u64 offset, struct btrfs_key *ins)
5540 {
5541         int ret;
5542
5543         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5544
5545         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5546                                          0, root_objectid, owner, offset,
5547                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
5548         return ret;
5549 }
5550
5551 /*
5552  * this is used by the tree logging recovery code.  It records that
5553  * an extent has been allocated and makes sure to clear the free
5554  * space cache bits as well
5555  */
5556 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5557                                    struct btrfs_root *root,
5558                                    u64 root_objectid, u64 owner, u64 offset,
5559                                    struct btrfs_key *ins)
5560 {
5561         int ret;
5562         struct btrfs_block_group_cache *block_group;
5563         struct btrfs_caching_control *caching_ctl;
5564         u64 start = ins->objectid;
5565         u64 num_bytes = ins->offset;
5566
5567         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5568         cache_block_group(block_group, trans, NULL, 0);
5569         caching_ctl = get_caching_control(block_group);
5570
5571         if (!caching_ctl) {
5572                 BUG_ON(!block_group_cache_done(block_group));
5573                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5574                 BUG_ON(ret);
5575         } else {
5576                 mutex_lock(&caching_ctl->mutex);
5577
5578                 if (start >= caching_ctl->progress) {
5579                         ret = add_excluded_extent(root, start, num_bytes);
5580                         BUG_ON(ret);
5581                 } else if (start + num_bytes <= caching_ctl->progress) {
5582                         ret = btrfs_remove_free_space(block_group,
5583                                                       start, num_bytes);
5584                         BUG_ON(ret);
5585                 } else {
5586                         num_bytes = caching_ctl->progress - start;
5587                         ret = btrfs_remove_free_space(block_group,
5588                                                       start, num_bytes);
5589                         BUG_ON(ret);
5590
5591                         start = caching_ctl->progress;
5592                         num_bytes = ins->objectid + ins->offset -
5593                                     caching_ctl->progress;
5594                         ret = add_excluded_extent(root, start, num_bytes);
5595                         BUG_ON(ret);
5596                 }
5597
5598                 mutex_unlock(&caching_ctl->mutex);
5599                 put_caching_control(caching_ctl);
5600         }
5601
5602         ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
5603         BUG_ON(ret);
5604         btrfs_put_block_group(block_group);
5605         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5606                                          0, owner, offset, ins, 1);
5607         return ret;
5608 }
5609
5610 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5611                                             struct btrfs_root *root,
5612                                             u64 bytenr, u32 blocksize,
5613                                             int level)
5614 {
5615         struct extent_buffer *buf;
5616
5617         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5618         if (!buf)
5619                 return ERR_PTR(-ENOMEM);
5620         btrfs_set_header_generation(buf, trans->transid);
5621         btrfs_set_buffer_lockdep_class(buf, level);
5622         btrfs_tree_lock(buf);
5623         clean_tree_block(trans, root, buf);
5624
5625         btrfs_set_lock_blocking(buf);
5626         btrfs_set_buffer_uptodate(buf);
5627
5628         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5629                 /*
5630                  * we allow two log transactions at a time, use different
5631                  * EXENT bit to differentiate dirty pages.
5632                  */
5633                 if (root->log_transid % 2 == 0)
5634                         set_extent_dirty(&root->dirty_log_pages, buf->start,
5635                                         buf->start + buf->len - 1, GFP_NOFS);
5636                 else
5637                         set_extent_new(&root->dirty_log_pages, buf->start,
5638                                         buf->start + buf->len - 1, GFP_NOFS);
5639         } else {
5640                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5641                          buf->start + buf->len - 1, GFP_NOFS);
5642         }
5643         trans->blocks_used++;
5644         /* this returns a buffer locked for blocking */
5645         return buf;
5646 }
5647
5648 static struct btrfs_block_rsv *
5649 use_block_rsv(struct btrfs_trans_handle *trans,
5650               struct btrfs_root *root, u32 blocksize)
5651 {
5652         struct btrfs_block_rsv *block_rsv;
5653         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5654         int ret;
5655
5656         block_rsv = get_block_rsv(trans, root);
5657
5658         if (block_rsv->size == 0) {
5659                 ret = reserve_metadata_bytes(trans, root, block_rsv,
5660                                              blocksize, 0);
5661                 /*
5662                  * If we couldn't reserve metadata bytes try and use some from
5663                  * the global reserve.
5664                  */
5665                 if (ret && block_rsv != global_rsv) {
5666                         ret = block_rsv_use_bytes(global_rsv, blocksize);
5667                         if (!ret)
5668                                 return global_rsv;
5669                         return ERR_PTR(ret);
5670                 } else if (ret) {
5671                         return ERR_PTR(ret);
5672                 }
5673                 return block_rsv;
5674         }
5675
5676         ret = block_rsv_use_bytes(block_rsv, blocksize);
5677         if (!ret)
5678                 return block_rsv;
5679         if (ret) {
5680                 WARN_ON(1);
5681                 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5682                                              0);
5683                 if (!ret) {
5684                         spin_lock(&block_rsv->lock);
5685                         block_rsv->size += blocksize;
5686                         spin_unlock(&block_rsv->lock);
5687                         return block_rsv;
5688                 } else if (ret && block_rsv != global_rsv) {
5689                         ret = block_rsv_use_bytes(global_rsv, blocksize);
5690                         if (!ret)
5691                                 return global_rsv;
5692                 }
5693         }
5694
5695         return ERR_PTR(-ENOSPC);
5696 }
5697
5698 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5699 {
5700         block_rsv_add_bytes(block_rsv, blocksize, 0);
5701         block_rsv_release_bytes(block_rsv, NULL, 0);
5702 }
5703
5704 /*
5705  * finds a free extent and does all the dirty work required for allocation
5706  * returns the key for the extent through ins, and a tree buffer for
5707  * the first block of the extent through buf.
5708  *
5709  * returns the tree buffer or NULL.
5710  */
5711 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5712                                         struct btrfs_root *root, u32 blocksize,
5713                                         u64 parent, u64 root_objectid,
5714                                         struct btrfs_disk_key *key, int level,
5715                                         u64 hint, u64 empty_size)
5716 {
5717         struct btrfs_key ins;
5718         struct btrfs_block_rsv *block_rsv;
5719         struct extent_buffer *buf;
5720         u64 flags = 0;
5721         int ret;
5722
5723
5724         block_rsv = use_block_rsv(trans, root, blocksize);
5725         if (IS_ERR(block_rsv))
5726                 return ERR_CAST(block_rsv);
5727
5728         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5729                                    empty_size, hint, (u64)-1, &ins, 0);
5730         if (ret) {
5731                 unuse_block_rsv(block_rsv, blocksize);
5732                 return ERR_PTR(ret);
5733         }
5734
5735         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5736                                     blocksize, level);
5737         BUG_ON(IS_ERR(buf));
5738
5739         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5740                 if (parent == 0)
5741                         parent = ins.objectid;
5742                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5743         } else
5744                 BUG_ON(parent > 0);
5745
5746         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5747                 struct btrfs_delayed_extent_op *extent_op;
5748                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5749                 BUG_ON(!extent_op);
5750                 if (key)
5751                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
5752                 else
5753                         memset(&extent_op->key, 0, sizeof(extent_op->key));
5754                 extent_op->flags_to_set = flags;
5755                 extent_op->update_key = 1;
5756                 extent_op->update_flags = 1;
5757                 extent_op->is_data = 0;
5758
5759                 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5760                                         ins.offset, parent, root_objectid,
5761                                         level, BTRFS_ADD_DELAYED_EXTENT,
5762                                         extent_op);
5763                 BUG_ON(ret);
5764         }
5765         return buf;
5766 }
5767
5768 struct walk_control {
5769         u64 refs[BTRFS_MAX_LEVEL];
5770         u64 flags[BTRFS_MAX_LEVEL];
5771         struct btrfs_key update_progress;
5772         int stage;
5773         int level;
5774         int shared_level;
5775         int update_ref;
5776         int keep_locks;
5777         int reada_slot;
5778         int reada_count;
5779 };
5780
5781 #define DROP_REFERENCE  1
5782 #define UPDATE_BACKREF  2
5783
5784 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5785                                      struct btrfs_root *root,
5786                                      struct walk_control *wc,
5787                                      struct btrfs_path *path)
5788 {
5789         u64 bytenr;
5790         u64 generation;
5791         u64 refs;
5792         u64 flags;
5793         u32 nritems;
5794         u32 blocksize;
5795         struct btrfs_key key;
5796         struct extent_buffer *eb;
5797         int ret;
5798         int slot;
5799         int nread = 0;
5800
5801         if (path->slots[wc->level] < wc->reada_slot) {
5802                 wc->reada_count = wc->reada_count * 2 / 3;
5803                 wc->reada_count = max(wc->reada_count, 2);
5804         } else {
5805                 wc->reada_count = wc->reada_count * 3 / 2;
5806                 wc->reada_count = min_t(int, wc->reada_count,
5807                                         BTRFS_NODEPTRS_PER_BLOCK(root));
5808         }
5809
5810         eb = path->nodes[wc->level];
5811         nritems = btrfs_header_nritems(eb);
5812         blocksize = btrfs_level_size(root, wc->level - 1);
5813
5814         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5815                 if (nread >= wc->reada_count)
5816                         break;
5817
5818                 cond_resched();
5819                 bytenr = btrfs_node_blockptr(eb, slot);
5820                 generation = btrfs_node_ptr_generation(eb, slot);
5821
5822                 if (slot == path->slots[wc->level])
5823                         goto reada;
5824
5825                 if (wc->stage == UPDATE_BACKREF &&
5826                     generation <= root->root_key.offset)
5827                         continue;
5828
5829                 /* We don't lock the tree block, it's OK to be racy here */
5830                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5831                                                &refs, &flags);
5832                 BUG_ON(ret);
5833                 BUG_ON(refs == 0);
5834
5835                 if (wc->stage == DROP_REFERENCE) {
5836                         if (refs == 1)
5837                                 goto reada;
5838
5839                         if (wc->level == 1 &&
5840                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5841                                 continue;
5842                         if (!wc->update_ref ||
5843                             generation <= root->root_key.offset)
5844                                 continue;
5845                         btrfs_node_key_to_cpu(eb, &key, slot);
5846                         ret = btrfs_comp_cpu_keys(&key,
5847                                                   &wc->update_progress);
5848                         if (ret < 0)
5849                                 continue;
5850                 } else {
5851                         if (wc->level == 1 &&
5852                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5853                                 continue;
5854                 }
5855 reada:
5856                 ret = readahead_tree_block(root, bytenr, blocksize,
5857                                            generation);
5858                 if (ret)
5859                         break;
5860                 nread++;
5861         }
5862         wc->reada_slot = slot;
5863 }
5864
5865 /*
5866  * hepler to process tree block while walking down the tree.
5867  *
5868  * when wc->stage == UPDATE_BACKREF, this function updates
5869  * back refs for pointers in the block.
5870  *
5871  * NOTE: return value 1 means we should stop walking down.
5872  */
5873 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5874                                    struct btrfs_root *root,
5875                                    struct btrfs_path *path,
5876                                    struct walk_control *wc, int lookup_info)
5877 {
5878         int level = wc->level;
5879         struct extent_buffer *eb = path->nodes[level];
5880         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5881         int ret;
5882
5883         if (wc->stage == UPDATE_BACKREF &&
5884             btrfs_header_owner(eb) != root->root_key.objectid)
5885                 return 1;
5886
5887         /*
5888          * when reference count of tree block is 1, it won't increase
5889          * again. once full backref flag is set, we never clear it.
5890          */
5891         if (lookup_info &&
5892             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5893              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5894                 BUG_ON(!path->locks[level]);
5895                 ret = btrfs_lookup_extent_info(trans, root,
5896                                                eb->start, eb->len,
5897                                                &wc->refs[level],
5898                                                &wc->flags[level]);
5899                 BUG_ON(ret);
5900                 BUG_ON(wc->refs[level] == 0);
5901         }
5902
5903         if (wc->stage == DROP_REFERENCE) {
5904                 if (wc->refs[level] > 1)
5905                         return 1;
5906
5907                 if (path->locks[level] && !wc->keep_locks) {
5908                         btrfs_tree_unlock(eb);
5909                         path->locks[level] = 0;
5910                 }
5911                 return 0;
5912         }
5913
5914         /* wc->stage == UPDATE_BACKREF */
5915         if (!(wc->flags[level] & flag)) {
5916                 BUG_ON(!path->locks[level]);
5917                 ret = btrfs_inc_ref(trans, root, eb, 1);
5918                 BUG_ON(ret);
5919                 ret = btrfs_dec_ref(trans, root, eb, 0);
5920                 BUG_ON(ret);
5921                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5922                                                   eb->len, flag, 0);
5923                 BUG_ON(ret);
5924                 wc->flags[level] |= flag;
5925         }
5926
5927         /*
5928          * the block is shared by multiple trees, so it's not good to
5929          * keep the tree lock
5930          */
5931         if (path->locks[level] && level > 0) {
5932                 btrfs_tree_unlock(eb);
5933                 path->locks[level] = 0;
5934         }
5935         return 0;
5936 }
5937
5938 /*
5939  * hepler to process tree block pointer.
5940  *
5941  * when wc->stage == DROP_REFERENCE, this function checks
5942  * reference count of the block pointed to. if the block
5943  * is shared and we need update back refs for the subtree
5944  * rooted at the block, this function changes wc->stage to
5945  * UPDATE_BACKREF. if the block is shared and there is no
5946  * need to update back, this function drops the reference
5947  * to the block.
5948  *
5949  * NOTE: return value 1 means we should stop walking down.
5950  */
5951 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5952                                  struct btrfs_root *root,
5953                                  struct btrfs_path *path,
5954                                  struct walk_control *wc, int *lookup_info)
5955 {
5956         u64 bytenr;
5957         u64 generation;
5958         u64 parent;
5959         u32 blocksize;
5960         struct btrfs_key key;
5961         struct extent_buffer *next;
5962         int level = wc->level;
5963         int reada = 0;
5964         int ret = 0;
5965
5966         generation = btrfs_node_ptr_generation(path->nodes[level],
5967                                                path->slots[level]);
5968         /*
5969          * if the lower level block was created before the snapshot
5970          * was created, we know there is no need to update back refs
5971          * for the subtree
5972          */
5973         if (wc->stage == UPDATE_BACKREF &&
5974             generation <= root->root_key.offset) {
5975                 *lookup_info = 1;
5976                 return 1;
5977         }
5978
5979         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5980         blocksize = btrfs_level_size(root, level - 1);
5981
5982         next = btrfs_find_tree_block(root, bytenr, blocksize);
5983         if (!next) {
5984                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5985                 if (!next)
5986                         return -ENOMEM;
5987                 reada = 1;
5988         }
5989         btrfs_tree_lock(next);
5990         btrfs_set_lock_blocking(next);
5991
5992         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5993                                        &wc->refs[level - 1],
5994                                        &wc->flags[level - 1]);
5995         BUG_ON(ret);
5996         BUG_ON(wc->refs[level - 1] == 0);
5997         *lookup_info = 0;
5998
5999         if (wc->stage == DROP_REFERENCE) {
6000                 if (wc->refs[level - 1] > 1) {
6001                         if (level == 1 &&
6002                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6003                                 goto skip;
6004
6005                         if (!wc->update_ref ||
6006                             generation <= root->root_key.offset)
6007                                 goto skip;
6008
6009                         btrfs_node_key_to_cpu(path->nodes[level], &key,
6010                                               path->slots[level]);
6011                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6012                         if (ret < 0)
6013                                 goto skip;
6014
6015                         wc->stage = UPDATE_BACKREF;
6016                         wc->shared_level = level - 1;
6017                 }
6018         } else {
6019                 if (level == 1 &&
6020                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6021                         goto skip;
6022         }
6023
6024         if (!btrfs_buffer_uptodate(next, generation)) {
6025                 btrfs_tree_unlock(next);
6026                 free_extent_buffer(next);
6027                 next = NULL;
6028                 *lookup_info = 1;
6029         }
6030
6031         if (!next) {
6032                 if (reada && level == 1)
6033                         reada_walk_down(trans, root, wc, path);
6034                 next = read_tree_block(root, bytenr, blocksize, generation);
6035                 if (!next)
6036                         return -EIO;
6037                 btrfs_tree_lock(next);
6038                 btrfs_set_lock_blocking(next);
6039         }
6040
6041         level--;
6042         BUG_ON(level != btrfs_header_level(next));
6043         path->nodes[level] = next;
6044         path->slots[level] = 0;
6045         path->locks[level] = 1;
6046         wc->level = level;
6047         if (wc->level == 1)
6048                 wc->reada_slot = 0;
6049         return 0;
6050 skip:
6051         wc->refs[level - 1] = 0;
6052         wc->flags[level - 1] = 0;
6053         if (wc->stage == DROP_REFERENCE) {
6054                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6055                         parent = path->nodes[level]->start;
6056                 } else {
6057                         BUG_ON(root->root_key.objectid !=
6058                                btrfs_header_owner(path->nodes[level]));
6059                         parent = 0;
6060                 }
6061
6062                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6063                                         root->root_key.objectid, level - 1, 0);
6064                 BUG_ON(ret);
6065         }
6066         btrfs_tree_unlock(next);
6067         free_extent_buffer(next);
6068         *lookup_info = 1;
6069         return 1;
6070 }
6071
6072 /*
6073  * hepler to process tree block while walking up the tree.
6074  *
6075  * when wc->stage == DROP_REFERENCE, this function drops
6076  * reference count on the block.
6077  *
6078  * when wc->stage == UPDATE_BACKREF, this function changes
6079  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6080  * to UPDATE_BACKREF previously while processing the block.
6081  *
6082  * NOTE: return value 1 means we should stop walking up.
6083  */
6084 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6085                                  struct btrfs_root *root,
6086                                  struct btrfs_path *path,
6087                                  struct walk_control *wc)
6088 {
6089         int ret;
6090         int level = wc->level;
6091         struct extent_buffer *eb = path->nodes[level];
6092         u64 parent = 0;
6093
6094         if (wc->stage == UPDATE_BACKREF) {
6095                 BUG_ON(wc->shared_level < level);
6096                 if (level < wc->shared_level)
6097                         goto out;
6098
6099                 ret = find_next_key(path, level + 1, &wc->update_progress);
6100                 if (ret > 0)
6101                         wc->update_ref = 0;
6102
6103                 wc->stage = DROP_REFERENCE;
6104                 wc->shared_level = -1;
6105                 path->slots[level] = 0;
6106
6107                 /*
6108                  * check reference count again if the block isn't locked.
6109                  * we should start walking down the tree again if reference
6110                  * count is one.
6111                  */
6112                 if (!path->locks[level]) {
6113                         BUG_ON(level == 0);
6114                         btrfs_tree_lock(eb);
6115                         btrfs_set_lock_blocking(eb);
6116                         path->locks[level] = 1;
6117
6118                         ret = btrfs_lookup_extent_info(trans, root,
6119                                                        eb->start, eb->len,
6120                                                        &wc->refs[level],
6121                                                        &wc->flags[level]);
6122                         BUG_ON(ret);
6123                         BUG_ON(wc->refs[level] == 0);
6124                         if (wc->refs[level] == 1) {
6125                                 btrfs_tree_unlock(eb);
6126                                 path->locks[level] = 0;
6127                                 return 1;
6128                         }
6129                 }
6130         }
6131
6132         /* wc->stage == DROP_REFERENCE */
6133         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6134
6135         if (wc->refs[level] == 1) {
6136                 if (level == 0) {
6137                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6138                                 ret = btrfs_dec_ref(trans, root, eb, 1);
6139                         else
6140                                 ret = btrfs_dec_ref(trans, root, eb, 0);
6141                         BUG_ON(ret);
6142                 }
6143                 /* make block locked assertion in clean_tree_block happy */
6144                 if (!path->locks[level] &&
6145                     btrfs_header_generation(eb) == trans->transid) {
6146                         btrfs_tree_lock(eb);
6147                         btrfs_set_lock_blocking(eb);
6148                         path->locks[level] = 1;
6149                 }
6150                 clean_tree_block(trans, root, eb);
6151         }
6152
6153         if (eb == root->node) {
6154                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6155                         parent = eb->start;
6156                 else
6157                         BUG_ON(root->root_key.objectid !=
6158                                btrfs_header_owner(eb));
6159         } else {
6160                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6161                         parent = path->nodes[level + 1]->start;
6162                 else
6163                         BUG_ON(root->root_key.objectid !=
6164                                btrfs_header_owner(path->nodes[level + 1]));
6165         }
6166
6167         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6168 out:
6169         wc->refs[level] = 0;
6170         wc->flags[level] = 0;
6171         return 0;
6172 }
6173
6174 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6175                                    struct btrfs_root *root,
6176                                    struct btrfs_path *path,
6177                                    struct walk_control *wc)
6178 {
6179         int level = wc->level;
6180         int lookup_info = 1;
6181         int ret;
6182
6183         while (level >= 0) {
6184                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
6185                 if (ret > 0)
6186                         break;
6187
6188                 if (level == 0)
6189                         break;
6190
6191                 if (path->slots[level] >=
6192                     btrfs_header_nritems(path->nodes[level]))
6193                         break;
6194
6195                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
6196                 if (ret > 0) {
6197                         path->slots[level]++;
6198                         continue;
6199                 } else if (ret < 0)
6200                         return ret;
6201                 level = wc->level;
6202         }
6203         return 0;
6204 }
6205
6206 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6207                                  struct btrfs_root *root,
6208                                  struct btrfs_path *path,
6209                                  struct walk_control *wc, int max_level)
6210 {
6211         int level = wc->level;
6212         int ret;
6213
6214         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6215         while (level < max_level && path->nodes[level]) {
6216                 wc->level = level;
6217                 if (path->slots[level] + 1 <
6218                     btrfs_header_nritems(path->nodes[level])) {
6219                         path->slots[level]++;
6220                         return 0;
6221                 } else {
6222                         ret = walk_up_proc(trans, root, path, wc);
6223                         if (ret > 0)
6224                                 return 0;
6225
6226                         if (path->locks[level]) {
6227                                 btrfs_tree_unlock(path->nodes[level]);
6228                                 path->locks[level] = 0;
6229                         }
6230                         free_extent_buffer(path->nodes[level]);
6231                         path->nodes[level] = NULL;
6232                         level++;
6233                 }
6234         }
6235         return 1;
6236 }
6237
6238 /*
6239  * drop a subvolume tree.
6240  *
6241  * this function traverses the tree freeing any blocks that only
6242  * referenced by the tree.
6243  *
6244  * when a shared tree block is found. this function decreases its
6245  * reference count by one. if update_ref is true, this function
6246  * also make sure backrefs for the shared block and all lower level
6247  * blocks are properly updated.
6248  */
6249 int btrfs_drop_snapshot(struct btrfs_root *root,
6250                         struct btrfs_block_rsv *block_rsv, int update_ref)
6251 {
6252         struct btrfs_path *path;
6253         struct btrfs_trans_handle *trans;
6254         struct btrfs_root *tree_root = root->fs_info->tree_root;
6255         struct btrfs_root_item *root_item = &root->root_item;
6256         struct walk_control *wc;
6257         struct btrfs_key key;
6258         int err = 0;
6259         int ret;
6260         int level;
6261
6262         path = btrfs_alloc_path();
6263         BUG_ON(!path);
6264
6265         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6266         BUG_ON(!wc);
6267
6268         trans = btrfs_start_transaction(tree_root, 0);
6269         BUG_ON(IS_ERR(trans));
6270
6271         if (block_rsv)
6272                 trans->block_rsv = block_rsv;
6273
6274         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6275                 level = btrfs_header_level(root->node);
6276                 path->nodes[level] = btrfs_lock_root_node(root);
6277                 btrfs_set_lock_blocking(path->nodes[level]);
6278                 path->slots[level] = 0;
6279                 path->locks[level] = 1;
6280                 memset(&wc->update_progress, 0,
6281                        sizeof(wc->update_progress));
6282         } else {
6283                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6284                 memcpy(&wc->update_progress, &key,
6285                        sizeof(wc->update_progress));
6286
6287                 level = root_item->drop_level;
6288                 BUG_ON(level == 0);
6289                 path->lowest_level = level;
6290                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6291                 path->lowest_level = 0;
6292                 if (ret < 0) {
6293                         err = ret;
6294                         goto out;
6295                 }
6296                 WARN_ON(ret > 0);
6297
6298                 /*
6299                  * unlock our path, this is safe because only this
6300                  * function is allowed to delete this snapshot
6301                  */
6302                 btrfs_unlock_up_safe(path, 0);
6303
6304                 level = btrfs_header_level(root->node);
6305                 while (1) {
6306                         btrfs_tree_lock(path->nodes[level]);
6307                         btrfs_set_lock_blocking(path->nodes[level]);
6308
6309                         ret = btrfs_lookup_extent_info(trans, root,
6310                                                 path->nodes[level]->start,
6311                                                 path->nodes[level]->len,
6312                                                 &wc->refs[level],
6313                                                 &wc->flags[level]);
6314                         BUG_ON(ret);
6315                         BUG_ON(wc->refs[level] == 0);
6316
6317                         if (level == root_item->drop_level)
6318                                 break;
6319
6320                         btrfs_tree_unlock(path->nodes[level]);
6321                         WARN_ON(wc->refs[level] != 1);
6322                         level--;
6323                 }
6324         }
6325
6326         wc->level = level;
6327         wc->shared_level = -1;
6328         wc->stage = DROP_REFERENCE;
6329         wc->update_ref = update_ref;
6330         wc->keep_locks = 0;
6331         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6332
6333         while (1) {
6334                 ret = walk_down_tree(trans, root, path, wc);
6335                 if (ret < 0) {
6336                         err = ret;
6337                         break;
6338                 }
6339
6340                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6341                 if (ret < 0) {
6342                         err = ret;
6343                         break;
6344                 }
6345
6346                 if (ret > 0) {
6347                         BUG_ON(wc->stage != DROP_REFERENCE);
6348                         break;
6349                 }
6350
6351                 if (wc->stage == DROP_REFERENCE) {
6352                         level = wc->level;
6353                         btrfs_node_key(path->nodes[level],
6354                                        &root_item->drop_progress,
6355                                        path->slots[level]);
6356                         root_item->drop_level = level;
6357                 }
6358
6359                 BUG_ON(wc->level == 0);
6360                 if (btrfs_should_end_transaction(trans, tree_root)) {
6361                         ret = btrfs_update_root(trans, tree_root,
6362                                                 &root->root_key,
6363                                                 root_item);
6364                         BUG_ON(ret);
6365
6366                         btrfs_end_transaction_throttle(trans, tree_root);
6367                         trans = btrfs_start_transaction(tree_root, 0);
6368                         BUG_ON(IS_ERR(trans));
6369                         if (block_rsv)
6370                                 trans->block_rsv = block_rsv;
6371                 }
6372         }
6373         btrfs_release_path(path);
6374         BUG_ON(err);
6375
6376         ret = btrfs_del_root(trans, tree_root, &root->root_key);
6377         BUG_ON(ret);
6378
6379         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6380                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6381                                            NULL, NULL);
6382                 BUG_ON(ret < 0);
6383                 if (ret > 0) {
6384                         /* if we fail to delete the orphan item this time
6385                          * around, it'll get picked up the next time.
6386                          *
6387                          * The most common failure here is just -ENOENT.
6388                          */
6389                         btrfs_del_orphan_item(trans, tree_root,
6390                                               root->root_key.objectid);
6391                 }
6392         }
6393
6394         if (root->in_radix) {
6395                 btrfs_free_fs_root(tree_root->fs_info, root);
6396         } else {
6397                 free_extent_buffer(root->node);
6398                 free_extent_buffer(root->commit_root);
6399                 kfree(root);
6400         }
6401 out:
6402         btrfs_end_transaction_throttle(trans, tree_root);
6403         kfree(wc);
6404         btrfs_free_path(path);
6405         return err;
6406 }
6407
6408 /*
6409  * drop subtree rooted at tree block 'node'.
6410  *
6411  * NOTE: this function will unlock and release tree block 'node'
6412  */
6413 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6414                         struct btrfs_root *root,
6415                         struct extent_buffer *node,
6416                         struct extent_buffer *parent)
6417 {
6418         struct btrfs_path *path;
6419         struct walk_control *wc;
6420         int level;
6421         int parent_level;
6422         int ret = 0;
6423         int wret;
6424
6425         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6426
6427         path = btrfs_alloc_path();
6428         if (!path)
6429                 return -ENOMEM;
6430
6431         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6432         if (!wc) {
6433                 btrfs_free_path(path);
6434                 return -ENOMEM;
6435         }
6436
6437         btrfs_assert_tree_locked(parent);
6438         parent_level = btrfs_header_level(parent);
6439         extent_buffer_get(parent);
6440         path->nodes[parent_level] = parent;
6441         path->slots[parent_level] = btrfs_header_nritems(parent);
6442
6443         btrfs_assert_tree_locked(node);
6444         level = btrfs_header_level(node);
6445         path->nodes[level] = node;
6446         path->slots[level] = 0;
6447         path->locks[level] = 1;
6448
6449         wc->refs[parent_level] = 1;
6450         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6451         wc->level = level;
6452         wc->shared_level = -1;
6453         wc->stage = DROP_REFERENCE;
6454         wc->update_ref = 0;
6455         wc->keep_locks = 1;
6456         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6457
6458         while (1) {
6459                 wret = walk_down_tree(trans, root, path, wc);
6460                 if (wret < 0) {
6461                         ret = wret;
6462                         break;
6463                 }
6464
6465                 wret = walk_up_tree(trans, root, path, wc, parent_level);
6466                 if (wret < 0)
6467                         ret = wret;
6468                 if (wret != 0)
6469                         break;
6470         }
6471
6472         kfree(wc);
6473         btrfs_free_path(path);
6474         return ret;
6475 }
6476
6477 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6478 {
6479         u64 num_devices;
6480         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6481                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6482
6483         /*
6484          * we add in the count of missing devices because we want
6485          * to make sure that any RAID levels on a degraded FS
6486          * continue to be honored.
6487          */
6488         num_devices = root->fs_info->fs_devices->rw_devices +
6489                 root->fs_info->fs_devices->missing_devices;
6490
6491         if (num_devices == 1) {
6492                 stripped |= BTRFS_BLOCK_GROUP_DUP;
6493                 stripped = flags & ~stripped;
6494
6495                 /* turn raid0 into single device chunks */
6496                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
6497                         return stripped;
6498
6499                 /* turn mirroring into duplication */
6500                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
6501                              BTRFS_BLOCK_GROUP_RAID10))
6502                         return stripped | BTRFS_BLOCK_GROUP_DUP;
6503                 return flags;
6504         } else {
6505                 /* they already had raid on here, just return */
6506                 if (flags & stripped)
6507                         return flags;
6508
6509                 stripped |= BTRFS_BLOCK_GROUP_DUP;
6510                 stripped = flags & ~stripped;
6511
6512                 /* switch duplicated blocks with raid1 */
6513                 if (flags & BTRFS_BLOCK_GROUP_DUP)
6514                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
6515
6516                 /* turn single device chunks into raid0 */
6517                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
6518         }
6519         return flags;
6520 }
6521
6522 static int set_block_group_ro(struct btrfs_block_group_cache *cache)
6523 {
6524         struct btrfs_space_info *sinfo = cache->space_info;
6525         u64 num_bytes;
6526         int ret = -ENOSPC;
6527
6528         if (cache->ro)
6529                 return 0;
6530
6531         spin_lock(&sinfo->lock);
6532         spin_lock(&cache->lock);
6533         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6534                     cache->bytes_super - btrfs_block_group_used(&cache->item);
6535
6536         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6537             sinfo->bytes_may_use + sinfo->bytes_readonly +
6538             cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
6539                 sinfo->bytes_readonly += num_bytes;
6540                 sinfo->bytes_reserved += cache->reserved_pinned;
6541                 cache->reserved_pinned = 0;
6542                 cache->ro = 1;
6543                 ret = 0;
6544         }
6545
6546         spin_unlock(&cache->lock);
6547         spin_unlock(&sinfo->lock);
6548         return ret;
6549 }
6550
6551 int btrfs_set_block_group_ro(struct btrfs_root *root,
6552                              struct btrfs_block_group_cache *cache)
6553
6554 {
6555         struct btrfs_trans_handle *trans;
6556         u64 alloc_flags;
6557         int ret;
6558
6559         BUG_ON(cache->ro);
6560
6561         trans = btrfs_join_transaction(root);
6562         BUG_ON(IS_ERR(trans));
6563
6564         alloc_flags = update_block_group_flags(root, cache->flags);
6565         if (alloc_flags != cache->flags)
6566                 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6567                                CHUNK_ALLOC_FORCE);
6568
6569         ret = set_block_group_ro(cache);
6570         if (!ret)
6571                 goto out;
6572         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
6573         ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6574                              CHUNK_ALLOC_FORCE);
6575         if (ret < 0)
6576                 goto out;
6577         ret = set_block_group_ro(cache);
6578 out:
6579         btrfs_end_transaction(trans, root);
6580         return ret;
6581 }
6582
6583 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
6584                             struct btrfs_root *root, u64 type)
6585 {
6586         u64 alloc_flags = get_alloc_profile(root, type);
6587         return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6588                               CHUNK_ALLOC_FORCE);
6589 }
6590
6591 /*
6592  * helper to account the unused space of all the readonly block group in the
6593  * list. takes mirrors into account.
6594  */
6595 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
6596 {
6597         struct btrfs_block_group_cache *block_group;
6598         u64 free_bytes = 0;
6599         int factor;
6600
6601         list_for_each_entry(block_group, groups_list, list) {
6602                 spin_lock(&block_group->lock);
6603
6604                 if (!block_group->ro) {
6605                         spin_unlock(&block_group->lock);
6606                         continue;
6607                 }
6608
6609                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
6610                                           BTRFS_BLOCK_GROUP_RAID10 |
6611                                           BTRFS_BLOCK_GROUP_DUP))
6612                         factor = 2;
6613                 else
6614                         factor = 1;
6615
6616                 free_bytes += (block_group->key.offset -
6617                                btrfs_block_group_used(&block_group->item)) *
6618                                factor;
6619
6620                 spin_unlock(&block_group->lock);
6621         }
6622
6623         return free_bytes;
6624 }
6625
6626 /*
6627  * helper to account the unused space of all the readonly block group in the
6628  * space_info. takes mirrors into account.
6629  */
6630 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6631 {
6632         int i;
6633         u64 free_bytes = 0;
6634
6635         spin_lock(&sinfo->lock);
6636
6637         for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
6638                 if (!list_empty(&sinfo->block_groups[i]))
6639                         free_bytes += __btrfs_get_ro_block_group_free_space(
6640                                                 &sinfo->block_groups[i]);
6641
6642         spin_unlock(&sinfo->lock);
6643
6644         return free_bytes;
6645 }
6646
6647 int btrfs_set_block_group_rw(struct btrfs_root *root,
6648                               struct btrfs_block_group_cache *cache)
6649 {
6650         struct btrfs_space_info *sinfo = cache->space_info;
6651         u64 num_bytes;
6652
6653         BUG_ON(!cache->ro);
6654
6655         spin_lock(&sinfo->lock);
6656         spin_lock(&cache->lock);
6657         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6658                     cache->bytes_super - btrfs_block_group_used(&cache->item);
6659         sinfo->bytes_readonly -= num_bytes;
6660         cache->ro = 0;
6661         spin_unlock(&cache->lock);
6662         spin_unlock(&sinfo->lock);
6663         return 0;
6664 }
6665
6666 /*
6667  * checks to see if its even possible to relocate this block group.
6668  *
6669  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6670  * ok to go ahead and try.
6671  */
6672 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6673 {
6674         struct btrfs_block_group_cache *block_group;
6675         struct btrfs_space_info *space_info;
6676         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6677         struct btrfs_device *device;
6678         int full = 0;
6679         int ret = 0;
6680
6681         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
6682
6683         /* odd, couldn't find the block group, leave it alone */
6684         if (!block_group)
6685                 return -1;
6686
6687         /* no bytes used, we're good */
6688         if (!btrfs_block_group_used(&block_group->item))
6689                 goto out;
6690
6691         space_info = block_group->space_info;
6692         spin_lock(&space_info->lock);
6693
6694         full = space_info->full;
6695
6696         /*
6697          * if this is the last block group we have in this space, we can't
6698          * relocate it unless we're able to allocate a new chunk below.
6699          *
6700          * Otherwise, we need to make sure we have room in the space to handle
6701          * all of the extents from this block group.  If we can, we're good
6702          */
6703         if ((space_info->total_bytes != block_group->key.offset) &&
6704            (space_info->bytes_used + space_info->bytes_reserved +
6705             space_info->bytes_pinned + space_info->bytes_readonly +
6706             btrfs_block_group_used(&block_group->item) <
6707             space_info->total_bytes)) {
6708                 spin_unlock(&space_info->lock);
6709                 goto out;
6710         }
6711         spin_unlock(&space_info->lock);
6712
6713         /*
6714          * ok we don't have enough space, but maybe we have free space on our
6715          * devices to allocate new chunks for relocation, so loop through our
6716          * alloc devices and guess if we have enough space.  However, if we
6717          * were marked as full, then we know there aren't enough chunks, and we
6718          * can just return.
6719          */
6720         ret = -1;
6721         if (full)
6722                 goto out;
6723
6724         mutex_lock(&root->fs_info->chunk_mutex);
6725         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
6726                 u64 min_free = btrfs_block_group_used(&block_group->item);
6727                 u64 dev_offset;
6728
6729                 /*
6730                  * check to make sure we can actually find a chunk with enough
6731                  * space to fit our block group in.
6732                  */
6733                 if (device->total_bytes > device->bytes_used + min_free) {
6734                         ret = find_free_dev_extent(NULL, device, min_free,
6735                                                    &dev_offset, NULL);
6736                         if (!ret)
6737                                 break;
6738                         ret = -1;
6739                 }
6740         }
6741         mutex_unlock(&root->fs_info->chunk_mutex);
6742 out:
6743         btrfs_put_block_group(block_group);
6744         return ret;
6745 }
6746
6747 static int find_first_block_group(struct btrfs_root *root,
6748                 struct btrfs_path *path, struct btrfs_key *key)
6749 {
6750         int ret = 0;
6751         struct btrfs_key found_key;
6752         struct extent_buffer *leaf;
6753         int slot;
6754
6755         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
6756         if (ret < 0)
6757                 goto out;
6758
6759         while (1) {
6760                 slot = path->slots[0];
6761                 leaf = path->nodes[0];
6762                 if (slot >= btrfs_header_nritems(leaf)) {
6763                         ret = btrfs_next_leaf(root, path);
6764                         if (ret == 0)
6765                                 continue;
6766                         if (ret < 0)
6767                                 goto out;
6768                         break;
6769                 }
6770                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6771
6772                 if (found_key.objectid >= key->objectid &&
6773                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6774                         ret = 0;
6775                         goto out;
6776                 }
6777                 path->slots[0]++;
6778         }
6779 out:
6780         return ret;
6781 }
6782
6783 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
6784 {
6785         struct btrfs_block_group_cache *block_group;
6786         u64 last = 0;
6787
6788         while (1) {
6789                 struct inode *inode;
6790
6791                 block_group = btrfs_lookup_first_block_group(info, last);
6792                 while (block_group) {
6793                         spin_lock(&block_group->lock);
6794                         if (block_group->iref)
6795                                 break;
6796                         spin_unlock(&block_group->lock);
6797                         block_group = next_block_group(info->tree_root,
6798                                                        block_group);
6799                 }
6800                 if (!block_group) {
6801                         if (last == 0)
6802                                 break;
6803                         last = 0;
6804                         continue;
6805                 }
6806
6807                 inode = block_group->inode;
6808                 block_group->iref = 0;
6809                 block_group->inode = NULL;
6810                 spin_unlock(&block_group->lock);
6811                 iput(inode);
6812                 last = block_group->key.objectid + block_group->key.offset;
6813                 btrfs_put_block_group(block_group);
6814         }
6815 }
6816
6817 int btrfs_free_block_groups(struct btrfs_fs_info *info)
6818 {
6819         struct btrfs_block_group_cache *block_group;
6820         struct btrfs_space_info *space_info;
6821         struct btrfs_caching_control *caching_ctl;
6822         struct rb_node *n;
6823
6824         down_write(&info->extent_commit_sem);
6825         while (!list_empty(&info->caching_block_groups)) {
6826                 caching_ctl = list_entry(info->caching_block_groups.next,
6827                                          struct btrfs_caching_control, list);
6828                 list_del(&caching_ctl->list);
6829                 put_caching_control(caching_ctl);
6830         }
6831         up_write(&info->extent_commit_sem);
6832
6833         spin_lock(&info->block_group_cache_lock);
6834         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
6835                 block_group = rb_entry(n, struct btrfs_block_group_cache,
6836                                        cache_node);
6837                 rb_erase(&block_group->cache_node,
6838                          &info->block_group_cache_tree);
6839                 spin_unlock(&info->block_group_cache_lock);
6840
6841                 down_write(&block_group->space_info->groups_sem);
6842                 list_del(&block_group->list);
6843                 up_write(&block_group->space_info->groups_sem);
6844
6845                 if (block_group->cached == BTRFS_CACHE_STARTED)
6846                         wait_block_group_cache_done(block_group);
6847
6848                 /*
6849                  * We haven't cached this block group, which means we could
6850                  * possibly have excluded extents on this block group.
6851                  */
6852                 if (block_group->cached == BTRFS_CACHE_NO)
6853                         free_excluded_extents(info->extent_root, block_group);
6854
6855                 btrfs_remove_free_space_cache(block_group);
6856                 btrfs_put_block_group(block_group);
6857
6858                 spin_lock(&info->block_group_cache_lock);
6859         }
6860         spin_unlock(&info->block_group_cache_lock);
6861
6862         /* now that all the block groups are freed, go through and
6863          * free all the space_info structs.  This is only called during
6864          * the final stages of unmount, and so we know nobody is
6865          * using them.  We call synchronize_rcu() once before we start,
6866          * just to be on the safe side.
6867          */
6868         synchronize_rcu();
6869
6870         release_global_block_rsv(info);
6871
6872         while(!list_empty(&info->space_info)) {
6873                 space_info = list_entry(info->space_info.next,
6874                                         struct btrfs_space_info,
6875                                         list);
6876                 if (space_info->bytes_pinned > 0 ||
6877                     space_info->bytes_reserved > 0) {
6878                         WARN_ON(1);
6879                         dump_space_info(space_info, 0, 0);
6880                 }
6881                 list_del(&space_info->list);
6882                 kfree(space_info);
6883         }
6884         return 0;
6885 }
6886
6887 static void __link_block_group(struct btrfs_space_info *space_info,
6888                                struct btrfs_block_group_cache *cache)
6889 {
6890         int index = get_block_group_index(cache);
6891
6892         down_write(&space_info->groups_sem);
6893         list_add_tail(&cache->list, &space_info->block_groups[index]);
6894         up_write(&space_info->groups_sem);
6895 }
6896
6897 int btrfs_read_block_groups(struct btrfs_root *root)
6898 {
6899         struct btrfs_path *path;
6900         int ret;
6901         struct btrfs_block_group_cache *cache;
6902         struct btrfs_fs_info *info = root->fs_info;
6903         struct btrfs_space_info *space_info;
6904         struct btrfs_key key;
6905         struct btrfs_key found_key;
6906         struct extent_buffer *leaf;
6907         int need_clear = 0;
6908         u64 cache_gen;
6909
6910         root = info->extent_root;
6911         key.objectid = 0;
6912         key.offset = 0;
6913         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
6914         path = btrfs_alloc_path();
6915         if (!path)
6916                 return -ENOMEM;
6917         path->reada = 1;
6918
6919         cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
6920         if (cache_gen != 0 &&
6921             btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
6922                 need_clear = 1;
6923         if (btrfs_test_opt(root, CLEAR_CACHE))
6924                 need_clear = 1;
6925         if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
6926                 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
6927
6928         while (1) {
6929                 ret = find_first_block_group(root, path, &key);
6930                 if (ret > 0)
6931                         break;
6932                 if (ret != 0)
6933                         goto error;
6934                 leaf = path->nodes[0];
6935                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6936                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6937                 if (!cache) {
6938                         ret = -ENOMEM;
6939                         goto error;
6940                 }
6941                 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
6942                                                 GFP_NOFS);
6943                 if (!cache->free_space_ctl) {
6944                         kfree(cache);
6945                         ret = -ENOMEM;
6946                         goto error;
6947                 }
6948
6949                 atomic_set(&cache->count, 1);
6950                 spin_lock_init(&cache->lock);
6951                 cache->fs_info = info;
6952                 INIT_LIST_HEAD(&cache->list);
6953                 INIT_LIST_HEAD(&cache->cluster_list);
6954
6955                 if (need_clear)
6956                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6957
6958                 read_extent_buffer(leaf, &cache->item,
6959                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
6960                                    sizeof(cache->item));
6961                 memcpy(&cache->key, &found_key, sizeof(found_key));
6962
6963                 key.objectid = found_key.objectid + found_key.offset;
6964                 btrfs_release_path(path);
6965                 cache->flags = btrfs_block_group_flags(&cache->item);
6966                 cache->sectorsize = root->sectorsize;
6967
6968                 btrfs_init_free_space_ctl(cache);
6969
6970                 /*
6971                  * We need to exclude the super stripes now so that the space
6972                  * info has super bytes accounted for, otherwise we'll think
6973                  * we have more space than we actually do.
6974                  */
6975                 exclude_super_stripes(root, cache);
6976
6977                 /*
6978                  * check for two cases, either we are full, and therefore
6979                  * don't need to bother with the caching work since we won't
6980                  * find any space, or we are empty, and we can just add all
6981                  * the space in and be done with it.  This saves us _alot_ of
6982                  * time, particularly in the full case.
6983                  */
6984                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
6985                         cache->last_byte_to_unpin = (u64)-1;
6986                         cache->cached = BTRFS_CACHE_FINISHED;
6987                         free_excluded_extents(root, cache);
6988                 } else if (btrfs_block_group_used(&cache->item) == 0) {
6989                         cache->last_byte_to_unpin = (u64)-1;
6990                         cache->cached = BTRFS_CACHE_FINISHED;
6991                         add_new_free_space(cache, root->fs_info,
6992                                            found_key.objectid,
6993                                            found_key.objectid +
6994                                            found_key.offset);
6995                         free_excluded_extents(root, cache);
6996                 }
6997
6998                 ret = update_space_info(info, cache->flags, found_key.offset,
6999                                         btrfs_block_group_used(&cache->item),
7000                                         &space_info);
7001                 BUG_ON(ret);
7002                 cache->space_info = space_info;
7003                 spin_lock(&cache->space_info->lock);
7004                 cache->space_info->bytes_readonly += cache->bytes_super;
7005                 spin_unlock(&cache->space_info->lock);
7006
7007                 __link_block_group(space_info, cache);
7008
7009                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7010                 BUG_ON(ret);
7011
7012                 set_avail_alloc_bits(root->fs_info, cache->flags);
7013                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7014                         set_block_group_ro(cache);
7015         }
7016
7017         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7018                 if (!(get_alloc_profile(root, space_info->flags) &
7019                       (BTRFS_BLOCK_GROUP_RAID10 |
7020                        BTRFS_BLOCK_GROUP_RAID1 |
7021                        BTRFS_BLOCK_GROUP_DUP)))
7022                         continue;
7023                 /*
7024                  * avoid allocating from un-mirrored block group if there are
7025                  * mirrored block groups.
7026                  */
7027                 list_for_each_entry(cache, &space_info->block_groups[3], list)
7028                         set_block_group_ro(cache);
7029                 list_for_each_entry(cache, &space_info->block_groups[4], list)
7030                         set_block_group_ro(cache);
7031         }
7032
7033         init_global_block_rsv(info);
7034         ret = 0;
7035 error:
7036         btrfs_free_path(path);
7037         return ret;
7038 }
7039
7040 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7041                            struct btrfs_root *root, u64 bytes_used,
7042                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7043                            u64 size)
7044 {
7045         int ret;
7046         struct btrfs_root *extent_root;
7047         struct btrfs_block_group_cache *cache;
7048
7049         extent_root = root->fs_info->extent_root;
7050
7051         root->fs_info->last_trans_log_full_commit = trans->transid;
7052
7053         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7054         if (!cache)
7055                 return -ENOMEM;
7056         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7057                                         GFP_NOFS);
7058         if (!cache->free_space_ctl) {
7059                 kfree(cache);
7060                 return -ENOMEM;
7061         }
7062
7063         cache->key.objectid = chunk_offset;
7064         cache->key.offset = size;
7065         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7066         cache->sectorsize = root->sectorsize;
7067         cache->fs_info = root->fs_info;
7068
7069         atomic_set(&cache->count, 1);
7070         spin_lock_init(&cache->lock);
7071         INIT_LIST_HEAD(&cache->list);
7072         INIT_LIST_HEAD(&cache->cluster_list);
7073
7074         btrfs_init_free_space_ctl(cache);
7075
7076         btrfs_set_block_group_used(&cache->item, bytes_used);
7077         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7078         cache->flags = type;
7079         btrfs_set_block_group_flags(&cache->item, type);
7080
7081         cache->last_byte_to_unpin = (u64)-1;
7082         cache->cached = BTRFS_CACHE_FINISHED;
7083         exclude_super_stripes(root, cache);
7084
7085         add_new_free_space(cache, root->fs_info, chunk_offset,
7086                            chunk_offset + size);
7087
7088         free_excluded_extents(root, cache);
7089
7090         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7091                                 &cache->space_info);
7092         BUG_ON(ret);
7093
7094         spin_lock(&cache->space_info->lock);
7095         cache->space_info->bytes_readonly += cache->bytes_super;
7096         spin_unlock(&cache->space_info->lock);
7097
7098         __link_block_group(cache->space_info, cache);
7099
7100         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7101         BUG_ON(ret);
7102
7103         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7104                                 sizeof(cache->item));
7105         BUG_ON(ret);
7106
7107         set_avail_alloc_bits(extent_root->fs_info, type);
7108
7109         return 0;
7110 }
7111
7112 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7113                              struct btrfs_root *root, u64 group_start)
7114 {
7115         struct btrfs_path *path;
7116         struct btrfs_block_group_cache *block_group;
7117         struct btrfs_free_cluster *cluster;
7118         struct btrfs_root *tree_root = root->fs_info->tree_root;
7119         struct btrfs_key key;
7120         struct inode *inode;
7121         int ret;
7122         int factor;
7123
7124         root = root->fs_info->extent_root;
7125
7126         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7127         BUG_ON(!block_group);
7128         BUG_ON(!block_group->ro);
7129
7130         /*
7131          * Free the reserved super bytes from this block group before
7132          * remove it.
7133          */
7134         free_excluded_extents(root, block_group);
7135
7136         memcpy(&key, &block_group->key, sizeof(key));
7137         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7138                                   BTRFS_BLOCK_GROUP_RAID1 |
7139                                   BTRFS_BLOCK_GROUP_RAID10))
7140                 factor = 2;
7141         else
7142                 factor = 1;
7143
7144         /* make sure this block group isn't part of an allocation cluster */
7145         cluster = &root->fs_info->data_alloc_cluster;
7146         spin_lock(&cluster->refill_lock);
7147         btrfs_return_cluster_to_free_space(block_group, cluster);
7148         spin_unlock(&cluster->refill_lock);
7149
7150         /*
7151          * make sure this block group isn't part of a metadata
7152          * allocation cluster
7153          */
7154         cluster = &root->fs_info->meta_alloc_cluster;
7155         spin_lock(&cluster->refill_lock);
7156         btrfs_return_cluster_to_free_space(block_group, cluster);
7157         spin_unlock(&cluster->refill_lock);
7158
7159         path = btrfs_alloc_path();
7160         BUG_ON(!path);
7161
7162         inode = lookup_free_space_inode(root, block_group, path);
7163         if (!IS_ERR(inode)) {
7164                 btrfs_orphan_add(trans, inode);
7165                 clear_nlink(inode);
7166                 /* One for the block groups ref */
7167                 spin_lock(&block_group->lock);
7168                 if (block_group->iref) {
7169                         block_group->iref = 0;
7170                         block_group->inode = NULL;
7171                         spin_unlock(&block_group->lock);
7172                         iput(inode);
7173                 } else {
7174                         spin_unlock(&block_group->lock);
7175                 }
7176                 /* One for our lookup ref */
7177                 iput(inode);
7178         }
7179
7180         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7181         key.offset = block_group->key.objectid;
7182         key.type = 0;
7183
7184         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7185         if (ret < 0)
7186                 goto out;
7187         if (ret > 0)
7188                 btrfs_release_path(path);
7189         if (ret == 0) {
7190                 ret = btrfs_del_item(trans, tree_root, path);
7191                 if (ret)
7192                         goto out;
7193                 btrfs_release_path(path);
7194         }
7195
7196         spin_lock(&root->fs_info->block_group_cache_lock);
7197         rb_erase(&block_group->cache_node,
7198                  &root->fs_info->block_group_cache_tree);
7199         spin_unlock(&root->fs_info->block_group_cache_lock);
7200
7201         down_write(&block_group->space_info->groups_sem);
7202         /*
7203          * we must use list_del_init so people can check to see if they
7204          * are still on the list after taking the semaphore
7205          */
7206         list_del_init(&block_group->list);
7207         up_write(&block_group->space_info->groups_sem);
7208
7209         if (block_group->cached == BTRFS_CACHE_STARTED)
7210                 wait_block_group_cache_done(block_group);
7211
7212         btrfs_remove_free_space_cache(block_group);
7213
7214         spin_lock(&block_group->space_info->lock);
7215         block_group->space_info->total_bytes -= block_group->key.offset;
7216         block_group->space_info->bytes_readonly -= block_group->key.offset;
7217         block_group->space_info->disk_total -= block_group->key.offset * factor;
7218         spin_unlock(&block_group->space_info->lock);
7219
7220         memcpy(&key, &block_group->key, sizeof(key));
7221
7222         btrfs_clear_space_info_full(root->fs_info);
7223
7224         btrfs_put_block_group(block_group);
7225         btrfs_put_block_group(block_group);
7226
7227         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7228         if (ret > 0)
7229                 ret = -EIO;
7230         if (ret < 0)
7231                 goto out;
7232
7233         ret = btrfs_del_item(trans, root, path);
7234 out:
7235         btrfs_free_path(path);
7236         return ret;
7237 }
7238
7239 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7240 {
7241         struct btrfs_space_info *space_info;
7242         struct btrfs_super_block *disk_super;
7243         u64 features;
7244         u64 flags;
7245         int mixed = 0;
7246         int ret;
7247
7248         disk_super = &fs_info->super_copy;
7249         if (!btrfs_super_root(disk_super))
7250                 return 1;
7251
7252         features = btrfs_super_incompat_flags(disk_super);
7253         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
7254                 mixed = 1;
7255
7256         flags = BTRFS_BLOCK_GROUP_SYSTEM;
7257         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7258         if (ret)
7259                 goto out;
7260
7261         if (mixed) {
7262                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
7263                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7264         } else {
7265                 flags = BTRFS_BLOCK_GROUP_METADATA;
7266                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7267                 if (ret)
7268                         goto out;
7269
7270                 flags = BTRFS_BLOCK_GROUP_DATA;
7271                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7272         }
7273 out:
7274         return ret;
7275 }
7276
7277 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
7278 {
7279         return unpin_extent_range(root, start, end);
7280 }
7281
7282 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
7283                                u64 num_bytes, u64 *actual_bytes)
7284 {
7285         return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
7286 }
7287
7288 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
7289 {
7290         struct btrfs_fs_info *fs_info = root->fs_info;
7291         struct btrfs_block_group_cache *cache = NULL;
7292         u64 group_trimmed;
7293         u64 start;
7294         u64 end;
7295         u64 trimmed = 0;
7296         int ret = 0;
7297
7298         cache = btrfs_lookup_block_group(fs_info, range->start);
7299
7300         while (cache) {
7301                 if (cache->key.objectid >= (range->start + range->len)) {
7302                         btrfs_put_block_group(cache);
7303                         break;
7304                 }
7305
7306                 start = max(range->start, cache->key.objectid);
7307                 end = min(range->start + range->len,
7308                                 cache->key.objectid + cache->key.offset);
7309
7310                 if (end - start >= range->minlen) {
7311                         if (!block_group_cache_done(cache)) {
7312                                 ret = cache_block_group(cache, NULL, root, 0);
7313                                 if (!ret)
7314                                         wait_block_group_cache_done(cache);
7315                         }
7316                         ret = btrfs_trim_block_group(cache,
7317                                                      &group_trimmed,
7318                                                      start,
7319                                                      end,
7320                                                      range->minlen);
7321
7322                         trimmed += group_trimmed;
7323                         if (ret) {
7324                                 btrfs_put_block_group(cache);
7325                                 break;
7326                         }
7327                 }
7328
7329                 cache = next_block_group(fs_info->tree_root, cache);
7330         }
7331
7332         range->len = trimmed;
7333         return ret;
7334 }