2 * Copyright (C) 2009 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/sort.h>
23 #include "delayed-ref.h"
24 #include "transaction.h"
27 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for
29 * delayed processing. This avoids deep call chains where we
30 * add extents in the middle of btrfs_search_slot, and it allows
31 * us to buffer up frequently modified backrefs in an rb tree instead
32 * of hammering updates on the extent allocation tree.
36 * compare two delayed tree backrefs with same bytenr and type
38 static int comp_tree_refs(struct btrfs_delayed_tree_ref
*ref2
,
39 struct btrfs_delayed_tree_ref
*ref1
)
41 if (ref1
->node
.type
== BTRFS_TREE_BLOCK_REF_KEY
) {
42 if (ref1
->root
< ref2
->root
)
44 if (ref1
->root
> ref2
->root
)
47 if (ref1
->parent
< ref2
->parent
)
49 if (ref1
->parent
> ref2
->parent
)
56 * compare two delayed data backrefs with same bytenr and type
58 static int comp_data_refs(struct btrfs_delayed_data_ref
*ref2
,
59 struct btrfs_delayed_data_ref
*ref1
)
61 if (ref1
->node
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
62 if (ref1
->root
< ref2
->root
)
64 if (ref1
->root
> ref2
->root
)
66 if (ref1
->objectid
< ref2
->objectid
)
68 if (ref1
->objectid
> ref2
->objectid
)
70 if (ref1
->offset
< ref2
->offset
)
72 if (ref1
->offset
> ref2
->offset
)
75 if (ref1
->parent
< ref2
->parent
)
77 if (ref1
->parent
> ref2
->parent
)
84 * entries in the rb tree are ordered by the byte number of the extent,
85 * type of the delayed backrefs and content of delayed backrefs.
87 static int comp_entry(struct btrfs_delayed_ref_node
*ref2
,
88 struct btrfs_delayed_ref_node
*ref1
)
90 if (ref1
->bytenr
< ref2
->bytenr
)
92 if (ref1
->bytenr
> ref2
->bytenr
)
94 if (ref1
->is_head
&& ref2
->is_head
)
100 if (ref1
->type
< ref2
->type
)
102 if (ref1
->type
> ref2
->type
)
104 if (ref1
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
105 ref1
->type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2
),
107 btrfs_delayed_node_to_tree_ref(ref1
));
108 } else if (ref1
->type
== BTRFS_EXTENT_DATA_REF_KEY
||
109 ref1
->type
== BTRFS_SHARED_DATA_REF_KEY
) {
110 return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2
),
111 btrfs_delayed_node_to_data_ref(ref1
));
118 * insert a new ref into the rbtree. This returns any existing refs
119 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
122 static struct btrfs_delayed_ref_node
*tree_insert(struct rb_root
*root
,
123 struct rb_node
*node
)
125 struct rb_node
**p
= &root
->rb_node
;
126 struct rb_node
*parent_node
= NULL
;
127 struct btrfs_delayed_ref_node
*entry
;
128 struct btrfs_delayed_ref_node
*ins
;
131 ins
= rb_entry(node
, struct btrfs_delayed_ref_node
, rb_node
);
134 entry
= rb_entry(parent_node
, struct btrfs_delayed_ref_node
,
137 cmp
= comp_entry(entry
, ins
);
146 rb_link_node(node
, parent_node
, p
);
147 rb_insert_color(node
, root
);
152 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot
155 static struct btrfs_delayed_ref_node
*find_ref_head(struct rb_root
*root
,
157 struct btrfs_delayed_ref_node
**last
)
159 struct rb_node
*n
= root
->rb_node
;
160 struct btrfs_delayed_ref_node
*entry
;
164 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
165 WARN_ON(!entry
->in_tree
);
169 if (bytenr
< entry
->bytenr
)
171 else if (bytenr
> entry
->bytenr
)
173 else if (!btrfs_delayed_ref_is_head(entry
))
188 int btrfs_delayed_ref_lock(struct btrfs_trans_handle
*trans
,
189 struct btrfs_delayed_ref_head
*head
)
191 struct btrfs_delayed_ref_root
*delayed_refs
;
193 delayed_refs
= &trans
->transaction
->delayed_refs
;
194 assert_spin_locked(&delayed_refs
->lock
);
195 if (mutex_trylock(&head
->mutex
))
198 atomic_inc(&head
->node
.refs
);
199 spin_unlock(&delayed_refs
->lock
);
201 mutex_lock(&head
->mutex
);
202 spin_lock(&delayed_refs
->lock
);
203 if (!head
->node
.in_tree
) {
204 mutex_unlock(&head
->mutex
);
205 btrfs_put_delayed_ref(&head
->node
);
208 btrfs_put_delayed_ref(&head
->node
);
212 int btrfs_find_ref_cluster(struct btrfs_trans_handle
*trans
,
213 struct list_head
*cluster
, u64 start
)
216 struct btrfs_delayed_ref_root
*delayed_refs
;
217 struct rb_node
*node
;
218 struct btrfs_delayed_ref_node
*ref
;
219 struct btrfs_delayed_ref_head
*head
;
221 delayed_refs
= &trans
->transaction
->delayed_refs
;
223 node
= rb_first(&delayed_refs
->root
);
226 find_ref_head(&delayed_refs
->root
, start
, &ref
);
228 struct btrfs_delayed_ref_node
*tmp
;
230 node
= rb_prev(&ref
->rb_node
);
233 struct btrfs_delayed_ref_node
,
235 if (tmp
->bytenr
< start
)
238 node
= rb_prev(&ref
->rb_node
);
240 node
= &ref
->rb_node
;
242 node
= rb_first(&delayed_refs
->root
);
245 while (node
&& count
< 32) {
246 ref
= rb_entry(node
, struct btrfs_delayed_ref_node
, rb_node
);
247 if (btrfs_delayed_ref_is_head(ref
)) {
248 head
= btrfs_delayed_node_to_head(ref
);
249 if (list_empty(&head
->cluster
)) {
250 list_add_tail(&head
->cluster
, cluster
);
251 delayed_refs
->run_delayed_start
=
255 WARN_ON(delayed_refs
->num_heads_ready
== 0);
256 delayed_refs
->num_heads_ready
--;
258 /* the goal of the clustering is to find extents
259 * that are likely to end up in the same extent
260 * leaf on disk. So, we don't want them spread
261 * all over the tree. Stop now if we've hit
262 * a head that was already in use
267 node
= rb_next(node
);
273 * we've gone to the end of the rbtree without finding any
274 * clusters. start from the beginning and try again
277 node
= rb_first(&delayed_refs
->root
);
284 * This checks to see if there are any delayed refs in the
285 * btree for a given bytenr. It returns one if it finds any
286 * and zero otherwise.
288 * If it only finds a head node, it returns 0.
290 * The idea is to use this when deciding if you can safely delete an
291 * extent from the extent allocation tree. There may be a pending
292 * ref in the rbtree that adds or removes references, so as long as this
293 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
296 int btrfs_delayed_ref_pending(struct btrfs_trans_handle
*trans
, u64 bytenr
)
298 struct btrfs_delayed_ref_node
*ref
;
299 struct btrfs_delayed_ref_root
*delayed_refs
;
300 struct rb_node
*prev_node
;
303 delayed_refs
= &trans
->transaction
->delayed_refs
;
304 spin_lock(&delayed_refs
->lock
);
306 ref
= find_ref_head(&delayed_refs
->root
, bytenr
, NULL
);
308 prev_node
= rb_prev(&ref
->rb_node
);
311 ref
= rb_entry(prev_node
, struct btrfs_delayed_ref_node
,
313 if (ref
->bytenr
== bytenr
)
317 spin_unlock(&delayed_refs
->lock
);
322 * helper function to update an extent delayed ref in the
323 * rbtree. existing and update must both have the same
326 * This may free existing if the update cancels out whatever
327 * operation it was doing.
330 update_existing_ref(struct btrfs_trans_handle
*trans
,
331 struct btrfs_delayed_ref_root
*delayed_refs
,
332 struct btrfs_delayed_ref_node
*existing
,
333 struct btrfs_delayed_ref_node
*update
)
335 if (update
->action
!= existing
->action
) {
337 * this is effectively undoing either an add or a
338 * drop. We decrement the ref_mod, and if it goes
339 * down to zero we just delete the entry without
340 * every changing the extent allocation tree.
343 if (existing
->ref_mod
== 0) {
344 rb_erase(&existing
->rb_node
,
345 &delayed_refs
->root
);
346 existing
->in_tree
= 0;
347 btrfs_put_delayed_ref(existing
);
348 delayed_refs
->num_entries
--;
349 if (trans
->delayed_ref_updates
)
350 trans
->delayed_ref_updates
--;
352 WARN_ON(existing
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
353 existing
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
356 WARN_ON(existing
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
357 existing
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
359 * the action on the existing ref matches
360 * the action on the ref we're trying to add.
361 * Bump the ref_mod by one so the backref that
362 * is eventually added/removed has the correct
365 existing
->ref_mod
+= update
->ref_mod
;
370 * helper function to update the accounting in the head ref
371 * existing and update must have the same bytenr
374 update_existing_head_ref(struct btrfs_delayed_ref_node
*existing
,
375 struct btrfs_delayed_ref_node
*update
)
377 struct btrfs_delayed_ref_head
*existing_ref
;
378 struct btrfs_delayed_ref_head
*ref
;
380 existing_ref
= btrfs_delayed_node_to_head(existing
);
381 ref
= btrfs_delayed_node_to_head(update
);
382 BUG_ON(existing_ref
->is_data
!= ref
->is_data
);
384 if (ref
->must_insert_reserved
) {
385 /* if the extent was freed and then
386 * reallocated before the delayed ref
387 * entries were processed, we can end up
388 * with an existing head ref without
389 * the must_insert_reserved flag set.
392 existing_ref
->must_insert_reserved
= ref
->must_insert_reserved
;
395 * update the num_bytes so we make sure the accounting
398 existing
->num_bytes
= update
->num_bytes
;
402 if (ref
->extent_op
) {
403 if (!existing_ref
->extent_op
) {
404 existing_ref
->extent_op
= ref
->extent_op
;
406 if (ref
->extent_op
->update_key
) {
407 memcpy(&existing_ref
->extent_op
->key
,
408 &ref
->extent_op
->key
,
409 sizeof(ref
->extent_op
->key
));
410 existing_ref
->extent_op
->update_key
= 1;
412 if (ref
->extent_op
->update_flags
) {
413 existing_ref
->extent_op
->flags_to_set
|=
414 ref
->extent_op
->flags_to_set
;
415 existing_ref
->extent_op
->update_flags
= 1;
417 kfree(ref
->extent_op
);
421 * update the reference mod on the head to reflect this new operation
423 existing
->ref_mod
+= update
->ref_mod
;
427 * helper function to actually insert a head node into the rbtree.
428 * this does all the dirty work in terms of maintaining the correct
429 * overall modification count.
431 static noinline
int add_delayed_ref_head(struct btrfs_trans_handle
*trans
,
432 struct btrfs_delayed_ref_node
*ref
,
433 u64 bytenr
, u64 num_bytes
,
434 int action
, int is_data
)
436 struct btrfs_delayed_ref_node
*existing
;
437 struct btrfs_delayed_ref_head
*head_ref
= NULL
;
438 struct btrfs_delayed_ref_root
*delayed_refs
;
440 int must_insert_reserved
= 0;
443 * the head node stores the sum of all the mods, so dropping a ref
444 * should drop the sum in the head node by one.
446 if (action
== BTRFS_UPDATE_DELAYED_HEAD
)
448 else if (action
== BTRFS_DROP_DELAYED_REF
)
452 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
453 * the reserved accounting when the extent is finally added, or
454 * if a later modification deletes the delayed ref without ever
455 * inserting the extent into the extent allocation tree.
456 * ref->must_insert_reserved is the flag used to record
457 * that accounting mods are required.
459 * Once we record must_insert_reserved, switch the action to
460 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
462 if (action
== BTRFS_ADD_DELAYED_EXTENT
)
463 must_insert_reserved
= 1;
465 must_insert_reserved
= 0;
467 delayed_refs
= &trans
->transaction
->delayed_refs
;
469 /* first set the basic ref node struct up */
470 atomic_set(&ref
->refs
, 1);
471 ref
->bytenr
= bytenr
;
472 ref
->num_bytes
= num_bytes
;
473 ref
->ref_mod
= count_mod
;
479 head_ref
= btrfs_delayed_node_to_head(ref
);
480 head_ref
->must_insert_reserved
= must_insert_reserved
;
481 head_ref
->is_data
= is_data
;
483 INIT_LIST_HEAD(&head_ref
->cluster
);
484 mutex_init(&head_ref
->mutex
);
486 existing
= tree_insert(&delayed_refs
->root
, &ref
->rb_node
);
489 update_existing_head_ref(existing
, ref
);
491 * we've updated the existing ref, free the newly
496 delayed_refs
->num_heads
++;
497 delayed_refs
->num_heads_ready
++;
498 delayed_refs
->num_entries
++;
499 trans
->delayed_ref_updates
++;
505 * helper to insert a delayed tree ref into the rbtree.
507 static noinline
int add_delayed_tree_ref(struct btrfs_trans_handle
*trans
,
508 struct btrfs_delayed_ref_node
*ref
,
509 u64 bytenr
, u64 num_bytes
, u64 parent
,
510 u64 ref_root
, int level
, int action
)
512 struct btrfs_delayed_ref_node
*existing
;
513 struct btrfs_delayed_tree_ref
*full_ref
;
514 struct btrfs_delayed_ref_root
*delayed_refs
;
516 if (action
== BTRFS_ADD_DELAYED_EXTENT
)
517 action
= BTRFS_ADD_DELAYED_REF
;
519 delayed_refs
= &trans
->transaction
->delayed_refs
;
521 /* first set the basic ref node struct up */
522 atomic_set(&ref
->refs
, 1);
523 ref
->bytenr
= bytenr
;
524 ref
->num_bytes
= num_bytes
;
526 ref
->action
= action
;
530 full_ref
= btrfs_delayed_node_to_tree_ref(ref
);
532 full_ref
->parent
= parent
;
533 ref
->type
= BTRFS_SHARED_BLOCK_REF_KEY
;
535 full_ref
->root
= ref_root
;
536 ref
->type
= BTRFS_TREE_BLOCK_REF_KEY
;
538 full_ref
->level
= level
;
540 existing
= tree_insert(&delayed_refs
->root
, &ref
->rb_node
);
543 update_existing_ref(trans
, delayed_refs
, existing
, ref
);
545 * we've updated the existing ref, free the newly
550 delayed_refs
->num_entries
++;
551 trans
->delayed_ref_updates
++;
557 * helper to insert a delayed data ref into the rbtree.
559 static noinline
int add_delayed_data_ref(struct btrfs_trans_handle
*trans
,
560 struct btrfs_delayed_ref_node
*ref
,
561 u64 bytenr
, u64 num_bytes
, u64 parent
,
562 u64 ref_root
, u64 owner
, u64 offset
,
565 struct btrfs_delayed_ref_node
*existing
;
566 struct btrfs_delayed_data_ref
*full_ref
;
567 struct btrfs_delayed_ref_root
*delayed_refs
;
569 if (action
== BTRFS_ADD_DELAYED_EXTENT
)
570 action
= BTRFS_ADD_DELAYED_REF
;
572 delayed_refs
= &trans
->transaction
->delayed_refs
;
574 /* first set the basic ref node struct up */
575 atomic_set(&ref
->refs
, 1);
576 ref
->bytenr
= bytenr
;
577 ref
->num_bytes
= num_bytes
;
579 ref
->action
= action
;
583 full_ref
= btrfs_delayed_node_to_data_ref(ref
);
585 full_ref
->parent
= parent
;
586 ref
->type
= BTRFS_SHARED_DATA_REF_KEY
;
588 full_ref
->root
= ref_root
;
589 ref
->type
= BTRFS_EXTENT_DATA_REF_KEY
;
591 full_ref
->objectid
= owner
;
592 full_ref
->offset
= offset
;
594 existing
= tree_insert(&delayed_refs
->root
, &ref
->rb_node
);
597 update_existing_ref(trans
, delayed_refs
, existing
, ref
);
599 * we've updated the existing ref, free the newly
604 delayed_refs
->num_entries
++;
605 trans
->delayed_ref_updates
++;
611 * add a delayed tree ref. This does all of the accounting required
612 * to make sure the delayed ref is eventually processed before this
613 * transaction commits.
615 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle
*trans
,
616 u64 bytenr
, u64 num_bytes
, u64 parent
,
617 u64 ref_root
, int level
, int action
,
618 struct btrfs_delayed_extent_op
*extent_op
)
620 struct btrfs_delayed_tree_ref
*ref
;
621 struct btrfs_delayed_ref_head
*head_ref
;
622 struct btrfs_delayed_ref_root
*delayed_refs
;
625 BUG_ON(extent_op
&& extent_op
->is_data
);
626 ref
= kmalloc(sizeof(*ref
), GFP_NOFS
);
630 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
636 head_ref
->extent_op
= extent_op
;
638 delayed_refs
= &trans
->transaction
->delayed_refs
;
639 spin_lock(&delayed_refs
->lock
);
642 * insert both the head node and the new ref without dropping
645 ret
= add_delayed_ref_head(trans
, &head_ref
->node
, bytenr
, num_bytes
,
649 ret
= add_delayed_tree_ref(trans
, &ref
->node
, bytenr
, num_bytes
,
650 parent
, ref_root
, level
, action
);
652 spin_unlock(&delayed_refs
->lock
);
657 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
659 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle
*trans
,
660 u64 bytenr
, u64 num_bytes
,
661 u64 parent
, u64 ref_root
,
662 u64 owner
, u64 offset
, int action
,
663 struct btrfs_delayed_extent_op
*extent_op
)
665 struct btrfs_delayed_data_ref
*ref
;
666 struct btrfs_delayed_ref_head
*head_ref
;
667 struct btrfs_delayed_ref_root
*delayed_refs
;
670 BUG_ON(extent_op
&& !extent_op
->is_data
);
671 ref
= kmalloc(sizeof(*ref
), GFP_NOFS
);
675 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
681 head_ref
->extent_op
= extent_op
;
683 delayed_refs
= &trans
->transaction
->delayed_refs
;
684 spin_lock(&delayed_refs
->lock
);
687 * insert both the head node and the new ref without dropping
690 ret
= add_delayed_ref_head(trans
, &head_ref
->node
, bytenr
, num_bytes
,
694 ret
= add_delayed_data_ref(trans
, &ref
->node
, bytenr
, num_bytes
,
695 parent
, ref_root
, owner
, offset
, action
);
697 spin_unlock(&delayed_refs
->lock
);
701 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle
*trans
,
702 u64 bytenr
, u64 num_bytes
,
703 struct btrfs_delayed_extent_op
*extent_op
)
705 struct btrfs_delayed_ref_head
*head_ref
;
706 struct btrfs_delayed_ref_root
*delayed_refs
;
709 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
713 head_ref
->extent_op
= extent_op
;
715 delayed_refs
= &trans
->transaction
->delayed_refs
;
716 spin_lock(&delayed_refs
->lock
);
718 ret
= add_delayed_ref_head(trans
, &head_ref
->node
, bytenr
,
719 num_bytes
, BTRFS_UPDATE_DELAYED_HEAD
,
723 spin_unlock(&delayed_refs
->lock
);
728 * this does a simple search for the head node for a given extent.
729 * It must be called with the delayed ref spinlock held, and it returns
730 * the head node if any where found, or NULL if not.
732 struct btrfs_delayed_ref_head
*
733 btrfs_find_delayed_ref_head(struct btrfs_trans_handle
*trans
, u64 bytenr
)
735 struct btrfs_delayed_ref_node
*ref
;
736 struct btrfs_delayed_ref_root
*delayed_refs
;
738 delayed_refs
= &trans
->transaction
->delayed_refs
;
739 ref
= find_ref_head(&delayed_refs
->root
, bytenr
, NULL
);
741 return btrfs_delayed_node_to_head(ref
);
746 * add a delayed ref to the tree. This does all of the accounting required
747 * to make sure the delayed ref is eventually processed before this
748 * transaction commits.
750 * The main point of this call is to add and remove a backreference in a single
751 * shot, taking the lock only once, and only searching for the head node once.
753 * It is the same as doing a ref add and delete in two separate calls.
756 int btrfs_update_delayed_ref(struct btrfs_trans_handle
*trans
,
757 u64 bytenr
, u64 num_bytes
, u64 orig_parent
,
758 u64 parent
, u64 orig_ref_root
, u64 ref_root
,
759 u64 orig_ref_generation
, u64 ref_generation
,
760 u64 owner_objectid
, int pin
)
762 struct btrfs_delayed_ref
*ref
;
763 struct btrfs_delayed_ref
*old_ref
;
764 struct btrfs_delayed_ref_head
*head_ref
;
765 struct btrfs_delayed_ref_root
*delayed_refs
;
768 ref
= kmalloc(sizeof(*ref
), GFP_NOFS
);
772 old_ref
= kmalloc(sizeof(*old_ref
), GFP_NOFS
);
779 * the parent = 0 case comes from cases where we don't actually
780 * know the parent yet. It will get updated later via a add/drop
785 if (orig_parent
== 0)
786 orig_parent
= bytenr
;
788 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
794 delayed_refs
= &trans
->transaction
->delayed_refs
;
795 spin_lock(&delayed_refs
->lock
);
798 * insert both the head node and the new ref without dropping
801 ret
= __btrfs_add_delayed_ref(trans
, &head_ref
->node
, bytenr
, num_bytes
,
803 BTRFS_UPDATE_DELAYED_HEAD
, 0);
806 ret
= __btrfs_add_delayed_ref(trans
, &ref
->node
, bytenr
, num_bytes
,
807 parent
, ref_root
, ref_generation
,
808 owner_objectid
, BTRFS_ADD_DELAYED_REF
, 0);
811 ret
= __btrfs_add_delayed_ref(trans
, &old_ref
->node
, bytenr
, num_bytes
,
812 orig_parent
, orig_ref_root
,
813 orig_ref_generation
, owner_objectid
,
814 BTRFS_DROP_DELAYED_REF
, pin
);
816 spin_unlock(&delayed_refs
->lock
);