2 * Copyright (C) 2009 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/sort.h>
23 #include "delayed-ref.h"
24 #include "transaction.h"
27 * delayed back reference update tracking. For subvolume trees
28 * we queue up extent allocations and backref maintenance for
29 * delayed processing. This avoids deep call chains where we
30 * add extents in the middle of btrfs_search_slot, and it allows
31 * us to buffer up frequently modified backrefs in an rb tree instead
32 * of hammering updates on the extent allocation tree.
36 * compare two delayed tree backrefs with same bytenr and type
38 static int comp_tree_refs(struct btrfs_delayed_tree_ref
*ref2
,
39 struct btrfs_delayed_tree_ref
*ref1
)
41 if (ref1
->node
.type
== BTRFS_TREE_BLOCK_REF_KEY
) {
42 if (ref1
->root
< ref2
->root
)
44 if (ref1
->root
> ref2
->root
)
47 if (ref1
->parent
< ref2
->parent
)
49 if (ref1
->parent
> ref2
->parent
)
56 * compare two delayed data backrefs with same bytenr and type
58 static int comp_data_refs(struct btrfs_delayed_data_ref
*ref2
,
59 struct btrfs_delayed_data_ref
*ref1
)
61 if (ref1
->node
.type
== BTRFS_EXTENT_DATA_REF_KEY
) {
62 if (ref1
->root
< ref2
->root
)
64 if (ref1
->root
> ref2
->root
)
66 if (ref1
->objectid
< ref2
->objectid
)
68 if (ref1
->objectid
> ref2
->objectid
)
70 if (ref1
->offset
< ref2
->offset
)
72 if (ref1
->offset
> ref2
->offset
)
75 if (ref1
->parent
< ref2
->parent
)
77 if (ref1
->parent
> ref2
->parent
)
84 * entries in the rb tree are ordered by the byte number of the extent,
85 * type of the delayed backrefs and content of delayed backrefs.
87 static int comp_entry(struct btrfs_delayed_ref_node
*ref2
,
88 struct btrfs_delayed_ref_node
*ref1
)
90 if (ref1
->bytenr
< ref2
->bytenr
)
92 if (ref1
->bytenr
> ref2
->bytenr
)
94 if (ref1
->is_head
&& ref2
->is_head
)
100 if (ref1
->type
< ref2
->type
)
102 if (ref1
->type
> ref2
->type
)
104 if (ref1
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
105 ref1
->type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2
),
107 btrfs_delayed_node_to_tree_ref(ref1
));
108 } else if (ref1
->type
== BTRFS_EXTENT_DATA_REF_KEY
||
109 ref1
->type
== BTRFS_SHARED_DATA_REF_KEY
) {
110 return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2
),
111 btrfs_delayed_node_to_data_ref(ref1
));
118 * insert a new ref into the rbtree. This returns any existing refs
119 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
122 static struct btrfs_delayed_ref_node
*tree_insert(struct rb_root
*root
,
123 struct rb_node
*node
)
125 struct rb_node
**p
= &root
->rb_node
;
126 struct rb_node
*parent_node
= NULL
;
127 struct btrfs_delayed_ref_node
*entry
;
128 struct btrfs_delayed_ref_node
*ins
;
131 ins
= rb_entry(node
, struct btrfs_delayed_ref_node
, rb_node
);
134 entry
= rb_entry(parent_node
, struct btrfs_delayed_ref_node
,
137 cmp
= comp_entry(entry
, ins
);
146 rb_link_node(node
, parent_node
, p
);
147 rb_insert_color(node
, root
);
152 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot
155 static struct btrfs_delayed_ref_node
*find_ref_head(struct rb_root
*root
,
157 struct btrfs_delayed_ref_node
**last
)
159 struct rb_node
*n
= root
->rb_node
;
160 struct btrfs_delayed_ref_node
*entry
;
164 entry
= rb_entry(n
, struct btrfs_delayed_ref_node
, rb_node
);
165 WARN_ON(!entry
->in_tree
);
169 if (bytenr
< entry
->bytenr
)
171 else if (bytenr
> entry
->bytenr
)
173 else if (!btrfs_delayed_ref_is_head(entry
))
188 int btrfs_delayed_ref_lock(struct btrfs_trans_handle
*trans
,
189 struct btrfs_delayed_ref_head
*head
)
191 struct btrfs_delayed_ref_root
*delayed_refs
;
193 delayed_refs
= &trans
->transaction
->delayed_refs
;
194 assert_spin_locked(&delayed_refs
->lock
);
195 if (mutex_trylock(&head
->mutex
))
198 atomic_inc(&head
->node
.refs
);
199 spin_unlock(&delayed_refs
->lock
);
201 mutex_lock(&head
->mutex
);
202 spin_lock(&delayed_refs
->lock
);
203 if (!head
->node
.in_tree
) {
204 mutex_unlock(&head
->mutex
);
205 btrfs_put_delayed_ref(&head
->node
);
208 btrfs_put_delayed_ref(&head
->node
);
212 int btrfs_find_ref_cluster(struct btrfs_trans_handle
*trans
,
213 struct list_head
*cluster
, u64 start
)
216 struct btrfs_delayed_ref_root
*delayed_refs
;
217 struct rb_node
*node
;
218 struct btrfs_delayed_ref_node
*ref
;
219 struct btrfs_delayed_ref_head
*head
;
221 delayed_refs
= &trans
->transaction
->delayed_refs
;
223 node
= rb_first(&delayed_refs
->root
);
226 find_ref_head(&delayed_refs
->root
, start
, &ref
);
228 struct btrfs_delayed_ref_node
*tmp
;
230 node
= rb_prev(&ref
->rb_node
);
233 struct btrfs_delayed_ref_node
,
235 if (tmp
->bytenr
< start
)
238 node
= rb_prev(&ref
->rb_node
);
240 node
= &ref
->rb_node
;
242 node
= rb_first(&delayed_refs
->root
);
245 while (node
&& count
< 32) {
246 ref
= rb_entry(node
, struct btrfs_delayed_ref_node
, rb_node
);
247 if (btrfs_delayed_ref_is_head(ref
)) {
248 head
= btrfs_delayed_node_to_head(ref
);
249 if (list_empty(&head
->cluster
)) {
250 list_add_tail(&head
->cluster
, cluster
);
251 delayed_refs
->run_delayed_start
=
255 WARN_ON(delayed_refs
->num_heads_ready
== 0);
256 delayed_refs
->num_heads_ready
--;
258 /* the goal of the clustering is to find extents
259 * that are likely to end up in the same extent
260 * leaf on disk. So, we don't want them spread
261 * all over the tree. Stop now if we've hit
262 * a head that was already in use
267 node
= rb_next(node
);
273 * we've gone to the end of the rbtree without finding any
274 * clusters. start from the beginning and try again
277 node
= rb_first(&delayed_refs
->root
);
284 * helper function to update an extent delayed ref in the
285 * rbtree. existing and update must both have the same
288 * This may free existing if the update cancels out whatever
289 * operation it was doing.
292 update_existing_ref(struct btrfs_trans_handle
*trans
,
293 struct btrfs_delayed_ref_root
*delayed_refs
,
294 struct btrfs_delayed_ref_node
*existing
,
295 struct btrfs_delayed_ref_node
*update
)
297 if (update
->action
!= existing
->action
) {
299 * this is effectively undoing either an add or a
300 * drop. We decrement the ref_mod, and if it goes
301 * down to zero we just delete the entry without
302 * every changing the extent allocation tree.
305 if (existing
->ref_mod
== 0) {
306 rb_erase(&existing
->rb_node
,
307 &delayed_refs
->root
);
308 existing
->in_tree
= 0;
309 btrfs_put_delayed_ref(existing
);
310 delayed_refs
->num_entries
--;
311 if (trans
->delayed_ref_updates
)
312 trans
->delayed_ref_updates
--;
314 WARN_ON(existing
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
315 existing
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
318 WARN_ON(existing
->type
== BTRFS_TREE_BLOCK_REF_KEY
||
319 existing
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
321 * the action on the existing ref matches
322 * the action on the ref we're trying to add.
323 * Bump the ref_mod by one so the backref that
324 * is eventually added/removed has the correct
327 existing
->ref_mod
+= update
->ref_mod
;
332 * helper function to update the accounting in the head ref
333 * existing and update must have the same bytenr
336 update_existing_head_ref(struct btrfs_delayed_ref_node
*existing
,
337 struct btrfs_delayed_ref_node
*update
)
339 struct btrfs_delayed_ref_head
*existing_ref
;
340 struct btrfs_delayed_ref_head
*ref
;
342 existing_ref
= btrfs_delayed_node_to_head(existing
);
343 ref
= btrfs_delayed_node_to_head(update
);
344 BUG_ON(existing_ref
->is_data
!= ref
->is_data
);
346 if (ref
->must_insert_reserved
) {
347 /* if the extent was freed and then
348 * reallocated before the delayed ref
349 * entries were processed, we can end up
350 * with an existing head ref without
351 * the must_insert_reserved flag set.
354 existing_ref
->must_insert_reserved
= ref
->must_insert_reserved
;
357 * update the num_bytes so we make sure the accounting
360 existing
->num_bytes
= update
->num_bytes
;
364 if (ref
->extent_op
) {
365 if (!existing_ref
->extent_op
) {
366 existing_ref
->extent_op
= ref
->extent_op
;
368 if (ref
->extent_op
->update_key
) {
369 memcpy(&existing_ref
->extent_op
->key
,
370 &ref
->extent_op
->key
,
371 sizeof(ref
->extent_op
->key
));
372 existing_ref
->extent_op
->update_key
= 1;
374 if (ref
->extent_op
->update_flags
) {
375 existing_ref
->extent_op
->flags_to_set
|=
376 ref
->extent_op
->flags_to_set
;
377 existing_ref
->extent_op
->update_flags
= 1;
379 kfree(ref
->extent_op
);
383 * update the reference mod on the head to reflect this new operation
385 existing
->ref_mod
+= update
->ref_mod
;
389 * helper function to actually insert a head node into the rbtree.
390 * this does all the dirty work in terms of maintaining the correct
391 * overall modification count.
393 static noinline
int add_delayed_ref_head(struct btrfs_trans_handle
*trans
,
394 struct btrfs_delayed_ref_node
*ref
,
395 u64 bytenr
, u64 num_bytes
,
396 int action
, int is_data
)
398 struct btrfs_delayed_ref_node
*existing
;
399 struct btrfs_delayed_ref_head
*head_ref
= NULL
;
400 struct btrfs_delayed_ref_root
*delayed_refs
;
402 int must_insert_reserved
= 0;
405 * the head node stores the sum of all the mods, so dropping a ref
406 * should drop the sum in the head node by one.
408 if (action
== BTRFS_UPDATE_DELAYED_HEAD
)
410 else if (action
== BTRFS_DROP_DELAYED_REF
)
414 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
415 * the reserved accounting when the extent is finally added, or
416 * if a later modification deletes the delayed ref without ever
417 * inserting the extent into the extent allocation tree.
418 * ref->must_insert_reserved is the flag used to record
419 * that accounting mods are required.
421 * Once we record must_insert_reserved, switch the action to
422 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
424 if (action
== BTRFS_ADD_DELAYED_EXTENT
)
425 must_insert_reserved
= 1;
427 must_insert_reserved
= 0;
429 delayed_refs
= &trans
->transaction
->delayed_refs
;
431 /* first set the basic ref node struct up */
432 atomic_set(&ref
->refs
, 1);
433 ref
->bytenr
= bytenr
;
434 ref
->num_bytes
= num_bytes
;
435 ref
->ref_mod
= count_mod
;
441 head_ref
= btrfs_delayed_node_to_head(ref
);
442 head_ref
->must_insert_reserved
= must_insert_reserved
;
443 head_ref
->is_data
= is_data
;
445 INIT_LIST_HEAD(&head_ref
->cluster
);
446 mutex_init(&head_ref
->mutex
);
448 trace_btrfs_delayed_ref_head(ref
, head_ref
, action
);
450 existing
= tree_insert(&delayed_refs
->root
, &ref
->rb_node
);
453 update_existing_head_ref(existing
, ref
);
455 * we've updated the existing ref, free the newly
460 delayed_refs
->num_heads
++;
461 delayed_refs
->num_heads_ready
++;
462 delayed_refs
->num_entries
++;
463 trans
->delayed_ref_updates
++;
469 * helper to insert a delayed tree ref into the rbtree.
471 static noinline
int add_delayed_tree_ref(struct btrfs_trans_handle
*trans
,
472 struct btrfs_delayed_ref_node
*ref
,
473 u64 bytenr
, u64 num_bytes
, u64 parent
,
474 u64 ref_root
, int level
, int action
)
476 struct btrfs_delayed_ref_node
*existing
;
477 struct btrfs_delayed_tree_ref
*full_ref
;
478 struct btrfs_delayed_ref_root
*delayed_refs
;
480 if (action
== BTRFS_ADD_DELAYED_EXTENT
)
481 action
= BTRFS_ADD_DELAYED_REF
;
483 delayed_refs
= &trans
->transaction
->delayed_refs
;
485 /* first set the basic ref node struct up */
486 atomic_set(&ref
->refs
, 1);
487 ref
->bytenr
= bytenr
;
488 ref
->num_bytes
= num_bytes
;
490 ref
->action
= action
;
494 full_ref
= btrfs_delayed_node_to_tree_ref(ref
);
496 full_ref
->parent
= parent
;
497 ref
->type
= BTRFS_SHARED_BLOCK_REF_KEY
;
499 full_ref
->root
= ref_root
;
500 ref
->type
= BTRFS_TREE_BLOCK_REF_KEY
;
502 full_ref
->level
= level
;
504 trace_btrfs_delayed_tree_ref(ref
, full_ref
, action
);
506 existing
= tree_insert(&delayed_refs
->root
, &ref
->rb_node
);
509 update_existing_ref(trans
, delayed_refs
, existing
, ref
);
511 * we've updated the existing ref, free the newly
516 delayed_refs
->num_entries
++;
517 trans
->delayed_ref_updates
++;
523 * helper to insert a delayed data ref into the rbtree.
525 static noinline
int add_delayed_data_ref(struct btrfs_trans_handle
*trans
,
526 struct btrfs_delayed_ref_node
*ref
,
527 u64 bytenr
, u64 num_bytes
, u64 parent
,
528 u64 ref_root
, u64 owner
, u64 offset
,
531 struct btrfs_delayed_ref_node
*existing
;
532 struct btrfs_delayed_data_ref
*full_ref
;
533 struct btrfs_delayed_ref_root
*delayed_refs
;
535 if (action
== BTRFS_ADD_DELAYED_EXTENT
)
536 action
= BTRFS_ADD_DELAYED_REF
;
538 delayed_refs
= &trans
->transaction
->delayed_refs
;
540 /* first set the basic ref node struct up */
541 atomic_set(&ref
->refs
, 1);
542 ref
->bytenr
= bytenr
;
543 ref
->num_bytes
= num_bytes
;
545 ref
->action
= action
;
549 full_ref
= btrfs_delayed_node_to_data_ref(ref
);
551 full_ref
->parent
= parent
;
552 ref
->type
= BTRFS_SHARED_DATA_REF_KEY
;
554 full_ref
->root
= ref_root
;
555 ref
->type
= BTRFS_EXTENT_DATA_REF_KEY
;
557 full_ref
->objectid
= owner
;
558 full_ref
->offset
= offset
;
560 trace_btrfs_delayed_data_ref(ref
, full_ref
, action
);
562 existing
= tree_insert(&delayed_refs
->root
, &ref
->rb_node
);
565 update_existing_ref(trans
, delayed_refs
, existing
, ref
);
567 * we've updated the existing ref, free the newly
572 delayed_refs
->num_entries
++;
573 trans
->delayed_ref_updates
++;
579 * add a delayed tree ref. This does all of the accounting required
580 * to make sure the delayed ref is eventually processed before this
581 * transaction commits.
583 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle
*trans
,
584 u64 bytenr
, u64 num_bytes
, u64 parent
,
585 u64 ref_root
, int level
, int action
,
586 struct btrfs_delayed_extent_op
*extent_op
)
588 struct btrfs_delayed_tree_ref
*ref
;
589 struct btrfs_delayed_ref_head
*head_ref
;
590 struct btrfs_delayed_ref_root
*delayed_refs
;
593 BUG_ON(extent_op
&& extent_op
->is_data
);
594 ref
= kmalloc(sizeof(*ref
), GFP_NOFS
);
598 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
604 head_ref
->extent_op
= extent_op
;
606 delayed_refs
= &trans
->transaction
->delayed_refs
;
607 spin_lock(&delayed_refs
->lock
);
610 * insert both the head node and the new ref without dropping
613 ret
= add_delayed_ref_head(trans
, &head_ref
->node
, bytenr
, num_bytes
,
617 ret
= add_delayed_tree_ref(trans
, &ref
->node
, bytenr
, num_bytes
,
618 parent
, ref_root
, level
, action
);
620 spin_unlock(&delayed_refs
->lock
);
625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
627 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle
*trans
,
628 u64 bytenr
, u64 num_bytes
,
629 u64 parent
, u64 ref_root
,
630 u64 owner
, u64 offset
, int action
,
631 struct btrfs_delayed_extent_op
*extent_op
)
633 struct btrfs_delayed_data_ref
*ref
;
634 struct btrfs_delayed_ref_head
*head_ref
;
635 struct btrfs_delayed_ref_root
*delayed_refs
;
638 BUG_ON(extent_op
&& !extent_op
->is_data
);
639 ref
= kmalloc(sizeof(*ref
), GFP_NOFS
);
643 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
649 head_ref
->extent_op
= extent_op
;
651 delayed_refs
= &trans
->transaction
->delayed_refs
;
652 spin_lock(&delayed_refs
->lock
);
655 * insert both the head node and the new ref without dropping
658 ret
= add_delayed_ref_head(trans
, &head_ref
->node
, bytenr
, num_bytes
,
662 ret
= add_delayed_data_ref(trans
, &ref
->node
, bytenr
, num_bytes
,
663 parent
, ref_root
, owner
, offset
, action
);
665 spin_unlock(&delayed_refs
->lock
);
669 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle
*trans
,
670 u64 bytenr
, u64 num_bytes
,
671 struct btrfs_delayed_extent_op
*extent_op
)
673 struct btrfs_delayed_ref_head
*head_ref
;
674 struct btrfs_delayed_ref_root
*delayed_refs
;
677 head_ref
= kmalloc(sizeof(*head_ref
), GFP_NOFS
);
681 head_ref
->extent_op
= extent_op
;
683 delayed_refs
= &trans
->transaction
->delayed_refs
;
684 spin_lock(&delayed_refs
->lock
);
686 ret
= add_delayed_ref_head(trans
, &head_ref
->node
, bytenr
,
687 num_bytes
, BTRFS_UPDATE_DELAYED_HEAD
,
691 spin_unlock(&delayed_refs
->lock
);
696 * this does a simple search for the head node for a given extent.
697 * It must be called with the delayed ref spinlock held, and it returns
698 * the head node if any where found, or NULL if not.
700 struct btrfs_delayed_ref_head
*
701 btrfs_find_delayed_ref_head(struct btrfs_trans_handle
*trans
, u64 bytenr
)
703 struct btrfs_delayed_ref_node
*ref
;
704 struct btrfs_delayed_ref_root
*delayed_refs
;
706 delayed_refs
= &trans
->transaction
->delayed_refs
;
707 ref
= find_ref_head(&delayed_refs
->root
, bytenr
, NULL
);
709 return btrfs_delayed_node_to_head(ref
);