2 * Copyright (c) 2009 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 static int rebalance_node(struct hammer_ioc_rebalance
*rebal
,
38 hammer_cursor_t cursor
, hammer_node_lock_t lcache
);
39 static void rebalance_closeout(hammer_node_lock_t base_item
, int base_count
,
40 hammer_btree_elm_t elm
);
41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item
, int index
,
42 hammer_node_lock_t item
, hammer_node_lock_t chld_item
);
45 * Iterate through the specified range of object ids and rebalance B-Tree
46 * leaf and internal nodes we encounter. A forwards iteration is used.
48 * All leafs are at the same depth. We use the b-tree scan code loosely
49 * to position ourselves and create degenerate cases to skip indices
50 * that we have rebalanced in bulk.
54 hammer_ioc_rebalance(hammer_transaction_t trans
, hammer_inode_t ip
,
55 struct hammer_ioc_rebalance
*rebal
)
57 struct hammer_cursor cursor
;
58 struct hammer_node_lock lcache
;
59 hammer_btree_leaf_elm_t elm
;
62 uint32_t key_end_localization
;
64 if ((rebal
->key_beg
.localization
| rebal
->key_end
.localization
) &
65 HAMMER_LOCALIZE_PSEUDOFS_MASK
) {
68 if (rebal
->key_beg
.localization
> rebal
->key_end
.localization
)
70 if (rebal
->key_beg
.localization
== rebal
->key_end
.localization
) {
71 if (rebal
->key_beg
.obj_id
> rebal
->key_end
.obj_id
)
73 /* key-space limitations - no check needed */
75 if (rebal
->saturation
< HAMMER_BTREE_INT_ELMS
/ 2)
76 rebal
->saturation
= HAMMER_BTREE_INT_ELMS
/ 2;
77 if (rebal
->saturation
> HAMMER_BTREE_INT_ELMS
)
78 rebal
->saturation
= HAMMER_BTREE_INT_ELMS
;
81 * Ioctl caller has only set localization type to rebalance.
82 * Initialize cursor key localization with ip localization.
84 rebal
->key_cur
= rebal
->key_beg
;
85 rebal
->key_cur
.localization
&= HAMMER_LOCALIZE_MASK
;
86 if (rebal
->allpfs
== 0)
87 rebal
->key_cur
.localization
|= ip
->obj_localization
;
89 key_end_localization
= rebal
->key_end
.localization
;
90 key_end_localization
&= HAMMER_LOCALIZE_MASK
;
91 if (rebal
->allpfs
== 0)
92 key_end_localization
|= ip
->obj_localization
;
94 key_end_localization
|= pfs_to_lo(HAMMER_MAX_PFSID
);
96 hammer_btree_lcache_init(trans
->hmp
, &lcache
, 2);
98 seq
= trans
->hmp
->flusher
.done
;
101 * Scan forwards. Retries typically occur if a deadlock is detected.
104 error
= hammer_init_cursor(trans
, &cursor
, NULL
, NULL
);
106 hammer_done_cursor(&cursor
);
109 cursor
.key_beg
= rebal
->key_cur
;
110 cursor
.key_end
= rebal
->key_end
;
111 cursor
.key_end
.localization
= key_end_localization
;
112 cursor
.flags
|= HAMMER_CURSOR_END_INCLUSIVE
;
113 cursor
.flags
|= HAMMER_CURSOR_BACKEND
;
116 * Cause internal nodes to be returned on the way up. Internal nodes
117 * are not returned on the way down so we can create a degenerate
118 * case to handle internal nodes as a trailing function of their
121 * Note that by not setting INSERTING or PRUNING no boundary
122 * corrections will be made and a sync lock is not needed for the
123 * B-Tree scan itself.
125 cursor
.flags
|= HAMMER_CURSOR_REBLOCKING
;
127 error
= hammer_btree_first(&cursor
);
131 * Rebalancing can be hard on the memory allocator, make
132 * sure there is enough free memory before doing it.
134 if (vm_test_nominal()) {
135 hammer_unlock_cursor(&cursor
);
137 hammer_lock_cursor(&cursor
);
141 * Filesystem went read-only during rebalancing
143 if (trans
->hmp
->ronly
) {
149 * We only care about internal nodes visited for the last
150 * time on the way up... that is, a trailing scan of the
151 * internal node after all of its children have been recursed
154 if (cursor
.node
->ondisk
->type
== HAMMER_BTREE_TYPE_INTERNAL
) {
156 * Leave cursor.index alone, we want to recurse
157 * through all children of the internal node before
160 * Process the internal node on the way up after
161 * the last child's sub-tree has been balanced.
163 if (cursor
.index
== cursor
.node
->ondisk
->count
- 1) {
164 hammer_sync_lock_sh(trans
);
165 error
= rebalance_node(rebal
, &cursor
, &lcache
);
166 hammer_sync_unlock(trans
);
170 * We don't need to iterate through all the leaf
171 * elements, we only care about the parent (internal)
174 cursor
.index
= cursor
.node
->ondisk
->count
- 1;
180 * Update returned scan position and do a flush if
183 * WARNING: We extract the base using the leaf element
184 * type but this could be an internal node. The
185 * base is the same either way.
187 * However, due to the rebalancing operation the
188 * cursor position may have exceeded the right-hand
191 * WARNING: See warnings in hammer_unlock_cursor()
194 elm
= &cursor
.node
->ondisk
->elms
[cursor
.index
].leaf
;
195 rebal
->key_cur
= elm
->base
;
196 ++rebal
->stat_ncount
;
198 while (hammer_flusher_meta_halflimit(trans
->hmp
) ||
199 hammer_flusher_undo_exhausted(trans
, 2)) {
200 hammer_unlock_cursor(&cursor
);
201 hammer_flusher_wait(trans
->hmp
, seq
);
202 hammer_lock_cursor(&cursor
);
203 seq
= hammer_flusher_async_one(trans
->hmp
);
207 * Before iterating check if the rebalance operation caused
208 * the cursor to index past the right-hand boundary and make
209 * sure to stop if it does. Otherwise the iteration may
210 * panic e.g. due to the key maxing out its fields and no
211 * longer being within the strict bounds of the root node.
213 if (hammer_btree_cmp(&rebal
->key_cur
, &cursor
.key_end
) > 0) {
214 rebal
->key_cur
= cursor
.key_end
;
219 * Iterate, stop if a signal was received.
221 if ((error
= hammer_signal_check(trans
->hmp
)) != 0)
223 error
= hammer_btree_iterate(&cursor
);
227 hammer_done_cursor(&cursor
);
228 if (error
== EDEADLK
) {
229 ++rebal
->stat_collisions
;
232 if (error
== EINTR
) {
233 rebal
->head
.flags
|= HAMMER_IOC_HEAD_INTR
;
237 rebal
->key_cur
.localization
&= HAMMER_LOCALIZE_MASK
;
238 hammer_btree_lcache_free(trans
->hmp
, &lcache
);
243 * Rebalance an internal node, called via a trailing upward recursion.
244 * All the children have already been individually rebalanced.
246 * To rebalance we scan the elements in the children and pack them,
247 * so we actually need to lock the children and the children's children.
251 * C C C C C C C children (first level) (internal or leaf nodes)
252 * children's elements (second level)
254 * <<<---------- pack children's elements, possibly remove excess
255 * children after packing.
257 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
258 * Any live tracked B-Tree nodes must be updated (we worm out of that
259 * by not allowing any). And boundary elements must be preserved.
261 * NOTE: If the children are leaf nodes we may have a degenerate case
262 * case where there are no elements in the leafs.
267 rebalance_node(struct hammer_ioc_rebalance
*rebal
, hammer_cursor_t cursor
,
268 hammer_node_lock_t lcache
)
270 struct hammer_node_lock lockroot
;
271 hammer_node_lock_t base_item
;
272 hammer_node_lock_t chld_item
;
273 hammer_node_lock_t item
;
274 hammer_btree_elm_t elm
;
277 uint8_t type1 __debugvar
;
287 * Lock the parent node via the cursor, collect and lock our
288 * children and children's children.
290 * By the way, this is a LOT of locks.
292 hammer_node_lock_init(&lockroot
, cursor
->node
);
293 error
= hammer_cursor_upgrade(cursor
);
296 error
= hammer_btree_lock_children(cursor
, 2, &lockroot
, lcache
);
301 * Make a copy of all the locked on-disk data to simplify the element
302 * shifting we are going to have to do. We will modify the copy
305 hammer_btree_lock_copy(cursor
, &lockroot
);
308 * Look at the first child node.
310 if (TAILQ_FIRST(&lockroot
.list
) == NULL
)
312 type1
= TAILQ_FIRST(&lockroot
.list
)->node
->ondisk
->type
;
315 * Figure out the total number of children's children and
316 * calculate the average number of elements per child.
318 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms
319 * is always greater or equal to count.
321 * If count == 0 we hit a degenerate case which will cause
322 * avg_elms to also calculate as 0.
324 if (hammer_debug_general
& 0x1000)
325 hdkprintf("lockroot %p count %d\n", &lockroot
, lockroot
.count
);
327 TAILQ_FOREACH(item
, &lockroot
.list
, entry
) {
328 if (hammer_debug_general
& 0x1000)
329 hdkprintf("add count %d\n", item
->count
);
330 count
+= item
->count
;
331 KKASSERT(item
->node
->ondisk
->type
== type1
);
333 avg_elms
= howmany(count
, lockroot
.count
);
334 KKASSERT(avg_elms
>= 0);
337 * If the average number of elements per child is too low then
338 * calculate the desired number of children (n) such that the
339 * average number of elements is reasonable.
341 * If the desired number of children is 1 then avg_elms will
342 * wind up being count, which may still be smaller then saturation
345 if (count
&& avg_elms
< rebal
->saturation
) {
346 n
= howmany(count
, rebal
->saturation
);
347 avg_elms
= howmany(count
, n
);
351 * Pack the elements in the children. Elements for each item is
352 * packed into base_item until avg_elms is reached, then base_item
355 * hammer_cursor_moved_element() is called for each element moved
356 * to update tracked cursors, including the index beyond the last
357 * element (at count).
359 * Any cursors tracking the internal node itself must also be
360 * updated, potentially repointing them at a leaf and clearing
363 base_item
= TAILQ_FIRST(&lockroot
.list
);
367 TAILQ_FOREACH(item
, &lockroot
.list
, entry
) {
369 KKASSERT(item
->count
== node
->ondisk
->count
);
370 chld_item
= TAILQ_FIRST(&item
->list
);
371 for (i
= 0; i
< item
->count
; ++i
) {
373 * Closeout. If the next element is at index 0
374 * just use the existing separator in the parent.
376 if (base_count
== avg_elms
) {
378 elm
= &lockroot
.node
->ondisk
->elms
[
381 elm
= &node
->ondisk
->elms
[i
];
383 rebalance_closeout(base_item
, base_count
, elm
);
384 base_item
= TAILQ_NEXT(base_item
, entry
);
391 * Check degenerate no-work case. Otherwise pack
394 * All changes are made to the copy.
396 if (item
== base_item
&& i
== base_count
) {
399 chld_item
= TAILQ_NEXT(chld_item
, entry
);
406 elm
= &base_item
->copy
->elms
[base_count
];
407 *elm
= node
->ondisk
->elms
[i
];
408 base_item
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
411 * Adjust the mirror_tid of the target and the
412 * internal element linkage.
414 * The parent node (lockroot.node) should already
415 * have an aggregate mirror_tid so we do not have
416 * to update that. However, it is possible for us
417 * to catch a hammer_btree_mirror_propagate() with
418 * its pants down. Update the parent if necessary.
420 tid
= node
->ondisk
->mirror_tid
;
422 if (base_item
->copy
->mirror_tid
< tid
) {
423 base_item
->copy
->mirror_tid
= tid
;
424 if (lockroot
.copy
->mirror_tid
< tid
) {
425 lockroot
.copy
->mirror_tid
= tid
;
427 HAMMER_NODE_LOCK_UPDATED
;
429 if (lockroot
.copy
->elms
[root_count
].
430 internal
.mirror_tid
< tid
) {
431 lockroot
.copy
->elms
[root_count
].
432 internal
.mirror_tid
= tid
;
434 HAMMER_NODE_LOCK_UPDATED
;
436 base_item
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
440 * We moved elm. The parent pointers for any
441 * children of elm must be repointed.
443 if (item
!= base_item
&&
444 node
->ondisk
->type
== HAMMER_BTREE_TYPE_INTERNAL
) {
446 rebalance_parent_ptrs(base_item
, base_count
,
449 hammer_cursor_moved_element(item
->parent
->node
,
456 chld_item
= TAILQ_NEXT(chld_item
, entry
);
460 * Always call at the end (i == number of elements) in
461 * case a cursor is sitting indexed there.
463 hammer_cursor_moved_element(item
->parent
->node
, item
->index
,
465 base_item
->node
, base_count
);
469 * Packing complete, close-out base_item using the right-hand
470 * boundary of the original parent.
472 * If we will be deleting nodes from the root shift the old
473 * right-hand-boundary to the new ending index.
475 elm
= &lockroot
.node
->ondisk
->elms
[lockroot
.node
->ondisk
->count
];
476 rebalance_closeout(base_item
, base_count
, elm
);
478 if (lockroot
.copy
->count
!= root_count
) {
479 lockroot
.copy
->count
= root_count
;
480 lockroot
.copy
->elms
[root_count
] = *elm
;
481 lockroot
.flags
|= HAMMER_NODE_LOCK_UPDATED
;
485 * Any extra items beyond base_item are now completely empty and
486 * can be destroyed. Queue the destruction up in the copy. Note
487 * that none of the destroyed nodes are part of our cursor.
489 * The cursor is locked so it isn't on the tracking list. It
490 * should have been pointing at the boundary element (at root_count).
491 * When deleting elements from the root (which is cursor.node), we
492 * have to update the cursor.index manually to keep it in bounds.
494 while ((base_item
= TAILQ_NEXT(base_item
, entry
)) != NULL
) {
495 hammer_cursor_removed_node(base_item
->node
, lockroot
.node
,
497 hammer_cursor_deleted_element(lockroot
.node
, base_count
);
498 base_item
->copy
->type
= HAMMER_BTREE_TYPE_DELETED
;
499 base_item
->copy
->count
= 0;
500 base_item
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
501 if (cursor
->index
> lockroot
.copy
->count
)
503 ++rebal
->stat_deletions
;
507 * All done, sync the locked child tree to disk. This will also
508 * flush and delete deleted nodes.
510 rebal
->stat_nrebal
+= hammer_btree_sync_copy(cursor
, &lockroot
);
512 hammer_btree_unlock_children(cursor
->trans
->hmp
, &lockroot
, lcache
);
513 hammer_cursor_downgrade(cursor
);
518 * Close-out the child base_item. This node contains base_count
521 * If the node is an internal node the right-hand boundary must be
526 rebalance_closeout(hammer_node_lock_t base_item
, int base_count
,
527 hammer_btree_elm_t elm
)
529 hammer_node_lock_t parent
;
530 hammer_btree_elm_t base_elm
;
531 hammer_btree_elm_t rbound_elm
;
535 * Update the count. NOTE: base_count can be 0 for the
536 * degenerate leaf case.
538 if (hammer_debug_general
& 0x1000) {
539 hdkprintf("%016jx:", (intmax_t)base_item
->node
->node_offset
);
541 if (base_item
->copy
->count
!= base_count
) {
542 base_item
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
543 base_item
->copy
->count
= base_count
;
544 if (hammer_debug_general
& 0x1000)
545 kprintf(" (count update)");
549 * If we are closing out an internal node we must assign
550 * a right-hand boundary. Use the element contents as the
551 * right-hand boundary.
553 * Internal nodes are required to have at least one child,
554 * otherwise the left and right boundary would end up being
555 * the same element. Only leaf nodes can be empty.
557 * Rebalancing may cut-off an internal node such that the
558 * new right hand boundary is the next element anyway, but
559 * we still have to make sure that subtree_offset, btype,
560 * and mirror_tid are all 0.
562 if (base_item
->copy
->type
== HAMMER_BTREE_TYPE_INTERNAL
) {
563 KKASSERT(base_count
!= 0);
564 base_elm
= &base_item
->copy
->elms
[base_count
];
566 if (bcmp(base_elm
, elm
, sizeof(*elm
)) != 0 ||
567 elm
->internal
.subtree_offset
||
568 elm
->internal
.mirror_tid
||
571 base_elm
->internal
.subtree_offset
= 0;
572 base_elm
->internal
.mirror_tid
= 0;
573 base_elm
->base
.btype
= 0;
574 base_item
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
575 if (hammer_debug_general
& 0x1000)
576 kprintf(" (rhs update)");
578 if (hammer_debug_general
& 0x1000)
579 kprintf(" (rhs same)");
584 * The parent's boundary must be updated. Be careful to retain
585 * the btype and non-base internal fields as that information is
588 parent
= base_item
->parent
;
589 rbound_elm
= &parent
->copy
->elms
[base_item
->index
+ 1];
590 if (bcmp(&rbound_elm
->base
, &elm
->base
, sizeof(elm
->base
)) != 0) {
591 save
= rbound_elm
->base
.btype
;
592 rbound_elm
->base
= elm
->base
;
593 rbound_elm
->base
.btype
= save
;
594 parent
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
595 if (hammer_debug_general
& 0x1000) {
596 kprintf(" (parent bound update %d)",
597 base_item
->index
+ 1);
600 if (hammer_debug_general
& 0x1000)
605 * An element in item has moved to base_item. We must update the parent
606 * pointer of the node the element points to (which is chld_item).
610 rebalance_parent_ptrs(hammer_node_lock_t base_item
, int index
,
611 hammer_node_lock_t item
, hammer_node_lock_t chld_item
)
613 KKASSERT(chld_item
->node
->ondisk
->parent
== item
->node
->node_offset
);
614 chld_item
->copy
->parent
= base_item
->node
->node_offset
;
615 chld_item
->flags
|= HAMMER_NODE_LOCK_UPDATED
;
616 hammer_cursor_parent_changed(chld_item
->node
,
617 item
->node
, base_item
->node
, index
);