sys/vfs/hammer: Use HAMMER_BUFSIZE_DOALIGN() and variants
[dragonfly.git] / sys / vfs / hammer / hammer_rebalance.c
blob280671e42f8f923c801663d46bbdf30b45470de3
1 /*
2 * Copyright (c) 2009 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 #include "hammer.h"
37 static int rebalance_node(struct hammer_ioc_rebalance *rebal,
38 hammer_cursor_t cursor, hammer_node_lock_t lcache);
39 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
40 hammer_btree_elm_t elm);
41 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
42 hammer_node_lock_t item, hammer_node_lock_t chld_item);
45 * Iterate through the specified range of object ids and rebalance B-Tree
46 * leaf and internal nodes we encounter. A forwards iteration is used.
48 * All leafs are at the same depth. We use the b-tree scan code loosely
49 * to position ourselves and create degenerate cases to skip indices
50 * that we have rebalanced in bulk.
53 int
54 hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
55 struct hammer_ioc_rebalance *rebal)
57 struct hammer_cursor cursor;
58 struct hammer_node_lock lcache;
59 hammer_btree_leaf_elm_t elm;
60 int error;
61 int seq;
62 uint32_t key_end_localization;
64 if ((rebal->key_beg.localization | rebal->key_end.localization) &
65 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
66 return(EINVAL);
68 if (rebal->key_beg.localization > rebal->key_end.localization)
69 return(EINVAL);
70 if (rebal->key_beg.localization == rebal->key_end.localization) {
71 if (rebal->key_beg.obj_id > rebal->key_end.obj_id)
72 return(EINVAL);
73 /* key-space limitations - no check needed */
75 if (rebal->saturation < HAMMER_BTREE_INT_ELMS / 2)
76 rebal->saturation = HAMMER_BTREE_INT_ELMS / 2;
77 if (rebal->saturation > HAMMER_BTREE_INT_ELMS)
78 rebal->saturation = HAMMER_BTREE_INT_ELMS;
81 * Ioctl caller has only set localization type to rebalance.
82 * Initialize cursor key localization with ip localization.
84 rebal->key_cur = rebal->key_beg;
85 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
86 if (rebal->allpfs == 0)
87 rebal->key_cur.localization |= ip->obj_localization;
89 key_end_localization = rebal->key_end.localization;
90 key_end_localization &= HAMMER_LOCALIZE_MASK;
91 if (rebal->allpfs == 0)
92 key_end_localization |= ip->obj_localization;
93 else
94 key_end_localization |= pfs_to_lo(HAMMER_MAX_PFSID);
96 hammer_btree_lcache_init(trans->hmp, &lcache, 2);
98 seq = trans->hmp->flusher.done;
101 * Scan forwards. Retries typically occur if a deadlock is detected.
103 retry:
104 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
105 if (error) {
106 hammer_done_cursor(&cursor);
107 goto failed;
109 cursor.key_beg = rebal->key_cur;
110 cursor.key_end = rebal->key_end;
111 cursor.key_end.localization = key_end_localization;
112 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
113 cursor.flags |= HAMMER_CURSOR_BACKEND;
116 * Cause internal nodes to be returned on the way up. Internal nodes
117 * are not returned on the way down so we can create a degenerate
118 * case to handle internal nodes as a trailing function of their
119 * sub-trees.
121 * Note that by not setting INSERTING or PRUNING no boundary
122 * corrections will be made and a sync lock is not needed for the
123 * B-Tree scan itself.
125 cursor.flags |= HAMMER_CURSOR_REBLOCKING;
127 error = hammer_btree_first(&cursor);
129 while (error == 0) {
131 * Rebalancing can be hard on the memory allocator, make
132 * sure there is enough free memory before doing it.
134 if (vm_test_nominal()) {
135 hammer_unlock_cursor(&cursor);
136 vm_wait_nominal();
137 hammer_lock_cursor(&cursor);
141 * We only care about internal nodes visited for the last
142 * time on the way up... that is, a trailing scan of the
143 * internal node after all of its children have been recursed
144 * through.
146 if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
148 * Leave cursor.index alone, we want to recurse
149 * through all children of the internal node before
150 * visiting it.
152 * Process the internal node on the way up after
153 * the last child's sub-tree has been balanced.
155 if (cursor.index == cursor.node->ondisk->count - 1) {
156 hammer_sync_lock_sh(trans);
157 error = rebalance_node(rebal, &cursor, &lcache);
158 hammer_sync_unlock(trans);
160 } else {
162 * We don't need to iterate through all the leaf
163 * elements, we only care about the parent (internal)
164 * node.
166 cursor.index = cursor.node->ondisk->count - 1;
168 if (error)
169 break;
172 * Update returned scan position and do a flush if
173 * necessary.
175 * WARNING: We extract the base using the leaf element
176 * type but this could be an internal node. The
177 * base is the same either way.
179 * However, due to the rebalancing operation the
180 * cursor position may have exceeded the right-hand
181 * boundary.
183 * WARNING: See warnings in hammer_unlock_cursor()
184 * function.
186 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
187 rebal->key_cur = elm->base;
188 ++rebal->stat_ncount;
190 while (hammer_flusher_meta_halflimit(trans->hmp) ||
191 hammer_flusher_undo_exhausted(trans, 2)) {
192 hammer_unlock_cursor(&cursor);
193 hammer_flusher_wait(trans->hmp, seq);
194 hammer_lock_cursor(&cursor);
195 seq = hammer_flusher_async_one(trans->hmp);
199 * Before iterating check if the rebalance operation caused
200 * the cursor to index past the right-hand boundary and make
201 * sure to stop if it does. Otherwise the iteration may
202 * panic e.g. due to the key maxing out its fields and no
203 * longer being within the strict bounds of the root node.
205 if (hammer_btree_cmp(&rebal->key_cur, &cursor.key_end) > 0) {
206 rebal->key_cur = cursor.key_end;
207 break;
211 * Iterate, stop if a signal was received.
213 if ((error = hammer_signal_check(trans->hmp)) != 0)
214 break;
215 error = hammer_btree_iterate(&cursor);
217 if (error == ENOENT)
218 error = 0;
219 hammer_done_cursor(&cursor);
220 if (error == EDEADLK) {
221 ++rebal->stat_collisions;
222 goto retry;
224 if (error == EINTR) {
225 rebal->head.flags |= HAMMER_IOC_HEAD_INTR;
226 error = 0;
228 failed:
229 rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
230 hammer_btree_lcache_free(trans->hmp, &lcache);
231 return(error);
235 * Rebalance an internal node, called via a trailing upward recursion.
236 * All the children have already been individually rebalanced.
238 * To rebalance we scan the elements in the children and pack them,
239 * so we actually need to lock the children and the children's children.
241 * INTERNAL_NODE
242 * / / | | | \ \
243 * C C C C C C C children (first level) (internal or leaf nodes)
244 * children's elements (second level)
246 * <<<---------- pack children's elements, possibly remove excess
247 * children after packing.
249 * NOTE: The mirror_tids, parent pointers, and child pointers must be updated.
250 * Any live tracked B-Tree nodes must be updated (we worm out of that
251 * by not allowing any). And boundary elements must be preserved.
253 * NOTE: If the children are leaf nodes we may have a degenerate case
254 * case where there are no elements in the leafs.
256 * XXX live-tracked
258 static int
259 rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor,
260 hammer_node_lock_t lcache)
262 struct hammer_node_lock lockroot;
263 hammer_node_lock_t base_item;
264 hammer_node_lock_t chld_item;
265 hammer_node_lock_t item;
266 hammer_btree_elm_t elm;
267 hammer_node_t node;
268 hammer_tid_t tid;
269 uint8_t type1 __debugvar;
270 int base_count;
271 int root_count;
272 int avg_elms;
273 int count;
274 int error;
275 int i;
276 int n;
279 * Lock the parent node via the cursor, collect and lock our
280 * children and children's children.
282 * By the way, this is a LOT of locks.
284 hammer_node_lock_init(&lockroot, cursor->node);
285 error = hammer_cursor_upgrade(cursor);
286 if (error)
287 goto done;
288 error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache);
289 if (error)
290 goto done;
293 * Make a copy of all the locked on-disk data to simplify the element
294 * shifting we are going to have to do. We will modify the copy
295 * first.
297 hammer_btree_lock_copy(cursor, &lockroot);
300 * Look at the first child node.
302 if (TAILQ_FIRST(&lockroot.list) == NULL)
303 goto done;
304 type1 = TAILQ_FIRST(&lockroot.list)->node->ondisk->type;
307 * Figure out the total number of children's children and
308 * calculate the average number of elements per child.
310 * The minimum avg_elms is 1 when count > 0. avg_elms * root_elms
311 * is always greater or equal to count.
313 * If count == 0 we hit a degenerate case which will cause
314 * avg_elms to also calculate as 0.
316 if (hammer_debug_general & 0x1000)
317 hdkprintf("lockroot %p count %d\n", &lockroot, lockroot.count);
318 count = 0;
319 TAILQ_FOREACH(item, &lockroot.list, entry) {
320 if (hammer_debug_general & 0x1000)
321 hdkprintf("add count %d\n", item->count);
322 count += item->count;
323 KKASSERT(item->node->ondisk->type == type1);
325 avg_elms = (count + (lockroot.count - 1)) / lockroot.count;
326 KKASSERT(avg_elms >= 0);
329 * If the average number of elements per child is too low then
330 * calculate the desired number of children (n) such that the
331 * average number of elements is reasonable.
333 * If the desired number of children is 1 then avg_elms will
334 * wind up being count, which may still be smaller then saturation
335 * but that is ok.
337 if (count && avg_elms < rebal->saturation) {
338 n = (count + (rebal->saturation - 1)) / rebal->saturation;
339 avg_elms = (count + (n - 1)) / n;
343 * Pack the elements in the children. Elements for each item is
344 * packed into base_item until avg_elms is reached, then base_item
345 * iterates.
347 * hammer_cursor_moved_element() is called for each element moved
348 * to update tracked cursors, including the index beyond the last
349 * element (at count).
351 * Any cursors tracking the internal node itself must also be
352 * updated, potentially repointing them at a leaf and clearing
353 * ATEDISK.
355 base_item = TAILQ_FIRST(&lockroot.list);
356 base_count = 0;
357 root_count = 0;
359 TAILQ_FOREACH(item, &lockroot.list, entry) {
360 node = item->node;
361 KKASSERT(item->count == node->ondisk->count);
362 chld_item = TAILQ_FIRST(&item->list);
363 for (i = 0; i < item->count; ++i) {
365 * Closeout. If the next element is at index 0
366 * just use the existing separator in the parent.
368 if (base_count == avg_elms) {
369 if (i == 0) {
370 elm = &lockroot.node->ondisk->elms[
371 item->index];
372 } else {
373 elm = &node->ondisk->elms[i];
375 rebalance_closeout(base_item, base_count, elm);
376 base_item = TAILQ_NEXT(base_item, entry);
377 KKASSERT(base_item);
378 base_count = 0;
379 ++root_count;
383 * Check degenerate no-work case. Otherwise pack
384 * the element.
386 * All changes are made to the copy.
388 if (item == base_item && i == base_count) {
389 ++base_count;
390 if (chld_item)
391 chld_item = TAILQ_NEXT(chld_item, entry);
392 continue;
396 * Pack element.
398 elm = &base_item->copy->elms[base_count];
399 *elm = node->ondisk->elms[i];
400 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
403 * Adjust the mirror_tid of the target and the
404 * internal element linkage.
406 * The parent node (lockroot.node) should already
407 * have an aggregate mirror_tid so we do not have
408 * to update that. However, it is possible for us
409 * to catch a hammer_btree_mirror_propagate() with
410 * its pants down. Update the parent if necessary.
412 tid = node->ondisk->mirror_tid;
414 if (base_item->copy->mirror_tid < tid) {
415 base_item->copy->mirror_tid = tid;
416 if (lockroot.copy->mirror_tid < tid) {
417 lockroot.copy->mirror_tid = tid;
418 lockroot.flags |=
419 HAMMER_NODE_LOCK_UPDATED;
421 if (lockroot.copy->elms[root_count].
422 internal.mirror_tid < tid) {
423 lockroot.copy->elms[root_count].
424 internal.mirror_tid = tid;
425 lockroot.flags |=
426 HAMMER_NODE_LOCK_UPDATED;
428 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
432 * We moved elm. The parent pointers for any
433 * children of elm must be repointed.
435 if (item != base_item &&
436 node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) {
437 KKASSERT(chld_item);
438 rebalance_parent_ptrs(base_item, base_count,
439 item, chld_item);
441 hammer_cursor_moved_element(item->parent->node,
442 item->index,
443 node, i,
444 base_item->node,
445 base_count);
446 ++base_count;
447 if (chld_item)
448 chld_item = TAILQ_NEXT(chld_item, entry);
452 * Always call at the end (i == number of elements) in
453 * case a cursor is sitting indexed there.
455 hammer_cursor_moved_element(item->parent->node, item->index,
456 node, i,
457 base_item->node, base_count);
461 * Packing complete, close-out base_item using the right-hand
462 * boundary of the original parent.
464 * If we will be deleting nodes from the root shift the old
465 * right-hand-boundary to the new ending index.
467 elm = &lockroot.node->ondisk->elms[lockroot.node->ondisk->count];
468 rebalance_closeout(base_item, base_count, elm);
469 ++root_count;
470 if (lockroot.copy->count != root_count) {
471 lockroot.copy->count = root_count;
472 lockroot.copy->elms[root_count] = *elm;
473 lockroot.flags |= HAMMER_NODE_LOCK_UPDATED;
477 * Any extra items beyond base_item are now completely empty and
478 * can be destroyed. Queue the destruction up in the copy. Note
479 * that none of the destroyed nodes are part of our cursor.
481 * The cursor is locked so it isn't on the tracking list. It
482 * should have been pointing at the boundary element (at root_count).
483 * When deleting elements from the root (which is cursor.node), we
484 * have to update the cursor.index manually to keep it in bounds.
486 while ((base_item = TAILQ_NEXT(base_item, entry)) != NULL) {
487 hammer_cursor_removed_node(base_item->node, lockroot.node,
488 base_count);
489 hammer_cursor_deleted_element(lockroot.node, base_count);
490 base_item->copy->type = HAMMER_BTREE_TYPE_DELETED;
491 base_item->copy->count = 0;
492 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
493 if (cursor->index > lockroot.copy->count)
494 --cursor->index;
495 ++rebal->stat_deletions;
499 * All done, sync the locked child tree to disk. This will also
500 * flush and delete deleted nodes.
502 rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
503 done:
504 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache);
505 hammer_cursor_downgrade(cursor);
506 return (error);
510 * Close-out the child base_item. This node contains base_count
511 * elements.
513 * If the node is an internal node the right-hand boundary must be
514 * set to elm.
516 static
517 void
518 rebalance_closeout(hammer_node_lock_t base_item, int base_count,
519 hammer_btree_elm_t elm)
521 hammer_node_lock_t parent;
522 hammer_btree_elm_t base_elm;
523 hammer_btree_elm_t rbound_elm;
524 uint8_t save;
527 * Update the count. NOTE: base_count can be 0 for the
528 * degenerate leaf case.
530 if (hammer_debug_general & 0x1000) {
531 hdkprintf("%016jx:", (intmax_t)base_item->node->node_offset);
533 if (base_item->copy->count != base_count) {
534 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
535 base_item->copy->count = base_count;
536 if (hammer_debug_general & 0x1000)
537 kprintf(" (count update)");
541 * If we are closing out an internal node we must assign
542 * a right-hand boundary. Use the element contents as the
543 * right-hand boundary.
545 * Internal nodes are required to have at least one child,
546 * otherwise the left and right boundary would end up being
547 * the same element. Only leaf nodes can be empty.
549 * Rebalancing may cut-off an internal node such that the
550 * new right hand boundary is the next element anyway, but
551 * we still have to make sure that subtree_offset, btype,
552 * and mirror_tid are all 0.
554 if (base_item->copy->type == HAMMER_BTREE_TYPE_INTERNAL) {
555 KKASSERT(base_count != 0);
556 base_elm = &base_item->copy->elms[base_count];
558 if (bcmp(base_elm, elm, sizeof(*elm)) != 0 ||
559 elm->internal.subtree_offset ||
560 elm->internal.mirror_tid ||
561 elm->base.btype) {
562 *base_elm = *elm;
563 base_elm->internal.subtree_offset = 0;
564 base_elm->internal.mirror_tid = 0;
565 base_elm->base.btype = 0;
566 base_item->flags |= HAMMER_NODE_LOCK_UPDATED;
567 if (hammer_debug_general & 0x1000)
568 kprintf(" (rhs update)");
569 } else {
570 if (hammer_debug_general & 0x1000)
571 kprintf(" (rhs same)");
576 * The parent's boundary must be updated. Be careful to retain
577 * the btype and non-base internal fields as that information is
578 * unrelated.
580 parent = base_item->parent;
581 rbound_elm = &parent->copy->elms[base_item->index + 1];
582 if (bcmp(&rbound_elm->base, &elm->base, sizeof(elm->base)) != 0) {
583 save = rbound_elm->base.btype;
584 rbound_elm->base = elm->base;
585 rbound_elm->base.btype = save;
586 parent->flags |= HAMMER_NODE_LOCK_UPDATED;
587 if (hammer_debug_general & 0x1000) {
588 kprintf(" (parent bound update %d)",
589 base_item->index + 1);
592 if (hammer_debug_general & 0x1000)
593 kprintf("\n");
597 * An element in item has moved to base_item. We must update the parent
598 * pointer of the node the element points to (which is chld_item).
600 static
601 void
602 rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
603 hammer_node_lock_t item, hammer_node_lock_t chld_item)
605 KKASSERT(chld_item->node->ondisk->parent == item->node->node_offset);
606 chld_item->copy->parent = base_item->node->node_offset;
607 chld_item->flags |= HAMMER_NODE_LOCK_UPDATED;
608 hammer_cursor_parent_changed(chld_item->node,
609 item->node, base_item->node, index);