2 * lib/btree.c - Simple In-memory B+Tree
4 * As should be obvious for Linux kernel code, license is GPLv2
6 * Copyright (c) 2007-2008 Joern Engel <joern@purestorage.com>
7 * Bits and pieces stolen from Peter Zijlstra's code, which is
8 * Copyright 2007, Red Hat Inc. Peter Zijlstra
11 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
13 * A relatively simple B+Tree implementation. I have written it as a learning
14 * exercise to understand how B+Trees work. Turned out to be useful as well.
16 * B+Trees can be used similar to Linux radix trees (which don't have anything
17 * in common with textbook radix trees, beware). Prerequisite for them working
18 * well is that access to a random tree node is much faster than a large number
19 * of operations within each node.
21 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
22 * has gained similar properties, as memory access times, when measured in cpu
23 * cycles, have increased. Cacheline sizes have increased as well, which also
26 * Compared to radix trees, B+Trees are more efficient when dealing with a
27 * sparsely populated address space. Between 25% and 50% of the memory is
28 * occupied with valid pointers. When densely populated, radix trees contain
29 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
32 * This particular implementation stores pointers identified by a long value.
33 * Storing NULL pointers is illegal, lookup will return NULL when no entry
36 * A tricks was used that is not commonly found in textbooks. The lowest
37 * values are to the right, not to the left. All used slots within a node
38 * are on the left, all unused slots contain NUL values. Most operations
39 * simply loop once over all slots and terminate on the first NUL.
42 #include <linux/btree.h>
43 #include <linux/cache.h>
44 #include <linux/kernel.h>
45 #include <linux/slab.h>
46 #include <linux/module.h>
48 #define MAX(a, b) ((a) > (b) ? (a) : (b))
49 #define NODESIZE MAX(L1_CACHE_BYTES, 128)
57 struct btree_geo btree_geo32
= {
59 .no_pairs
= NODESIZE
/ sizeof(long) / 2,
60 .no_longs
= NODESIZE
/ sizeof(long) / 2,
62 EXPORT_SYMBOL_GPL(btree_geo32
);
64 #define LONG_PER_U64 (64 / BITS_PER_LONG)
65 struct btree_geo btree_geo64
= {
66 .keylen
= LONG_PER_U64
,
67 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
),
68 .no_longs
= LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
)),
70 EXPORT_SYMBOL_GPL(btree_geo64
);
72 struct btree_geo btree_geo128
= {
73 .keylen
= 2 * LONG_PER_U64
,
74 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
),
75 .no_longs
= 2 * LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
)),
77 EXPORT_SYMBOL_GPL(btree_geo128
);
79 #define MAX_KEYLEN (2 * LONG_PER_U64)
81 static struct kmem_cache
*btree_cachep
;
83 void *btree_alloc(gfp_t gfp_mask
, void *pool_data
)
85 return kmem_cache_alloc(btree_cachep
, gfp_mask
);
87 EXPORT_SYMBOL_GPL(btree_alloc
);
89 void btree_free(void *element
, void *pool_data
)
91 kmem_cache_free(btree_cachep
, element
);
93 EXPORT_SYMBOL_GPL(btree_free
);
95 static unsigned long *btree_node_alloc(struct btree_head
*head
, gfp_t gfp
)
99 node
= mempool_alloc(head
->mempool
, gfp
);
101 memset(node
, 0, NODESIZE
);
105 static int longcmp(const unsigned long *l1
, const unsigned long *l2
, size_t n
)
109 for (i
= 0; i
< n
; i
++) {
118 static unsigned long *longcpy(unsigned long *dest
, const unsigned long *src
,
123 for (i
= 0; i
< n
; i
++)
128 static unsigned long *longset(unsigned long *s
, unsigned long c
, size_t n
)
132 for (i
= 0; i
< n
; i
++)
137 static void dec_key(struct btree_geo
*geo
, unsigned long *key
)
142 for (i
= geo
->keylen
- 1; i
>= 0; i
--) {
150 static unsigned long *bkey(struct btree_geo
*geo
, unsigned long *node
, int n
)
152 return &node
[n
* geo
->keylen
];
155 static void *bval(struct btree_geo
*geo
, unsigned long *node
, int n
)
157 return (void *)node
[geo
->no_longs
+ n
];
160 static void setkey(struct btree_geo
*geo
, unsigned long *node
, int n
,
163 longcpy(bkey(geo
, node
, n
), key
, geo
->keylen
);
166 static void setval(struct btree_geo
*geo
, unsigned long *node
, int n
,
169 node
[geo
->no_longs
+ n
] = (unsigned long) val
;
172 static void clearpair(struct btree_geo
*geo
, unsigned long *node
, int n
)
174 longset(bkey(geo
, node
, n
), 0, geo
->keylen
);
175 node
[geo
->no_longs
+ n
] = 0;
178 static inline void __btree_init(struct btree_head
*head
)
184 void btree_init_mempool(struct btree_head
*head
, mempool_t
*mempool
)
187 head
->mempool
= mempool
;
189 EXPORT_SYMBOL_GPL(btree_init_mempool
);
191 int btree_init(struct btree_head
*head
)
194 head
->mempool
= mempool_create(0, btree_alloc
, btree_free
, NULL
);
199 EXPORT_SYMBOL_GPL(btree_init
);
201 void btree_destroy(struct btree_head
*head
)
203 mempool_free(head
->node
, head
->mempool
);
204 mempool_destroy(head
->mempool
);
205 head
->mempool
= NULL
;
207 EXPORT_SYMBOL_GPL(btree_destroy
);
209 void *btree_last(struct btree_head
*head
, struct btree_geo
*geo
,
212 int height
= head
->height
;
213 unsigned long *node
= head
->node
;
218 for ( ; height
> 1; height
--)
219 node
= bval(geo
, node
, 0);
221 longcpy(key
, bkey(geo
, node
, 0), geo
->keylen
);
222 return bval(geo
, node
, 0);
224 EXPORT_SYMBOL_GPL(btree_last
);
226 static int keycmp(struct btree_geo
*geo
, unsigned long *node
, int pos
,
229 return longcmp(bkey(geo
, node
, pos
), key
, geo
->keylen
);
232 static int keyzero(struct btree_geo
*geo
, unsigned long *key
)
236 for (i
= 0; i
< geo
->keylen
; i
++)
243 void *btree_lookup(struct btree_head
*head
, struct btree_geo
*geo
,
246 int i
, height
= head
->height
;
247 unsigned long *node
= head
->node
;
252 for ( ; height
> 1; height
--) {
253 for (i
= 0; i
< geo
->no_pairs
; i
++)
254 if (keycmp(geo
, node
, i
, key
) <= 0)
256 if (i
== geo
->no_pairs
)
258 node
= bval(geo
, node
, i
);
266 for (i
= 0; i
< geo
->no_pairs
; i
++)
267 if (keycmp(geo
, node
, i
, key
) == 0)
268 return bval(geo
, node
, i
);
271 EXPORT_SYMBOL_GPL(btree_lookup
);
273 int btree_update(struct btree_head
*head
, struct btree_geo
*geo
,
274 unsigned long *key
, void *val
)
276 int i
, height
= head
->height
;
277 unsigned long *node
= head
->node
;
282 for ( ; height
> 1; height
--) {
283 for (i
= 0; i
< geo
->no_pairs
; i
++)
284 if (keycmp(geo
, node
, i
, key
) <= 0)
286 if (i
== geo
->no_pairs
)
288 node
= bval(geo
, node
, i
);
296 for (i
= 0; i
< geo
->no_pairs
; i
++)
297 if (keycmp(geo
, node
, i
, key
) == 0) {
298 setval(geo
, node
, i
, val
);
303 EXPORT_SYMBOL_GPL(btree_update
);
306 * Usually this function is quite similar to normal lookup. But the key of
307 * a parent node may be smaller than the smallest key of all its siblings.
308 * In such a case we cannot just return NULL, as we have only proven that no
309 * key smaller than __key, but larger than this parent key exists.
310 * So we set __key to the parent key and retry. We have to use the smallest
311 * such parent key, which is the last parent key we encountered.
313 void *btree_get_prev(struct btree_head
*head
, struct btree_geo
*geo
,
314 unsigned long *__key
)
317 unsigned long *node
, *oldnode
;
318 unsigned long *retry_key
= NULL
, key
[MAX_KEYLEN
];
320 if (keyzero(geo
, __key
))
323 if (head
->height
== 0)
325 longcpy(key
, __key
, geo
->keylen
);
330 for (height
= head
->height
; height
> 1; height
--) {
331 for (i
= 0; i
< geo
->no_pairs
; i
++)
332 if (keycmp(geo
, node
, i
, key
) <= 0)
334 if (i
== geo
->no_pairs
)
337 node
= bval(geo
, node
, i
);
340 retry_key
= bkey(geo
, oldnode
, i
);
346 for (i
= 0; i
< geo
->no_pairs
; i
++) {
347 if (keycmp(geo
, node
, i
, key
) <= 0) {
348 if (bval(geo
, node
, i
)) {
349 longcpy(__key
, bkey(geo
, node
, i
), geo
->keylen
);
350 return bval(geo
, node
, i
);
357 longcpy(key
, retry_key
, geo
->keylen
);
363 EXPORT_SYMBOL_GPL(btree_get_prev
);
365 static int getpos(struct btree_geo
*geo
, unsigned long *node
,
370 for (i
= 0; i
< geo
->no_pairs
; i
++) {
371 if (keycmp(geo
, node
, i
, key
) <= 0)
377 static int getfill(struct btree_geo
*geo
, unsigned long *node
, int start
)
381 for (i
= start
; i
< geo
->no_pairs
; i
++)
382 if (!bval(geo
, node
, i
))
388 * locate the correct leaf node in the btree
390 static unsigned long *find_level(struct btree_head
*head
, struct btree_geo
*geo
,
391 unsigned long *key
, int level
)
393 unsigned long *node
= head
->node
;
396 for (height
= head
->height
; height
> level
; height
--) {
397 for (i
= 0; i
< geo
->no_pairs
; i
++)
398 if (keycmp(geo
, node
, i
, key
) <= 0)
401 if ((i
== geo
->no_pairs
) || !bval(geo
, node
, i
)) {
402 /* right-most key is too large, update it */
403 /* FIXME: If the right-most key on higher levels is
404 * always zero, this wouldn't be necessary. */
406 setkey(geo
, node
, i
, key
);
409 node
= bval(geo
, node
, i
);
415 static int btree_grow(struct btree_head
*head
, struct btree_geo
*geo
,
421 node
= btree_node_alloc(head
, gfp
);
425 fill
= getfill(geo
, head
->node
, 0);
426 setkey(geo
, node
, 0, bkey(geo
, head
->node
, fill
- 1));
427 setval(geo
, node
, 0, head
->node
);
434 static void btree_shrink(struct btree_head
*head
, struct btree_geo
*geo
)
439 if (head
->height
<= 1)
443 fill
= getfill(geo
, node
, 0);
445 head
->node
= bval(geo
, node
, 0);
447 mempool_free(node
, head
->mempool
);
450 static int btree_insert_level(struct btree_head
*head
, struct btree_geo
*geo
,
451 unsigned long *key
, void *val
, int level
,
455 int i
, pos
, fill
, err
;
458 if (head
->height
< level
) {
459 err
= btree_grow(head
, geo
, gfp
);
465 node
= find_level(head
, geo
, key
, level
);
466 pos
= getpos(geo
, node
, key
);
467 fill
= getfill(geo
, node
, pos
);
468 /* two identical keys are not allowed */
469 BUG_ON(pos
< fill
&& keycmp(geo
, node
, pos
, key
) == 0);
471 if (fill
== geo
->no_pairs
) {
472 /* need to split node */
475 new = btree_node_alloc(head
, gfp
);
478 err
= btree_insert_level(head
, geo
,
479 bkey(geo
, node
, fill
/ 2 - 1),
480 new, level
+ 1, gfp
);
482 mempool_free(new, head
->mempool
);
485 for (i
= 0; i
< fill
/ 2; i
++) {
486 setkey(geo
, new, i
, bkey(geo
, node
, i
));
487 setval(geo
, new, i
, bval(geo
, node
, i
));
488 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ fill
/ 2));
489 setval(geo
, node
, i
, bval(geo
, node
, i
+ fill
/ 2));
490 clearpair(geo
, node
, i
+ fill
/ 2);
493 setkey(geo
, node
, i
, bkey(geo
, node
, fill
- 1));
494 setval(geo
, node
, i
, bval(geo
, node
, fill
- 1));
495 clearpair(geo
, node
, fill
- 1);
499 BUG_ON(fill
>= geo
->no_pairs
);
501 /* shift and insert */
502 for (i
= fill
; i
> pos
; i
--) {
503 setkey(geo
, node
, i
, bkey(geo
, node
, i
- 1));
504 setval(geo
, node
, i
, bval(geo
, node
, i
- 1));
506 setkey(geo
, node
, pos
, key
);
507 setval(geo
, node
, pos
, val
);
512 int btree_insert(struct btree_head
*head
, struct btree_geo
*geo
,
513 unsigned long *key
, void *val
, gfp_t gfp
)
516 return btree_insert_level(head
, geo
, key
, val
, 1, gfp
);
518 EXPORT_SYMBOL_GPL(btree_insert
);
520 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
521 unsigned long *key
, int level
);
522 static void merge(struct btree_head
*head
, struct btree_geo
*geo
, int level
,
523 unsigned long *left
, int lfill
,
524 unsigned long *right
, int rfill
,
525 unsigned long *parent
, int lpos
)
529 for (i
= 0; i
< rfill
; i
++) {
530 /* Move all keys to the left */
531 setkey(geo
, left
, lfill
+ i
, bkey(geo
, right
, i
));
532 setval(geo
, left
, lfill
+ i
, bval(geo
, right
, i
));
534 /* Exchange left and right child in parent */
535 setval(geo
, parent
, lpos
, right
);
536 setval(geo
, parent
, lpos
+ 1, left
);
537 /* Remove left (formerly right) child from parent */
538 btree_remove_level(head
, geo
, bkey(geo
, parent
, lpos
), level
+ 1);
539 mempool_free(right
, head
->mempool
);
542 static void rebalance(struct btree_head
*head
, struct btree_geo
*geo
,
543 unsigned long *key
, int level
, unsigned long *child
, int fill
)
545 unsigned long *parent
, *left
= NULL
, *right
= NULL
;
546 int i
, no_left
, no_right
;
549 /* Because we don't steal entries from a neighbour, this case
550 * can happen. Parent node contains a single child, this
551 * node, so merging with a sibling never happens.
553 btree_remove_level(head
, geo
, key
, level
+ 1);
554 mempool_free(child
, head
->mempool
);
558 parent
= find_level(head
, geo
, key
, level
+ 1);
559 i
= getpos(geo
, parent
, key
);
560 BUG_ON(bval(geo
, parent
, i
) != child
);
563 left
= bval(geo
, parent
, i
- 1);
564 no_left
= getfill(geo
, left
, 0);
565 if (fill
+ no_left
<= geo
->no_pairs
) {
566 merge(head
, geo
, level
,
573 if (i
+ 1 < getfill(geo
, parent
, i
)) {
574 right
= bval(geo
, parent
, i
+ 1);
575 no_right
= getfill(geo
, right
, 0);
576 if (fill
+ no_right
<= geo
->no_pairs
) {
577 merge(head
, geo
, level
,
585 * We could also try to steal one entry from the left or right
586 * neighbor. By not doing so we changed the invariant from
587 * "all nodes are at least half full" to "no two neighboring
588 * nodes can be merged". Which means that the average fill of
589 * all nodes is still half or better.
593 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
594 unsigned long *key
, int level
)
600 if (level
> head
->height
) {
601 /* we recursed all the way up */
607 node
= find_level(head
, geo
, key
, level
);
608 pos
= getpos(geo
, node
, key
);
609 fill
= getfill(geo
, node
, pos
);
610 if ((level
== 1) && (keycmp(geo
, node
, pos
, key
) != 0))
612 ret
= bval(geo
, node
, pos
);
614 /* remove and shift */
615 for (i
= pos
; i
< fill
- 1; i
++) {
616 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ 1));
617 setval(geo
, node
, i
, bval(geo
, node
, i
+ 1));
619 clearpair(geo
, node
, fill
- 1);
621 if (fill
- 1 < geo
->no_pairs
/ 2) {
622 if (level
< head
->height
)
623 rebalance(head
, geo
, key
, level
, node
, fill
- 1);
624 else if (fill
- 1 == 1)
625 btree_shrink(head
, geo
);
631 void *btree_remove(struct btree_head
*head
, struct btree_geo
*geo
,
634 if (head
->height
== 0)
637 return btree_remove_level(head
, geo
, key
, 1);
639 EXPORT_SYMBOL_GPL(btree_remove
);
641 int btree_merge(struct btree_head
*target
, struct btree_head
*victim
,
642 struct btree_geo
*geo
, gfp_t gfp
)
644 unsigned long key
[MAX_KEYLEN
];
645 unsigned long dup
[MAX_KEYLEN
];
649 BUG_ON(target
== victim
);
651 if (!(target
->node
)) {
652 /* target is empty, just copy fields over */
653 target
->node
= victim
->node
;
654 target
->height
= victim
->height
;
655 __btree_init(victim
);
659 /* TODO: This needs some optimizations. Currently we do three tree
660 * walks to remove a single object from the victim.
663 if (!btree_last(victim
, geo
, key
))
665 val
= btree_lookup(victim
, geo
, key
);
666 err
= btree_insert(target
, geo
, key
, val
, gfp
);
669 /* We must make a copy of the key, as the original will get
670 * mangled inside btree_remove. */
671 longcpy(dup
, key
, geo
->keylen
);
672 btree_remove(victim
, geo
, dup
);
676 EXPORT_SYMBOL_GPL(btree_merge
);
678 static size_t __btree_for_each(struct btree_head
*head
, struct btree_geo
*geo
,
679 unsigned long *node
, unsigned long opaque
,
680 void (*func
)(void *elem
, unsigned long opaque
,
681 unsigned long *key
, size_t index
,
683 void *func2
, int reap
, int height
, size_t count
)
686 unsigned long *child
;
688 for (i
= 0; i
< geo
->no_pairs
; i
++) {
689 child
= bval(geo
, node
, i
);
693 count
= __btree_for_each(head
, geo
, child
, opaque
,
694 func
, func2
, reap
, height
- 1, count
);
696 func(child
, opaque
, bkey(geo
, node
, i
), count
++,
700 mempool_free(node
, head
->mempool
);
704 static void empty(void *elem
, unsigned long opaque
, unsigned long *key
,
705 size_t index
, void *func2
)
709 void visitorl(void *elem
, unsigned long opaque
, unsigned long *key
,
710 size_t index
, void *__func
)
712 visitorl_t func
= __func
;
714 func(elem
, opaque
, *key
, index
);
716 EXPORT_SYMBOL_GPL(visitorl
);
718 void visitor32(void *elem
, unsigned long opaque
, unsigned long *__key
,
719 size_t index
, void *__func
)
721 visitor32_t func
= __func
;
722 u32
*key
= (void *)__key
;
724 func(elem
, opaque
, *key
, index
);
726 EXPORT_SYMBOL_GPL(visitor32
);
728 void visitor64(void *elem
, unsigned long opaque
, unsigned long *__key
,
729 size_t index
, void *__func
)
731 visitor64_t func
= __func
;
732 u64
*key
= (void *)__key
;
734 func(elem
, opaque
, *key
, index
);
736 EXPORT_SYMBOL_GPL(visitor64
);
738 void visitor128(void *elem
, unsigned long opaque
, unsigned long *__key
,
739 size_t index
, void *__func
)
741 visitor128_t func
= __func
;
742 u64
*key
= (void *)__key
;
744 func(elem
, opaque
, key
[0], key
[1], index
);
746 EXPORT_SYMBOL_GPL(visitor128
);
748 size_t btree_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
749 unsigned long opaque
,
750 void (*func
)(void *elem
, unsigned long opaque
,
752 size_t index
, void *func2
),
760 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
761 func2
, 0, head
->height
, 0);
764 EXPORT_SYMBOL_GPL(btree_visitor
);
766 size_t btree_grim_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
767 unsigned long opaque
,
768 void (*func
)(void *elem
, unsigned long opaque
,
770 size_t index
, void *func2
),
778 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
779 func2
, 1, head
->height
, 0);
783 EXPORT_SYMBOL_GPL(btree_grim_visitor
);
785 static int __init
btree_module_init(void)
787 btree_cachep
= kmem_cache_create("btree_node", NODESIZE
, 0,
788 SLAB_HWCACHE_ALIGN
, NULL
);
792 static void __exit
btree_module_exit(void)
794 kmem_cache_destroy(btree_cachep
);
797 /* If core code starts using btree, initialization should happen even earlier */
798 module_init(btree_module_init
);
799 module_exit(btree_module_exit
);
801 MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
802 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
803 MODULE_LICENSE("GPL");