2 * lib/btree.c - Simple In-memory B+Tree
4 * As should be obvious for Linux kernel code, license is GPLv2
6 * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org>
7 * Bits and pieces stolen from Peter Zijlstra's code, which is
8 * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
11 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
13 * A relatively simple B+Tree implementation. I have written it as a learning
14 * exercise to understand how B+Trees work. Turned out to be useful as well.
16 * B+Trees can be used similar to Linux radix trees (which don't have anything
17 * in common with textbook radix trees, beware). Prerequisite for them working
18 * well is that access to a random tree node is much faster than a large number
19 * of operations within each node.
21 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
22 * has gained similar properties, as memory access times, when measured in cpu
23 * cycles, have increased. Cacheline sizes have increased as well, which also
26 * Compared to radix trees, B+Trees are more efficient when dealing with a
27 * sparsely populated address space. Between 25% and 50% of the memory is
28 * occupied with valid pointers. When densely populated, radix trees contain
29 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
32 * This particular implementation stores pointers identified by a long value.
33 * Storing NULL pointers is illegal, lookup will return NULL when no entry
36 * A tricks was used that is not commonly found in textbooks. The lowest
37 * values are to the right, not to the left. All used slots within a node
38 * are on the left, all unused slots contain NUL values. Most operations
39 * simply loop once over all slots and terminate on the first NUL.
42 #include <linux/btree.h>
43 #include <linux/cache.h>
44 #include <linux/kernel.h>
45 #include <linux/slab.h>
46 #include <linux/module.h>
48 #define MAX(a, b) ((a) > (b) ? (a) : (b))
49 #define NODESIZE MAX(L1_CACHE_BYTES, 128)
57 struct btree_geo btree_geo32
= {
59 .no_pairs
= NODESIZE
/ sizeof(long) / 2,
60 .no_longs
= NODESIZE
/ sizeof(long) / 2,
62 EXPORT_SYMBOL_GPL(btree_geo32
);
64 #define LONG_PER_U64 (64 / BITS_PER_LONG)
65 struct btree_geo btree_geo64
= {
66 .keylen
= LONG_PER_U64
,
67 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
),
68 .no_longs
= LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
)),
70 EXPORT_SYMBOL_GPL(btree_geo64
);
72 struct btree_geo btree_geo128
= {
73 .keylen
= 2 * LONG_PER_U64
,
74 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
),
75 .no_longs
= 2 * LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
)),
77 EXPORT_SYMBOL_GPL(btree_geo128
);
79 static struct kmem_cache
*btree_cachep
;
81 void *btree_alloc(gfp_t gfp_mask
, void *pool_data
)
83 return kmem_cache_alloc(btree_cachep
, gfp_mask
);
85 EXPORT_SYMBOL_GPL(btree_alloc
);
87 void btree_free(void *element
, void *pool_data
)
89 kmem_cache_free(btree_cachep
, element
);
91 EXPORT_SYMBOL_GPL(btree_free
);
93 static unsigned long *btree_node_alloc(struct btree_head
*head
, gfp_t gfp
)
97 node
= mempool_alloc(head
->mempool
, gfp
);
99 memset(node
, 0, NODESIZE
);
103 static int longcmp(const unsigned long *l1
, const unsigned long *l2
, size_t n
)
107 for (i
= 0; i
< n
; i
++) {
116 static unsigned long *longcpy(unsigned long *dest
, const unsigned long *src
,
121 for (i
= 0; i
< n
; i
++)
126 static unsigned long *longset(unsigned long *s
, unsigned long c
, size_t n
)
130 for (i
= 0; i
< n
; i
++)
135 static void dec_key(struct btree_geo
*geo
, unsigned long *key
)
140 for (i
= geo
->keylen
- 1; i
>= 0; i
--) {
148 static unsigned long *bkey(struct btree_geo
*geo
, unsigned long *node
, int n
)
150 return &node
[n
* geo
->keylen
];
153 static void *bval(struct btree_geo
*geo
, unsigned long *node
, int n
)
155 return (void *)node
[geo
->no_longs
+ n
];
158 static void setkey(struct btree_geo
*geo
, unsigned long *node
, int n
,
161 longcpy(bkey(geo
, node
, n
), key
, geo
->keylen
);
164 static void setval(struct btree_geo
*geo
, unsigned long *node
, int n
,
167 node
[geo
->no_longs
+ n
] = (unsigned long) val
;
170 static void clearpair(struct btree_geo
*geo
, unsigned long *node
, int n
)
172 longset(bkey(geo
, node
, n
), 0, geo
->keylen
);
173 node
[geo
->no_longs
+ n
] = 0;
176 static inline void __btree_init(struct btree_head
*head
)
182 void btree_init_mempool(struct btree_head
*head
, mempool_t
*mempool
)
185 head
->mempool
= mempool
;
187 EXPORT_SYMBOL_GPL(btree_init_mempool
);
189 int btree_init(struct btree_head
*head
)
192 head
->mempool
= mempool_create(0, btree_alloc
, btree_free
, NULL
);
197 EXPORT_SYMBOL_GPL(btree_init
);
199 void btree_destroy(struct btree_head
*head
)
201 mempool_destroy(head
->mempool
);
202 head
->mempool
= NULL
;
204 EXPORT_SYMBOL_GPL(btree_destroy
);
206 void *btree_last(struct btree_head
*head
, struct btree_geo
*geo
,
209 int height
= head
->height
;
210 unsigned long *node
= head
->node
;
215 for ( ; height
> 1; height
--)
216 node
= bval(geo
, node
, 0);
218 longcpy(key
, bkey(geo
, node
, 0), geo
->keylen
);
219 return bval(geo
, node
, 0);
221 EXPORT_SYMBOL_GPL(btree_last
);
223 static int keycmp(struct btree_geo
*geo
, unsigned long *node
, int pos
,
226 return longcmp(bkey(geo
, node
, pos
), key
, geo
->keylen
);
229 static int keyzero(struct btree_geo
*geo
, unsigned long *key
)
233 for (i
= 0; i
< geo
->keylen
; i
++)
240 void *btree_lookup(struct btree_head
*head
, struct btree_geo
*geo
,
243 int i
, height
= head
->height
;
244 unsigned long *node
= head
->node
;
249 for ( ; height
> 1; height
--) {
250 for (i
= 0; i
< geo
->no_pairs
; i
++)
251 if (keycmp(geo
, node
, i
, key
) <= 0)
253 if (i
== geo
->no_pairs
)
255 node
= bval(geo
, node
, i
);
263 for (i
= 0; i
< geo
->no_pairs
; i
++)
264 if (keycmp(geo
, node
, i
, key
) == 0)
265 return bval(geo
, node
, i
);
268 EXPORT_SYMBOL_GPL(btree_lookup
);
270 int btree_update(struct btree_head
*head
, struct btree_geo
*geo
,
271 unsigned long *key
, void *val
)
273 int i
, height
= head
->height
;
274 unsigned long *node
= head
->node
;
279 for ( ; height
> 1; height
--) {
280 for (i
= 0; i
< geo
->no_pairs
; i
++)
281 if (keycmp(geo
, node
, i
, key
) <= 0)
283 if (i
== geo
->no_pairs
)
285 node
= bval(geo
, node
, i
);
293 for (i
= 0; i
< geo
->no_pairs
; i
++)
294 if (keycmp(geo
, node
, i
, key
) == 0) {
295 setval(geo
, node
, i
, val
);
300 EXPORT_SYMBOL_GPL(btree_update
);
303 * Usually this function is quite similar to normal lookup. But the key of
304 * a parent node may be smaller than the smallest key of all its siblings.
305 * In such a case we cannot just return NULL, as we have only proven that no
306 * key smaller than __key, but larger than this parent key exists.
307 * So we set __key to the parent key and retry. We have to use the smallest
308 * such parent key, which is the last parent key we encountered.
310 void *btree_get_prev(struct btree_head
*head
, struct btree_geo
*geo
,
311 unsigned long *__key
)
314 unsigned long *node
, *oldnode
;
315 unsigned long *retry_key
= NULL
, key
[geo
->keylen
];
317 if (keyzero(geo
, __key
))
320 if (head
->height
== 0)
322 longcpy(key
, __key
, geo
->keylen
);
327 for (height
= head
->height
; height
> 1; height
--) {
328 for (i
= 0; i
< geo
->no_pairs
; i
++)
329 if (keycmp(geo
, node
, i
, key
) <= 0)
331 if (i
== geo
->no_pairs
)
334 node
= bval(geo
, node
, i
);
337 retry_key
= bkey(geo
, oldnode
, i
);
343 for (i
= 0; i
< geo
->no_pairs
; i
++) {
344 if (keycmp(geo
, node
, i
, key
) <= 0) {
345 if (bval(geo
, node
, i
)) {
346 longcpy(__key
, bkey(geo
, node
, i
), geo
->keylen
);
347 return bval(geo
, node
, i
);
354 longcpy(key
, retry_key
, geo
->keylen
);
360 EXPORT_SYMBOL_GPL(btree_get_prev
);
362 static int getpos(struct btree_geo
*geo
, unsigned long *node
,
367 for (i
= 0; i
< geo
->no_pairs
; i
++) {
368 if (keycmp(geo
, node
, i
, key
) <= 0)
374 static int getfill(struct btree_geo
*geo
, unsigned long *node
, int start
)
378 for (i
= start
; i
< geo
->no_pairs
; i
++)
379 if (!bval(geo
, node
, i
))
385 * locate the correct leaf node in the btree
387 static unsigned long *find_level(struct btree_head
*head
, struct btree_geo
*geo
,
388 unsigned long *key
, int level
)
390 unsigned long *node
= head
->node
;
393 for (height
= head
->height
; height
> level
; height
--) {
394 for (i
= 0; i
< geo
->no_pairs
; i
++)
395 if (keycmp(geo
, node
, i
, key
) <= 0)
398 if ((i
== geo
->no_pairs
) || !bval(geo
, node
, i
)) {
399 /* right-most key is too large, update it */
400 /* FIXME: If the right-most key on higher levels is
401 * always zero, this wouldn't be necessary. */
403 setkey(geo
, node
, i
, key
);
406 node
= bval(geo
, node
, i
);
412 static int btree_grow(struct btree_head
*head
, struct btree_geo
*geo
,
418 node
= btree_node_alloc(head
, gfp
);
422 fill
= getfill(geo
, head
->node
, 0);
423 setkey(geo
, node
, 0, bkey(geo
, head
->node
, fill
- 1));
424 setval(geo
, node
, 0, head
->node
);
431 static void btree_shrink(struct btree_head
*head
, struct btree_geo
*geo
)
436 if (head
->height
<= 1)
440 fill
= getfill(geo
, node
, 0);
442 head
->node
= bval(geo
, node
, 0);
444 mempool_free(node
, head
->mempool
);
447 static int btree_insert_level(struct btree_head
*head
, struct btree_geo
*geo
,
448 unsigned long *key
, void *val
, int level
,
452 int i
, pos
, fill
, err
;
455 if (head
->height
< level
) {
456 err
= btree_grow(head
, geo
, gfp
);
462 node
= find_level(head
, geo
, key
, level
);
463 pos
= getpos(geo
, node
, key
);
464 fill
= getfill(geo
, node
, pos
);
465 /* two identical keys are not allowed */
466 BUG_ON(pos
< fill
&& keycmp(geo
, node
, pos
, key
) == 0);
468 if (fill
== geo
->no_pairs
) {
469 /* need to split node */
472 new = btree_node_alloc(head
, gfp
);
475 err
= btree_insert_level(head
, geo
,
476 bkey(geo
, node
, fill
/ 2 - 1),
477 new, level
+ 1, gfp
);
479 mempool_free(new, head
->mempool
);
482 for (i
= 0; i
< fill
/ 2; i
++) {
483 setkey(geo
, new, i
, bkey(geo
, node
, i
));
484 setval(geo
, new, i
, bval(geo
, node
, i
));
485 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ fill
/ 2));
486 setval(geo
, node
, i
, bval(geo
, node
, i
+ fill
/ 2));
487 clearpair(geo
, node
, i
+ fill
/ 2);
490 setkey(geo
, node
, i
, bkey(geo
, node
, fill
- 1));
491 setval(geo
, node
, i
, bval(geo
, node
, fill
- 1));
492 clearpair(geo
, node
, fill
- 1);
496 BUG_ON(fill
>= geo
->no_pairs
);
498 /* shift and insert */
499 for (i
= fill
; i
> pos
; i
--) {
500 setkey(geo
, node
, i
, bkey(geo
, node
, i
- 1));
501 setval(geo
, node
, i
, bval(geo
, node
, i
- 1));
503 setkey(geo
, node
, pos
, key
);
504 setval(geo
, node
, pos
, val
);
509 int btree_insert(struct btree_head
*head
, struct btree_geo
*geo
,
510 unsigned long *key
, void *val
, gfp_t gfp
)
513 return btree_insert_level(head
, geo
, key
, val
, 1, gfp
);
515 EXPORT_SYMBOL_GPL(btree_insert
);
517 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
518 unsigned long *key
, int level
);
519 static void merge(struct btree_head
*head
, struct btree_geo
*geo
, int level
,
520 unsigned long *left
, int lfill
,
521 unsigned long *right
, int rfill
,
522 unsigned long *parent
, int lpos
)
526 for (i
= 0; i
< rfill
; i
++) {
527 /* Move all keys to the left */
528 setkey(geo
, left
, lfill
+ i
, bkey(geo
, right
, i
));
529 setval(geo
, left
, lfill
+ i
, bval(geo
, right
, i
));
531 /* Exchange left and right child in parent */
532 setval(geo
, parent
, lpos
, right
);
533 setval(geo
, parent
, lpos
+ 1, left
);
534 /* Remove left (formerly right) child from parent */
535 btree_remove_level(head
, geo
, bkey(geo
, parent
, lpos
), level
+ 1);
536 mempool_free(right
, head
->mempool
);
539 static void rebalance(struct btree_head
*head
, struct btree_geo
*geo
,
540 unsigned long *key
, int level
, unsigned long *child
, int fill
)
542 unsigned long *parent
, *left
= NULL
, *right
= NULL
;
543 int i
, no_left
, no_right
;
546 /* Because we don't steal entries from a neighbour, this case
547 * can happen. Parent node contains a single child, this
548 * node, so merging with a sibling never happens.
550 btree_remove_level(head
, geo
, key
, level
+ 1);
551 mempool_free(child
, head
->mempool
);
555 parent
= find_level(head
, geo
, key
, level
+ 1);
556 i
= getpos(geo
, parent
, key
);
557 BUG_ON(bval(geo
, parent
, i
) != child
);
560 left
= bval(geo
, parent
, i
- 1);
561 no_left
= getfill(geo
, left
, 0);
562 if (fill
+ no_left
<= geo
->no_pairs
) {
563 merge(head
, geo
, level
,
570 if (i
+ 1 < getfill(geo
, parent
, i
)) {
571 right
= bval(geo
, parent
, i
+ 1);
572 no_right
= getfill(geo
, right
, 0);
573 if (fill
+ no_right
<= geo
->no_pairs
) {
574 merge(head
, geo
, level
,
582 * We could also try to steal one entry from the left or right
583 * neighbor. By not doing so we changed the invariant from
584 * "all nodes are at least half full" to "no two neighboring
585 * nodes can be merged". Which means that the average fill of
586 * all nodes is still half or better.
590 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
591 unsigned long *key
, int level
)
597 if (level
> head
->height
) {
598 /* we recursed all the way up */
604 node
= find_level(head
, geo
, key
, level
);
605 pos
= getpos(geo
, node
, key
);
606 fill
= getfill(geo
, node
, pos
);
607 if ((level
== 1) && (keycmp(geo
, node
, pos
, key
) != 0))
609 ret
= bval(geo
, node
, pos
);
611 /* remove and shift */
612 for (i
= pos
; i
< fill
- 1; i
++) {
613 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ 1));
614 setval(geo
, node
, i
, bval(geo
, node
, i
+ 1));
616 clearpair(geo
, node
, fill
- 1);
618 if (fill
- 1 < geo
->no_pairs
/ 2) {
619 if (level
< head
->height
)
620 rebalance(head
, geo
, key
, level
, node
, fill
- 1);
621 else if (fill
- 1 == 1)
622 btree_shrink(head
, geo
);
628 void *btree_remove(struct btree_head
*head
, struct btree_geo
*geo
,
631 if (head
->height
== 0)
634 return btree_remove_level(head
, geo
, key
, 1);
636 EXPORT_SYMBOL_GPL(btree_remove
);
638 int btree_merge(struct btree_head
*target
, struct btree_head
*victim
,
639 struct btree_geo
*geo
, gfp_t gfp
)
641 unsigned long key
[geo
->keylen
];
642 unsigned long dup
[geo
->keylen
];
646 BUG_ON(target
== victim
);
648 if (!(target
->node
)) {
649 /* target is empty, just copy fields over */
650 target
->node
= victim
->node
;
651 target
->height
= victim
->height
;
652 __btree_init(victim
);
656 /* TODO: This needs some optimizations. Currently we do three tree
657 * walks to remove a single object from the victim.
660 if (!btree_last(victim
, geo
, key
))
662 val
= btree_lookup(victim
, geo
, key
);
663 err
= btree_insert(target
, geo
, key
, val
, gfp
);
666 /* We must make a copy of the key, as the original will get
667 * mangled inside btree_remove. */
668 longcpy(dup
, key
, geo
->keylen
);
669 btree_remove(victim
, geo
, dup
);
673 EXPORT_SYMBOL_GPL(btree_merge
);
675 static size_t __btree_for_each(struct btree_head
*head
, struct btree_geo
*geo
,
676 unsigned long *node
, unsigned long opaque
,
677 void (*func
)(void *elem
, unsigned long opaque
,
678 unsigned long *key
, size_t index
,
680 void *func2
, int reap
, int height
, size_t count
)
683 unsigned long *child
;
685 for (i
= 0; i
< geo
->no_pairs
; i
++) {
686 child
= bval(geo
, node
, i
);
690 count
= __btree_for_each(head
, geo
, child
, opaque
,
691 func
, func2
, reap
, height
- 1, count
);
693 func(child
, opaque
, bkey(geo
, node
, i
), count
++,
697 mempool_free(node
, head
->mempool
);
701 static void empty(void *elem
, unsigned long opaque
, unsigned long *key
,
702 size_t index
, void *func2
)
706 void visitorl(void *elem
, unsigned long opaque
, unsigned long *key
,
707 size_t index
, void *__func
)
709 visitorl_t func
= __func
;
711 func(elem
, opaque
, *key
, index
);
713 EXPORT_SYMBOL_GPL(visitorl
);
715 void visitor32(void *elem
, unsigned long opaque
, unsigned long *__key
,
716 size_t index
, void *__func
)
718 visitor32_t func
= __func
;
719 u32
*key
= (void *)__key
;
721 func(elem
, opaque
, *key
, index
);
723 EXPORT_SYMBOL_GPL(visitor32
);
725 void visitor64(void *elem
, unsigned long opaque
, unsigned long *__key
,
726 size_t index
, void *__func
)
728 visitor64_t func
= __func
;
729 u64
*key
= (void *)__key
;
731 func(elem
, opaque
, *key
, index
);
733 EXPORT_SYMBOL_GPL(visitor64
);
735 void visitor128(void *elem
, unsigned long opaque
, unsigned long *__key
,
736 size_t index
, void *__func
)
738 visitor128_t func
= __func
;
739 u64
*key
= (void *)__key
;
741 func(elem
, opaque
, key
[0], key
[1], index
);
743 EXPORT_SYMBOL_GPL(visitor128
);
745 size_t btree_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
746 unsigned long opaque
,
747 void (*func
)(void *elem
, unsigned long opaque
,
749 size_t index
, void *func2
),
757 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
758 func2
, 0, head
->height
, 0);
761 EXPORT_SYMBOL_GPL(btree_visitor
);
763 size_t btree_grim_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
764 unsigned long opaque
,
765 void (*func
)(void *elem
, unsigned long opaque
,
767 size_t index
, void *func2
),
775 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
776 func2
, 1, head
->height
, 0);
780 EXPORT_SYMBOL_GPL(btree_grim_visitor
);
782 static int __init
btree_module_init(void)
784 btree_cachep
= kmem_cache_create("btree_node", NODESIZE
, 0,
785 SLAB_HWCACHE_ALIGN
, NULL
);
789 static void __exit
btree_module_exit(void)
791 kmem_cache_destroy(btree_cachep
);
794 /* If core code starts using btree, initialization should happen even earlier */
795 module_init(btree_module_init
);
796 module_exit(btree_module_exit
);
798 MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
799 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
800 MODULE_LICENSE("GPL");