2 * lib/btree.c - Simple In-memory B+Tree
4 * As should be obvious for Linux kernel code, license is GPLv2
6 * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org>
7 * Bits and pieces stolen from Peter Zijlstra's code, which is
8 * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
11 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
13 * A relatively simple B+Tree implementation. I have written it as a learning
14 * excercise to understand how B+Trees work. Turned out to be useful as well.
16 * B+Trees can be used similar to Linux radix trees (which don't have anything
17 * in common with textbook radix trees, beware). Prerequisite for them working
18 * well is that access to a random tree node is much faster than a large number
19 * of operations within each node.
21 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
22 * has gained similar properties, as memory access times, when measured in cpu
23 * cycles, have increased. Cacheline sizes have increased as well, which also
26 * Compared to radix trees, B+Trees are more efficient when dealing with a
27 * sparsely populated address space. Between 25% and 50% of the memory is
28 * occupied with valid pointers. When densely populated, radix trees contain
29 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
32 * This particular implementation stores pointers identified by a long value.
33 * Storing NULL pointers is illegal, lookup will return NULL when no entry
36 * A tricks was used that is not commonly found in textbooks. The lowest
37 * values are to the right, not to the left. All used slots within a node
38 * are on the left, all unused slots contain NUL values. Most operations
39 * simply loop once over all slots and terminate on the first NUL.
42 #include <linux/btree.h>
43 #include <linux/cache.h>
44 #include <linux/kernel.h>
45 #include <linux/slab.h>
46 #include <linux/module.h>
48 #define MAX(a, b) ((a) > (b) ? (a) : (b))
49 #define NODESIZE MAX(L1_CACHE_BYTES, 128)
57 struct btree_geo btree_geo32
= {
59 .no_pairs
= NODESIZE
/ sizeof(long) / 2,
60 .no_longs
= NODESIZE
/ sizeof(long) / 2,
62 EXPORT_SYMBOL_GPL(btree_geo32
);
64 #define LONG_PER_U64 (64 / BITS_PER_LONG)
65 struct btree_geo btree_geo64
= {
66 .keylen
= LONG_PER_U64
,
67 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
),
68 .no_longs
= LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
)),
70 EXPORT_SYMBOL_GPL(btree_geo64
);
72 struct btree_geo btree_geo128
= {
73 .keylen
= 2 * LONG_PER_U64
,
74 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
),
75 .no_longs
= 2 * LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
)),
77 EXPORT_SYMBOL_GPL(btree_geo128
);
79 static struct kmem_cache
*btree_cachep
;
81 void *btree_alloc(gfp_t gfp_mask
, void *pool_data
)
83 return kmem_cache_alloc(btree_cachep
, gfp_mask
);
85 EXPORT_SYMBOL_GPL(btree_alloc
);
87 void btree_free(void *element
, void *pool_data
)
89 kmem_cache_free(btree_cachep
, element
);
91 EXPORT_SYMBOL_GPL(btree_free
);
93 static unsigned long *btree_node_alloc(struct btree_head
*head
, gfp_t gfp
)
97 node
= mempool_alloc(head
->mempool
, gfp
);
98 memset(node
, 0, NODESIZE
);
102 static int longcmp(const unsigned long *l1
, const unsigned long *l2
, size_t n
)
106 for (i
= 0; i
< n
; i
++) {
115 static unsigned long *longcpy(unsigned long *dest
, const unsigned long *src
,
120 for (i
= 0; i
< n
; i
++)
125 static unsigned long *longset(unsigned long *s
, unsigned long c
, size_t n
)
129 for (i
= 0; i
< n
; i
++)
134 static void dec_key(struct btree_geo
*geo
, unsigned long *key
)
139 for (i
= geo
->keylen
- 1; i
>= 0; i
--) {
147 static unsigned long *bkey(struct btree_geo
*geo
, unsigned long *node
, int n
)
149 return &node
[n
* geo
->keylen
];
152 static void *bval(struct btree_geo
*geo
, unsigned long *node
, int n
)
154 return (void *)node
[geo
->no_longs
+ n
];
157 static void setkey(struct btree_geo
*geo
, unsigned long *node
, int n
,
160 longcpy(bkey(geo
, node
, n
), key
, geo
->keylen
);
163 static void setval(struct btree_geo
*geo
, unsigned long *node
, int n
,
166 node
[geo
->no_longs
+ n
] = (unsigned long) val
;
169 static void clearpair(struct btree_geo
*geo
, unsigned long *node
, int n
)
171 longset(bkey(geo
, node
, n
), 0, geo
->keylen
);
172 node
[geo
->no_longs
+ n
] = 0;
175 static inline void __btree_init(struct btree_head
*head
)
181 void btree_init_mempool(struct btree_head
*head
, mempool_t
*mempool
)
184 head
->mempool
= mempool
;
186 EXPORT_SYMBOL_GPL(btree_init_mempool
);
188 int btree_init(struct btree_head
*head
)
191 head
->mempool
= mempool_create(0, btree_alloc
, btree_free
, NULL
);
196 EXPORT_SYMBOL_GPL(btree_init
);
198 void btree_destroy(struct btree_head
*head
)
200 mempool_destroy(head
->mempool
);
201 head
->mempool
= NULL
;
203 EXPORT_SYMBOL_GPL(btree_destroy
);
205 void *btree_last(struct btree_head
*head
, struct btree_geo
*geo
,
208 int height
= head
->height
;
209 unsigned long *node
= head
->node
;
214 for ( ; height
> 1; height
--)
215 node
= bval(geo
, node
, 0);
217 longcpy(key
, bkey(geo
, node
, 0), geo
->keylen
);
218 return bval(geo
, node
, 0);
220 EXPORT_SYMBOL_GPL(btree_last
);
222 static int keycmp(struct btree_geo
*geo
, unsigned long *node
, int pos
,
225 return longcmp(bkey(geo
, node
, pos
), key
, geo
->keylen
);
228 static int keyzero(struct btree_geo
*geo
, unsigned long *key
)
232 for (i
= 0; i
< geo
->keylen
; i
++)
239 void *btree_lookup(struct btree_head
*head
, struct btree_geo
*geo
,
242 int i
, height
= head
->height
;
243 unsigned long *node
= head
->node
;
248 for ( ; height
> 1; height
--) {
249 for (i
= 0; i
< geo
->no_pairs
; i
++)
250 if (keycmp(geo
, node
, i
, key
) <= 0)
252 if (i
== geo
->no_pairs
)
254 node
= bval(geo
, node
, i
);
262 for (i
= 0; i
< geo
->no_pairs
; i
++)
263 if (keycmp(geo
, node
, i
, key
) == 0)
264 return bval(geo
, node
, i
);
267 EXPORT_SYMBOL_GPL(btree_lookup
);
269 int btree_update(struct btree_head
*head
, struct btree_geo
*geo
,
270 unsigned long *key
, void *val
)
272 int i
, height
= head
->height
;
273 unsigned long *node
= head
->node
;
278 for ( ; height
> 1; height
--) {
279 for (i
= 0; i
< geo
->no_pairs
; i
++)
280 if (keycmp(geo
, node
, i
, key
) <= 0)
282 if (i
== geo
->no_pairs
)
284 node
= bval(geo
, node
, i
);
292 for (i
= 0; i
< geo
->no_pairs
; i
++)
293 if (keycmp(geo
, node
, i
, key
) == 0) {
294 setval(geo
, node
, i
, val
);
299 EXPORT_SYMBOL_GPL(btree_update
);
302 * Usually this function is quite similar to normal lookup. But the key of
303 * a parent node may be smaller than the smallest key of all its siblings.
304 * In such a case we cannot just return NULL, as we have only proven that no
305 * key smaller than __key, but larger than this parent key exists.
306 * So we set __key to the parent key and retry. We have to use the smallest
307 * such parent key, which is the last parent key we encountered.
309 void *btree_get_prev(struct btree_head
*head
, struct btree_geo
*geo
,
310 unsigned long *__key
)
313 unsigned long *node
, *oldnode
;
314 unsigned long *retry_key
= NULL
, key
[geo
->keylen
];
316 if (keyzero(geo
, __key
))
319 if (head
->height
== 0)
322 longcpy(key
, __key
, geo
->keylen
);
326 for (height
= head
->height
; height
> 1; height
--) {
327 for (i
= 0; i
< geo
->no_pairs
; i
++)
328 if (keycmp(geo
, node
, i
, key
) <= 0)
330 if (i
== geo
->no_pairs
)
333 node
= bval(geo
, node
, i
);
336 retry_key
= bkey(geo
, oldnode
, i
);
342 for (i
= 0; i
< geo
->no_pairs
; i
++) {
343 if (keycmp(geo
, node
, i
, key
) <= 0) {
344 if (bval(geo
, node
, i
)) {
345 longcpy(__key
, bkey(geo
, node
, i
), geo
->keylen
);
346 return bval(geo
, node
, i
);
360 static int getpos(struct btree_geo
*geo
, unsigned long *node
,
365 for (i
= 0; i
< geo
->no_pairs
; i
++) {
366 if (keycmp(geo
, node
, i
, key
) <= 0)
372 static int getfill(struct btree_geo
*geo
, unsigned long *node
, int start
)
376 for (i
= start
; i
< geo
->no_pairs
; i
++)
377 if (!bval(geo
, node
, i
))
383 * locate the correct leaf node in the btree
385 static unsigned long *find_level(struct btree_head
*head
, struct btree_geo
*geo
,
386 unsigned long *key
, int level
)
388 unsigned long *node
= head
->node
;
391 for (height
= head
->height
; height
> level
; height
--) {
392 for (i
= 0; i
< geo
->no_pairs
; i
++)
393 if (keycmp(geo
, node
, i
, key
) <= 0)
396 if ((i
== geo
->no_pairs
) || !bval(geo
, node
, i
)) {
397 /* right-most key is too large, update it */
398 /* FIXME: If the right-most key on higher levels is
399 * always zero, this wouldn't be necessary. */
401 setkey(geo
, node
, i
, key
);
404 node
= bval(geo
, node
, i
);
410 static int btree_grow(struct btree_head
*head
, struct btree_geo
*geo
,
416 node
= btree_node_alloc(head
, gfp
);
420 fill
= getfill(geo
, head
->node
, 0);
421 setkey(geo
, node
, 0, bkey(geo
, head
->node
, fill
- 1));
422 setval(geo
, node
, 0, head
->node
);
429 static void btree_shrink(struct btree_head
*head
, struct btree_geo
*geo
)
434 if (head
->height
<= 1)
438 fill
= getfill(geo
, node
, 0);
440 head
->node
= bval(geo
, node
, 0);
442 mempool_free(node
, head
->mempool
);
445 static int btree_insert_level(struct btree_head
*head
, struct btree_geo
*geo
,
446 unsigned long *key
, void *val
, int level
,
450 int i
, pos
, fill
, err
;
453 if (head
->height
< level
) {
454 err
= btree_grow(head
, geo
, gfp
);
460 node
= find_level(head
, geo
, key
, level
);
461 pos
= getpos(geo
, node
, key
);
462 fill
= getfill(geo
, node
, pos
);
463 /* two identical keys are not allowed */
464 BUG_ON(pos
< fill
&& keycmp(geo
, node
, pos
, key
) == 0);
466 if (fill
== geo
->no_pairs
) {
467 /* need to split node */
470 new = btree_node_alloc(head
, gfp
);
473 err
= btree_insert_level(head
, geo
,
474 bkey(geo
, node
, fill
/ 2 - 1),
475 new, level
+ 1, gfp
);
477 mempool_free(new, head
->mempool
);
480 for (i
= 0; i
< fill
/ 2; i
++) {
481 setkey(geo
, new, i
, bkey(geo
, node
, i
));
482 setval(geo
, new, i
, bval(geo
, node
, i
));
483 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ fill
/ 2));
484 setval(geo
, node
, i
, bval(geo
, node
, i
+ fill
/ 2));
485 clearpair(geo
, node
, i
+ fill
/ 2);
488 setkey(geo
, node
, i
, bkey(geo
, node
, fill
- 1));
489 setval(geo
, node
, i
, bval(geo
, node
, fill
- 1));
490 clearpair(geo
, node
, fill
- 1);
494 BUG_ON(fill
>= geo
->no_pairs
);
496 /* shift and insert */
497 for (i
= fill
; i
> pos
; i
--) {
498 setkey(geo
, node
, i
, bkey(geo
, node
, i
- 1));
499 setval(geo
, node
, i
, bval(geo
, node
, i
- 1));
501 setkey(geo
, node
, pos
, key
);
502 setval(geo
, node
, pos
, val
);
507 int btree_insert(struct btree_head
*head
, struct btree_geo
*geo
,
508 unsigned long *key
, void *val
, gfp_t gfp
)
510 return btree_insert_level(head
, geo
, key
, val
, 1, gfp
);
512 EXPORT_SYMBOL_GPL(btree_insert
);
514 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
515 unsigned long *key
, int level
);
516 static void merge(struct btree_head
*head
, struct btree_geo
*geo
, int level
,
517 unsigned long *left
, int lfill
,
518 unsigned long *right
, int rfill
,
519 unsigned long *parent
, int lpos
)
523 for (i
= 0; i
< rfill
; i
++) {
524 /* Move all keys to the left */
525 setkey(geo
, left
, lfill
+ i
, bkey(geo
, right
, i
));
526 setval(geo
, left
, lfill
+ i
, bval(geo
, right
, i
));
528 /* Exchange left and right child in parent */
529 setval(geo
, parent
, lpos
, right
);
530 setval(geo
, parent
, lpos
+ 1, left
);
531 /* Remove left (formerly right) child from parent */
532 btree_remove_level(head
, geo
, bkey(geo
, parent
, lpos
), level
+ 1);
533 mempool_free(right
, head
->mempool
);
536 static void rebalance(struct btree_head
*head
, struct btree_geo
*geo
,
537 unsigned long *key
, int level
, unsigned long *child
, int fill
)
539 unsigned long *parent
, *left
= NULL
, *right
= NULL
;
540 int i
, no_left
, no_right
;
543 /* Because we don't steal entries from a neigbour, this case
544 * can happen. Parent node contains a single child, this
545 * node, so merging with a sibling never happens.
547 btree_remove_level(head
, geo
, key
, level
+ 1);
548 mempool_free(child
, head
->mempool
);
552 parent
= find_level(head
, geo
, key
, level
+ 1);
553 i
= getpos(geo
, parent
, key
);
554 BUG_ON(bval(geo
, parent
, i
) != child
);
557 left
= bval(geo
, parent
, i
- 1);
558 no_left
= getfill(geo
, left
, 0);
559 if (fill
+ no_left
<= geo
->no_pairs
) {
560 merge(head
, geo
, level
,
567 if (i
+ 1 < getfill(geo
, parent
, i
)) {
568 right
= bval(geo
, parent
, i
+ 1);
569 no_right
= getfill(geo
, right
, 0);
570 if (fill
+ no_right
<= geo
->no_pairs
) {
571 merge(head
, geo
, level
,
579 * We could also try to steal one entry from the left or right
580 * neighbor. By not doing so we changed the invariant from
581 * "all nodes are at least half full" to "no two neighboring
582 * nodes can be merged". Which means that the average fill of
583 * all nodes is still half or better.
587 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
588 unsigned long *key
, int level
)
594 if (level
> head
->height
) {
595 /* we recursed all the way up */
601 node
= find_level(head
, geo
, key
, level
);
602 pos
= getpos(geo
, node
, key
);
603 fill
= getfill(geo
, node
, pos
);
604 if ((level
== 1) && (keycmp(geo
, node
, pos
, key
) != 0))
606 ret
= bval(geo
, node
, pos
);
608 /* remove and shift */
609 for (i
= pos
; i
< fill
- 1; i
++) {
610 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ 1));
611 setval(geo
, node
, i
, bval(geo
, node
, i
+ 1));
613 clearpair(geo
, node
, fill
- 1);
615 if (fill
- 1 < geo
->no_pairs
/ 2) {
616 if (level
< head
->height
)
617 rebalance(head
, geo
, key
, level
, node
, fill
- 1);
618 else if (fill
- 1 == 1)
619 btree_shrink(head
, geo
);
625 void *btree_remove(struct btree_head
*head
, struct btree_geo
*geo
,
628 if (head
->height
== 0)
631 return btree_remove_level(head
, geo
, key
, 1);
633 EXPORT_SYMBOL_GPL(btree_remove
);
635 int btree_merge(struct btree_head
*target
, struct btree_head
*victim
,
636 struct btree_geo
*geo
, gfp_t gfp
)
638 unsigned long key
[geo
->keylen
];
639 unsigned long dup
[geo
->keylen
];
643 BUG_ON(target
== victim
);
645 if (!(target
->node
)) {
646 /* target is empty, just copy fields over */
647 target
->node
= victim
->node
;
648 target
->height
= victim
->height
;
649 __btree_init(victim
);
653 /* TODO: This needs some optimizations. Currently we do three tree
654 * walks to remove a single object from the victim.
657 if (!btree_last(victim
, geo
, key
))
659 val
= btree_lookup(victim
, geo
, key
);
660 err
= btree_insert(target
, geo
, key
, val
, gfp
);
663 /* We must make a copy of the key, as the original will get
664 * mangled inside btree_remove. */
665 longcpy(dup
, key
, geo
->keylen
);
666 btree_remove(victim
, geo
, dup
);
670 EXPORT_SYMBOL_GPL(btree_merge
);
672 static size_t __btree_for_each(struct btree_head
*head
, struct btree_geo
*geo
,
673 unsigned long *node
, unsigned long opaque
,
674 void (*func
)(void *elem
, unsigned long opaque
,
675 unsigned long *key
, size_t index
,
677 void *func2
, int reap
, int height
, size_t count
)
680 unsigned long *child
;
682 for (i
= 0; i
< geo
->no_pairs
; i
++) {
683 child
= bval(geo
, node
, i
);
687 count
= __btree_for_each(head
, geo
, child
, opaque
,
688 func
, func2
, reap
, height
- 1, count
);
690 func(child
, opaque
, bkey(geo
, node
, i
), count
++,
694 mempool_free(node
, head
->mempool
);
698 static void empty(void *elem
, unsigned long opaque
, unsigned long *key
,
699 size_t index
, void *func2
)
703 void visitorl(void *elem
, unsigned long opaque
, unsigned long *key
,
704 size_t index
, void *__func
)
706 visitorl_t func
= __func
;
708 func(elem
, opaque
, *key
, index
);
710 EXPORT_SYMBOL_GPL(visitorl
);
712 void visitor32(void *elem
, unsigned long opaque
, unsigned long *__key
,
713 size_t index
, void *__func
)
715 visitor32_t func
= __func
;
716 u32
*key
= (void *)__key
;
718 func(elem
, opaque
, *key
, index
);
720 EXPORT_SYMBOL_GPL(visitor32
);
722 void visitor64(void *elem
, unsigned long opaque
, unsigned long *__key
,
723 size_t index
, void *__func
)
725 visitor64_t func
= __func
;
726 u64
*key
= (void *)__key
;
728 func(elem
, opaque
, *key
, index
);
730 EXPORT_SYMBOL_GPL(visitor64
);
732 void visitor128(void *elem
, unsigned long opaque
, unsigned long *__key
,
733 size_t index
, void *__func
)
735 visitor128_t func
= __func
;
736 u64
*key
= (void *)__key
;
738 func(elem
, opaque
, key
[0], key
[1], index
);
740 EXPORT_SYMBOL_GPL(visitor128
);
742 size_t btree_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
743 unsigned long opaque
,
744 void (*func
)(void *elem
, unsigned long opaque
,
746 size_t index
, void *func2
),
754 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
755 func2
, 0, head
->height
, 0);
758 EXPORT_SYMBOL_GPL(btree_visitor
);
760 size_t btree_grim_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
761 unsigned long opaque
,
762 void (*func
)(void *elem
, unsigned long opaque
,
764 size_t index
, void *func2
),
772 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
773 func2
, 1, head
->height
, 0);
777 EXPORT_SYMBOL_GPL(btree_grim_visitor
);
779 static int __init
btree_module_init(void)
781 btree_cachep
= kmem_cache_create("btree_node", NODESIZE
, 0,
782 SLAB_HWCACHE_ALIGN
, NULL
);
786 static void __exit
btree_module_exit(void)
788 kmem_cache_destroy(btree_cachep
);
791 /* If core code starts using btree, initialization should happen even earlier */
792 module_init(btree_module_init
);
793 module_exit(btree_module_exit
);
795 MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
796 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
797 MODULE_LICENSE("GPL");