1 // SPDX-License-Identifier: GPL-2.0-only
3 * lib/btree.c - Simple In-memory B+Tree
5 * Copyright (c) 2007-2008 Joern Engel <joern@purestorage.com>
6 * Bits and pieces stolen from Peter Zijlstra's code, which is
7 * Copyright 2007, Red Hat Inc. Peter Zijlstra
9 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
11 * A relatively simple B+Tree implementation. I have written it as a learning
12 * exercise to understand how B+Trees work. Turned out to be useful as well.
14 * B+Trees can be used similar to Linux radix trees (which don't have anything
15 * in common with textbook radix trees, beware). Prerequisite for them working
16 * well is that access to a random tree node is much faster than a large number
17 * of operations within each node.
19 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
20 * has gained similar properties, as memory access times, when measured in cpu
21 * cycles, have increased. Cacheline sizes have increased as well, which also
24 * Compared to radix trees, B+Trees are more efficient when dealing with a
25 * sparsely populated address space. Between 25% and 50% of the memory is
26 * occupied with valid pointers. When densely populated, radix trees contain
27 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
30 * This particular implementation stores pointers identified by a long value.
31 * Storing NULL pointers is illegal, lookup will return NULL when no entry
34 * A tricks was used that is not commonly found in textbooks. The lowest
35 * values are to the right, not to the left. All used slots within a node
36 * are on the left, all unused slots contain NUL values. Most operations
37 * simply loop once over all slots and terminate on the first NUL.
40 #include <linux/btree.h>
41 #include <linux/cache.h>
42 #include <linux/kernel.h>
43 #include <linux/slab.h>
44 #include <linux/module.h>
46 #define MAX(a, b) ((a) > (b) ? (a) : (b))
47 #define NODESIZE MAX(L1_CACHE_BYTES, 128)
55 struct btree_geo btree_geo32
= {
57 .no_pairs
= NODESIZE
/ sizeof(long) / 2,
58 .no_longs
= NODESIZE
/ sizeof(long) / 2,
60 EXPORT_SYMBOL_GPL(btree_geo32
);
62 #define LONG_PER_U64 (64 / BITS_PER_LONG)
63 struct btree_geo btree_geo64
= {
64 .keylen
= LONG_PER_U64
,
65 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
),
66 .no_longs
= LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + LONG_PER_U64
)),
68 EXPORT_SYMBOL_GPL(btree_geo64
);
70 struct btree_geo btree_geo128
= {
71 .keylen
= 2 * LONG_PER_U64
,
72 .no_pairs
= NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
),
73 .no_longs
= 2 * LONG_PER_U64
* (NODESIZE
/ sizeof(long) / (1 + 2 * LONG_PER_U64
)),
75 EXPORT_SYMBOL_GPL(btree_geo128
);
77 #define MAX_KEYLEN (2 * LONG_PER_U64)
79 static struct kmem_cache
*btree_cachep
;
81 void *btree_alloc(gfp_t gfp_mask
, void *pool_data
)
83 return kmem_cache_alloc(btree_cachep
, gfp_mask
);
85 EXPORT_SYMBOL_GPL(btree_alloc
);
87 void btree_free(void *element
, void *pool_data
)
89 kmem_cache_free(btree_cachep
, element
);
91 EXPORT_SYMBOL_GPL(btree_free
);
93 static unsigned long *btree_node_alloc(struct btree_head
*head
, gfp_t gfp
)
97 node
= mempool_alloc(head
->mempool
, gfp
);
99 memset(node
, 0, NODESIZE
);
103 static int longcmp(const unsigned long *l1
, const unsigned long *l2
, size_t n
)
107 for (i
= 0; i
< n
; i
++) {
116 static unsigned long *longcpy(unsigned long *dest
, const unsigned long *src
,
121 for (i
= 0; i
< n
; i
++)
126 static unsigned long *longset(unsigned long *s
, unsigned long c
, size_t n
)
130 for (i
= 0; i
< n
; i
++)
135 static void dec_key(struct btree_geo
*geo
, unsigned long *key
)
140 for (i
= geo
->keylen
- 1; i
>= 0; i
--) {
148 static unsigned long *bkey(struct btree_geo
*geo
, unsigned long *node
, int n
)
150 return &node
[n
* geo
->keylen
];
153 static void *bval(struct btree_geo
*geo
, unsigned long *node
, int n
)
155 return (void *)node
[geo
->no_longs
+ n
];
158 static void setkey(struct btree_geo
*geo
, unsigned long *node
, int n
,
161 longcpy(bkey(geo
, node
, n
), key
, geo
->keylen
);
164 static void setval(struct btree_geo
*geo
, unsigned long *node
, int n
,
167 node
[geo
->no_longs
+ n
] = (unsigned long) val
;
170 static void clearpair(struct btree_geo
*geo
, unsigned long *node
, int n
)
172 longset(bkey(geo
, node
, n
), 0, geo
->keylen
);
173 node
[geo
->no_longs
+ n
] = 0;
176 static inline void __btree_init(struct btree_head
*head
)
182 void btree_init_mempool(struct btree_head
*head
, mempool_t
*mempool
)
185 head
->mempool
= mempool
;
187 EXPORT_SYMBOL_GPL(btree_init_mempool
);
189 int btree_init(struct btree_head
*head
)
192 head
->mempool
= mempool_create(0, btree_alloc
, btree_free
, NULL
);
197 EXPORT_SYMBOL_GPL(btree_init
);
199 void btree_destroy(struct btree_head
*head
)
201 mempool_free(head
->node
, head
->mempool
);
202 mempool_destroy(head
->mempool
);
203 head
->mempool
= NULL
;
205 EXPORT_SYMBOL_GPL(btree_destroy
);
207 void *btree_last(struct btree_head
*head
, struct btree_geo
*geo
,
210 int height
= head
->height
;
211 unsigned long *node
= head
->node
;
216 for ( ; height
> 1; height
--)
217 node
= bval(geo
, node
, 0);
219 longcpy(key
, bkey(geo
, node
, 0), geo
->keylen
);
220 return bval(geo
, node
, 0);
222 EXPORT_SYMBOL_GPL(btree_last
);
224 static int keycmp(struct btree_geo
*geo
, unsigned long *node
, int pos
,
227 return longcmp(bkey(geo
, node
, pos
), key
, geo
->keylen
);
230 static int keyzero(struct btree_geo
*geo
, unsigned long *key
)
234 for (i
= 0; i
< geo
->keylen
; i
++)
241 void *btree_lookup(struct btree_head
*head
, struct btree_geo
*geo
,
244 int i
, height
= head
->height
;
245 unsigned long *node
= head
->node
;
250 for ( ; height
> 1; height
--) {
251 for (i
= 0; i
< geo
->no_pairs
; i
++)
252 if (keycmp(geo
, node
, i
, key
) <= 0)
254 if (i
== geo
->no_pairs
)
256 node
= bval(geo
, node
, i
);
264 for (i
= 0; i
< geo
->no_pairs
; i
++)
265 if (keycmp(geo
, node
, i
, key
) == 0)
266 return bval(geo
, node
, i
);
269 EXPORT_SYMBOL_GPL(btree_lookup
);
271 int btree_update(struct btree_head
*head
, struct btree_geo
*geo
,
272 unsigned long *key
, void *val
)
274 int i
, height
= head
->height
;
275 unsigned long *node
= head
->node
;
280 for ( ; height
> 1; height
--) {
281 for (i
= 0; i
< geo
->no_pairs
; i
++)
282 if (keycmp(geo
, node
, i
, key
) <= 0)
284 if (i
== geo
->no_pairs
)
286 node
= bval(geo
, node
, i
);
294 for (i
= 0; i
< geo
->no_pairs
; i
++)
295 if (keycmp(geo
, node
, i
, key
) == 0) {
296 setval(geo
, node
, i
, val
);
301 EXPORT_SYMBOL_GPL(btree_update
);
304 * Usually this function is quite similar to normal lookup. But the key of
305 * a parent node may be smaller than the smallest key of all its siblings.
306 * In such a case we cannot just return NULL, as we have only proven that no
307 * key smaller than __key, but larger than this parent key exists.
308 * So we set __key to the parent key and retry. We have to use the smallest
309 * such parent key, which is the last parent key we encountered.
311 void *btree_get_prev(struct btree_head
*head
, struct btree_geo
*geo
,
312 unsigned long *__key
)
315 unsigned long *node
, *oldnode
;
316 unsigned long *retry_key
= NULL
, key
[MAX_KEYLEN
];
318 if (keyzero(geo
, __key
))
321 if (head
->height
== 0)
323 longcpy(key
, __key
, geo
->keylen
);
328 for (height
= head
->height
; height
> 1; height
--) {
329 for (i
= 0; i
< geo
->no_pairs
; i
++)
330 if (keycmp(geo
, node
, i
, key
) <= 0)
332 if (i
== geo
->no_pairs
)
335 node
= bval(geo
, node
, i
);
338 retry_key
= bkey(geo
, oldnode
, i
);
344 for (i
= 0; i
< geo
->no_pairs
; i
++) {
345 if (keycmp(geo
, node
, i
, key
) <= 0) {
346 if (bval(geo
, node
, i
)) {
347 longcpy(__key
, bkey(geo
, node
, i
), geo
->keylen
);
348 return bval(geo
, node
, i
);
355 longcpy(key
, retry_key
, geo
->keylen
);
361 EXPORT_SYMBOL_GPL(btree_get_prev
);
363 static int getpos(struct btree_geo
*geo
, unsigned long *node
,
368 for (i
= 0; i
< geo
->no_pairs
; i
++) {
369 if (keycmp(geo
, node
, i
, key
) <= 0)
375 static int getfill(struct btree_geo
*geo
, unsigned long *node
, int start
)
379 for (i
= start
; i
< geo
->no_pairs
; i
++)
380 if (!bval(geo
, node
, i
))
386 * locate the correct leaf node in the btree
388 static unsigned long *find_level(struct btree_head
*head
, struct btree_geo
*geo
,
389 unsigned long *key
, int level
)
391 unsigned long *node
= head
->node
;
394 for (height
= head
->height
; height
> level
; height
--) {
395 for (i
= 0; i
< geo
->no_pairs
; i
++)
396 if (keycmp(geo
, node
, i
, key
) <= 0)
399 if ((i
== geo
->no_pairs
) || !bval(geo
, node
, i
)) {
400 /* right-most key is too large, update it */
401 /* FIXME: If the right-most key on higher levels is
402 * always zero, this wouldn't be necessary. */
404 setkey(geo
, node
, i
, key
);
407 node
= bval(geo
, node
, i
);
413 static int btree_grow(struct btree_head
*head
, struct btree_geo
*geo
,
419 node
= btree_node_alloc(head
, gfp
);
423 fill
= getfill(geo
, head
->node
, 0);
424 setkey(geo
, node
, 0, bkey(geo
, head
->node
, fill
- 1));
425 setval(geo
, node
, 0, head
->node
);
432 static void btree_shrink(struct btree_head
*head
, struct btree_geo
*geo
)
437 if (head
->height
<= 1)
441 fill
= getfill(geo
, node
, 0);
443 head
->node
= bval(geo
, node
, 0);
445 mempool_free(node
, head
->mempool
);
448 static int btree_insert_level(struct btree_head
*head
, struct btree_geo
*geo
,
449 unsigned long *key
, void *val
, int level
,
453 int i
, pos
, fill
, err
;
456 if (head
->height
< level
) {
457 err
= btree_grow(head
, geo
, gfp
);
463 node
= find_level(head
, geo
, key
, level
);
464 pos
= getpos(geo
, node
, key
);
465 fill
= getfill(geo
, node
, pos
);
466 /* two identical keys are not allowed */
467 BUG_ON(pos
< fill
&& keycmp(geo
, node
, pos
, key
) == 0);
469 if (fill
== geo
->no_pairs
) {
470 /* need to split node */
473 new = btree_node_alloc(head
, gfp
);
476 err
= btree_insert_level(head
, geo
,
477 bkey(geo
, node
, fill
/ 2 - 1),
478 new, level
+ 1, gfp
);
480 mempool_free(new, head
->mempool
);
483 for (i
= 0; i
< fill
/ 2; i
++) {
484 setkey(geo
, new, i
, bkey(geo
, node
, i
));
485 setval(geo
, new, i
, bval(geo
, node
, i
));
486 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ fill
/ 2));
487 setval(geo
, node
, i
, bval(geo
, node
, i
+ fill
/ 2));
488 clearpair(geo
, node
, i
+ fill
/ 2);
491 setkey(geo
, node
, i
, bkey(geo
, node
, fill
- 1));
492 setval(geo
, node
, i
, bval(geo
, node
, fill
- 1));
493 clearpair(geo
, node
, fill
- 1);
497 BUG_ON(fill
>= geo
->no_pairs
);
499 /* shift and insert */
500 for (i
= fill
; i
> pos
; i
--) {
501 setkey(geo
, node
, i
, bkey(geo
, node
, i
- 1));
502 setval(geo
, node
, i
, bval(geo
, node
, i
- 1));
504 setkey(geo
, node
, pos
, key
);
505 setval(geo
, node
, pos
, val
);
510 int btree_insert(struct btree_head
*head
, struct btree_geo
*geo
,
511 unsigned long *key
, void *val
, gfp_t gfp
)
514 return btree_insert_level(head
, geo
, key
, val
, 1, gfp
);
516 EXPORT_SYMBOL_GPL(btree_insert
);
518 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
519 unsigned long *key
, int level
);
520 static void merge(struct btree_head
*head
, struct btree_geo
*geo
, int level
,
521 unsigned long *left
, int lfill
,
522 unsigned long *right
, int rfill
,
523 unsigned long *parent
, int lpos
)
527 for (i
= 0; i
< rfill
; i
++) {
528 /* Move all keys to the left */
529 setkey(geo
, left
, lfill
+ i
, bkey(geo
, right
, i
));
530 setval(geo
, left
, lfill
+ i
, bval(geo
, right
, i
));
532 /* Exchange left and right child in parent */
533 setval(geo
, parent
, lpos
, right
);
534 setval(geo
, parent
, lpos
+ 1, left
);
535 /* Remove left (formerly right) child from parent */
536 btree_remove_level(head
, geo
, bkey(geo
, parent
, lpos
), level
+ 1);
537 mempool_free(right
, head
->mempool
);
540 static void rebalance(struct btree_head
*head
, struct btree_geo
*geo
,
541 unsigned long *key
, int level
, unsigned long *child
, int fill
)
543 unsigned long *parent
, *left
= NULL
, *right
= NULL
;
544 int i
, no_left
, no_right
;
547 /* Because we don't steal entries from a neighbour, this case
548 * can happen. Parent node contains a single child, this
549 * node, so merging with a sibling never happens.
551 btree_remove_level(head
, geo
, key
, level
+ 1);
552 mempool_free(child
, head
->mempool
);
556 parent
= find_level(head
, geo
, key
, level
+ 1);
557 i
= getpos(geo
, parent
, key
);
558 BUG_ON(bval(geo
, parent
, i
) != child
);
561 left
= bval(geo
, parent
, i
- 1);
562 no_left
= getfill(geo
, left
, 0);
563 if (fill
+ no_left
<= geo
->no_pairs
) {
564 merge(head
, geo
, level
,
571 if (i
+ 1 < getfill(geo
, parent
, i
)) {
572 right
= bval(geo
, parent
, i
+ 1);
573 no_right
= getfill(geo
, right
, 0);
574 if (fill
+ no_right
<= geo
->no_pairs
) {
575 merge(head
, geo
, level
,
583 * We could also try to steal one entry from the left or right
584 * neighbor. By not doing so we changed the invariant from
585 * "all nodes are at least half full" to "no two neighboring
586 * nodes can be merged". Which means that the average fill of
587 * all nodes is still half or better.
591 static void *btree_remove_level(struct btree_head
*head
, struct btree_geo
*geo
,
592 unsigned long *key
, int level
)
598 if (level
> head
->height
) {
599 /* we recursed all the way up */
605 node
= find_level(head
, geo
, key
, level
);
606 pos
= getpos(geo
, node
, key
);
607 fill
= getfill(geo
, node
, pos
);
608 if ((level
== 1) && (keycmp(geo
, node
, pos
, key
) != 0))
610 ret
= bval(geo
, node
, pos
);
612 /* remove and shift */
613 for (i
= pos
; i
< fill
- 1; i
++) {
614 setkey(geo
, node
, i
, bkey(geo
, node
, i
+ 1));
615 setval(geo
, node
, i
, bval(geo
, node
, i
+ 1));
617 clearpair(geo
, node
, fill
- 1);
619 if (fill
- 1 < geo
->no_pairs
/ 2) {
620 if (level
< head
->height
)
621 rebalance(head
, geo
, key
, level
, node
, fill
- 1);
622 else if (fill
- 1 == 1)
623 btree_shrink(head
, geo
);
629 void *btree_remove(struct btree_head
*head
, struct btree_geo
*geo
,
632 if (head
->height
== 0)
635 return btree_remove_level(head
, geo
, key
, 1);
637 EXPORT_SYMBOL_GPL(btree_remove
);
639 int btree_merge(struct btree_head
*target
, struct btree_head
*victim
,
640 struct btree_geo
*geo
, gfp_t gfp
)
642 unsigned long key
[MAX_KEYLEN
];
643 unsigned long dup
[MAX_KEYLEN
];
647 BUG_ON(target
== victim
);
649 if (!(target
->node
)) {
650 /* target is empty, just copy fields over */
651 target
->node
= victim
->node
;
652 target
->height
= victim
->height
;
653 __btree_init(victim
);
657 /* TODO: This needs some optimizations. Currently we do three tree
658 * walks to remove a single object from the victim.
661 if (!btree_last(victim
, geo
, key
))
663 val
= btree_lookup(victim
, geo
, key
);
664 err
= btree_insert(target
, geo
, key
, val
, gfp
);
667 /* We must make a copy of the key, as the original will get
668 * mangled inside btree_remove. */
669 longcpy(dup
, key
, geo
->keylen
);
670 btree_remove(victim
, geo
, dup
);
674 EXPORT_SYMBOL_GPL(btree_merge
);
676 static size_t __btree_for_each(struct btree_head
*head
, struct btree_geo
*geo
,
677 unsigned long *node
, unsigned long opaque
,
678 void (*func
)(void *elem
, unsigned long opaque
,
679 unsigned long *key
, size_t index
,
681 void *func2
, int reap
, int height
, size_t count
)
684 unsigned long *child
;
686 for (i
= 0; i
< geo
->no_pairs
; i
++) {
687 child
= bval(geo
, node
, i
);
691 count
= __btree_for_each(head
, geo
, child
, opaque
,
692 func
, func2
, reap
, height
- 1, count
);
694 func(child
, opaque
, bkey(geo
, node
, i
), count
++,
698 mempool_free(node
, head
->mempool
);
702 static void empty(void *elem
, unsigned long opaque
, unsigned long *key
,
703 size_t index
, void *func2
)
707 void visitorl(void *elem
, unsigned long opaque
, unsigned long *key
,
708 size_t index
, void *__func
)
710 visitorl_t func
= __func
;
712 func(elem
, opaque
, *key
, index
);
714 EXPORT_SYMBOL_GPL(visitorl
);
716 void visitor32(void *elem
, unsigned long opaque
, unsigned long *__key
,
717 size_t index
, void *__func
)
719 visitor32_t func
= __func
;
720 u32
*key
= (void *)__key
;
722 func(elem
, opaque
, *key
, index
);
724 EXPORT_SYMBOL_GPL(visitor32
);
726 void visitor64(void *elem
, unsigned long opaque
, unsigned long *__key
,
727 size_t index
, void *__func
)
729 visitor64_t func
= __func
;
730 u64
*key
= (void *)__key
;
732 func(elem
, opaque
, *key
, index
);
734 EXPORT_SYMBOL_GPL(visitor64
);
736 void visitor128(void *elem
, unsigned long opaque
, unsigned long *__key
,
737 size_t index
, void *__func
)
739 visitor128_t func
= __func
;
740 u64
*key
= (void *)__key
;
742 func(elem
, opaque
, key
[0], key
[1], index
);
744 EXPORT_SYMBOL_GPL(visitor128
);
746 size_t btree_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
747 unsigned long opaque
,
748 void (*func
)(void *elem
, unsigned long opaque
,
750 size_t index
, void *func2
),
758 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
759 func2
, 0, head
->height
, 0);
762 EXPORT_SYMBOL_GPL(btree_visitor
);
764 size_t btree_grim_visitor(struct btree_head
*head
, struct btree_geo
*geo
,
765 unsigned long opaque
,
766 void (*func
)(void *elem
, unsigned long opaque
,
768 size_t index
, void *func2
),
776 count
= __btree_for_each(head
, geo
, head
->node
, opaque
, func
,
777 func2
, 1, head
->height
, 0);
781 EXPORT_SYMBOL_GPL(btree_grim_visitor
);
783 static int __init
btree_module_init(void)
785 btree_cachep
= kmem_cache_create("btree_node", NODESIZE
, 0,
786 SLAB_HWCACHE_ALIGN
, NULL
);
790 static void __exit
btree_module_exit(void)
792 kmem_cache_destroy(btree_cachep
);
795 /* If core code starts using btree, initialization should happen even earlier */
796 module_init(btree_module_init
);
797 module_exit(btree_module_exit
);
799 MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
800 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
801 MODULE_LICENSE("GPL");