2 * Copyright (C) 2001 Momchil Velikov
3 * Portions Copyright (C) 2001 Christoph Hellwig
4 * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5 * Copyright (C) 2006 Nick Piggin
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2, or (at
10 * your option) any later version.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include <linux/errno.h>
23 #include <linux/init.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/radix-tree.h>
27 #include <linux/percpu.h>
28 #include <linux/slab.h>
29 #include <linux/notifier.h>
30 #include <linux/cpu.h>
31 #include <linux/gfp.h>
32 #include <linux/string.h>
33 #include <linux/bitops.h>
34 #include <linux/rcupdate.h>
35 #include <linux/spinlock.h>
39 #define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
41 #define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
44 #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
45 #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
47 #define RADIX_TREE_TAG_LONGS \
48 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
50 struct radix_tree_node
{
51 unsigned int height
; /* Height from the bottom */
53 struct rcu_head rcu_head
;
54 void *slots
[RADIX_TREE_MAP_SIZE
];
55 unsigned long tags
[RADIX_TREE_MAX_TAGS
][RADIX_TREE_TAG_LONGS
];
56 #ifdef CONFIG_RADIX_TREE_CONCURRENT
61 struct radix_tree_path
{
62 struct radix_tree_node
*node
;
64 #ifdef CONFIG_RADIX_TREE_CONCURRENT
69 #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
70 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
71 RADIX_TREE_MAP_SHIFT))
74 * The height_to_maxindex array needs to be one deeper than the maximum
75 * path as height 0 holds only 1 entry.
77 static unsigned long height_to_maxindex
[RADIX_TREE_MAX_PATH
+ 1] __read_mostly
;
79 #ifdef CONFIG_RADIX_TREE_CONCURRENT
80 static struct lock_class_key radix_node_class
[RADIX_TREE_MAX_PATH
];
82 #ifdef CONFIG_DEBUG_LOCK_ALLOC
83 static const char *radix_node_key_string
[RADIX_TREE_MAX_PATH
] = {
103 #ifdef CONFIG_RADIX_TREE_OPTIMISTIC
104 static DEFINE_PER_CPU(unsigned long[RADIX_TREE_MAX_PATH
+1], optimistic_histogram
);
106 static void optimistic_hit(unsigned long height
)
108 if (height
> RADIX_TREE_MAX_PATH
)
109 height
= RADIX_TREE_MAX_PATH
;
111 __get_cpu_var(optimistic_histogram
)[height
]++;
114 #ifdef CONFIG_PROC_FS
116 #include <linux/seq_file.h>
117 #include <linux/uaccess.h>
119 static void *frag_start(struct seq_file
*m
, loff_t
*pos
)
121 if (*pos
< 0 || *pos
> RADIX_TREE_MAX_PATH
)
124 m
->private = (void *)(unsigned long)*pos
;
128 static void *frag_next(struct seq_file
*m
, void *arg
, loff_t
*pos
)
130 if (*pos
< RADIX_TREE_MAX_PATH
) {
132 (*((unsigned long *)&m
->private))++;
138 static void frag_stop(struct seq_file
*m
, void *arg
)
142 unsigned long get_optimistic_stat(unsigned long index
)
144 unsigned long total
= 0;
147 for_each_possible_cpu(cpu
) {
148 total
+= per_cpu(optimistic_histogram
, cpu
)[index
];
153 static int frag_show(struct seq_file
*m
, void *arg
)
155 unsigned long index
= (unsigned long)m
->private;
156 unsigned long hits
= get_optimistic_stat(index
);
159 seq_printf(m
, "levels skipped\thits\n");
161 if (index
< RADIX_TREE_MAX_PATH
)
162 seq_printf(m
, "%9lu\t%9lu\n", index
, hits
);
164 seq_printf(m
, "failed\t%9lu\n", hits
);
169 struct seq_operations optimistic_op
= {
176 static void optimistic_reset(void)
180 for_each_possible_cpu(cpu
) {
181 for (height
= 0; height
<= RADIX_TREE_MAX_PATH
; height
++)
182 per_cpu(optimistic_histogram
, cpu
)[height
] = 0;
186 ssize_t
optimistic_write(struct file
*file
, const char __user
*buf
,
187 size_t count
, loff_t
*ppos
)
191 if (get_user(c
, buf
))
199 #endif // CONFIG_PROC_FS
200 #endif // CONFIG_RADIX_TREE_OPTIMISTIC
203 * Radix tree node cache.
205 static struct kmem_cache
*radix_tree_node_cachep
;
208 * Per-cpu pool of preloaded nodes
210 struct radix_tree_preload
{
212 struct radix_tree_node
*nodes
[RADIX_TREE_MAX_PATH
];
214 DEFINE_PER_CPU(struct radix_tree_preload
, radix_tree_preloads
) = { 0, };
216 static inline gfp_t
root_gfp_mask(struct radix_tree_root
*root
)
218 return root
->gfp_mask
& __GFP_BITS_MASK
;
222 * This assumes that the caller has performed appropriate preallocation, and
223 * that the caller has pinned this thread of control to the current CPU.
225 static struct radix_tree_node
*
226 radix_tree_node_alloc(struct radix_tree_root
*root
, int height
)
228 struct radix_tree_node
*ret
= NULL
;
229 gfp_t gfp_mask
= root_gfp_mask(root
);
231 if (!(gfp_mask
& __GFP_WAIT
)) {
232 struct radix_tree_preload
*rtp
;
235 * Provided the caller has preloaded here, we will always
236 * succeed in getting a node here (and never reach
239 rtp
= &__get_cpu_var(radix_tree_preloads
);
241 ret
= rtp
->nodes
[rtp
->nr
- 1];
242 rtp
->nodes
[rtp
->nr
- 1] = NULL
;
247 ret
= kmem_cache_alloc(radix_tree_node_cachep
,
248 set_migrateflags(gfp_mask
, __GFP_RECLAIMABLE
));
250 BUG_ON(radix_tree_is_indirect_ptr(ret
));
251 #ifdef CONFIG_RADIX_TREE_CONCURRENT
252 spin_lock_init(&ret
->lock
);
253 lockdep_set_class_and_name(&ret
->lock
,
254 &radix_node_class
[height
],
255 radix_node_key_string
[height
]);
257 ret
->height
= height
;
261 static void radix_tree_node_rcu_free(struct rcu_head
*head
)
263 struct radix_tree_node
*node
=
264 container_of(head
, struct radix_tree_node
, rcu_head
);
265 kmem_cache_free(radix_tree_node_cachep
, node
);
269 radix_tree_node_free(struct radix_tree_node
*node
)
271 call_rcu(&node
->rcu_head
, radix_tree_node_rcu_free
);
275 * Load up this CPU's radix_tree_node buffer with sufficient objects to
276 * ensure that the addition of a single element in the tree cannot fail. On
277 * success, return zero, with preemption disabled. On error, return -ENOMEM
278 * with preemption not disabled.
280 int radix_tree_preload(gfp_t gfp_mask
)
282 struct radix_tree_preload
*rtp
;
283 struct radix_tree_node
*node
;
287 rtp
= &__get_cpu_var(radix_tree_preloads
);
288 while (rtp
->nr
< ARRAY_SIZE(rtp
->nodes
)) {
290 node
= kmem_cache_alloc(radix_tree_node_cachep
,
291 set_migrateflags(gfp_mask
, __GFP_RECLAIMABLE
));
295 rtp
= &__get_cpu_var(radix_tree_preloads
);
296 if (rtp
->nr
< ARRAY_SIZE(rtp
->nodes
))
297 rtp
->nodes
[rtp
->nr
++] = node
;
299 kmem_cache_free(radix_tree_node_cachep
, node
);
305 EXPORT_SYMBOL(radix_tree_preload
);
307 static inline void tag_set(struct radix_tree_node
*node
, unsigned int tag
,
310 __set_bit(offset
, node
->tags
[tag
]);
313 static inline void tag_clear(struct radix_tree_node
*node
, unsigned int tag
,
316 __clear_bit(offset
, node
->tags
[tag
]);
319 static inline int tag_get(struct radix_tree_node
*node
, unsigned int tag
,
322 return test_bit(offset
, node
->tags
[tag
]);
325 static inline void root_tag_set(struct radix_tree_root
*root
, unsigned int tag
)
327 root
->gfp_mask
|= (__force gfp_t
)(1 << (tag
+ __GFP_BITS_SHIFT
));
331 static inline void root_tag_clear(struct radix_tree_root
*root
, unsigned int tag
)
333 root
->gfp_mask
&= (__force gfp_t
)~(1 << (tag
+ __GFP_BITS_SHIFT
));
336 static inline void root_tag_clear_all(struct radix_tree_root
*root
)
338 root
->gfp_mask
&= __GFP_BITS_MASK
;
341 static inline int root_tag_get(struct radix_tree_root
*root
, unsigned int tag
)
343 return (__force
unsigned)root
->gfp_mask
& (1 << (tag
+ __GFP_BITS_SHIFT
));
347 * Returns 1 if any slot in the node has this tag set.
348 * Otherwise returns 0.
350 static inline int any_tag_set(struct radix_tree_node
*node
, unsigned int tag
)
353 for (idx
= 0; idx
< RADIX_TREE_TAG_LONGS
; idx
++) {
354 if (node
->tags
[tag
][idx
])
360 static inline int any_tag_set_but(struct radix_tree_node
*node
,
361 unsigned int tag
, int offset
)
364 int offset_idx
= offset
/ BITS_PER_LONG
;
365 unsigned long offset_mask
= ~(1UL << (offset
% BITS_PER_LONG
));
366 for (idx
= 0; idx
< RADIX_TREE_TAG_LONGS
; idx
++) {
367 unsigned long mask
= ~0UL;
368 if (idx
== offset_idx
)
370 if (node
->tags
[tag
][idx
] & mask
)
377 * Return the maximum key which can be store into a
378 * radix tree with height HEIGHT.
380 static inline unsigned long radix_tree_maxindex(unsigned int height
)
382 return height_to_maxindex
[height
];
386 * Extend a radix tree so it can store key @index.
388 static int radix_tree_extend(struct radix_tree_root
*root
, unsigned long index
)
390 struct radix_tree_node
*node
;
394 /* Figure out what the height should be. */
395 height
= root
->height
+ 1;
396 while (index
> radix_tree_maxindex(height
))
399 if (root
->rnode
== NULL
) {
400 root
->height
= height
;
405 unsigned int newheight
= root
->height
+ 1;
406 if (!(node
= radix_tree_node_alloc(root
, newheight
)))
409 /* Increase the height. */
410 node
->slots
[0] = radix_tree_indirect_to_ptr(root
->rnode
);
412 /* Propagate the aggregated tag info into the new root */
413 for (tag
= 0; tag
< RADIX_TREE_MAX_TAGS
; tag
++) {
414 if (root_tag_get(root
, tag
))
415 tag_set(node
, tag
, 0);
419 node
= radix_tree_ptr_to_indirect(node
);
420 rcu_assign_pointer(root
->rnode
, node
);
421 root
->height
= newheight
;
422 } while (height
> root
->height
);
427 #ifdef CONFIG_RADIX_TREE_CONCURRENT
428 static inline struct radix_tree_context
*
429 radix_tree_get_context(struct radix_tree_root
**rootp
)
431 struct radix_tree_context
*context
= NULL
;
432 unsigned long addr
= (unsigned long)*rootp
;
435 context
= (struct radix_tree_context
*)(addr
- 1);
436 *rootp
= context
->root
;
442 #define RADIX_TREE_CONTEXT(context, root) \
443 struct radix_tree_context *context = \
444 radix_tree_get_context(&root)
446 static inline spinlock_t
*radix_node_lock(struct radix_tree_root
*root
,
447 struct radix_tree_node
*node
)
449 spinlock_t
*locked
= &node
->lock
;
454 static inline void radix_ladder_lock(struct radix_tree_context
*context
,
455 struct radix_tree_node
*node
)
458 struct radix_tree_root
*root
= context
->root
;
459 spinlock_t
*locked
= radix_node_lock(root
, node
);
461 spin_unlock(context
->locked
);
462 context
->locked
= locked
;
467 static inline void radix_path_init(struct radix_tree_context
*context
,
468 struct radix_tree_path
*pathp
)
470 pathp
->locked
= context
? context
->locked
: NULL
;
473 static inline void radix_path_lock(struct radix_tree_context
*context
,
474 struct radix_tree_path
*pathp
, struct radix_tree_node
*node
)
477 struct radix_tree_root
*root
= context
->root
;
478 spinlock_t
*locked
= radix_node_lock(root
, node
);
480 context
->locked
= locked
;
481 pathp
->locked
= locked
;
483 pathp
->locked
= NULL
;
486 static inline void radix_path_unlock(struct radix_tree_context
*context
,
487 struct radix_tree_path
*punlock
)
489 if (context
&& punlock
->locked
&&
490 context
->locked
!= punlock
->locked
)
491 spin_unlock(punlock
->locked
);
494 #define RADIX_TREE_CONTEXT(context, root) do { } while (0)
495 #define radix_ladder_lock(context, node) do { } while (0)
496 #define radix_path_init(context, pathp) do { } while (0)
497 #define radix_path_lock(context, pathp, node) do { } while (0)
498 #define radix_path_unlock(context, punlock) do { } while (0)
501 #ifdef CONFIG_RADIX_TREE_OPTIMISTIC
502 typedef int (*radix_valid_fn
)(struct radix_tree_node
*, int, int);
504 static struct radix_tree_node
*
505 radix_optimistic_lookup(struct radix_tree_context
*context
, unsigned long index
,
506 int tag
, radix_valid_fn valid
)
508 unsigned int height
, shift
;
509 struct radix_tree_node
*node
, *ret
= NULL
, **slot
;
510 struct radix_tree_root
*root
= context
->root
;
512 node
= rcu_dereference(root
->rnode
);
516 if (!radix_tree_is_indirect_ptr(node
))
519 node
= radix_tree_indirect_to_ptr(node
);
521 height
= node
->height
;
522 if (index
> radix_tree_maxindex(height
))
525 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
527 int offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
528 if ((*valid
)(node
, offset
, tag
))
530 slot
= (struct radix_tree_node
**)(node
->slots
+ offset
);
531 node
= rcu_dereference(*slot
);
535 shift
-= RADIX_TREE_MAP_SHIFT
;
537 } while (height
> 0);
542 static struct radix_tree_node
*
543 __radix_optimistic_lock(struct radix_tree_context
*context
, unsigned long index
,
544 int tag
, radix_valid_fn valid
)
546 struct radix_tree_node
*node
;
548 unsigned int shift
, offset
;
550 node
= radix_optimistic_lookup(context
, index
, tag
, valid
);
554 locked
= radix_node_lock(context
->root
, node
);
559 if (node
!= radix_optimistic_lookup(context
, index
, tag
, valid
))
562 /* check if the node got freed */
566 /* check if the node is still a valid termination point */
567 shift
= (node
->height
- 1) * RADIX_TREE_MAP_SHIFT
;
568 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
569 if (!(*valid
)(node
, offset
, tag
))
573 context
->locked
= locked
;
582 static struct radix_tree_node
*
583 radix_optimistic_lock(struct radix_tree_context
*context
, unsigned long index
,
584 int tag
, radix_valid_fn valid
)
586 struct radix_tree_node
*node
= NULL
;
589 node
= __radix_optimistic_lock(context
, index
, tag
, valid
);
591 BUG_ON(context
->locked
);
592 spin_lock(&context
->root
->lock
);
593 context
->locked
= &context
->root
->lock
;
594 optimistic_hit(RADIX_TREE_MAX_PATH
);
596 optimistic_hit(context
->root
->height
- node
->height
);
601 static int radix_valid_always(struct radix_tree_node
*node
, int offset
, int tag
)
606 static int radix_valid_tag(struct radix_tree_node
*node
, int offset
, int tag
)
608 return tag_get(node
, tag
, offset
);
611 #define radix_optimistic_lock(context, index, tag, valid) NULL
615 * radix_tree_insert - insert into a radix tree
616 * @root: radix tree root
618 * @item: item to insert
620 * Insert an item into the radix tree at position @index.
622 int radix_tree_insert(struct radix_tree_root
*root
,
623 unsigned long index
, void *item
)
625 struct radix_tree_node
*node
= NULL
, *slot
;
626 unsigned int height
, shift
;
630 RADIX_TREE_CONTEXT(context
, root
);
632 BUG_ON(radix_tree_is_indirect_ptr(item
));
634 node
= radix_optimistic_lock(context
, index
, 0, radix_valid_always
);
636 height
= node
->height
;
637 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
641 /* Make sure the tree is high enough. */
642 if (index
> radix_tree_maxindex(root
->height
)) {
643 error
= radix_tree_extend(root
, index
);
648 slot
= radix_tree_indirect_to_ptr(root
->rnode
);
649 height
= root
->height
;
650 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
652 offset
= 0; /* uninitialised var warning */
655 /* Have to add a child node. */
656 if (!(slot
= radix_tree_node_alloc(root
, height
)))
659 rcu_assign_pointer(node
->slots
[offset
], slot
);
662 rcu_assign_pointer(root
->rnode
,
663 radix_tree_ptr_to_indirect(slot
));
666 /* Go a level down */
668 radix_ladder_lock(context
, node
);
671 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
672 slot
= node
->slots
[offset
];
673 shift
-= RADIX_TREE_MAP_SHIFT
;
682 rcu_assign_pointer(node
->slots
[offset
], item
);
683 for (tag
= 0; tag
< RADIX_TREE_MAX_TAGS
; tag
++)
684 BUG_ON(tag_get(node
, tag
, offset
));
686 rcu_assign_pointer(root
->rnode
, item
);
687 for (tag
= 0; tag
< RADIX_TREE_MAX_TAGS
; tag
++)
688 BUG_ON(root_tag_get(root
, tag
));
693 EXPORT_SYMBOL(radix_tree_insert
);
696 * radix_tree_lookup_slot - lookup a slot in a radix tree
697 * @root: radix tree root
700 * Returns: the slot corresponding to the position @index in the
701 * radix tree @root. This is useful for update-if-exists operations.
703 * This function can be called under rcu_read_lock iff the slot is not
704 * modified by radix_tree_replace_slot, otherwise it must be called
705 * exclusive from other writers. Any dereference of the slot must be done
706 * using radix_tree_deref_slot.
708 void **radix_tree_lookup_slot(struct radix_tree_root
*root
, unsigned long index
)
710 unsigned int height
, shift
;
711 struct radix_tree_node
*node
, **slot
;
713 node
= rcu_dereference(root
->rnode
);
717 if (!radix_tree_is_indirect_ptr(node
)) {
720 return (void **)&root
->rnode
;
722 node
= radix_tree_indirect_to_ptr(node
);
724 height
= node
->height
;
725 if (index
> radix_tree_maxindex(height
))
728 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
731 slot
= (struct radix_tree_node
**)
732 (node
->slots
+ ((index
>>shift
) & RADIX_TREE_MAP_MASK
));
733 node
= rcu_dereference(*slot
);
737 shift
-= RADIX_TREE_MAP_SHIFT
;
739 } while (height
> 0);
741 return (void **)slot
;
743 EXPORT_SYMBOL(radix_tree_lookup_slot
);
746 * radix_tree_lookup - perform lookup operation on a radix tree
747 * @root: radix tree root
750 * Lookup the item at the position @index in the radix tree @root.
752 * This function can be called under rcu_read_lock, however the caller
753 * must manage lifetimes of leaf nodes (eg. RCU may also be used to free
754 * them safely). No RCU barriers are required to access or modify the
755 * returned item, however.
757 void *radix_tree_lookup(struct radix_tree_root
*root
, unsigned long index
)
759 unsigned int height
, shift
;
760 struct radix_tree_node
*node
, **slot
;
761 RADIX_TREE_CONTEXT(context
, root
);
763 node
= radix_optimistic_lock(context
, index
, 0, radix_valid_always
);
767 node
= rcu_dereference(root
->rnode
);
771 if (!radix_tree_is_indirect_ptr(node
)) {
776 node
= radix_tree_indirect_to_ptr(node
);
779 height
= node
->height
;
780 if (index
> radix_tree_maxindex(height
))
783 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
786 slot
= (struct radix_tree_node
**)
787 (node
->slots
+ ((index
>>shift
) & RADIX_TREE_MAP_MASK
));
788 node
= rcu_dereference(*slot
);
792 radix_ladder_lock(context
, node
);
794 shift
-= RADIX_TREE_MAP_SHIFT
;
796 } while (height
> 0);
800 EXPORT_SYMBOL(radix_tree_lookup
);
803 * radix_tree_tag_set - set a tag on a radix tree node
804 * @root: radix tree root
808 * Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
809 * corresponding to @index in the radix tree. From
810 * the root all the way down to the leaf node.
812 * Returns the address of the tagged item. Setting a tag on a not-present
815 void *radix_tree_tag_set(struct radix_tree_root
*root
,
816 unsigned long index
, unsigned int tag
)
818 unsigned int height
, shift
;
819 struct radix_tree_node
*slot
;
820 RADIX_TREE_CONTEXT(context
, root
);
822 slot
= radix_optimistic_lock(context
, index
, tag
, radix_valid_tag
);
824 height
= slot
->height
;
825 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
829 height
= root
->height
;
830 BUG_ON(index
> radix_tree_maxindex(height
));
832 slot
= radix_tree_indirect_to_ptr(root
->rnode
);
833 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
835 /* set the root's tag bit */
836 if (slot
&& !root_tag_get(root
, tag
))
837 root_tag_set(root
, tag
);
842 radix_ladder_lock(context
, slot
);
845 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
846 if (!tag_get(slot
, tag
, offset
))
847 tag_set(slot
, tag
, offset
);
848 slot
= slot
->slots
[offset
];
849 BUG_ON(slot
== NULL
);
850 shift
-= RADIX_TREE_MAP_SHIFT
;
856 EXPORT_SYMBOL(radix_tree_tag_set
);
859 * the change can never propagate upwards from here.
862 int radix_valid_tag_clear(struct radix_tree_node
*node
, int offset
, int tag
)
866 this = tag_get(node
, tag
, offset
);
867 other
= any_tag_set_but(node
, tag
, offset
);
869 return !this || other
;
873 * radix_tree_tag_clear - clear a tag on a radix tree node
874 * @root: radix tree root
878 * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
879 * corresponding to @index in the radix tree. If
880 * this causes the leaf node to have no tags set then clear the tag in the
881 * next-to-leaf node, etc.
883 * Returns the address of the tagged item on success, else NULL. ie:
884 * has the same return value and semantics as radix_tree_lookup().
886 void *radix_tree_tag_clear(struct radix_tree_root
*root
,
887 unsigned long index
, unsigned int tag
)
890 * The radix tree path needs to be one longer than the maximum path
891 * since the "list" is null terminated.
893 struct radix_tree_path path
[RADIX_TREE_MAX_PATH
+ 1], *pathp
= path
;
894 struct radix_tree_path
*punlock
= path
, *piter
;
895 struct radix_tree_node
*slot
= NULL
;
896 unsigned int height
, shift
, offset
;
898 RADIX_TREE_CONTEXT(context
, root
);
900 slot
= radix_optimistic_lock(context
, index
, tag
,
901 radix_valid_tag_clear
);
903 height
= slot
->height
;
904 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
905 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
906 pathp
->offset
= offset
;
908 radix_path_init(context
, pathp
);
913 radix_path_init(context
, pathp
);
915 height
= root
->height
;
916 if (index
> radix_tree_maxindex(height
))
919 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
920 slot
= radix_tree_indirect_to_ptr(root
->rnode
);
926 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
928 pathp
->offset
= offset
;
930 radix_path_lock(context
, pathp
, slot
);
932 if (radix_valid_tag_clear(slot
, offset
, tag
)) {
933 for (; punlock
< pathp
; punlock
++)
934 radix_path_unlock(context
, punlock
);
938 slot
= slot
->slots
[offset
];
939 shift
-= RADIX_TREE_MAP_SHIFT
;
946 for (piter
= pathp
; piter
>= punlock
; piter
--) {
948 if (!tag_get(piter
->node
, tag
, piter
->offset
))
950 tag_clear(piter
->node
, tag
, piter
->offset
);
951 if (any_tag_set(piter
->node
, tag
))
954 if (root_tag_get(root
, tag
))
955 root_tag_clear(root
, tag
);
960 for (; punlock
< pathp
; punlock
++)
961 radix_path_unlock(context
, punlock
);
964 EXPORT_SYMBOL(radix_tree_tag_clear
);
966 #ifndef __KERNEL__ /* Only the test harness uses this at present */
968 * radix_tree_tag_get - get a tag on a radix tree node
969 * @root: radix tree root
971 * @tag: tag index (< RADIX_TREE_MAX_TAGS)
975 * 0: tag not present or not set
978 int radix_tree_tag_get(struct radix_tree_root
*root
,
979 unsigned long index
, unsigned int tag
)
981 unsigned int height
, shift
;
982 struct radix_tree_node
*node
;
983 int saw_unset_tag
= 0;
985 /* check the root's tag bit */
986 if (!root_tag_get(root
, tag
))
989 node
= rcu_dereference(root
->rnode
);
993 if (!radix_tree_is_indirect_ptr(node
))
995 node
= radix_tree_indirect_to_ptr(node
);
997 height
= node
->height
;
998 if (index
> radix_tree_maxindex(height
))
1001 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
1009 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
1012 * This is just a debug check. Later, we can bale as soon as
1013 * we see an unset tag.
1015 if (!tag_get(node
, tag
, offset
))
1018 int ret
= tag_get(node
, tag
, offset
);
1020 BUG_ON(ret
&& saw_unset_tag
);
1023 node
= rcu_dereference(node
->slots
[offset
]);
1024 shift
-= RADIX_TREE_MAP_SHIFT
;
1028 EXPORT_SYMBOL(radix_tree_tag_get
);
1032 * radix_tree_next_hole - find the next hole (not-present entry)
1035 * @max_scan: maximum range to search
1037 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
1040 * Returns: the index of the hole if found, otherwise returns an index
1041 * outside of the set specified (in which case 'return - index >= max_scan'
1044 * radix_tree_next_hole may be called under rcu_read_lock. However, like
1045 * radix_tree_gang_lookup, this will not atomically search a snapshot of the
1046 * tree at a single point in time. For example, if a hole is created at index
1047 * 5, then subsequently a hole is created at index 10, radix_tree_next_hole
1048 * covering both indexes may return 10 if called under rcu_read_lock.
1050 unsigned long radix_tree_next_hole(struct radix_tree_root
*root
,
1051 unsigned long index
, unsigned long max_scan
)
1055 for (i
= 0; i
< max_scan
; i
++) {
1056 if (!radix_tree_lookup(root
, index
))
1065 EXPORT_SYMBOL(radix_tree_next_hole
);
1068 __lookup(struct radix_tree_node
*slot
, void ***results
, unsigned long index
,
1069 unsigned int max_items
, unsigned long *next_index
)
1071 unsigned int nr_found
= 0;
1072 unsigned int shift
, height
;
1075 height
= slot
->height
;
1078 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
1080 for ( ; height
> 1; height
--) {
1081 i
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
1083 if (slot
->slots
[i
] != NULL
)
1085 index
&= ~((1UL << shift
) - 1);
1086 index
+= 1UL << shift
;
1088 goto out
; /* 32-bit wraparound */
1090 if (i
== RADIX_TREE_MAP_SIZE
)
1094 shift
-= RADIX_TREE_MAP_SHIFT
;
1095 slot
= rcu_dereference(slot
->slots
[i
]);
1100 /* Bottom level: grab some items */
1101 for (i
= index
& RADIX_TREE_MAP_MASK
; i
< RADIX_TREE_MAP_SIZE
; i
++) {
1103 if (slot
->slots
[i
]) {
1104 results
[nr_found
++] = &(slot
->slots
[i
]);
1105 if (nr_found
== max_items
)
1110 *next_index
= index
;
1115 * radix_tree_gang_lookup - perform multiple lookup on a radix tree
1116 * @root: radix tree root
1117 * @results: where the results of the lookup are placed
1118 * @first_index: start the lookup from this key
1119 * @max_items: place up to this many items at *results
1121 * Performs an index-ascending scan of the tree for present items. Places
1122 * them at *@results and returns the number of items which were placed at
1125 * The implementation is naive.
1127 * Like radix_tree_lookup, radix_tree_gang_lookup may be called under
1128 * rcu_read_lock. In this case, rather than the returned results being
1129 * an atomic snapshot of the tree at a single point in time, the semantics
1130 * of an RCU protected gang lookup are as though multiple radix_tree_lookups
1131 * have been issued in individual locks, and results stored in 'results'.
1134 radix_tree_gang_lookup(struct radix_tree_root
*root
, void **results
,
1135 unsigned long first_index
, unsigned int max_items
)
1137 unsigned long max_index
;
1138 struct radix_tree_node
*node
;
1139 unsigned long cur_index
= first_index
;
1142 node
= rcu_dereference(root
->rnode
);
1146 if (!radix_tree_is_indirect_ptr(node
)) {
1147 if (first_index
> 0)
1152 node
= radix_tree_indirect_to_ptr(node
);
1154 max_index
= radix_tree_maxindex(node
->height
);
1157 while (ret
< max_items
) {
1158 unsigned int nr_found
, slots_found
, i
;
1159 unsigned long next_index
; /* Index of next search */
1161 if (cur_index
> max_index
)
1163 slots_found
= __lookup(node
, (void ***)results
+ ret
, cur_index
,
1164 max_items
- ret
, &next_index
);
1166 for (i
= 0; i
< slots_found
; i
++) {
1167 struct radix_tree_node
*slot
;
1168 slot
= *(((void ***)results
)[ret
+ i
]);
1171 results
[ret
+ nr_found
] = rcu_dereference(slot
);
1175 if (next_index
== 0)
1177 cur_index
= next_index
;
1182 EXPORT_SYMBOL(radix_tree_gang_lookup
);
1185 * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
1186 * @root: radix tree root
1187 * @results: where the results of the lookup are placed
1188 * @first_index: start the lookup from this key
1189 * @max_items: place up to this many items at *results
1191 * Performs an index-ascending scan of the tree for present items. Places
1192 * their slots at *@results and returns the number of items which were
1193 * placed at *@results.
1195 * The implementation is naive.
1197 * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
1198 * be dereferenced with radix_tree_deref_slot, and if using only RCU
1199 * protection, radix_tree_deref_slot may fail requiring a retry.
1202 radix_tree_gang_lookup_slot(struct radix_tree_root
*root
, void ***results
,
1203 unsigned long first_index
, unsigned int max_items
)
1205 unsigned long max_index
;
1206 struct radix_tree_node
*node
;
1207 unsigned long cur_index
= first_index
;
1210 node
= rcu_dereference(root
->rnode
);
1214 if (!radix_tree_is_indirect_ptr(node
)) {
1215 if (first_index
> 0)
1217 results
[0] = (void **)&root
->rnode
;
1220 node
= radix_tree_indirect_to_ptr(node
);
1222 max_index
= radix_tree_maxindex(node
->height
);
1225 while (ret
< max_items
) {
1226 unsigned int slots_found
;
1227 unsigned long next_index
; /* Index of next search */
1229 if (cur_index
> max_index
)
1231 slots_found
= __lookup(node
, results
+ ret
, cur_index
,
1232 max_items
- ret
, &next_index
);
1234 if (next_index
== 0)
1236 cur_index
= next_index
;
1241 EXPORT_SYMBOL(radix_tree_gang_lookup_slot
);
1244 * FIXME: the two tag_get()s here should use find_next_bit() instead of
1245 * open-coding the search.
1248 __lookup_tag(struct radix_tree_node
*slot
, void ***results
, unsigned long index
,
1249 unsigned int max_items
, unsigned long *next_index
, unsigned int tag
)
1251 unsigned int nr_found
= 0;
1252 unsigned int shift
, height
;
1254 height
= slot
->height
;
1257 shift
= (height
-1) * RADIX_TREE_MAP_SHIFT
;
1259 while (height
> 0) {
1260 unsigned long i
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
1263 if (tag_get(slot
, tag
, i
))
1265 index
&= ~((1UL << shift
) - 1);
1266 index
+= 1UL << shift
;
1268 goto out
; /* 32-bit wraparound */
1270 if (i
== RADIX_TREE_MAP_SIZE
)
1274 if (height
== 0) { /* Bottom level: grab some items */
1275 unsigned long j
= index
& RADIX_TREE_MAP_MASK
;
1277 for ( ; j
< RADIX_TREE_MAP_SIZE
; j
++) {
1278 struct radix_tree_node
*node
;
1280 if (!tag_get(slot
, tag
, j
))
1282 node
= slot
->slots
[j
];
1284 * Even though the tag was found set, we need to
1285 * recheck that we have a non-NULL node, because
1286 * if this lookup is lockless, it may have been
1287 * subsequently deleted.
1289 * Similar care must be taken in any place that
1290 * lookup ->slots[x] without a lock (ie. can't
1291 * rely on its value remaining the same).
1293 if (slot
->slots
[j
]) {
1294 results
[nr_found
++] = &slot
->slots
[j
];
1295 if (nr_found
== max_items
)
1300 shift
-= RADIX_TREE_MAP_SHIFT
;
1301 slot
= rcu_dereference(slot
->slots
[i
]);
1306 *next_index
= index
;
1311 * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
1313 * @root: radix tree root
1314 * @results: where the results of the lookup are placed
1315 * @first_index: start the lookup from this key
1316 * @max_items: place up to this many items at *results
1317 * @tag: the tag index (< RADIX_TREE_MAX_TAGS)
1319 * Performs an index-ascending scan of the tree for present items which
1320 * have the tag indexed by @tag set. Places the items at *@results and
1321 * returns the number of items which were placed at *@results.
1324 radix_tree_gang_lookup_tag(struct radix_tree_root
*root
, void **results
,
1325 unsigned long first_index
, unsigned int max_items
,
1328 struct radix_tree_node
*node
;
1329 unsigned long max_index
;
1330 unsigned long cur_index
= first_index
;
1333 /* check the root's tag bit */
1334 if (!root_tag_get(root
, tag
))
1337 node
= rcu_dereference(root
->rnode
);
1341 if (!radix_tree_is_indirect_ptr(node
)) {
1342 if (first_index
> 0)
1347 node
= radix_tree_indirect_to_ptr(node
);
1349 max_index
= radix_tree_maxindex(node
->height
);
1352 while (ret
< max_items
) {
1353 unsigned int slots_found
, nr_found
, i
;
1354 unsigned long next_index
; /* Index of next search */
1356 if (cur_index
> max_index
)
1358 slots_found
= __lookup_tag(node
, (void ***)results
+ ret
,
1359 cur_index
, max_items
- ret
, &next_index
, tag
);
1361 for (i
= 0; i
< slots_found
; i
++) {
1362 struct radix_tree_node
*slot
;
1363 slot
= *((void ***)results
)[ret
+ i
];
1366 results
[ret
+ nr_found
] = rcu_dereference(slot
);
1370 if (next_index
== 0)
1372 cur_index
= next_index
;
1377 EXPORT_SYMBOL(radix_tree_gang_lookup_tag
);
1380 * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
1381 * radix tree based on a tag
1382 * @root: radix tree root
1383 * @results: where the results of the lookup are placed
1384 * @first_index: start the lookup from this key
1385 * @max_items: place up to this many items at *results
1386 * @tag: the tag index (< RADIX_TREE_MAX_TAGS)
1388 * Performs an index-ascending scan of the tree for present items which
1389 * have the tag indexed by @tag set. Places the slots at *@results and
1390 * returns the number of slots which were placed at *@results.
1393 radix_tree_gang_lookup_tag_slot(struct radix_tree_root
*root
, void ***results
,
1394 unsigned long first_index
, unsigned int max_items
,
1397 struct radix_tree_node
*node
;
1398 unsigned long max_index
;
1399 unsigned long cur_index
= first_index
;
1402 /* check the root's tag bit */
1403 if (!root_tag_get(root
, tag
))
1406 node
= rcu_dereference(root
->rnode
);
1410 if (!radix_tree_is_indirect_ptr(node
)) {
1411 if (first_index
> 0)
1413 results
[0] = (void **)&root
->rnode
;
1416 node
= radix_tree_indirect_to_ptr(node
);
1418 max_index
= radix_tree_maxindex(node
->height
);
1421 while (ret
< max_items
) {
1422 unsigned int slots_found
;
1423 unsigned long next_index
; /* Index of next search */
1425 if (cur_index
> max_index
)
1427 slots_found
= __lookup_tag(node
, results
+ ret
,
1428 cur_index
, max_items
- ret
, &next_index
, tag
);
1430 if (next_index
== 0)
1432 cur_index
= next_index
;
1437 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot
);
1441 * radix_tree_shrink - shrink height of a radix tree to minimal
1442 * @root radix tree root
1444 static inline void radix_tree_shrink(struct radix_tree_root
*root
)
1446 /* try to shrink tree height */
1447 while (root
->height
> 0) {
1448 struct radix_tree_node
*to_free
= root
->rnode
;
1452 BUG_ON(!radix_tree_is_indirect_ptr(to_free
));
1453 to_free
= radix_tree_indirect_to_ptr(to_free
);
1456 * The candidate node has more than one child, or its child
1457 * is not at the leftmost slot, we cannot shrink.
1459 if (to_free
->count
!= 1)
1461 if (!to_free
->slots
[0])
1465 * We don't need rcu_assign_pointer(), since we are simply
1466 * moving the node from one part of the tree to another. If
1467 * it was safe to dereference the old pointer to it
1468 * (to_free->slots[0]), it will be safe to dereference the new
1469 * one (root->rnode).
1471 newptr
= to_free
->slots
[0];
1472 if (root
->height
> 1)
1473 newptr
= radix_tree_ptr_to_indirect(newptr
);
1474 root
->rnode
= newptr
;
1476 /* must only free zeroed nodes into the slab */
1477 for (tag
= 0; tag
< RADIX_TREE_MAX_TAGS
; tag
++)
1478 tag_clear(to_free
, tag
, 0);
1479 to_free
->slots
[0] = NULL
;
1485 int radix_valid_delete(struct radix_tree_node
*node
, int offset
, int tag
)
1488 * we need to check for > 2, because nodes with a single child
1489 * can still be deleted, see radix_tree_shrink().
1491 int unlock
= (node
->count
> 2);
1496 for (tag
= 0; tag
< RADIX_TREE_MAX_TAGS
; tag
++) {
1497 if (!radix_valid_tag_clear(node
, offset
, tag
)) {
1507 * radix_tree_delete - delete an item from a radix tree
1508 * @root: radix tree root
1511 * Remove the item at @index from the radix tree rooted at @root.
1513 * Returns the address of the deleted item, or NULL if it was not present.
1515 void *radix_tree_delete(struct radix_tree_root
*root
, unsigned long index
)
1518 * The radix tree path needs to be one longer than the maximum path
1519 * since the "list" is null terminated.
1521 struct radix_tree_path path
[RADIX_TREE_MAX_PATH
+ 1], *pathp
= path
;
1522 struct radix_tree_path
*punlock
= path
, *piter
;
1523 struct radix_tree_node
*slot
= NULL
;
1524 unsigned int height
, shift
;
1527 RADIX_TREE_CONTEXT(context
, root
);
1529 slot
= radix_optimistic_lock(context
, index
, 0, radix_valid_delete
);
1531 height
= slot
->height
;
1532 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
1533 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
1534 pathp
->offset
= offset
;
1536 radix_path_init(context
, pathp
);
1541 radix_path_init(context
, pathp
);
1543 height
= root
->height
;
1544 if (index
> radix_tree_maxindex(height
))
1549 root_tag_clear_all(root
);
1553 slot
= radix_tree_indirect_to_ptr(slot
);
1555 shift
= (height
- 1) * RADIX_TREE_MAP_SHIFT
;
1562 offset
= (index
>> shift
) & RADIX_TREE_MAP_MASK
;
1563 pathp
->offset
= offset
;
1565 radix_path_lock(context
, pathp
, slot
);
1567 if (radix_valid_delete(slot
, offset
, 0)) {
1568 for (; punlock
< pathp
; punlock
++)
1569 radix_path_unlock(context
, punlock
);
1573 slot
= slot
->slots
[offset
];
1574 shift
-= RADIX_TREE_MAP_SHIFT
;
1576 } while (height
> 0);
1582 * Clear all tags associated with the just-deleted item
1584 for (tag
= 0; tag
< RADIX_TREE_MAX_TAGS
; tag
++) {
1585 for (piter
= pathp
; piter
>= punlock
; piter
--) {
1587 if (!tag_get(piter
->node
, tag
, piter
->offset
))
1589 tag_clear(piter
->node
, tag
, piter
->offset
);
1590 if (any_tag_set(piter
->node
, tag
))
1593 if (root_tag_get(root
, tag
))
1594 root_tag_clear(root
, tag
);
1599 /* Now unhook the nodes we do not need anymore */
1600 for (piter
= pathp
; piter
>= punlock
&& piter
->node
; piter
--) {
1601 piter
->node
->slots
[piter
->offset
] = NULL
;
1602 piter
->node
->count
--;
1604 if (piter
->node
->count
) {
1606 radix_tree_indirect_to_ptr(root
->rnode
))
1607 radix_tree_shrink(root
);
1612 BUG_ON(piter
->node
);
1614 root_tag_clear_all(root
);
1619 for (; punlock
<= pathp
; punlock
++) {
1620 radix_path_unlock(context
, punlock
);
1621 if (punlock
->node
&& punlock
->node
->count
== 0)
1622 radix_tree_node_free(punlock
->node
);
1626 EXPORT_SYMBOL(radix_tree_delete
);
1629 * radix_tree_tagged - test whether any items in the tree are tagged
1630 * @root: radix tree root
1633 int radix_tree_tagged(struct radix_tree_root
*root
, unsigned int tag
)
1635 return root_tag_get(root
, tag
);
1637 EXPORT_SYMBOL(radix_tree_tagged
);
1640 radix_tree_node_ctor(struct kmem_cache
*cachep
, void *node
)
1642 memset(node
, 0, sizeof(struct radix_tree_node
));
1645 static __init
unsigned long __maxindex(unsigned int height
)
1647 unsigned int width
= height
* RADIX_TREE_MAP_SHIFT
;
1648 int shift
= RADIX_TREE_INDEX_BITS
- width
;
1652 if (shift
>= BITS_PER_LONG
)
1654 return ~0UL >> shift
;
1657 static __init
void radix_tree_init_maxindex(void)
1661 for (i
= 0; i
< ARRAY_SIZE(height_to_maxindex
); i
++)
1662 height_to_maxindex
[i
] = __maxindex(i
);
1665 static int radix_tree_callback(struct notifier_block
*nfb
,
1666 unsigned long action
,
1669 int cpu
= (long)hcpu
;
1670 struct radix_tree_preload
*rtp
;
1672 /* Free per-cpu pool of perloaded nodes */
1673 if (action
== CPU_DEAD
|| action
== CPU_DEAD_FROZEN
) {
1674 rtp
= &per_cpu(radix_tree_preloads
, cpu
);
1676 kmem_cache_free(radix_tree_node_cachep
,
1677 rtp
->nodes
[rtp
->nr
-1]);
1678 rtp
->nodes
[rtp
->nr
-1] = NULL
;
1685 void __init
radix_tree_init(void)
1687 radix_tree_node_cachep
= kmem_cache_create("radix_tree_node",
1688 sizeof(struct radix_tree_node
), 0,
1689 SLAB_PANIC
, radix_tree_node_ctor
);
1690 radix_tree_init_maxindex();
1691 hotcpu_notifier(radix_tree_callback
, 0);