2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
26 * Code from fib_hash has been reused which includes the following header:
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
33 * IPv4 FIB: lookup engine and maintenance routines.
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
43 * Substantial contributions to this work comes from:
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
51 #define VERSION "0.408"
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
83 #define MAX_STAT_DEPTH 32
85 #define KEYLENGTH (8*sizeof(t_key))
87 typedef unsigned int t_key
;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
103 unsigned long parent
;
105 struct hlist_head list
;
110 struct hlist_node hlist
;
113 struct list_head falh
;
117 unsigned long parent
;
119 unsigned char pos
; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits
; /* 2log(KEYLENGTH) bits needed */
121 unsigned int full_children
; /* KEYLENGTH bits needed */
122 unsigned int empty_children
; /* KEYLENGTH bits needed */
125 struct work_struct work
;
126 struct tnode
*tnode_free
;
128 struct node
*child
[0];
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats
{
134 unsigned int backtrack
;
135 unsigned int semantic_match_passed
;
136 unsigned int semantic_match_miss
;
137 unsigned int null_node_hit
;
138 unsigned int resize_node_skipped
;
143 unsigned int totdepth
;
144 unsigned int maxdepth
;
147 unsigned int nullpointers
;
148 unsigned int prefixes
;
149 unsigned int nodesizes
[MAX_STAT_DEPTH
];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats
;
159 static void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
);
160 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
,
162 static struct node
*resize(struct trie
*t
, struct tnode
*tn
);
163 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
);
164 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode
*tnode_free_head
;
168 static struct kmem_cache
*fn_alias_kmem __read_mostly
;
169 static struct kmem_cache
*trie_leaf_kmem __read_mostly
;
171 static inline struct tnode
*node_parent(struct node
*node
)
173 return (struct tnode
*)(node
->parent
& ~NODE_TYPE_MASK
);
176 static inline struct tnode
*node_parent_rcu(struct node
*node
)
178 struct tnode
*ret
= node_parent(node
);
180 return rcu_dereference(ret
);
183 /* Same as rcu_assign_pointer
184 * but that macro() assumes that value is a pointer.
186 static inline void node_set_parent(struct node
*node
, struct tnode
*ptr
)
189 node
->parent
= (unsigned long)ptr
| NODE_TYPE(node
);
192 static inline struct node
*tnode_get_child(struct tnode
*tn
, unsigned int i
)
194 BUG_ON(i
>= 1U << tn
->bits
);
199 static inline struct node
*tnode_get_child_rcu(struct tnode
*tn
, unsigned int i
)
201 struct node
*ret
= tnode_get_child(tn
, i
);
203 return rcu_dereference(ret
);
206 static inline int tnode_child_length(const struct tnode
*tn
)
208 return 1 << tn
->bits
;
211 static inline t_key
mask_pfx(t_key k
, unsigned short l
)
213 return (l
== 0) ? 0 : k
>> (KEYLENGTH
-l
) << (KEYLENGTH
-l
);
216 static inline t_key
tkey_extract_bits(t_key a
, int offset
, int bits
)
218 if (offset
< KEYLENGTH
)
219 return ((t_key
)(a
<< offset
)) >> (KEYLENGTH
- bits
);
224 static inline int tkey_equals(t_key a
, t_key b
)
229 static inline int tkey_sub_equals(t_key a
, int offset
, int bits
, t_key b
)
231 if (bits
== 0 || offset
>= KEYLENGTH
)
233 bits
= bits
> KEYLENGTH
? KEYLENGTH
: bits
;
234 return ((a
^ b
) << offset
) >> (KEYLENGTH
- bits
) == 0;
237 static inline int tkey_mismatch(t_key a
, int offset
, t_key b
)
244 while ((diff
<< i
) >> (KEYLENGTH
-1) == 0)
250 To understand this stuff, an understanding of keys and all their bits is
251 necessary. Every node in the trie has a key associated with it, but not
252 all of the bits in that key are significant.
254 Consider a node 'n' and its parent 'tp'.
256 If n is a leaf, every bit in its key is significant. Its presence is
257 necessitated by path compression, since during a tree traversal (when
258 searching for a leaf - unless we are doing an insertion) we will completely
259 ignore all skipped bits we encounter. Thus we need to verify, at the end of
260 a potentially successful search, that we have indeed been walking the
263 Note that we can never "miss" the correct key in the tree if present by
264 following the wrong path. Path compression ensures that segments of the key
265 that are the same for all keys with a given prefix are skipped, but the
266 skipped part *is* identical for each node in the subtrie below the skipped
267 bit! trie_insert() in this implementation takes care of that - note the
268 call to tkey_sub_equals() in trie_insert().
270 if n is an internal node - a 'tnode' here, the various parts of its key
271 have many different meanings.
274 _________________________________________________________________
275 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276 -----------------------------------------------------------------
277 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
279 _________________________________________________________________
280 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281 -----------------------------------------------------------------
282 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
289 First, let's just ignore the bits that come before the parent tp, that is
290 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291 not use them for anything.
293 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294 index into the parent's child array. That is, they will be used to find
295 'n' among tp's children.
297 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
300 All the bits we have seen so far are significant to the node n. The rest
301 of the bits are really not needed or indeed known in n->key.
303 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304 n's child array, and will of course be different for each child.
307 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
312 static inline void check_tnode(const struct tnode
*tn
)
314 WARN_ON(tn
&& tn
->pos
+tn
->bits
> 32);
317 static const int halve_threshold
= 25;
318 static const int inflate_threshold
= 50;
319 static const int halve_threshold_root
= 8;
320 static const int inflate_threshold_root
= 15;
323 static void __alias_free_mem(struct rcu_head
*head
)
325 struct fib_alias
*fa
= container_of(head
, struct fib_alias
, rcu
);
326 kmem_cache_free(fn_alias_kmem
, fa
);
329 static inline void alias_free_mem_rcu(struct fib_alias
*fa
)
331 call_rcu(&fa
->rcu
, __alias_free_mem
);
334 static void __leaf_free_rcu(struct rcu_head
*head
)
336 struct leaf
*l
= container_of(head
, struct leaf
, rcu
);
337 kmem_cache_free(trie_leaf_kmem
, l
);
340 static inline void free_leaf(struct leaf
*l
)
342 call_rcu_bh(&l
->rcu
, __leaf_free_rcu
);
345 static void __leaf_info_free_rcu(struct rcu_head
*head
)
347 kfree(container_of(head
, struct leaf_info
, rcu
));
350 static inline void free_leaf_info(struct leaf_info
*leaf
)
352 call_rcu(&leaf
->rcu
, __leaf_info_free_rcu
);
355 static struct tnode
*tnode_alloc(size_t size
)
357 if (size
<= PAGE_SIZE
)
358 return kzalloc(size
, GFP_KERNEL
);
360 return __vmalloc(size
, GFP_KERNEL
| __GFP_ZERO
, PAGE_KERNEL
);
363 static void __tnode_vfree(struct work_struct
*arg
)
365 struct tnode
*tn
= container_of(arg
, struct tnode
, work
);
369 static void __tnode_free_rcu(struct rcu_head
*head
)
371 struct tnode
*tn
= container_of(head
, struct tnode
, rcu
);
372 size_t size
= sizeof(struct tnode
) +
373 (sizeof(struct node
*) << tn
->bits
);
375 if (size
<= PAGE_SIZE
)
378 INIT_WORK(&tn
->work
, __tnode_vfree
);
379 schedule_work(&tn
->work
);
383 static inline void tnode_free(struct tnode
*tn
)
386 free_leaf((struct leaf
*) tn
);
388 call_rcu(&tn
->rcu
, __tnode_free_rcu
);
391 static void tnode_free_safe(struct tnode
*tn
)
395 if (node_parent((struct node
*) tn
)) {
396 tn
->tnode_free
= tnode_free_head
;
397 tnode_free_head
= tn
;
403 static void tnode_free_flush(void)
407 while ((tn
= tnode_free_head
)) {
408 tnode_free_head
= tn
->tnode_free
;
409 tn
->tnode_free
= NULL
;
414 static struct leaf
*leaf_new(void)
416 struct leaf
*l
= kmem_cache_alloc(trie_leaf_kmem
, GFP_KERNEL
);
419 INIT_HLIST_HEAD(&l
->list
);
424 static struct leaf_info
*leaf_info_new(int plen
)
426 struct leaf_info
*li
= kmalloc(sizeof(struct leaf_info
), GFP_KERNEL
);
429 INIT_LIST_HEAD(&li
->falh
);
434 static struct tnode
*tnode_new(t_key key
, int pos
, int bits
)
436 size_t sz
= sizeof(struct tnode
) + (sizeof(struct node
*) << bits
);
437 struct tnode
*tn
= tnode_alloc(sz
);
440 tn
->parent
= T_TNODE
;
444 tn
->full_children
= 0;
445 tn
->empty_children
= 1<<bits
;
448 pr_debug("AT %p s=%u %lu\n", tn
, (unsigned int) sizeof(struct tnode
),
449 (unsigned long) (sizeof(struct node
) << bits
));
454 * Check whether a tnode 'n' is "full", i.e. it is an internal node
455 * and no bits are skipped. See discussion in dyntree paper p. 6
458 static inline int tnode_full(const struct tnode
*tn
, const struct node
*n
)
460 if (n
== NULL
|| IS_LEAF(n
))
463 return ((struct tnode
*) n
)->pos
== tn
->pos
+ tn
->bits
;
466 static inline void put_child(struct trie
*t
, struct tnode
*tn
, int i
,
469 tnode_put_child_reorg(tn
, i
, n
, -1);
473 * Add a child at position i overwriting the old value.
474 * Update the value of full_children and empty_children.
477 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
,
480 struct node
*chi
= tn
->child
[i
];
483 BUG_ON(i
>= 1<<tn
->bits
);
485 /* update emptyChildren */
486 if (n
== NULL
&& chi
!= NULL
)
487 tn
->empty_children
++;
488 else if (n
!= NULL
&& chi
== NULL
)
489 tn
->empty_children
--;
491 /* update fullChildren */
493 wasfull
= tnode_full(tn
, chi
);
495 isfull
= tnode_full(tn
, n
);
496 if (wasfull
&& !isfull
)
498 else if (!wasfull
&& isfull
)
502 node_set_parent(n
, tn
);
504 rcu_assign_pointer(tn
->child
[i
], n
);
507 static struct node
*resize(struct trie
*t
, struct tnode
*tn
)
511 struct tnode
*old_tn
;
512 int inflate_threshold_use
;
513 int halve_threshold_use
;
519 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
520 tn
, inflate_threshold
, halve_threshold
);
523 if (tn
->empty_children
== tnode_child_length(tn
)) {
528 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
529 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
536 /* compress one level */
537 node_set_parent(n
, NULL
);
542 * Double as long as the resulting node has a number of
543 * nonempty nodes that are above the threshold.
547 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
548 * the Helsinki University of Technology and Matti Tikkanen of Nokia
549 * Telecommunications, page 6:
550 * "A node is doubled if the ratio of non-empty children to all
551 * children in the *doubled* node is at least 'high'."
553 * 'high' in this instance is the variable 'inflate_threshold'. It
554 * is expressed as a percentage, so we multiply it with
555 * tnode_child_length() and instead of multiplying by 2 (since the
556 * child array will be doubled by inflate()) and multiplying
557 * the left-hand side by 100 (to handle the percentage thing) we
558 * multiply the left-hand side by 50.
560 * The left-hand side may look a bit weird: tnode_child_length(tn)
561 * - tn->empty_children is of course the number of non-null children
562 * in the current node. tn->full_children is the number of "full"
563 * children, that is non-null tnodes with a skip value of 0.
564 * All of those will be doubled in the resulting inflated tnode, so
565 * we just count them one extra time here.
567 * A clearer way to write this would be:
569 * to_be_doubled = tn->full_children;
570 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
573 * new_child_length = tnode_child_length(tn) * 2;
575 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
577 * if (new_fill_factor >= inflate_threshold)
579 * ...and so on, tho it would mess up the while () loop.
582 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
586 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
587 * inflate_threshold * new_child_length
589 * expand not_to_be_doubled and to_be_doubled, and shorten:
590 * 100 * (tnode_child_length(tn) - tn->empty_children +
591 * tn->full_children) >= inflate_threshold * new_child_length
593 * expand new_child_length:
594 * 100 * (tnode_child_length(tn) - tn->empty_children +
595 * tn->full_children) >=
596 * inflate_threshold * tnode_child_length(tn) * 2
599 * 50 * (tn->full_children + tnode_child_length(tn) -
600 * tn->empty_children) >= inflate_threshold *
601 * tnode_child_length(tn)
607 /* Keep root node larger */
610 inflate_threshold_use
= inflate_threshold_root
;
612 inflate_threshold_use
= inflate_threshold
;
616 while ((tn
->full_children
> 0 && max_resize
-- &&
617 50 * (tn
->full_children
+ tnode_child_length(tn
)
618 - tn
->empty_children
)
619 >= inflate_threshold_use
* tnode_child_length(tn
))) {
626 #ifdef CONFIG_IP_FIB_TRIE_STATS
627 t
->stats
.resize_node_skipped
++;
633 if (max_resize
< 0) {
635 pr_warning("Fix inflate_threshold_root."
636 " Now=%d size=%d bits\n",
637 inflate_threshold_root
, tn
->bits
);
639 pr_warning("Fix inflate_threshold."
640 " Now=%d size=%d bits\n",
641 inflate_threshold
, tn
->bits
);
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
652 /* Keep root node larger */
655 halve_threshold_use
= halve_threshold_root
;
657 halve_threshold_use
= halve_threshold
;
661 while (tn
->bits
> 1 && max_resize
-- &&
662 100 * (tnode_child_length(tn
) - tn
->empty_children
) <
663 halve_threshold_use
* tnode_child_length(tn
)) {
669 #ifdef CONFIG_IP_FIB_TRIE_STATS
670 t
->stats
.resize_node_skipped
++;
676 if (max_resize
< 0) {
678 pr_warning("Fix halve_threshold_root."
679 " Now=%d size=%d bits\n",
680 halve_threshold_root
, tn
->bits
);
682 pr_warning("Fix halve_threshold."
683 " Now=%d size=%d bits\n",
684 halve_threshold
, tn
->bits
);
687 /* Only one child remains */
688 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
689 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
696 /* compress one level */
698 node_set_parent(n
, NULL
);
703 return (struct node
*) tn
;
706 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
)
708 struct tnode
*oldtnode
= tn
;
709 int olen
= tnode_child_length(tn
);
712 pr_debug("In inflate\n");
714 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
+ 1);
717 return ERR_PTR(-ENOMEM
);
720 * Preallocate and store tnodes before the actual work so we
721 * don't get into an inconsistent state if memory allocation
722 * fails. In case of failure we return the oldnode and inflate
723 * of tnode is ignored.
726 for (i
= 0; i
< olen
; i
++) {
729 inode
= (struct tnode
*) tnode_get_child(oldtnode
, i
);
732 inode
->pos
== oldtnode
->pos
+ oldtnode
->bits
&&
734 struct tnode
*left
, *right
;
735 t_key m
= ~0U << (KEYLENGTH
- 1) >> inode
->pos
;
737 left
= tnode_new(inode
->key
&(~m
), inode
->pos
+ 1,
742 right
= tnode_new(inode
->key
|m
, inode
->pos
+ 1,
750 put_child(t
, tn
, 2*i
, (struct node
*) left
);
751 put_child(t
, tn
, 2*i
+1, (struct node
*) right
);
755 for (i
= 0; i
< olen
; i
++) {
757 struct node
*node
= tnode_get_child(oldtnode
, i
);
758 struct tnode
*left
, *right
;
765 /* A leaf or an internal node with skipped bits */
767 if (IS_LEAF(node
) || ((struct tnode
*) node
)->pos
>
768 tn
->pos
+ tn
->bits
- 1) {
769 if (tkey_extract_bits(node
->key
,
770 oldtnode
->pos
+ oldtnode
->bits
,
772 put_child(t
, tn
, 2*i
, node
);
774 put_child(t
, tn
, 2*i
+1, node
);
778 /* An internal node with two children */
779 inode
= (struct tnode
*) node
;
781 if (inode
->bits
== 1) {
782 put_child(t
, tn
, 2*i
, inode
->child
[0]);
783 put_child(t
, tn
, 2*i
+1, inode
->child
[1]);
785 tnode_free_safe(inode
);
789 /* An internal node with more than two children */
791 /* We will replace this node 'inode' with two new
792 * ones, 'left' and 'right', each with half of the
793 * original children. The two new nodes will have
794 * a position one bit further down the key and this
795 * means that the "significant" part of their keys
796 * (see the discussion near the top of this file)
797 * will differ by one bit, which will be "0" in
798 * left's key and "1" in right's key. Since we are
799 * moving the key position by one step, the bit that
800 * we are moving away from - the bit at position
801 * (inode->pos) - is the one that will differ between
802 * left and right. So... we synthesize that bit in the
804 * The mask 'm' below will be a single "one" bit at
805 * the position (inode->pos)
808 /* Use the old key, but set the new significant
812 left
= (struct tnode
*) tnode_get_child(tn
, 2*i
);
813 put_child(t
, tn
, 2*i
, NULL
);
817 right
= (struct tnode
*) tnode_get_child(tn
, 2*i
+1);
818 put_child(t
, tn
, 2*i
+1, NULL
);
822 size
= tnode_child_length(left
);
823 for (j
= 0; j
< size
; j
++) {
824 put_child(t
, left
, j
, inode
->child
[j
]);
825 put_child(t
, right
, j
, inode
->child
[j
+ size
]);
827 put_child(t
, tn
, 2*i
, resize(t
, left
));
828 put_child(t
, tn
, 2*i
+1, resize(t
, right
));
830 tnode_free_safe(inode
);
832 tnode_free_safe(oldtnode
);
836 int size
= tnode_child_length(tn
);
839 for (j
= 0; j
< size
; j
++)
841 tnode_free((struct tnode
*)tn
->child
[j
]);
845 return ERR_PTR(-ENOMEM
);
849 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
)
851 struct tnode
*oldtnode
= tn
;
852 struct node
*left
, *right
;
854 int olen
= tnode_child_length(tn
);
856 pr_debug("In halve\n");
858 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
- 1);
861 return ERR_PTR(-ENOMEM
);
864 * Preallocate and store tnodes before the actual work so we
865 * don't get into an inconsistent state if memory allocation
866 * fails. In case of failure we return the oldnode and halve
867 * of tnode is ignored.
870 for (i
= 0; i
< olen
; i
+= 2) {
871 left
= tnode_get_child(oldtnode
, i
);
872 right
= tnode_get_child(oldtnode
, i
+1);
874 /* Two nonempty children */
878 newn
= tnode_new(left
->key
, tn
->pos
+ tn
->bits
, 1);
883 put_child(t
, tn
, i
/2, (struct node
*)newn
);
888 for (i
= 0; i
< olen
; i
+= 2) {
889 struct tnode
*newBinNode
;
891 left
= tnode_get_child(oldtnode
, i
);
892 right
= tnode_get_child(oldtnode
, i
+1);
894 /* At least one of the children is empty */
896 if (right
== NULL
) /* Both are empty */
898 put_child(t
, tn
, i
/2, right
);
903 put_child(t
, tn
, i
/2, left
);
907 /* Two nonempty children */
908 newBinNode
= (struct tnode
*) tnode_get_child(tn
, i
/2);
909 put_child(t
, tn
, i
/2, NULL
);
910 put_child(t
, newBinNode
, 0, left
);
911 put_child(t
, newBinNode
, 1, right
);
912 put_child(t
, tn
, i
/2, resize(t
, newBinNode
));
914 tnode_free_safe(oldtnode
);
918 int size
= tnode_child_length(tn
);
921 for (j
= 0; j
< size
; j
++)
923 tnode_free((struct tnode
*)tn
->child
[j
]);
927 return ERR_PTR(-ENOMEM
);
931 /* readside must use rcu_read_lock currently dump routines
932 via get_fa_head and dump */
934 static struct leaf_info
*find_leaf_info(struct leaf
*l
, int plen
)
936 struct hlist_head
*head
= &l
->list
;
937 struct hlist_node
*node
;
938 struct leaf_info
*li
;
940 hlist_for_each_entry_rcu(li
, node
, head
, hlist
)
941 if (li
->plen
== plen
)
947 static inline struct list_head
*get_fa_head(struct leaf
*l
, int plen
)
949 struct leaf_info
*li
= find_leaf_info(l
, plen
);
957 static void insert_leaf_info(struct hlist_head
*head
, struct leaf_info
*new)
959 struct leaf_info
*li
= NULL
, *last
= NULL
;
960 struct hlist_node
*node
;
962 if (hlist_empty(head
)) {
963 hlist_add_head_rcu(&new->hlist
, head
);
965 hlist_for_each_entry(li
, node
, head
, hlist
) {
966 if (new->plen
> li
->plen
)
972 hlist_add_after_rcu(&last
->hlist
, &new->hlist
);
974 hlist_add_before_rcu(&new->hlist
, &li
->hlist
);
978 /* rcu_read_lock needs to be hold by caller from readside */
981 fib_find_node(struct trie
*t
, u32 key
)
988 n
= rcu_dereference(t
->trie
);
990 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
991 tn
= (struct tnode
*) n
;
995 if (tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
996 pos
= tn
->pos
+ tn
->bits
;
997 n
= tnode_get_child_rcu(tn
,
998 tkey_extract_bits(key
,
1004 /* Case we have found a leaf. Compare prefixes */
1006 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
))
1007 return (struct leaf
*)n
;
1012 static struct node
*trie_rebalance(struct trie
*t
, struct tnode
*tn
)
1020 while (tn
!= NULL
&& (tp
= node_parent((struct node
*)tn
)) != NULL
) {
1021 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1022 wasfull
= tnode_full(tp
, tnode_get_child(tp
, cindex
));
1023 tn
= (struct tnode
*) resize(t
, (struct tnode
*)tn
);
1025 tnode_put_child_reorg((struct tnode
*)tp
, cindex
,
1026 (struct node
*)tn
, wasfull
);
1028 tp
= node_parent((struct node
*) tn
);
1035 /* Handle last (top) tnode */
1037 tn
= (struct tnode
*)resize(t
, (struct tnode
*)tn
);
1041 return (struct node
*)tn
;
1044 /* only used from updater-side */
1046 static struct list_head
*fib_insert_node(struct trie
*t
, u32 key
, int plen
)
1049 struct tnode
*tp
= NULL
, *tn
= NULL
;
1053 struct list_head
*fa_head
= NULL
;
1054 struct leaf_info
*li
;
1060 /* If we point to NULL, stop. Either the tree is empty and we should
1061 * just put a new leaf in if, or we have reached an empty child slot,
1062 * and we should just put our new leaf in that.
1063 * If we point to a T_TNODE, check if it matches our key. Note that
1064 * a T_TNODE might be skipping any number of bits - its 'pos' need
1065 * not be the parent's 'pos'+'bits'!
1067 * If it does match the current key, get pos/bits from it, extract
1068 * the index from our key, push the T_TNODE and walk the tree.
1070 * If it doesn't, we have to replace it with a new T_TNODE.
1072 * If we point to a T_LEAF, it might or might not have the same key
1073 * as we do. If it does, just change the value, update the T_LEAF's
1074 * value, and return it.
1075 * If it doesn't, we need to replace it with a T_TNODE.
1078 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
1079 tn
= (struct tnode
*) n
;
1083 if (tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
1085 pos
= tn
->pos
+ tn
->bits
;
1086 n
= tnode_get_child(tn
,
1087 tkey_extract_bits(key
,
1091 BUG_ON(n
&& node_parent(n
) != tn
);
1097 * n ----> NULL, LEAF or TNODE
1099 * tp is n's (parent) ----> NULL or TNODE
1102 BUG_ON(tp
&& IS_LEAF(tp
));
1104 /* Case 1: n is a leaf. Compare prefixes */
1106 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
1107 l
= (struct leaf
*) n
;
1108 li
= leaf_info_new(plen
);
1113 fa_head
= &li
->falh
;
1114 insert_leaf_info(&l
->list
, li
);
1123 li
= leaf_info_new(plen
);
1130 fa_head
= &li
->falh
;
1131 insert_leaf_info(&l
->list
, li
);
1133 if (t
->trie
&& n
== NULL
) {
1134 /* Case 2: n is NULL, and will just insert a new leaf */
1136 node_set_parent((struct node
*)l
, tp
);
1138 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1139 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)l
);
1141 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1143 * Add a new tnode here
1144 * first tnode need some special handling
1148 pos
= tp
->pos
+tp
->bits
;
1153 newpos
= tkey_mismatch(key
, pos
, n
->key
);
1154 tn
= tnode_new(n
->key
, newpos
, 1);
1157 tn
= tnode_new(key
, newpos
, 1); /* First tnode */
1166 node_set_parent((struct node
*)tn
, tp
);
1168 missbit
= tkey_extract_bits(key
, newpos
, 1);
1169 put_child(t
, tn
, missbit
, (struct node
*)l
);
1170 put_child(t
, tn
, 1-missbit
, n
);
1173 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1174 put_child(t
, (struct tnode
*)tp
, cindex
,
1177 rcu_assign_pointer(t
->trie
, (struct node
*)tn
);
1182 if (tp
&& tp
->pos
+ tp
->bits
> 32)
1183 pr_warning("fib_trie"
1184 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1185 tp
, tp
->pos
, tp
->bits
, key
, plen
);
1187 /* Rebalance the trie */
1189 rcu_assign_pointer(t
->trie
, trie_rebalance(t
, tp
));
1195 * Caller must hold RTNL.
1197 static int fn_trie_insert(struct fib_table
*tb
, struct fib_config
*cfg
)
1199 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1200 struct fib_alias
*fa
, *new_fa
;
1201 struct list_head
*fa_head
= NULL
;
1202 struct fib_info
*fi
;
1203 int plen
= cfg
->fc_dst_len
;
1204 u8 tos
= cfg
->fc_tos
;
1212 key
= ntohl(cfg
->fc_dst
);
1214 pr_debug("Insert table=%u %08x/%d\n", tb
->tb_id
, key
, plen
);
1216 mask
= ntohl(inet_make_mask(plen
));
1223 fi
= fib_create_info(cfg
);
1229 l
= fib_find_node(t
, key
);
1233 fa_head
= get_fa_head(l
, plen
);
1234 fa
= fib_find_alias(fa_head
, tos
, fi
->fib_priority
);
1237 /* Now fa, if non-NULL, points to the first fib alias
1238 * with the same keys [prefix,tos,priority], if such key already
1239 * exists or to the node before which we will insert new one.
1241 * If fa is NULL, we will need to allocate a new one and
1242 * insert to the head of f.
1244 * If f is NULL, no fib node matched the destination key
1245 * and we need to allocate a new one of those as well.
1248 if (fa
&& fa
->fa_tos
== tos
&&
1249 fa
->fa_info
->fib_priority
== fi
->fib_priority
) {
1250 struct fib_alias
*fa_first
, *fa_match
;
1253 if (cfg
->fc_nlflags
& NLM_F_EXCL
)
1257 * 1. Find exact match for type, scope, fib_info to avoid
1259 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1263 fa
= list_entry(fa
->fa_list
.prev
, struct fib_alias
, fa_list
);
1264 list_for_each_entry_continue(fa
, fa_head
, fa_list
) {
1265 if (fa
->fa_tos
!= tos
)
1267 if (fa
->fa_info
->fib_priority
!= fi
->fib_priority
)
1269 if (fa
->fa_type
== cfg
->fc_type
&&
1270 fa
->fa_scope
== cfg
->fc_scope
&&
1271 fa
->fa_info
== fi
) {
1277 if (cfg
->fc_nlflags
& NLM_F_REPLACE
) {
1278 struct fib_info
*fi_drop
;
1288 new_fa
= kmem_cache_alloc(fn_alias_kmem
, GFP_KERNEL
);
1292 fi_drop
= fa
->fa_info
;
1293 new_fa
->fa_tos
= fa
->fa_tos
;
1294 new_fa
->fa_info
= fi
;
1295 new_fa
->fa_type
= cfg
->fc_type
;
1296 new_fa
->fa_scope
= cfg
->fc_scope
;
1297 state
= fa
->fa_state
;
1298 new_fa
->fa_state
= state
& ~FA_S_ACCESSED
;
1300 list_replace_rcu(&fa
->fa_list
, &new_fa
->fa_list
);
1301 alias_free_mem_rcu(fa
);
1303 fib_release_info(fi_drop
);
1304 if (state
& FA_S_ACCESSED
)
1305 rt_cache_flush(cfg
->fc_nlinfo
.nl_net
, -1);
1306 rtmsg_fib(RTM_NEWROUTE
, htonl(key
), new_fa
, plen
,
1307 tb
->tb_id
, &cfg
->fc_nlinfo
, NLM_F_REPLACE
);
1311 /* Error if we find a perfect match which
1312 * uses the same scope, type, and nexthop
1318 if (!(cfg
->fc_nlflags
& NLM_F_APPEND
))
1322 if (!(cfg
->fc_nlflags
& NLM_F_CREATE
))
1326 new_fa
= kmem_cache_alloc(fn_alias_kmem
, GFP_KERNEL
);
1330 new_fa
->fa_info
= fi
;
1331 new_fa
->fa_tos
= tos
;
1332 new_fa
->fa_type
= cfg
->fc_type
;
1333 new_fa
->fa_scope
= cfg
->fc_scope
;
1334 new_fa
->fa_state
= 0;
1336 * Insert new entry to the list.
1340 fa_head
= fib_insert_node(t
, key
, plen
);
1341 if (unlikely(!fa_head
)) {
1343 goto out_free_new_fa
;
1347 list_add_tail_rcu(&new_fa
->fa_list
,
1348 (fa
? &fa
->fa_list
: fa_head
));
1350 rt_cache_flush(cfg
->fc_nlinfo
.nl_net
, -1);
1351 rtmsg_fib(RTM_NEWROUTE
, htonl(key
), new_fa
, plen
, tb
->tb_id
,
1352 &cfg
->fc_nlinfo
, 0);
1357 kmem_cache_free(fn_alias_kmem
, new_fa
);
1359 fib_release_info(fi
);
1364 /* should be called with rcu_read_lock */
1365 static int check_leaf(struct trie
*t
, struct leaf
*l
,
1366 t_key key
, const struct flowi
*flp
,
1367 struct fib_result
*res
)
1369 struct leaf_info
*li
;
1370 struct hlist_head
*hhead
= &l
->list
;
1371 struct hlist_node
*node
;
1373 hlist_for_each_entry_rcu(li
, node
, hhead
, hlist
) {
1375 int plen
= li
->plen
;
1376 __be32 mask
= inet_make_mask(plen
);
1378 if (l
->key
!= (key
& ntohl(mask
)))
1381 err
= fib_semantic_match(&li
->falh
, flp
, res
, plen
);
1383 #ifdef CONFIG_IP_FIB_TRIE_STATS
1385 t
->stats
.semantic_match_passed
++;
1387 t
->stats
.semantic_match_miss
++;
1396 static int fn_trie_lookup(struct fib_table
*tb
, const struct flowi
*flp
,
1397 struct fib_result
*res
)
1399 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1404 t_key key
= ntohl(flp
->fl4_dst
);
1407 int current_prefix_length
= KEYLENGTH
;
1409 t_key node_prefix
, key_prefix
, pref_mismatch
;
1414 n
= rcu_dereference(t
->trie
);
1418 #ifdef CONFIG_IP_FIB_TRIE_STATS
1424 ret
= check_leaf(t
, (struct leaf
*)n
, key
, flp
, res
);
1428 pn
= (struct tnode
*) n
;
1436 cindex
= tkey_extract_bits(mask_pfx(key
, current_prefix_length
),
1439 n
= tnode_get_child(pn
, cindex
);
1442 #ifdef CONFIG_IP_FIB_TRIE_STATS
1443 t
->stats
.null_node_hit
++;
1449 ret
= check_leaf(t
, (struct leaf
*)n
, key
, flp
, res
);
1455 cn
= (struct tnode
*)n
;
1458 * It's a tnode, and we can do some extra checks here if we
1459 * like, to avoid descending into a dead-end branch.
1460 * This tnode is in the parent's child array at index
1461 * key[p_pos..p_pos+p_bits] but potentially with some bits
1462 * chopped off, so in reality the index may be just a
1463 * subprefix, padded with zero at the end.
1464 * We can also take a look at any skipped bits in this
1465 * tnode - everything up to p_pos is supposed to be ok,
1466 * and the non-chopped bits of the index (se previous
1467 * paragraph) are also guaranteed ok, but the rest is
1468 * considered unknown.
1470 * The skipped bits are key[pos+bits..cn->pos].
1473 /* If current_prefix_length < pos+bits, we are already doing
1474 * actual prefix matching, which means everything from
1475 * pos+(bits-chopped_off) onward must be zero along some
1476 * branch of this subtree - otherwise there is *no* valid
1477 * prefix present. Here we can only check the skipped
1478 * bits. Remember, since we have already indexed into the
1479 * parent's child array, we know that the bits we chopped of
1483 /* NOTA BENE: Checking only skipped bits
1484 for the new node here */
1486 if (current_prefix_length
< pos
+bits
) {
1487 if (tkey_extract_bits(cn
->key
, current_prefix_length
,
1488 cn
->pos
- current_prefix_length
)
1494 * If chopped_off=0, the index is fully validated and we
1495 * only need to look at the skipped bits for this, the new,
1496 * tnode. What we actually want to do is to find out if
1497 * these skipped bits match our key perfectly, or if we will
1498 * have to count on finding a matching prefix further down,
1499 * because if we do, we would like to have some way of
1500 * verifying the existence of such a prefix at this point.
1503 /* The only thing we can do at this point is to verify that
1504 * any such matching prefix can indeed be a prefix to our
1505 * key, and if the bits in the node we are inspecting that
1506 * do not match our key are not ZERO, this cannot be true.
1507 * Thus, find out where there is a mismatch (before cn->pos)
1508 * and verify that all the mismatching bits are zero in the
1513 * Note: We aren't very concerned about the piece of
1514 * the key that precede pn->pos+pn->bits, since these
1515 * have already been checked. The bits after cn->pos
1516 * aren't checked since these are by definition
1517 * "unknown" at this point. Thus, what we want to see
1518 * is if we are about to enter the "prefix matching"
1519 * state, and in that case verify that the skipped
1520 * bits that will prevail throughout this subtree are
1521 * zero, as they have to be if we are to find a
1525 node_prefix
= mask_pfx(cn
->key
, cn
->pos
);
1526 key_prefix
= mask_pfx(key
, cn
->pos
);
1527 pref_mismatch
= key_prefix
^node_prefix
;
1531 * In short: If skipped bits in this node do not match
1532 * the search key, enter the "prefix matching"
1535 if (pref_mismatch
) {
1536 while (!(pref_mismatch
& (1<<(KEYLENGTH
-1)))) {
1538 pref_mismatch
= pref_mismatch
<< 1;
1540 key_prefix
= tkey_extract_bits(cn
->key
, mp
, cn
->pos
-mp
);
1542 if (key_prefix
!= 0)
1545 if (current_prefix_length
>= cn
->pos
)
1546 current_prefix_length
= mp
;
1549 pn
= (struct tnode
*)n
; /* Descend */
1556 /* As zero don't change the child key (cindex) */
1557 while ((chopped_off
<= pn
->bits
)
1558 && !(cindex
& (1<<(chopped_off
-1))))
1561 /* Decrease current_... with bits chopped off */
1562 if (current_prefix_length
> pn
->pos
+ pn
->bits
- chopped_off
)
1563 current_prefix_length
= pn
->pos
+ pn
->bits
1567 * Either we do the actual chop off according or if we have
1568 * chopped off all bits in this tnode walk up to our parent.
1571 if (chopped_off
<= pn
->bits
) {
1572 cindex
&= ~(1 << (chopped_off
-1));
1574 struct tnode
*parent
= node_parent((struct node
*) pn
);
1578 /* Get Child's index */
1579 cindex
= tkey_extract_bits(pn
->key
, parent
->pos
, parent
->bits
);
1583 #ifdef CONFIG_IP_FIB_TRIE_STATS
1584 t
->stats
.backtrack
++;
1597 * Remove the leaf and return parent.
1599 static void trie_leaf_remove(struct trie
*t
, struct leaf
*l
)
1601 struct tnode
*tp
= node_parent((struct node
*) l
);
1603 pr_debug("entering trie_leaf_remove(%p)\n", l
);
1606 t_key cindex
= tkey_extract_bits(l
->key
, tp
->pos
, tp
->bits
);
1607 put_child(t
, (struct tnode
*)tp
, cindex
, NULL
);
1608 rcu_assign_pointer(t
->trie
, trie_rebalance(t
, tp
));
1610 rcu_assign_pointer(t
->trie
, NULL
);
1616 * Caller must hold RTNL.
1618 static int fn_trie_delete(struct fib_table
*tb
, struct fib_config
*cfg
)
1620 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1622 int plen
= cfg
->fc_dst_len
;
1623 u8 tos
= cfg
->fc_tos
;
1624 struct fib_alias
*fa
, *fa_to_delete
;
1625 struct list_head
*fa_head
;
1627 struct leaf_info
*li
;
1632 key
= ntohl(cfg
->fc_dst
);
1633 mask
= ntohl(inet_make_mask(plen
));
1639 l
= fib_find_node(t
, key
);
1644 fa_head
= get_fa_head(l
, plen
);
1645 fa
= fib_find_alias(fa_head
, tos
, 0);
1650 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key
, plen
, tos
, t
);
1652 fa_to_delete
= NULL
;
1653 fa
= list_entry(fa
->fa_list
.prev
, struct fib_alias
, fa_list
);
1654 list_for_each_entry_continue(fa
, fa_head
, fa_list
) {
1655 struct fib_info
*fi
= fa
->fa_info
;
1657 if (fa
->fa_tos
!= tos
)
1660 if ((!cfg
->fc_type
|| fa
->fa_type
== cfg
->fc_type
) &&
1661 (cfg
->fc_scope
== RT_SCOPE_NOWHERE
||
1662 fa
->fa_scope
== cfg
->fc_scope
) &&
1663 (!cfg
->fc_protocol
||
1664 fi
->fib_protocol
== cfg
->fc_protocol
) &&
1665 fib_nh_match(cfg
, fi
) == 0) {
1675 rtmsg_fib(RTM_DELROUTE
, htonl(key
), fa
, plen
, tb
->tb_id
,
1676 &cfg
->fc_nlinfo
, 0);
1678 l
= fib_find_node(t
, key
);
1679 li
= find_leaf_info(l
, plen
);
1681 list_del_rcu(&fa
->fa_list
);
1683 if (list_empty(fa_head
)) {
1684 hlist_del_rcu(&li
->hlist
);
1688 if (hlist_empty(&l
->list
))
1689 trie_leaf_remove(t
, l
);
1691 if (fa
->fa_state
& FA_S_ACCESSED
)
1692 rt_cache_flush(cfg
->fc_nlinfo
.nl_net
, -1);
1694 fib_release_info(fa
->fa_info
);
1695 alias_free_mem_rcu(fa
);
1699 static int trie_flush_list(struct list_head
*head
)
1701 struct fib_alias
*fa
, *fa_node
;
1704 list_for_each_entry_safe(fa
, fa_node
, head
, fa_list
) {
1705 struct fib_info
*fi
= fa
->fa_info
;
1707 if (fi
&& (fi
->fib_flags
& RTNH_F_DEAD
)) {
1708 list_del_rcu(&fa
->fa_list
);
1709 fib_release_info(fa
->fa_info
);
1710 alias_free_mem_rcu(fa
);
1717 static int trie_flush_leaf(struct leaf
*l
)
1720 struct hlist_head
*lih
= &l
->list
;
1721 struct hlist_node
*node
, *tmp
;
1722 struct leaf_info
*li
= NULL
;
1724 hlist_for_each_entry_safe(li
, node
, tmp
, lih
, hlist
) {
1725 found
+= trie_flush_list(&li
->falh
);
1727 if (list_empty(&li
->falh
)) {
1728 hlist_del_rcu(&li
->hlist
);
1736 * Scan for the next right leaf starting at node p->child[idx]
1737 * Since we have back pointer, no recursion necessary.
1739 static struct leaf
*leaf_walk_rcu(struct tnode
*p
, struct node
*c
)
1745 idx
= tkey_extract_bits(c
->key
, p
->pos
, p
->bits
) + 1;
1749 while (idx
< 1u << p
->bits
) {
1750 c
= tnode_get_child_rcu(p
, idx
++);
1755 prefetch(p
->child
[idx
]);
1756 return (struct leaf
*) c
;
1759 /* Rescan start scanning in new node */
1760 p
= (struct tnode
*) c
;
1764 /* Node empty, walk back up to parent */
1765 c
= (struct node
*) p
;
1766 } while ( (p
= node_parent_rcu(c
)) != NULL
);
1768 return NULL
; /* Root of trie */
1771 static struct leaf
*trie_firstleaf(struct trie
*t
)
1773 struct tnode
*n
= (struct tnode
*) rcu_dereference(t
->trie
);
1778 if (IS_LEAF(n
)) /* trie is just a leaf */
1779 return (struct leaf
*) n
;
1781 return leaf_walk_rcu(n
, NULL
);
1784 static struct leaf
*trie_nextleaf(struct leaf
*l
)
1786 struct node
*c
= (struct node
*) l
;
1787 struct tnode
*p
= node_parent(c
);
1790 return NULL
; /* trie with just one leaf */
1792 return leaf_walk_rcu(p
, c
);
1795 static struct leaf
*trie_leafindex(struct trie
*t
, int index
)
1797 struct leaf
*l
= trie_firstleaf(t
);
1799 while (l
&& index
-- > 0)
1800 l
= trie_nextleaf(l
);
1807 * Caller must hold RTNL.
1809 static int fn_trie_flush(struct fib_table
*tb
)
1811 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1812 struct leaf
*l
, *ll
= NULL
;
1815 for (l
= trie_firstleaf(t
); l
; l
= trie_nextleaf(l
)) {
1816 found
+= trie_flush_leaf(l
);
1818 if (ll
&& hlist_empty(&ll
->list
))
1819 trie_leaf_remove(t
, ll
);
1823 if (ll
&& hlist_empty(&ll
->list
))
1824 trie_leaf_remove(t
, ll
);
1826 pr_debug("trie_flush found=%d\n", found
);
1830 static void fn_trie_select_default(struct fib_table
*tb
,
1831 const struct flowi
*flp
,
1832 struct fib_result
*res
)
1834 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1835 int order
, last_idx
;
1836 struct fib_info
*fi
= NULL
;
1837 struct fib_info
*last_resort
;
1838 struct fib_alias
*fa
= NULL
;
1839 struct list_head
*fa_head
;
1848 l
= fib_find_node(t
, 0);
1852 fa_head
= get_fa_head(l
, 0);
1856 if (list_empty(fa_head
))
1859 list_for_each_entry_rcu(fa
, fa_head
, fa_list
) {
1860 struct fib_info
*next_fi
= fa
->fa_info
;
1862 if (fa
->fa_scope
!= res
->scope
||
1863 fa
->fa_type
!= RTN_UNICAST
)
1866 if (next_fi
->fib_priority
> res
->fi
->fib_priority
)
1868 if (!next_fi
->fib_nh
[0].nh_gw
||
1869 next_fi
->fib_nh
[0].nh_scope
!= RT_SCOPE_LINK
)
1871 fa
->fa_state
|= FA_S_ACCESSED
;
1874 if (next_fi
!= res
->fi
)
1876 } else if (!fib_detect_death(fi
, order
, &last_resort
,
1877 &last_idx
, tb
->tb_default
)) {
1878 fib_result_assign(res
, fi
);
1879 tb
->tb_default
= order
;
1885 if (order
<= 0 || fi
== NULL
) {
1886 tb
->tb_default
= -1;
1890 if (!fib_detect_death(fi
, order
, &last_resort
, &last_idx
,
1892 fib_result_assign(res
, fi
);
1893 tb
->tb_default
= order
;
1897 fib_result_assign(res
, last_resort
);
1898 tb
->tb_default
= last_idx
;
1903 static int fn_trie_dump_fa(t_key key
, int plen
, struct list_head
*fah
,
1904 struct fib_table
*tb
,
1905 struct sk_buff
*skb
, struct netlink_callback
*cb
)
1908 struct fib_alias
*fa
;
1909 __be32 xkey
= htonl(key
);
1914 /* rcu_read_lock is hold by caller */
1916 list_for_each_entry_rcu(fa
, fah
, fa_list
) {
1922 if (fib_dump_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1931 fa
->fa_info
, NLM_F_MULTI
) < 0) {
1941 static int fn_trie_dump_leaf(struct leaf
*l
, struct fib_table
*tb
,
1942 struct sk_buff
*skb
, struct netlink_callback
*cb
)
1944 struct leaf_info
*li
;
1945 struct hlist_node
*node
;
1951 /* rcu_read_lock is hold by caller */
1952 hlist_for_each_entry_rcu(li
, node
, &l
->list
, hlist
) {
1961 if (list_empty(&li
->falh
))
1964 if (fn_trie_dump_fa(l
->key
, li
->plen
, &li
->falh
, tb
, skb
, cb
) < 0) {
1975 static int fn_trie_dump(struct fib_table
*tb
, struct sk_buff
*skb
,
1976 struct netlink_callback
*cb
)
1979 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1980 t_key key
= cb
->args
[2];
1981 int count
= cb
->args
[3];
1984 /* Dump starting at last key.
1985 * Note: 0.0.0.0/0 (ie default) is first key.
1988 l
= trie_firstleaf(t
);
1990 /* Normally, continue from last key, but if that is missing
1991 * fallback to using slow rescan
1993 l
= fib_find_node(t
, key
);
1995 l
= trie_leafindex(t
, count
);
1999 cb
->args
[2] = l
->key
;
2000 if (fn_trie_dump_leaf(l
, tb
, skb
, cb
) < 0) {
2001 cb
->args
[3] = count
;
2007 l
= trie_nextleaf(l
);
2008 memset(&cb
->args
[4], 0,
2009 sizeof(cb
->args
) - 4*sizeof(cb
->args
[0]));
2011 cb
->args
[3] = count
;
2017 void __init
fib_hash_init(void)
2019 fn_alias_kmem
= kmem_cache_create("ip_fib_alias",
2020 sizeof(struct fib_alias
),
2021 0, SLAB_PANIC
, NULL
);
2023 trie_leaf_kmem
= kmem_cache_create("ip_fib_trie",
2024 max(sizeof(struct leaf
),
2025 sizeof(struct leaf_info
)),
2026 0, SLAB_PANIC
, NULL
);
2030 /* Fix more generic FIB names for init later */
2031 struct fib_table
*fib_hash_table(u32 id
)
2033 struct fib_table
*tb
;
2036 tb
= kmalloc(sizeof(struct fib_table
) + sizeof(struct trie
),
2042 tb
->tb_default
= -1;
2043 tb
->tb_lookup
= fn_trie_lookup
;
2044 tb
->tb_insert
= fn_trie_insert
;
2045 tb
->tb_delete
= fn_trie_delete
;
2046 tb
->tb_flush
= fn_trie_flush
;
2047 tb
->tb_select_default
= fn_trie_select_default
;
2048 tb
->tb_dump
= fn_trie_dump
;
2050 t
= (struct trie
*) tb
->tb_data
;
2051 memset(t
, 0, sizeof(*t
));
2053 if (id
== RT_TABLE_LOCAL
)
2054 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION
);
2059 #ifdef CONFIG_PROC_FS
2060 /* Depth first Trie walk iterator */
2061 struct fib_trie_iter
{
2062 struct seq_net_private p
;
2063 struct fib_table
*tb
;
2064 struct tnode
*tnode
;
2069 static struct node
*fib_trie_get_next(struct fib_trie_iter
*iter
)
2071 struct tnode
*tn
= iter
->tnode
;
2072 unsigned cindex
= iter
->index
;
2075 /* A single entry routing table */
2079 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2080 iter
->tnode
, iter
->index
, iter
->depth
);
2082 while (cindex
< (1<<tn
->bits
)) {
2083 struct node
*n
= tnode_get_child_rcu(tn
, cindex
);
2088 iter
->index
= cindex
+ 1;
2090 /* push down one level */
2091 iter
->tnode
= (struct tnode
*) n
;
2101 /* Current node exhausted, pop back up */
2102 p
= node_parent_rcu((struct node
*)tn
);
2104 cindex
= tkey_extract_bits(tn
->key
, p
->pos
, p
->bits
)+1;
2114 static struct node
*fib_trie_get_first(struct fib_trie_iter
*iter
,
2122 n
= rcu_dereference(t
->trie
);
2127 iter
->tnode
= (struct tnode
*) n
;
2139 static void trie_collect_stats(struct trie
*t
, struct trie_stat
*s
)
2142 struct fib_trie_iter iter
;
2144 memset(s
, 0, sizeof(*s
));
2147 for (n
= fib_trie_get_first(&iter
, t
); n
; n
= fib_trie_get_next(&iter
)) {
2149 struct leaf
*l
= (struct leaf
*)n
;
2150 struct leaf_info
*li
;
2151 struct hlist_node
*tmp
;
2154 s
->totdepth
+= iter
.depth
;
2155 if (iter
.depth
> s
->maxdepth
)
2156 s
->maxdepth
= iter
.depth
;
2158 hlist_for_each_entry_rcu(li
, tmp
, &l
->list
, hlist
)
2161 const struct tnode
*tn
= (const struct tnode
*) n
;
2165 if (tn
->bits
< MAX_STAT_DEPTH
)
2166 s
->nodesizes
[tn
->bits
]++;
2168 for (i
= 0; i
< (1<<tn
->bits
); i
++)
2177 * This outputs /proc/net/fib_triestats
2179 static void trie_show_stats(struct seq_file
*seq
, struct trie_stat
*stat
)
2181 unsigned i
, max
, pointers
, bytes
, avdepth
;
2184 avdepth
= stat
->totdepth
*100 / stat
->leaves
;
2188 seq_printf(seq
, "\tAver depth: %u.%02d\n",
2189 avdepth
/ 100, avdepth
% 100);
2190 seq_printf(seq
, "\tMax depth: %u\n", stat
->maxdepth
);
2192 seq_printf(seq
, "\tLeaves: %u\n", stat
->leaves
);
2193 bytes
= sizeof(struct leaf
) * stat
->leaves
;
2195 seq_printf(seq
, "\tPrefixes: %u\n", stat
->prefixes
);
2196 bytes
+= sizeof(struct leaf_info
) * stat
->prefixes
;
2198 seq_printf(seq
, "\tInternal nodes: %u\n\t", stat
->tnodes
);
2199 bytes
+= sizeof(struct tnode
) * stat
->tnodes
;
2201 max
= MAX_STAT_DEPTH
;
2202 while (max
> 0 && stat
->nodesizes
[max
-1] == 0)
2206 for (i
= 1; i
<= max
; i
++)
2207 if (stat
->nodesizes
[i
] != 0) {
2208 seq_printf(seq
, " %u: %u", i
, stat
->nodesizes
[i
]);
2209 pointers
+= (1<<i
) * stat
->nodesizes
[i
];
2211 seq_putc(seq
, '\n');
2212 seq_printf(seq
, "\tPointers: %u\n", pointers
);
2214 bytes
+= sizeof(struct node
*) * pointers
;
2215 seq_printf(seq
, "Null ptrs: %u\n", stat
->nullpointers
);
2216 seq_printf(seq
, "Total size: %u kB\n", (bytes
+ 1023) / 1024);
2219 #ifdef CONFIG_IP_FIB_TRIE_STATS
2220 static void trie_show_usage(struct seq_file
*seq
,
2221 const struct trie_use_stats
*stats
)
2223 seq_printf(seq
, "\nCounters:\n---------\n");
2224 seq_printf(seq
, "gets = %u\n", stats
->gets
);
2225 seq_printf(seq
, "backtracks = %u\n", stats
->backtrack
);
2226 seq_printf(seq
, "semantic match passed = %u\n",
2227 stats
->semantic_match_passed
);
2228 seq_printf(seq
, "semantic match miss = %u\n",
2229 stats
->semantic_match_miss
);
2230 seq_printf(seq
, "null node hit= %u\n", stats
->null_node_hit
);
2231 seq_printf(seq
, "skipped node resize = %u\n\n",
2232 stats
->resize_node_skipped
);
2234 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2236 static void fib_table_print(struct seq_file
*seq
, struct fib_table
*tb
)
2238 if (tb
->tb_id
== RT_TABLE_LOCAL
)
2239 seq_puts(seq
, "Local:\n");
2240 else if (tb
->tb_id
== RT_TABLE_MAIN
)
2241 seq_puts(seq
, "Main:\n");
2243 seq_printf(seq
, "Id %d:\n", tb
->tb_id
);
2247 static int fib_triestat_seq_show(struct seq_file
*seq
, void *v
)
2249 struct net
*net
= (struct net
*)seq
->private;
2253 "Basic info: size of leaf:"
2254 " %Zd bytes, size of tnode: %Zd bytes.\n",
2255 sizeof(struct leaf
), sizeof(struct tnode
));
2257 for (h
= 0; h
< FIB_TABLE_HASHSZ
; h
++) {
2258 struct hlist_head
*head
= &net
->ipv4
.fib_table_hash
[h
];
2259 struct hlist_node
*node
;
2260 struct fib_table
*tb
;
2262 hlist_for_each_entry_rcu(tb
, node
, head
, tb_hlist
) {
2263 struct trie
*t
= (struct trie
*) tb
->tb_data
;
2264 struct trie_stat stat
;
2269 fib_table_print(seq
, tb
);
2271 trie_collect_stats(t
, &stat
);
2272 trie_show_stats(seq
, &stat
);
2273 #ifdef CONFIG_IP_FIB_TRIE_STATS
2274 trie_show_usage(seq
, &t
->stats
);
2282 static int fib_triestat_seq_open(struct inode
*inode
, struct file
*file
)
2284 return single_open_net(inode
, file
, fib_triestat_seq_show
);
2287 static const struct file_operations fib_triestat_fops
= {
2288 .owner
= THIS_MODULE
,
2289 .open
= fib_triestat_seq_open
,
2291 .llseek
= seq_lseek
,
2292 .release
= single_release_net
,
2295 static struct node
*fib_trie_get_idx(struct seq_file
*seq
, loff_t pos
)
2297 struct fib_trie_iter
*iter
= seq
->private;
2298 struct net
*net
= seq_file_net(seq
);
2302 for (h
= 0; h
< FIB_TABLE_HASHSZ
; h
++) {
2303 struct hlist_head
*head
= &net
->ipv4
.fib_table_hash
[h
];
2304 struct hlist_node
*node
;
2305 struct fib_table
*tb
;
2307 hlist_for_each_entry_rcu(tb
, node
, head
, tb_hlist
) {
2310 for (n
= fib_trie_get_first(iter
,
2311 (struct trie
*) tb
->tb_data
);
2312 n
; n
= fib_trie_get_next(iter
))
2323 static void *fib_trie_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2327 return fib_trie_get_idx(seq
, *pos
);
2330 static void *fib_trie_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2332 struct fib_trie_iter
*iter
= seq
->private;
2333 struct net
*net
= seq_file_net(seq
);
2334 struct fib_table
*tb
= iter
->tb
;
2335 struct hlist_node
*tb_node
;
2340 /* next node in same table */
2341 n
= fib_trie_get_next(iter
);
2345 /* walk rest of this hash chain */
2346 h
= tb
->tb_id
& (FIB_TABLE_HASHSZ
- 1);
2347 while ( (tb_node
= rcu_dereference(tb
->tb_hlist
.next
)) ) {
2348 tb
= hlist_entry(tb_node
, struct fib_table
, tb_hlist
);
2349 n
= fib_trie_get_first(iter
, (struct trie
*) tb
->tb_data
);
2354 /* new hash chain */
2355 while (++h
< FIB_TABLE_HASHSZ
) {
2356 struct hlist_head
*head
= &net
->ipv4
.fib_table_hash
[h
];
2357 hlist_for_each_entry_rcu(tb
, tb_node
, head
, tb_hlist
) {
2358 n
= fib_trie_get_first(iter
, (struct trie
*) tb
->tb_data
);
2370 static void fib_trie_seq_stop(struct seq_file
*seq
, void *v
)
2376 static void seq_indent(struct seq_file
*seq
, int n
)
2378 while (n
-- > 0) seq_puts(seq
, " ");
2381 static inline const char *rtn_scope(char *buf
, size_t len
, enum rt_scope_t s
)
2384 case RT_SCOPE_UNIVERSE
: return "universe";
2385 case RT_SCOPE_SITE
: return "site";
2386 case RT_SCOPE_LINK
: return "link";
2387 case RT_SCOPE_HOST
: return "host";
2388 case RT_SCOPE_NOWHERE
: return "nowhere";
2390 snprintf(buf
, len
, "scope=%d", s
);
2395 static const char *rtn_type_names
[__RTN_MAX
] = {
2396 [RTN_UNSPEC
] = "UNSPEC",
2397 [RTN_UNICAST
] = "UNICAST",
2398 [RTN_LOCAL
] = "LOCAL",
2399 [RTN_BROADCAST
] = "BROADCAST",
2400 [RTN_ANYCAST
] = "ANYCAST",
2401 [RTN_MULTICAST
] = "MULTICAST",
2402 [RTN_BLACKHOLE
] = "BLACKHOLE",
2403 [RTN_UNREACHABLE
] = "UNREACHABLE",
2404 [RTN_PROHIBIT
] = "PROHIBIT",
2405 [RTN_THROW
] = "THROW",
2407 [RTN_XRESOLVE
] = "XRESOLVE",
2410 static inline const char *rtn_type(char *buf
, size_t len
, unsigned t
)
2412 if (t
< __RTN_MAX
&& rtn_type_names
[t
])
2413 return rtn_type_names
[t
];
2414 snprintf(buf
, len
, "type %u", t
);
2418 /* Pretty print the trie */
2419 static int fib_trie_seq_show(struct seq_file
*seq
, void *v
)
2421 const struct fib_trie_iter
*iter
= seq
->private;
2424 if (!node_parent_rcu(n
))
2425 fib_table_print(seq
, iter
->tb
);
2428 struct tnode
*tn
= (struct tnode
*) n
;
2429 __be32 prf
= htonl(mask_pfx(tn
->key
, tn
->pos
));
2431 seq_indent(seq
, iter
->depth
-1);
2432 seq_printf(seq
, " +-- %pI4/%d %d %d %d\n",
2433 &prf
, tn
->pos
, tn
->bits
, tn
->full_children
,
2434 tn
->empty_children
);
2437 struct leaf
*l
= (struct leaf
*) n
;
2438 struct leaf_info
*li
;
2439 struct hlist_node
*node
;
2440 __be32 val
= htonl(l
->key
);
2442 seq_indent(seq
, iter
->depth
);
2443 seq_printf(seq
, " |-- %pI4\n", &val
);
2445 hlist_for_each_entry_rcu(li
, node
, &l
->list
, hlist
) {
2446 struct fib_alias
*fa
;
2448 list_for_each_entry_rcu(fa
, &li
->falh
, fa_list
) {
2449 char buf1
[32], buf2
[32];
2451 seq_indent(seq
, iter
->depth
+1);
2452 seq_printf(seq
, " /%d %s %s", li
->plen
,
2453 rtn_scope(buf1
, sizeof(buf1
),
2455 rtn_type(buf2
, sizeof(buf2
),
2458 seq_printf(seq
, " tos=%d", fa
->fa_tos
);
2459 seq_putc(seq
, '\n');
2467 static const struct seq_operations fib_trie_seq_ops
= {
2468 .start
= fib_trie_seq_start
,
2469 .next
= fib_trie_seq_next
,
2470 .stop
= fib_trie_seq_stop
,
2471 .show
= fib_trie_seq_show
,
2474 static int fib_trie_seq_open(struct inode
*inode
, struct file
*file
)
2476 return seq_open_net(inode
, file
, &fib_trie_seq_ops
,
2477 sizeof(struct fib_trie_iter
));
2480 static const struct file_operations fib_trie_fops
= {
2481 .owner
= THIS_MODULE
,
2482 .open
= fib_trie_seq_open
,
2484 .llseek
= seq_lseek
,
2485 .release
= seq_release_net
,
2488 struct fib_route_iter
{
2489 struct seq_net_private p
;
2490 struct trie
*main_trie
;
2495 static struct leaf
*fib_route_get_idx(struct fib_route_iter
*iter
, loff_t pos
)
2497 struct leaf
*l
= NULL
;
2498 struct trie
*t
= iter
->main_trie
;
2500 /* use cache location of last found key */
2501 if (iter
->pos
> 0 && pos
>= iter
->pos
&& (l
= fib_find_node(t
, iter
->key
)))
2505 l
= trie_firstleaf(t
);
2508 while (l
&& pos
-- > 0) {
2510 l
= trie_nextleaf(l
);
2514 iter
->key
= pos
; /* remember it */
2516 iter
->pos
= 0; /* forget it */
2521 static void *fib_route_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2524 struct fib_route_iter
*iter
= seq
->private;
2525 struct fib_table
*tb
;
2528 tb
= fib_get_table(seq_file_net(seq
), RT_TABLE_MAIN
);
2532 iter
->main_trie
= (struct trie
*) tb
->tb_data
;
2534 return SEQ_START_TOKEN
;
2536 return fib_route_get_idx(iter
, *pos
- 1);
2539 static void *fib_route_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2541 struct fib_route_iter
*iter
= seq
->private;
2545 if (v
== SEQ_START_TOKEN
) {
2547 l
= trie_firstleaf(iter
->main_trie
);
2550 l
= trie_nextleaf(l
);
2560 static void fib_route_seq_stop(struct seq_file
*seq
, void *v
)
2566 static unsigned fib_flag_trans(int type
, __be32 mask
, const struct fib_info
*fi
)
2568 static unsigned type2flags
[RTN_MAX
+ 1] = {
2569 [7] = RTF_REJECT
, [8] = RTF_REJECT
,
2571 unsigned flags
= type2flags
[type
];
2573 if (fi
&& fi
->fib_nh
->nh_gw
)
2574 flags
|= RTF_GATEWAY
;
2575 if (mask
== htonl(0xFFFFFFFF))
2582 * This outputs /proc/net/route.
2583 * The format of the file is not supposed to be changed
2584 * and needs to be same as fib_hash output to avoid breaking
2587 static int fib_route_seq_show(struct seq_file
*seq
, void *v
)
2590 struct leaf_info
*li
;
2591 struct hlist_node
*node
;
2593 if (v
== SEQ_START_TOKEN
) {
2594 seq_printf(seq
, "%-127s\n", "Iface\tDestination\tGateway "
2595 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2600 hlist_for_each_entry_rcu(li
, node
, &l
->list
, hlist
) {
2601 struct fib_alias
*fa
;
2602 __be32 mask
, prefix
;
2604 mask
= inet_make_mask(li
->plen
);
2605 prefix
= htonl(l
->key
);
2607 list_for_each_entry_rcu(fa
, &li
->falh
, fa_list
) {
2608 const struct fib_info
*fi
= fa
->fa_info
;
2609 unsigned flags
= fib_flag_trans(fa
->fa_type
, mask
, fi
);
2612 if (fa
->fa_type
== RTN_BROADCAST
2613 || fa
->fa_type
== RTN_MULTICAST
)
2618 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2619 "%d\t%08X\t%d\t%u\t%u%n",
2620 fi
->fib_dev
? fi
->fib_dev
->name
: "*",
2622 fi
->fib_nh
->nh_gw
, flags
, 0, 0,
2626 fi
->fib_advmss
+ 40 : 0),
2628 fi
->fib_rtt
>> 3, &len
);
2631 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2632 "%d\t%08X\t%d\t%u\t%u%n",
2633 prefix
, 0, flags
, 0, 0, 0,
2634 mask
, 0, 0, 0, &len
);
2636 seq_printf(seq
, "%*s\n", 127 - len
, "");
2643 static const struct seq_operations fib_route_seq_ops
= {
2644 .start
= fib_route_seq_start
,
2645 .next
= fib_route_seq_next
,
2646 .stop
= fib_route_seq_stop
,
2647 .show
= fib_route_seq_show
,
2650 static int fib_route_seq_open(struct inode
*inode
, struct file
*file
)
2652 return seq_open_net(inode
, file
, &fib_route_seq_ops
,
2653 sizeof(struct fib_route_iter
));
2656 static const struct file_operations fib_route_fops
= {
2657 .owner
= THIS_MODULE
,
2658 .open
= fib_route_seq_open
,
2660 .llseek
= seq_lseek
,
2661 .release
= seq_release_net
,
2664 int __net_init
fib_proc_init(struct net
*net
)
2666 if (!proc_net_fops_create(net
, "fib_trie", S_IRUGO
, &fib_trie_fops
))
2669 if (!proc_net_fops_create(net
, "fib_triestat", S_IRUGO
,
2670 &fib_triestat_fops
))
2673 if (!proc_net_fops_create(net
, "route", S_IRUGO
, &fib_route_fops
))
2679 proc_net_remove(net
, "fib_triestat");
2681 proc_net_remove(net
, "fib_trie");
2686 void __net_exit
fib_proc_exit(struct net
*net
)
2688 proc_net_remove(net
, "fib_trie");
2689 proc_net_remove(net
, "fib_triestat");
2690 proc_net_remove(net
, "route");
2693 #endif /* CONFIG_PROC_FS */