2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
45 * Substantial contributions to this work comes from:
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
53 #define VERSION "0.407"
55 #include <linux/config.h>
56 #include <asm/uaccess.h>
57 #include <asm/system.h>
58 #include <asm/bitops.h>
59 #include <linux/types.h>
60 #include <linux/kernel.h>
61 #include <linux/sched.h>
63 #include <linux/string.h>
64 #include <linux/socket.h>
65 #include <linux/sockios.h>
66 #include <linux/errno.h>
68 #include <linux/inet.h>
69 #include <linux/inetdevice.h>
70 #include <linux/netdevice.h>
71 #include <linux/if_arp.h>
72 #include <linux/proc_fs.h>
73 #include <linux/rcupdate.h>
74 #include <linux/skbuff.h>
75 #include <linux/netlink.h>
76 #include <linux/init.h>
77 #include <linux/list.h>
79 #include <net/protocol.h>
80 #include <net/route.h>
83 #include <net/ip_fib.h>
84 #include "fib_lookup.h"
86 #undef CONFIG_IP_FIB_TRIE_STATS
87 #define MAX_STAT_DEPTH 32
89 #define KEYLENGTH (8*sizeof(t_key))
90 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
91 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
93 typedef unsigned int t_key
;
97 #define NODE_TYPE_MASK 0x1UL
98 #define NODE_PARENT(node) \
99 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
101 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
103 #define NODE_SET_PARENT(node, ptr) \
104 rcu_assign_pointer((node)->parent, \
105 ((unsigned long)(ptr)) | NODE_TYPE(node))
107 #define IS_TNODE(n) (!(n->parent & T_LEAF))
108 #define IS_LEAF(n) (n->parent & T_LEAF)
112 unsigned long parent
;
117 unsigned long parent
;
118 struct hlist_head list
;
123 struct hlist_node hlist
;
126 struct list_head falh
;
131 unsigned long parent
;
132 unsigned short pos
:5; /* 2log(KEYLENGTH) bits needed */
133 unsigned short bits
:5; /* 2log(KEYLENGTH) bits needed */
134 unsigned short full_children
; /* KEYLENGTH bits needed */
135 unsigned short empty_children
; /* KEYLENGTH bits needed */
137 struct node
*child
[0];
140 #ifdef CONFIG_IP_FIB_TRIE_STATS
141 struct trie_use_stats
{
143 unsigned int backtrack
;
144 unsigned int semantic_match_passed
;
145 unsigned int semantic_match_miss
;
146 unsigned int null_node_hit
;
147 unsigned int resize_node_skipped
;
152 unsigned int totdepth
;
153 unsigned int maxdepth
;
156 unsigned int nullpointers
;
157 unsigned int nodesizes
[MAX_STAT_DEPTH
];
162 #ifdef CONFIG_IP_FIB_TRIE_STATS
163 struct trie_use_stats stats
;
166 unsigned int revision
;
169 static void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
);
170 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
);
171 static struct node
*resize(struct trie
*t
, struct tnode
*tn
);
172 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
);
173 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
);
174 static void tnode_free(struct tnode
*tn
);
176 static kmem_cache_t
*fn_alias_kmem __read_mostly
;
177 static struct trie
*trie_local
= NULL
, *trie_main
= NULL
;
180 /* rcu_read_lock needs to be hold by caller from readside */
182 static inline struct node
*tnode_get_child(struct tnode
*tn
, int i
)
184 BUG_ON(i
>= 1 << tn
->bits
);
186 return rcu_dereference(tn
->child
[i
]);
189 static inline int tnode_child_length(const struct tnode
*tn
)
191 return 1 << tn
->bits
;
194 static inline t_key
tkey_extract_bits(t_key a
, int offset
, int bits
)
196 if (offset
< KEYLENGTH
)
197 return ((t_key
)(a
<< offset
)) >> (KEYLENGTH
- bits
);
202 static inline int tkey_equals(t_key a
, t_key b
)
207 static inline int tkey_sub_equals(t_key a
, int offset
, int bits
, t_key b
)
209 if (bits
== 0 || offset
>= KEYLENGTH
)
211 bits
= bits
> KEYLENGTH
? KEYLENGTH
: bits
;
212 return ((a
^ b
) << offset
) >> (KEYLENGTH
- bits
) == 0;
215 static inline int tkey_mismatch(t_key a
, int offset
, t_key b
)
222 while ((diff
<< i
) >> (KEYLENGTH
-1) == 0)
228 To understand this stuff, an understanding of keys and all their bits is
229 necessary. Every node in the trie has a key associated with it, but not
230 all of the bits in that key are significant.
232 Consider a node 'n' and its parent 'tp'.
234 If n is a leaf, every bit in its key is significant. Its presence is
235 necessitated by path compression, since during a tree traversal (when
236 searching for a leaf - unless we are doing an insertion) we will completely
237 ignore all skipped bits we encounter. Thus we need to verify, at the end of
238 a potentially successful search, that we have indeed been walking the
241 Note that we can never "miss" the correct key in the tree if present by
242 following the wrong path. Path compression ensures that segments of the key
243 that are the same for all keys with a given prefix are skipped, but the
244 skipped part *is* identical for each node in the subtrie below the skipped
245 bit! trie_insert() in this implementation takes care of that - note the
246 call to tkey_sub_equals() in trie_insert().
248 if n is an internal node - a 'tnode' here, the various parts of its key
249 have many different meanings.
252 _________________________________________________________________
253 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
254 -----------------------------------------------------------------
255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
257 _________________________________________________________________
258 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
259 -----------------------------------------------------------------
260 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
267 First, let's just ignore the bits that come before the parent tp, that is
268 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
269 not use them for anything.
271 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
272 index into the parent's child array. That is, they will be used to find
273 'n' among tp's children.
275 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
278 All the bits we have seen so far are significant to the node n. The rest
279 of the bits are really not needed or indeed known in n->key.
281 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
282 n's child array, and will of course be different for each child.
285 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
290 static inline void check_tnode(const struct tnode
*tn
)
292 WARN_ON(tn
&& tn
->pos
+tn
->bits
> 32);
295 static int halve_threshold
= 25;
296 static int inflate_threshold
= 50;
297 static int halve_threshold_root
= 15;
298 static int inflate_threshold_root
= 25;
301 static void __alias_free_mem(struct rcu_head
*head
)
303 struct fib_alias
*fa
= container_of(head
, struct fib_alias
, rcu
);
304 kmem_cache_free(fn_alias_kmem
, fa
);
307 static inline void alias_free_mem_rcu(struct fib_alias
*fa
)
309 call_rcu(&fa
->rcu
, __alias_free_mem
);
312 static void __leaf_free_rcu(struct rcu_head
*head
)
314 kfree(container_of(head
, struct leaf
, rcu
));
317 static void __leaf_info_free_rcu(struct rcu_head
*head
)
319 kfree(container_of(head
, struct leaf_info
, rcu
));
322 static inline void free_leaf_info(struct leaf_info
*leaf
)
324 call_rcu(&leaf
->rcu
, __leaf_info_free_rcu
);
327 static struct tnode
*tnode_alloc(unsigned int size
)
331 if (size
<= PAGE_SIZE
)
332 return kcalloc(size
, 1, GFP_KERNEL
);
334 pages
= alloc_pages(GFP_KERNEL
|__GFP_ZERO
, get_order(size
));
338 return page_address(pages
);
341 static void __tnode_free_rcu(struct rcu_head
*head
)
343 struct tnode
*tn
= container_of(head
, struct tnode
, rcu
);
344 unsigned int size
= sizeof(struct tnode
) +
345 (1 << tn
->bits
) * sizeof(struct node
*);
347 if (size
<= PAGE_SIZE
)
350 free_pages((unsigned long)tn
, get_order(size
));
353 static inline void tnode_free(struct tnode
*tn
)
356 struct leaf
*l
= (struct leaf
*) tn
;
357 call_rcu_bh(&l
->rcu
, __leaf_free_rcu
);
360 call_rcu(&tn
->rcu
, __tnode_free_rcu
);
363 static struct leaf
*leaf_new(void)
365 struct leaf
*l
= kmalloc(sizeof(struct leaf
), GFP_KERNEL
);
368 INIT_HLIST_HEAD(&l
->list
);
373 static struct leaf_info
*leaf_info_new(int plen
)
375 struct leaf_info
*li
= kmalloc(sizeof(struct leaf_info
), GFP_KERNEL
);
378 INIT_LIST_HEAD(&li
->falh
);
383 static struct tnode
* tnode_new(t_key key
, int pos
, int bits
)
385 int nchildren
= 1<<bits
;
386 int sz
= sizeof(struct tnode
) + nchildren
* sizeof(struct node
*);
387 struct tnode
*tn
= tnode_alloc(sz
);
391 tn
->parent
= T_TNODE
;
395 tn
->full_children
= 0;
396 tn
->empty_children
= 1<<bits
;
399 pr_debug("AT %p s=%u %u\n", tn
, (unsigned int) sizeof(struct tnode
),
400 (unsigned int) (sizeof(struct node
) * 1<<bits
));
405 * Check whether a tnode 'n' is "full", i.e. it is an internal node
406 * and no bits are skipped. See discussion in dyntree paper p. 6
409 static inline int tnode_full(const struct tnode
*tn
, const struct node
*n
)
411 if (n
== NULL
|| IS_LEAF(n
))
414 return ((struct tnode
*) n
)->pos
== tn
->pos
+ tn
->bits
;
417 static inline void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
)
419 tnode_put_child_reorg(tn
, i
, n
, -1);
423 * Add a child at position i overwriting the old value.
424 * Update the value of full_children and empty_children.
427 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
)
429 struct node
*chi
= tn
->child
[i
];
432 BUG_ON(i
>= 1<<tn
->bits
);
435 /* update emptyChildren */
436 if (n
== NULL
&& chi
!= NULL
)
437 tn
->empty_children
++;
438 else if (n
!= NULL
&& chi
== NULL
)
439 tn
->empty_children
--;
441 /* update fullChildren */
443 wasfull
= tnode_full(tn
, chi
);
445 isfull
= tnode_full(tn
, n
);
446 if (wasfull
&& !isfull
)
448 else if (!wasfull
&& isfull
)
452 NODE_SET_PARENT(n
, tn
);
454 rcu_assign_pointer(tn
->child
[i
], n
);
457 static struct node
*resize(struct trie
*t
, struct tnode
*tn
)
461 struct tnode
*old_tn
;
462 int inflate_threshold_use
;
463 int halve_threshold_use
;
468 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
469 tn
, inflate_threshold
, halve_threshold
);
472 if (tn
->empty_children
== tnode_child_length(tn
)) {
477 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
478 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
485 /* compress one level */
486 NODE_SET_PARENT(n
, NULL
);
491 * Double as long as the resulting node has a number of
492 * nonempty nodes that are above the threshold.
496 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
497 * the Helsinki University of Technology and Matti Tikkanen of Nokia
498 * Telecommunications, page 6:
499 * "A node is doubled if the ratio of non-empty children to all
500 * children in the *doubled* node is at least 'high'."
502 * 'high' in this instance is the variable 'inflate_threshold'. It
503 * is expressed as a percentage, so we multiply it with
504 * tnode_child_length() and instead of multiplying by 2 (since the
505 * child array will be doubled by inflate()) and multiplying
506 * the left-hand side by 100 (to handle the percentage thing) we
507 * multiply the left-hand side by 50.
509 * The left-hand side may look a bit weird: tnode_child_length(tn)
510 * - tn->empty_children is of course the number of non-null children
511 * in the current node. tn->full_children is the number of "full"
512 * children, that is non-null tnodes with a skip value of 0.
513 * All of those will be doubled in the resulting inflated tnode, so
514 * we just count them one extra time here.
516 * A clearer way to write this would be:
518 * to_be_doubled = tn->full_children;
519 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * new_child_length = tnode_child_length(tn) * 2;
524 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
526 * if (new_fill_factor >= inflate_threshold)
528 * ...and so on, tho it would mess up the while () loop.
531 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
535 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
536 * inflate_threshold * new_child_length
538 * expand not_to_be_doubled and to_be_doubled, and shorten:
539 * 100 * (tnode_child_length(tn) - tn->empty_children +
540 * tn->full_children) >= inflate_threshold * new_child_length
542 * expand new_child_length:
543 * 100 * (tnode_child_length(tn) - tn->empty_children +
544 * tn->full_children) >=
545 * inflate_threshold * tnode_child_length(tn) * 2
548 * 50 * (tn->full_children + tnode_child_length(tn) -
549 * tn->empty_children) >= inflate_threshold *
550 * tnode_child_length(tn)
556 /* Keep root node larger */
559 inflate_threshold_use
= inflate_threshold_root
;
561 inflate_threshold_use
= inflate_threshold
;
564 while ((tn
->full_children
> 0 &&
565 50 * (tn
->full_children
+ tnode_child_length(tn
) - tn
->empty_children
) >=
566 inflate_threshold_use
* tnode_child_length(tn
))) {
572 #ifdef CONFIG_IP_FIB_TRIE_STATS
573 t
->stats
.resize_node_skipped
++;
582 * Halve as long as the number of empty children in this
583 * node is above threshold.
587 /* Keep root node larger */
590 halve_threshold_use
= halve_threshold_root
;
592 halve_threshold_use
= halve_threshold
;
595 while (tn
->bits
> 1 &&
596 100 * (tnode_child_length(tn
) - tn
->empty_children
) <
597 halve_threshold_use
* tnode_child_length(tn
)) {
603 #ifdef CONFIG_IP_FIB_TRIE_STATS
604 t
->stats
.resize_node_skipped
++;
611 /* Only one child remains */
612 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
613 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
620 /* compress one level */
622 NODE_SET_PARENT(n
, NULL
);
627 return (struct node
*) tn
;
630 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
)
633 struct tnode
*oldtnode
= tn
;
634 int olen
= tnode_child_length(tn
);
637 pr_debug("In inflate\n");
639 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
+ 1);
642 return ERR_PTR(-ENOMEM
);
645 * Preallocate and store tnodes before the actual work so we
646 * don't get into an inconsistent state if memory allocation
647 * fails. In case of failure we return the oldnode and inflate
648 * of tnode is ignored.
651 for (i
= 0; i
< olen
; i
++) {
652 struct tnode
*inode
= (struct tnode
*) tnode_get_child(oldtnode
, i
);
656 inode
->pos
== oldtnode
->pos
+ oldtnode
->bits
&&
658 struct tnode
*left
, *right
;
659 t_key m
= TKEY_GET_MASK(inode
->pos
, 1);
661 left
= tnode_new(inode
->key
&(~m
), inode
->pos
+ 1,
666 right
= tnode_new(inode
->key
|m
, inode
->pos
+ 1,
674 put_child(t
, tn
, 2*i
, (struct node
*) left
);
675 put_child(t
, tn
, 2*i
+1, (struct node
*) right
);
679 for (i
= 0; i
< olen
; i
++) {
680 struct node
*node
= tnode_get_child(oldtnode
, i
);
681 struct tnode
*left
, *right
;
688 /* A leaf or an internal node with skipped bits */
690 if (IS_LEAF(node
) || ((struct tnode
*) node
)->pos
>
691 tn
->pos
+ tn
->bits
- 1) {
692 if (tkey_extract_bits(node
->key
, oldtnode
->pos
+ oldtnode
->bits
,
694 put_child(t
, tn
, 2*i
, node
);
696 put_child(t
, tn
, 2*i
+1, node
);
700 /* An internal node with two children */
701 inode
= (struct tnode
*) node
;
703 if (inode
->bits
== 1) {
704 put_child(t
, tn
, 2*i
, inode
->child
[0]);
705 put_child(t
, tn
, 2*i
+1, inode
->child
[1]);
711 /* An internal node with more than two children */
713 /* We will replace this node 'inode' with two new
714 * ones, 'left' and 'right', each with half of the
715 * original children. The two new nodes will have
716 * a position one bit further down the key and this
717 * means that the "significant" part of their keys
718 * (see the discussion near the top of this file)
719 * will differ by one bit, which will be "0" in
720 * left's key and "1" in right's key. Since we are
721 * moving the key position by one step, the bit that
722 * we are moving away from - the bit at position
723 * (inode->pos) - is the one that will differ between
724 * left and right. So... we synthesize that bit in the
726 * The mask 'm' below will be a single "one" bit at
727 * the position (inode->pos)
730 /* Use the old key, but set the new significant
734 left
= (struct tnode
*) tnode_get_child(tn
, 2*i
);
735 put_child(t
, tn
, 2*i
, NULL
);
739 right
= (struct tnode
*) tnode_get_child(tn
, 2*i
+1);
740 put_child(t
, tn
, 2*i
+1, NULL
);
744 size
= tnode_child_length(left
);
745 for (j
= 0; j
< size
; j
++) {
746 put_child(t
, left
, j
, inode
->child
[j
]);
747 put_child(t
, right
, j
, inode
->child
[j
+ size
]);
749 put_child(t
, tn
, 2*i
, resize(t
, left
));
750 put_child(t
, tn
, 2*i
+1, resize(t
, right
));
754 tnode_free(oldtnode
);
758 int size
= tnode_child_length(tn
);
761 for (j
= 0; j
< size
; j
++)
763 tnode_free((struct tnode
*)tn
->child
[j
]);
767 return ERR_PTR(-ENOMEM
);
771 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
)
773 struct tnode
*oldtnode
= tn
;
774 struct node
*left
, *right
;
776 int olen
= tnode_child_length(tn
);
778 pr_debug("In halve\n");
780 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
- 1);
783 return ERR_PTR(-ENOMEM
);
786 * Preallocate and store tnodes before the actual work so we
787 * don't get into an inconsistent state if memory allocation
788 * fails. In case of failure we return the oldnode and halve
789 * of tnode is ignored.
792 for (i
= 0; i
< olen
; i
+= 2) {
793 left
= tnode_get_child(oldtnode
, i
);
794 right
= tnode_get_child(oldtnode
, i
+1);
796 /* Two nonempty children */
800 newn
= tnode_new(left
->key
, tn
->pos
+ tn
->bits
, 1);
805 put_child(t
, tn
, i
/2, (struct node
*)newn
);
810 for (i
= 0; i
< olen
; i
+= 2) {
811 struct tnode
*newBinNode
;
813 left
= tnode_get_child(oldtnode
, i
);
814 right
= tnode_get_child(oldtnode
, i
+1);
816 /* At least one of the children is empty */
818 if (right
== NULL
) /* Both are empty */
820 put_child(t
, tn
, i
/2, right
);
825 put_child(t
, tn
, i
/2, left
);
829 /* Two nonempty children */
830 newBinNode
= (struct tnode
*) tnode_get_child(tn
, i
/2);
831 put_child(t
, tn
, i
/2, NULL
);
832 put_child(t
, newBinNode
, 0, left
);
833 put_child(t
, newBinNode
, 1, right
);
834 put_child(t
, tn
, i
/2, resize(t
, newBinNode
));
836 tnode_free(oldtnode
);
840 int size
= tnode_child_length(tn
);
843 for (j
= 0; j
< size
; j
++)
845 tnode_free((struct tnode
*)tn
->child
[j
]);
849 return ERR_PTR(-ENOMEM
);
853 static void trie_init(struct trie
*t
)
859 rcu_assign_pointer(t
->trie
, NULL
);
861 #ifdef CONFIG_IP_FIB_TRIE_STATS
862 memset(&t
->stats
, 0, sizeof(struct trie_use_stats
));
866 /* readside must use rcu_read_lock currently dump routines
867 via get_fa_head and dump */
869 static struct leaf_info
*find_leaf_info(struct leaf
*l
, int plen
)
871 struct hlist_head
*head
= &l
->list
;
872 struct hlist_node
*node
;
873 struct leaf_info
*li
;
875 hlist_for_each_entry_rcu(li
, node
, head
, hlist
)
876 if (li
->plen
== plen
)
882 static inline struct list_head
* get_fa_head(struct leaf
*l
, int plen
)
884 struct leaf_info
*li
= find_leaf_info(l
, plen
);
892 static void insert_leaf_info(struct hlist_head
*head
, struct leaf_info
*new)
894 struct leaf_info
*li
= NULL
, *last
= NULL
;
895 struct hlist_node
*node
;
897 if (hlist_empty(head
)) {
898 hlist_add_head_rcu(&new->hlist
, head
);
900 hlist_for_each_entry(li
, node
, head
, hlist
) {
901 if (new->plen
> li
->plen
)
907 hlist_add_after_rcu(&last
->hlist
, &new->hlist
);
909 hlist_add_before_rcu(&new->hlist
, &li
->hlist
);
913 /* rcu_read_lock needs to be hold by caller from readside */
916 fib_find_node(struct trie
*t
, u32 key
)
923 n
= rcu_dereference(t
->trie
);
925 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
926 tn
= (struct tnode
*) n
;
930 if (tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
931 pos
= tn
->pos
+ tn
->bits
;
932 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
936 /* Case we have found a leaf. Compare prefixes */
938 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
))
939 return (struct leaf
*)n
;
944 static struct node
*trie_rebalance(struct trie
*t
, struct tnode
*tn
)
948 struct tnode
*tp
= NULL
;
952 while (tn
!= NULL
&& NODE_PARENT(tn
) != NULL
) {
954 tp
= NODE_PARENT(tn
);
955 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
956 wasfull
= tnode_full(tp
, tnode_get_child(tp
, cindex
));
957 tn
= (struct tnode
*) resize (t
, (struct tnode
*)tn
);
958 tnode_put_child_reorg((struct tnode
*)tp
, cindex
,(struct node
*)tn
, wasfull
);
960 if (!NODE_PARENT(tn
))
963 tn
= NODE_PARENT(tn
);
965 /* Handle last (top) tnode */
967 tn
= (struct tnode
*) resize(t
, (struct tnode
*)tn
);
969 return (struct node
*) tn
;
972 /* only used from updater-side */
974 static struct list_head
*
975 fib_insert_node(struct trie
*t
, int *err
, u32 key
, int plen
)
978 struct tnode
*tp
= NULL
, *tn
= NULL
;
982 struct list_head
*fa_head
= NULL
;
983 struct leaf_info
*li
;
989 /* If we point to NULL, stop. Either the tree is empty and we should
990 * just put a new leaf in if, or we have reached an empty child slot,
991 * and we should just put our new leaf in that.
992 * If we point to a T_TNODE, check if it matches our key. Note that
993 * a T_TNODE might be skipping any number of bits - its 'pos' need
994 * not be the parent's 'pos'+'bits'!
996 * If it does match the current key, get pos/bits from it, extract
997 * the index from our key, push the T_TNODE and walk the tree.
999 * If it doesn't, we have to replace it with a new T_TNODE.
1001 * If we point to a T_LEAF, it might or might not have the same key
1002 * as we do. If it does, just change the value, update the T_LEAF's
1003 * value, and return it.
1004 * If it doesn't, we need to replace it with a T_TNODE.
1007 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
1008 tn
= (struct tnode
*) n
;
1012 if (tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
1014 pos
= tn
->pos
+ tn
->bits
;
1015 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1017 BUG_ON(n
&& NODE_PARENT(n
) != tn
);
1023 * n ----> NULL, LEAF or TNODE
1025 * tp is n's (parent) ----> NULL or TNODE
1028 BUG_ON(tp
&& IS_LEAF(tp
));
1030 /* Case 1: n is a leaf. Compare prefixes */
1032 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
1033 struct leaf
*l
= (struct leaf
*) n
;
1035 li
= leaf_info_new(plen
);
1042 fa_head
= &li
->falh
;
1043 insert_leaf_info(&l
->list
, li
);
1055 li
= leaf_info_new(plen
);
1058 tnode_free((struct tnode
*) l
);
1063 fa_head
= &li
->falh
;
1064 insert_leaf_info(&l
->list
, li
);
1066 if (t
->trie
&& n
== NULL
) {
1067 /* Case 2: n is NULL, and will just insert a new leaf */
1069 NODE_SET_PARENT(l
, tp
);
1071 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1072 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)l
);
1074 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1076 * Add a new tnode here
1077 * first tnode need some special handling
1081 pos
= tp
->pos
+tp
->bits
;
1086 newpos
= tkey_mismatch(key
, pos
, n
->key
);
1087 tn
= tnode_new(n
->key
, newpos
, 1);
1090 tn
= tnode_new(key
, newpos
, 1); /* First tnode */
1095 tnode_free((struct tnode
*) l
);
1100 NODE_SET_PARENT(tn
, tp
);
1102 missbit
= tkey_extract_bits(key
, newpos
, 1);
1103 put_child(t
, tn
, missbit
, (struct node
*)l
);
1104 put_child(t
, tn
, 1-missbit
, n
);
1107 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1108 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)tn
);
1110 rcu_assign_pointer(t
->trie
, (struct node
*)tn
); /* First tnode */
1115 if (tp
&& tp
->pos
+ tp
->bits
> 32)
1116 printk(KERN_WARNING
"fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1117 tp
, tp
->pos
, tp
->bits
, key
, plen
);
1119 /* Rebalance the trie */
1121 rcu_assign_pointer(t
->trie
, trie_rebalance(t
, tp
));
1129 fn_trie_insert(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1130 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1132 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1133 struct fib_alias
*fa
, *new_fa
;
1134 struct list_head
*fa_head
= NULL
;
1135 struct fib_info
*fi
;
1136 int plen
= r
->rtm_dst_len
;
1137 int type
= r
->rtm_type
;
1138 u8 tos
= r
->rtm_tos
;
1148 memcpy(&key
, rta
->rta_dst
, 4);
1152 pr_debug("Insert table=%d %08x/%d\n", tb
->tb_id
, key
, plen
);
1154 mask
= ntohl(inet_make_mask(plen
));
1161 fi
= fib_create_info(r
, rta
, nlhdr
, &err
);
1166 l
= fib_find_node(t
, key
);
1170 fa_head
= get_fa_head(l
, plen
);
1171 fa
= fib_find_alias(fa_head
, tos
, fi
->fib_priority
);
1174 /* Now fa, if non-NULL, points to the first fib alias
1175 * with the same keys [prefix,tos,priority], if such key already
1176 * exists or to the node before which we will insert new one.
1178 * If fa is NULL, we will need to allocate a new one and
1179 * insert to the head of f.
1181 * If f is NULL, no fib node matched the destination key
1182 * and we need to allocate a new one of those as well.
1185 if (fa
&& fa
->fa_info
->fib_priority
== fi
->fib_priority
) {
1186 struct fib_alias
*fa_orig
;
1189 if (nlhdr
->nlmsg_flags
& NLM_F_EXCL
)
1192 if (nlhdr
->nlmsg_flags
& NLM_F_REPLACE
) {
1193 struct fib_info
*fi_drop
;
1197 new_fa
= kmem_cache_alloc(fn_alias_kmem
, SLAB_KERNEL
);
1201 fi_drop
= fa
->fa_info
;
1202 new_fa
->fa_tos
= fa
->fa_tos
;
1203 new_fa
->fa_info
= fi
;
1204 new_fa
->fa_type
= type
;
1205 new_fa
->fa_scope
= r
->rtm_scope
;
1206 state
= fa
->fa_state
;
1207 new_fa
->fa_state
&= ~FA_S_ACCESSED
;
1209 list_replace_rcu(&fa
->fa_list
, &new_fa
->fa_list
);
1210 alias_free_mem_rcu(fa
);
1212 fib_release_info(fi_drop
);
1213 if (state
& FA_S_ACCESSED
)
1218 /* Error if we find a perfect match which
1219 * uses the same scope, type, and nexthop
1223 list_for_each_entry(fa
, fa_orig
->fa_list
.prev
, fa_list
) {
1224 if (fa
->fa_tos
!= tos
)
1226 if (fa
->fa_info
->fib_priority
!= fi
->fib_priority
)
1228 if (fa
->fa_type
== type
&&
1229 fa
->fa_scope
== r
->rtm_scope
&&
1230 fa
->fa_info
== fi
) {
1234 if (!(nlhdr
->nlmsg_flags
& NLM_F_APPEND
))
1238 if (!(nlhdr
->nlmsg_flags
& NLM_F_CREATE
))
1242 new_fa
= kmem_cache_alloc(fn_alias_kmem
, SLAB_KERNEL
);
1246 new_fa
->fa_info
= fi
;
1247 new_fa
->fa_tos
= tos
;
1248 new_fa
->fa_type
= type
;
1249 new_fa
->fa_scope
= r
->rtm_scope
;
1250 new_fa
->fa_state
= 0;
1252 * Insert new entry to the list.
1256 fa_head
= fib_insert_node(t
, &err
, key
, plen
);
1259 goto out_free_new_fa
;
1262 list_add_tail_rcu(&new_fa
->fa_list
,
1263 (fa
? &fa
->fa_list
: fa_head
));
1266 rtmsg_fib(RTM_NEWROUTE
, htonl(key
), new_fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1271 kmem_cache_free(fn_alias_kmem
, new_fa
);
1273 fib_release_info(fi
);
1279 /* should be called with rcu_read_lock */
1280 static inline int check_leaf(struct trie
*t
, struct leaf
*l
,
1281 t_key key
, int *plen
, const struct flowi
*flp
,
1282 struct fib_result
*res
)
1286 struct leaf_info
*li
;
1287 struct hlist_head
*hhead
= &l
->list
;
1288 struct hlist_node
*node
;
1290 hlist_for_each_entry_rcu(li
, node
, hhead
, hlist
) {
1292 mask
= ntohl(inet_make_mask(i
));
1293 if (l
->key
!= (key
& mask
))
1296 if ((err
= fib_semantic_match(&li
->falh
, flp
, res
, l
->key
, mask
, i
)) <= 0) {
1298 #ifdef CONFIG_IP_FIB_TRIE_STATS
1299 t
->stats
.semantic_match_passed
++;
1303 #ifdef CONFIG_IP_FIB_TRIE_STATS
1304 t
->stats
.semantic_match_miss
++;
1311 fn_trie_lookup(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1313 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1318 t_key key
= ntohl(flp
->fl4_dst
);
1321 int current_prefix_length
= KEYLENGTH
;
1323 t_key node_prefix
, key_prefix
, pref_mismatch
;
1328 n
= rcu_dereference(t
->trie
);
1332 #ifdef CONFIG_IP_FIB_TRIE_STATS
1338 if ((ret
= check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
)) <= 0)
1342 pn
= (struct tnode
*) n
;
1350 cindex
= tkey_extract_bits(MASK_PFX(key
, current_prefix_length
), pos
, bits
);
1352 n
= tnode_get_child(pn
, cindex
);
1355 #ifdef CONFIG_IP_FIB_TRIE_STATS
1356 t
->stats
.null_node_hit
++;
1362 if ((ret
= check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
)) <= 0)
1370 cn
= (struct tnode
*)n
;
1373 * It's a tnode, and we can do some extra checks here if we
1374 * like, to avoid descending into a dead-end branch.
1375 * This tnode is in the parent's child array at index
1376 * key[p_pos..p_pos+p_bits] but potentially with some bits
1377 * chopped off, so in reality the index may be just a
1378 * subprefix, padded with zero at the end.
1379 * We can also take a look at any skipped bits in this
1380 * tnode - everything up to p_pos is supposed to be ok,
1381 * and the non-chopped bits of the index (se previous
1382 * paragraph) are also guaranteed ok, but the rest is
1383 * considered unknown.
1385 * The skipped bits are key[pos+bits..cn->pos].
1388 /* If current_prefix_length < pos+bits, we are already doing
1389 * actual prefix matching, which means everything from
1390 * pos+(bits-chopped_off) onward must be zero along some
1391 * branch of this subtree - otherwise there is *no* valid
1392 * prefix present. Here we can only check the skipped
1393 * bits. Remember, since we have already indexed into the
1394 * parent's child array, we know that the bits we chopped of
1398 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1400 if (current_prefix_length
< pos
+bits
) {
1401 if (tkey_extract_bits(cn
->key
, current_prefix_length
,
1402 cn
->pos
- current_prefix_length
) != 0 ||
1408 * If chopped_off=0, the index is fully validated and we
1409 * only need to look at the skipped bits for this, the new,
1410 * tnode. What we actually want to do is to find out if
1411 * these skipped bits match our key perfectly, or if we will
1412 * have to count on finding a matching prefix further down,
1413 * because if we do, we would like to have some way of
1414 * verifying the existence of such a prefix at this point.
1417 /* The only thing we can do at this point is to verify that
1418 * any such matching prefix can indeed be a prefix to our
1419 * key, and if the bits in the node we are inspecting that
1420 * do not match our key are not ZERO, this cannot be true.
1421 * Thus, find out where there is a mismatch (before cn->pos)
1422 * and verify that all the mismatching bits are zero in the
1426 /* Note: We aren't very concerned about the piece of the key
1427 * that precede pn->pos+pn->bits, since these have already been
1428 * checked. The bits after cn->pos aren't checked since these are
1429 * by definition "unknown" at this point. Thus, what we want to
1430 * see is if we are about to enter the "prefix matching" state,
1431 * and in that case verify that the skipped bits that will prevail
1432 * throughout this subtree are zero, as they have to be if we are
1433 * to find a matching prefix.
1436 node_prefix
= MASK_PFX(cn
->key
, cn
->pos
);
1437 key_prefix
= MASK_PFX(key
, cn
->pos
);
1438 pref_mismatch
= key_prefix
^node_prefix
;
1441 /* In short: If skipped bits in this node do not match the search
1442 * key, enter the "prefix matching" state.directly.
1444 if (pref_mismatch
) {
1445 while (!(pref_mismatch
& (1<<(KEYLENGTH
-1)))) {
1447 pref_mismatch
= pref_mismatch
<<1;
1449 key_prefix
= tkey_extract_bits(cn
->key
, mp
, cn
->pos
-mp
);
1451 if (key_prefix
!= 0)
1454 if (current_prefix_length
>= cn
->pos
)
1455 current_prefix_length
= mp
;
1458 pn
= (struct tnode
*)n
; /* Descend */
1465 /* As zero don't change the child key (cindex) */
1466 while ((chopped_off
<= pn
->bits
) && !(cindex
& (1<<(chopped_off
-1))))
1469 /* Decrease current_... with bits chopped off */
1470 if (current_prefix_length
> pn
->pos
+ pn
->bits
- chopped_off
)
1471 current_prefix_length
= pn
->pos
+ pn
->bits
- chopped_off
;
1474 * Either we do the actual chop off according or if we have
1475 * chopped off all bits in this tnode walk up to our parent.
1478 if (chopped_off
<= pn
->bits
) {
1479 cindex
&= ~(1 << (chopped_off
-1));
1481 if (NODE_PARENT(pn
) == NULL
)
1484 /* Get Child's index */
1485 cindex
= tkey_extract_bits(pn
->key
, NODE_PARENT(pn
)->pos
, NODE_PARENT(pn
)->bits
);
1486 pn
= NODE_PARENT(pn
);
1489 #ifdef CONFIG_IP_FIB_TRIE_STATS
1490 t
->stats
.backtrack
++;
1502 /* only called from updater side */
1503 static int trie_leaf_remove(struct trie
*t
, t_key key
)
1506 struct tnode
*tp
= NULL
;
1507 struct node
*n
= t
->trie
;
1510 pr_debug("entering trie_leaf_remove(%p)\n", n
);
1512 /* Note that in the case skipped bits, those bits are *not* checked!
1513 * When we finish this, we will have NULL or a T_LEAF, and the
1514 * T_LEAF may or may not match our key.
1517 while (n
!= NULL
&& IS_TNODE(n
)) {
1518 struct tnode
*tn
= (struct tnode
*) n
;
1520 n
= tnode_get_child(tn
,tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1522 BUG_ON(n
&& NODE_PARENT(n
) != tn
);
1524 l
= (struct leaf
*) n
;
1526 if (!n
|| !tkey_equals(l
->key
, key
))
1531 * Remove the leaf and rebalance the tree
1538 tp
= NODE_PARENT(n
);
1539 tnode_free((struct tnode
*) n
);
1542 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1543 put_child(t
, (struct tnode
*)tp
, cindex
, NULL
);
1544 rcu_assign_pointer(t
->trie
, trie_rebalance(t
, tp
));
1546 rcu_assign_pointer(t
->trie
, NULL
);
1553 fn_trie_delete(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1554 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1556 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1558 int plen
= r
->rtm_dst_len
;
1559 u8 tos
= r
->rtm_tos
;
1560 struct fib_alias
*fa
, *fa_to_delete
;
1561 struct list_head
*fa_head
;
1563 struct leaf_info
*li
;
1571 memcpy(&key
, rta
->rta_dst
, 4);
1574 mask
= ntohl(inet_make_mask(plen
));
1580 l
= fib_find_node(t
, key
);
1585 fa_head
= get_fa_head(l
, plen
);
1586 fa
= fib_find_alias(fa_head
, tos
, 0);
1591 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key
, plen
, tos
, t
);
1593 fa_to_delete
= NULL
;
1594 fa_head
= fa
->fa_list
.prev
;
1596 list_for_each_entry(fa
, fa_head
, fa_list
) {
1597 struct fib_info
*fi
= fa
->fa_info
;
1599 if (fa
->fa_tos
!= tos
)
1602 if ((!r
->rtm_type
||
1603 fa
->fa_type
== r
->rtm_type
) &&
1604 (r
->rtm_scope
== RT_SCOPE_NOWHERE
||
1605 fa
->fa_scope
== r
->rtm_scope
) &&
1606 (!r
->rtm_protocol
||
1607 fi
->fib_protocol
== r
->rtm_protocol
) &&
1608 fib_nh_match(r
, nlhdr
, rta
, fi
) == 0) {
1618 rtmsg_fib(RTM_DELROUTE
, htonl(key
), fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1620 l
= fib_find_node(t
, key
);
1621 li
= find_leaf_info(l
, plen
);
1623 list_del_rcu(&fa
->fa_list
);
1625 if (list_empty(fa_head
)) {
1626 hlist_del_rcu(&li
->hlist
);
1630 if (hlist_empty(&l
->list
))
1631 trie_leaf_remove(t
, key
);
1633 if (fa
->fa_state
& FA_S_ACCESSED
)
1636 fib_release_info(fa
->fa_info
);
1637 alias_free_mem_rcu(fa
);
1641 static int trie_flush_list(struct trie
*t
, struct list_head
*head
)
1643 struct fib_alias
*fa
, *fa_node
;
1646 list_for_each_entry_safe(fa
, fa_node
, head
, fa_list
) {
1647 struct fib_info
*fi
= fa
->fa_info
;
1649 if (fi
&& (fi
->fib_flags
& RTNH_F_DEAD
)) {
1650 list_del_rcu(&fa
->fa_list
);
1651 fib_release_info(fa
->fa_info
);
1652 alias_free_mem_rcu(fa
);
1659 static int trie_flush_leaf(struct trie
*t
, struct leaf
*l
)
1662 struct hlist_head
*lih
= &l
->list
;
1663 struct hlist_node
*node
, *tmp
;
1664 struct leaf_info
*li
= NULL
;
1666 hlist_for_each_entry_safe(li
, node
, tmp
, lih
, hlist
) {
1667 found
+= trie_flush_list(t
, &li
->falh
);
1669 if (list_empty(&li
->falh
)) {
1670 hlist_del_rcu(&li
->hlist
);
1677 /* rcu_read_lock needs to be hold by caller from readside */
1679 static struct leaf
*nextleaf(struct trie
*t
, struct leaf
*thisleaf
)
1681 struct node
*c
= (struct node
*) thisleaf
;
1684 struct node
*trie
= rcu_dereference(t
->trie
);
1690 if (IS_LEAF(trie
)) /* trie w. just a leaf */
1691 return (struct leaf
*) trie
;
1693 p
= (struct tnode
*) trie
; /* Start */
1695 p
= (struct tnode
*) NODE_PARENT(c
);
1700 /* Find the next child of the parent */
1702 pos
= 1 + tkey_extract_bits(c
->key
, p
->pos
, p
->bits
);
1706 last
= 1 << p
->bits
;
1707 for (idx
= pos
; idx
< last
; idx
++) {
1708 c
= rcu_dereference(p
->child
[idx
]);
1713 /* Decend if tnode */
1714 while (IS_TNODE(c
)) {
1715 p
= (struct tnode
*) c
;
1718 /* Rightmost non-NULL branch */
1719 if (p
&& IS_TNODE(p
))
1720 while (!(c
= rcu_dereference(p
->child
[idx
]))
1721 && idx
< (1<<p
->bits
)) idx
++;
1723 /* Done with this tnode? */
1724 if (idx
>= (1 << p
->bits
) || !c
)
1727 return (struct leaf
*) c
;
1730 /* No more children go up one step */
1731 c
= (struct node
*) p
;
1732 p
= (struct tnode
*) NODE_PARENT(p
);
1734 return NULL
; /* Ready. Root of trie */
1737 static int fn_trie_flush(struct fib_table
*tb
)
1739 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1740 struct leaf
*ll
= NULL
, *l
= NULL
;
1745 for (h
= 0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1746 found
+= trie_flush_leaf(t
, l
);
1748 if (ll
&& hlist_empty(&ll
->list
))
1749 trie_leaf_remove(t
, ll
->key
);
1753 if (ll
&& hlist_empty(&ll
->list
))
1754 trie_leaf_remove(t
, ll
->key
);
1756 pr_debug("trie_flush found=%d\n", found
);
1760 static int trie_last_dflt
= -1;
1763 fn_trie_select_default(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1765 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1766 int order
, last_idx
;
1767 struct fib_info
*fi
= NULL
;
1768 struct fib_info
*last_resort
;
1769 struct fib_alias
*fa
= NULL
;
1770 struct list_head
*fa_head
;
1779 l
= fib_find_node(t
, 0);
1783 fa_head
= get_fa_head(l
, 0);
1787 if (list_empty(fa_head
))
1790 list_for_each_entry_rcu(fa
, fa_head
, fa_list
) {
1791 struct fib_info
*next_fi
= fa
->fa_info
;
1793 if (fa
->fa_scope
!= res
->scope
||
1794 fa
->fa_type
!= RTN_UNICAST
)
1797 if (next_fi
->fib_priority
> res
->fi
->fib_priority
)
1799 if (!next_fi
->fib_nh
[0].nh_gw
||
1800 next_fi
->fib_nh
[0].nh_scope
!= RT_SCOPE_LINK
)
1802 fa
->fa_state
|= FA_S_ACCESSED
;
1805 if (next_fi
!= res
->fi
)
1807 } else if (!fib_detect_death(fi
, order
, &last_resort
,
1808 &last_idx
, &trie_last_dflt
)) {
1810 fib_info_put(res
->fi
);
1812 atomic_inc(&fi
->fib_clntref
);
1813 trie_last_dflt
= order
;
1819 if (order
<= 0 || fi
== NULL
) {
1820 trie_last_dflt
= -1;
1824 if (!fib_detect_death(fi
, order
, &last_resort
, &last_idx
, &trie_last_dflt
)) {
1826 fib_info_put(res
->fi
);
1828 atomic_inc(&fi
->fib_clntref
);
1829 trie_last_dflt
= order
;
1832 if (last_idx
>= 0) {
1834 fib_info_put(res
->fi
);
1835 res
->fi
= last_resort
;
1837 atomic_inc(&last_resort
->fib_clntref
);
1839 trie_last_dflt
= last_idx
;
1844 static int fn_trie_dump_fa(t_key key
, int plen
, struct list_head
*fah
, struct fib_table
*tb
,
1845 struct sk_buff
*skb
, struct netlink_callback
*cb
)
1848 struct fib_alias
*fa
;
1850 u32 xkey
= htonl(key
);
1855 /* rcu_read_lock is hold by caller */
1857 list_for_each_entry_rcu(fa
, fah
, fa_list
) {
1862 BUG_ON(!fa
->fa_info
);
1864 if (fib_dump_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1873 fa
->fa_info
, 0) < 0) {
1883 static int fn_trie_dump_plen(struct trie
*t
, int plen
, struct fib_table
*tb
, struct sk_buff
*skb
,
1884 struct netlink_callback
*cb
)
1887 struct list_head
*fa_head
;
1888 struct leaf
*l
= NULL
;
1892 for (h
= 0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1896 memset(&cb
->args
[3], 0,
1897 sizeof(cb
->args
) - 3*sizeof(cb
->args
[0]));
1899 fa_head
= get_fa_head(l
, plen
);
1904 if (list_empty(fa_head
))
1907 if (fn_trie_dump_fa(l
->key
, plen
, fa_head
, tb
, skb
, cb
)<0) {
1916 static int fn_trie_dump(struct fib_table
*tb
, struct sk_buff
*skb
, struct netlink_callback
*cb
)
1919 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1924 for (m
= 0; m
<= 32; m
++) {
1928 memset(&cb
->args
[2], 0,
1929 sizeof(cb
->args
) - 2*sizeof(cb
->args
[0]));
1931 if (fn_trie_dump_plen(t
, 32-m
, tb
, skb
, cb
)<0) {
1944 /* Fix more generic FIB names for init later */
1946 #ifdef CONFIG_IP_MULTIPLE_TABLES
1947 struct fib_table
* fib_hash_init(int id
)
1949 struct fib_table
* __init
fib_hash_init(int id
)
1952 struct fib_table
*tb
;
1955 if (fn_alias_kmem
== NULL
)
1956 fn_alias_kmem
= kmem_cache_create("ip_fib_alias",
1957 sizeof(struct fib_alias
),
1958 0, SLAB_HWCACHE_ALIGN
,
1961 tb
= kmalloc(sizeof(struct fib_table
) + sizeof(struct trie
),
1967 tb
->tb_lookup
= fn_trie_lookup
;
1968 tb
->tb_insert
= fn_trie_insert
;
1969 tb
->tb_delete
= fn_trie_delete
;
1970 tb
->tb_flush
= fn_trie_flush
;
1971 tb
->tb_select_default
= fn_trie_select_default
;
1972 tb
->tb_dump
= fn_trie_dump
;
1973 memset(tb
->tb_data
, 0, sizeof(struct trie
));
1975 t
= (struct trie
*) tb
->tb_data
;
1979 if (id
== RT_TABLE_LOCAL
)
1981 else if (id
== RT_TABLE_MAIN
)
1984 if (id
== RT_TABLE_LOCAL
)
1985 printk(KERN_INFO
"IPv4 FIB: Using LC-trie version %s\n", VERSION
);
1990 #ifdef CONFIG_PROC_FS
1991 /* Depth first Trie walk iterator */
1992 struct fib_trie_iter
{
1993 struct tnode
*tnode
;
1999 static struct node
*fib_trie_get_next(struct fib_trie_iter
*iter
)
2001 struct tnode
*tn
= iter
->tnode
;
2002 unsigned cindex
= iter
->index
;
2005 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2006 iter
->tnode
, iter
->index
, iter
->depth
);
2008 while (cindex
< (1<<tn
->bits
)) {
2009 struct node
*n
= tnode_get_child(tn
, cindex
);
2014 iter
->index
= cindex
+ 1;
2016 /* push down one level */
2017 iter
->tnode
= (struct tnode
*) n
;
2027 /* Current node exhausted, pop back up */
2028 p
= NODE_PARENT(tn
);
2030 cindex
= tkey_extract_bits(tn
->key
, p
->pos
, p
->bits
)+1;
2040 static struct node
*fib_trie_get_first(struct fib_trie_iter
*iter
,
2048 n
= rcu_dereference(t
->trie
);
2053 if (n
&& IS_TNODE(n
)) {
2054 iter
->tnode
= (struct tnode
*) n
;
2063 static void trie_collect_stats(struct trie
*t
, struct trie_stat
*s
)
2066 struct fib_trie_iter iter
;
2068 memset(s
, 0, sizeof(*s
));
2071 for (n
= fib_trie_get_first(&iter
, t
); n
;
2072 n
= fib_trie_get_next(&iter
)) {
2075 s
->totdepth
+= iter
.depth
;
2076 if (iter
.depth
> s
->maxdepth
)
2077 s
->maxdepth
= iter
.depth
;
2079 const struct tnode
*tn
= (const struct tnode
*) n
;
2083 if(tn
->bits
< MAX_STAT_DEPTH
)
2084 s
->nodesizes
[tn
->bits
]++;
2086 for (i
= 0; i
< (1<<tn
->bits
); i
++)
2095 * This outputs /proc/net/fib_triestats
2097 static void trie_show_stats(struct seq_file
*seq
, struct trie_stat
*stat
)
2099 unsigned i
, max
, pointers
, bytes
, avdepth
;
2102 avdepth
= stat
->totdepth
*100 / stat
->leaves
;
2106 seq_printf(seq
, "\tAver depth: %d.%02d\n", avdepth
/ 100, avdepth
% 100 );
2107 seq_printf(seq
, "\tMax depth: %u\n", stat
->maxdepth
);
2109 seq_printf(seq
, "\tLeaves: %u\n", stat
->leaves
);
2111 bytes
= sizeof(struct leaf
) * stat
->leaves
;
2112 seq_printf(seq
, "\tInternal nodes: %d\n\t", stat
->tnodes
);
2113 bytes
+= sizeof(struct tnode
) * stat
->tnodes
;
2115 max
= MAX_STAT_DEPTH
;
2116 while (max
> 0 && stat
->nodesizes
[max
-1] == 0)
2120 for (i
= 1; i
<= max
; i
++)
2121 if (stat
->nodesizes
[i
] != 0) {
2122 seq_printf(seq
, " %d: %d", i
, stat
->nodesizes
[i
]);
2123 pointers
+= (1<<i
) * stat
->nodesizes
[i
];
2125 seq_putc(seq
, '\n');
2126 seq_printf(seq
, "\tPointers: %d\n", pointers
);
2128 bytes
+= sizeof(struct node
*) * pointers
;
2129 seq_printf(seq
, "Null ptrs: %d\n", stat
->nullpointers
);
2130 seq_printf(seq
, "Total size: %d kB\n", (bytes
+ 1023) / 1024);
2132 #ifdef CONFIG_IP_FIB_TRIE_STATS
2133 seq_printf(seq
, "Counters:\n---------\n");
2134 seq_printf(seq
,"gets = %d\n", t
->stats
.gets
);
2135 seq_printf(seq
,"backtracks = %d\n", t
->stats
.backtrack
);
2136 seq_printf(seq
,"semantic match passed = %d\n", t
->stats
.semantic_match_passed
);
2137 seq_printf(seq
,"semantic match miss = %d\n", t
->stats
.semantic_match_miss
);
2138 seq_printf(seq
,"null node hit= %d\n", t
->stats
.null_node_hit
);
2139 seq_printf(seq
,"skipped node resize = %d\n", t
->stats
.resize_node_skipped
);
2141 memset(&(t
->stats
), 0, sizeof(t
->stats
));
2143 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2146 static int fib_triestat_seq_show(struct seq_file
*seq
, void *v
)
2148 struct trie_stat
*stat
;
2150 stat
= kmalloc(sizeof(*stat
), GFP_KERNEL
);
2154 seq_printf(seq
, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2155 sizeof(struct leaf
), sizeof(struct tnode
));
2158 seq_printf(seq
, "Local:\n");
2159 trie_collect_stats(trie_local
, stat
);
2160 trie_show_stats(seq
, stat
);
2164 seq_printf(seq
, "Main:\n");
2165 trie_collect_stats(trie_main
, stat
);
2166 trie_show_stats(seq
, stat
);
2173 static int fib_triestat_seq_open(struct inode
*inode
, struct file
*file
)
2175 return single_open(file
, fib_triestat_seq_show
, NULL
);
2178 static struct file_operations fib_triestat_fops
= {
2179 .owner
= THIS_MODULE
,
2180 .open
= fib_triestat_seq_open
,
2182 .llseek
= seq_lseek
,
2183 .release
= single_release
,
2186 static struct node
*fib_trie_get_idx(struct fib_trie_iter
*iter
,
2192 for (n
= fib_trie_get_first(iter
, trie_local
);
2193 n
; ++idx
, n
= fib_trie_get_next(iter
)) {
2198 for (n
= fib_trie_get_first(iter
, trie_main
);
2199 n
; ++idx
, n
= fib_trie_get_next(iter
)) {
2206 static void *fib_trie_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2210 return SEQ_START_TOKEN
;
2211 return fib_trie_get_idx(seq
->private, *pos
- 1);
2214 static void *fib_trie_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2216 struct fib_trie_iter
*iter
= seq
->private;
2220 if (v
== SEQ_START_TOKEN
)
2221 return fib_trie_get_idx(iter
, 0);
2223 v
= fib_trie_get_next(iter
);
2228 /* continue scan in next trie */
2229 if (iter
->trie
== trie_local
)
2230 return fib_trie_get_first(iter
, trie_main
);
2235 static void fib_trie_seq_stop(struct seq_file
*seq
, void *v
)
2240 static void seq_indent(struct seq_file
*seq
, int n
)
2242 while (n
-- > 0) seq_puts(seq
, " ");
2245 static inline const char *rtn_scope(enum rt_scope_t s
)
2247 static char buf
[32];
2250 case RT_SCOPE_UNIVERSE
: return "universe";
2251 case RT_SCOPE_SITE
: return "site";
2252 case RT_SCOPE_LINK
: return "link";
2253 case RT_SCOPE_HOST
: return "host";
2254 case RT_SCOPE_NOWHERE
: return "nowhere";
2256 snprintf(buf
, sizeof(buf
), "scope=%d", s
);
2261 static const char *rtn_type_names
[__RTN_MAX
] = {
2262 [RTN_UNSPEC
] = "UNSPEC",
2263 [RTN_UNICAST
] = "UNICAST",
2264 [RTN_LOCAL
] = "LOCAL",
2265 [RTN_BROADCAST
] = "BROADCAST",
2266 [RTN_ANYCAST
] = "ANYCAST",
2267 [RTN_MULTICAST
] = "MULTICAST",
2268 [RTN_BLACKHOLE
] = "BLACKHOLE",
2269 [RTN_UNREACHABLE
] = "UNREACHABLE",
2270 [RTN_PROHIBIT
] = "PROHIBIT",
2271 [RTN_THROW
] = "THROW",
2273 [RTN_XRESOLVE
] = "XRESOLVE",
2276 static inline const char *rtn_type(unsigned t
)
2278 static char buf
[32];
2280 if (t
< __RTN_MAX
&& rtn_type_names
[t
])
2281 return rtn_type_names
[t
];
2282 snprintf(buf
, sizeof(buf
), "type %d", t
);
2286 /* Pretty print the trie */
2287 static int fib_trie_seq_show(struct seq_file
*seq
, void *v
)
2289 const struct fib_trie_iter
*iter
= seq
->private;
2292 if (v
== SEQ_START_TOKEN
)
2296 struct tnode
*tn
= (struct tnode
*) n
;
2297 t_key prf
= ntohl(MASK_PFX(tn
->key
, tn
->pos
));
2299 if (!NODE_PARENT(n
)) {
2300 if (iter
->trie
== trie_local
)
2301 seq_puts(seq
, "<local>:\n");
2303 seq_puts(seq
, "<main>:\n");
2305 seq_indent(seq
, iter
->depth
-1);
2306 seq_printf(seq
, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2307 NIPQUAD(prf
), tn
->pos
, tn
->bits
, tn
->full_children
,
2308 tn
->empty_children
);
2311 struct leaf
*l
= (struct leaf
*) n
;
2313 u32 val
= ntohl(l
->key
);
2315 seq_indent(seq
, iter
->depth
);
2316 seq_printf(seq
, " |-- %d.%d.%d.%d\n", NIPQUAD(val
));
2317 for (i
= 32; i
>= 0; i
--) {
2318 struct leaf_info
*li
= find_leaf_info(l
, i
);
2320 struct fib_alias
*fa
;
2321 list_for_each_entry_rcu(fa
, &li
->falh
, fa_list
) {
2322 seq_indent(seq
, iter
->depth
+1);
2323 seq_printf(seq
, " /%d %s %s", i
,
2324 rtn_scope(fa
->fa_scope
),
2325 rtn_type(fa
->fa_type
));
2327 seq_printf(seq
, "tos =%d\n",
2329 seq_putc(seq
, '\n');
2338 static struct seq_operations fib_trie_seq_ops
= {
2339 .start
= fib_trie_seq_start
,
2340 .next
= fib_trie_seq_next
,
2341 .stop
= fib_trie_seq_stop
,
2342 .show
= fib_trie_seq_show
,
2345 static int fib_trie_seq_open(struct inode
*inode
, struct file
*file
)
2347 struct seq_file
*seq
;
2349 struct fib_trie_iter
*s
= kmalloc(sizeof(*s
), GFP_KERNEL
);
2354 rc
= seq_open(file
, &fib_trie_seq_ops
);
2358 seq
= file
->private_data
;
2360 memset(s
, 0, sizeof(*s
));
2368 static struct file_operations fib_trie_fops
= {
2369 .owner
= THIS_MODULE
,
2370 .open
= fib_trie_seq_open
,
2372 .llseek
= seq_lseek
,
2373 .release
= seq_release_private
,
2376 static unsigned fib_flag_trans(int type
, u32 mask
, const struct fib_info
*fi
)
2378 static unsigned type2flags
[RTN_MAX
+ 1] = {
2379 [7] = RTF_REJECT
, [8] = RTF_REJECT
,
2381 unsigned flags
= type2flags
[type
];
2383 if (fi
&& fi
->fib_nh
->nh_gw
)
2384 flags
|= RTF_GATEWAY
;
2385 if (mask
== 0xFFFFFFFF)
2392 * This outputs /proc/net/route.
2393 * The format of the file is not supposed to be changed
2394 * and needs to be same as fib_hash output to avoid breaking
2397 static int fib_route_seq_show(struct seq_file
*seq
, void *v
)
2399 const struct fib_trie_iter
*iter
= seq
->private;
2404 if (v
== SEQ_START_TOKEN
) {
2405 seq_printf(seq
, "%-127s\n", "Iface\tDestination\tGateway "
2406 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2411 if (iter
->trie
== trie_local
)
2416 for (i
=32; i
>=0; i
--) {
2417 struct leaf_info
*li
= find_leaf_info(l
, i
);
2418 struct fib_alias
*fa
;
2424 mask
= inet_make_mask(li
->plen
);
2425 prefix
= htonl(l
->key
);
2427 list_for_each_entry_rcu(fa
, &li
->falh
, fa_list
) {
2428 const struct fib_info
*fi
= fa
->fa_info
;
2429 unsigned flags
= fib_flag_trans(fa
->fa_type
, mask
, fi
);
2431 if (fa
->fa_type
== RTN_BROADCAST
2432 || fa
->fa_type
== RTN_MULTICAST
)
2436 snprintf(bf
, sizeof(bf
),
2437 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2438 fi
->fib_dev
? fi
->fib_dev
->name
: "*",
2440 fi
->fib_nh
->nh_gw
, flags
, 0, 0,
2443 (fi
->fib_advmss
? fi
->fib_advmss
+ 40 : 0),
2447 snprintf(bf
, sizeof(bf
),
2448 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2449 prefix
, 0, flags
, 0, 0, 0,
2452 seq_printf(seq
, "%-127s\n", bf
);
2459 static struct seq_operations fib_route_seq_ops
= {
2460 .start
= fib_trie_seq_start
,
2461 .next
= fib_trie_seq_next
,
2462 .stop
= fib_trie_seq_stop
,
2463 .show
= fib_route_seq_show
,
2466 static int fib_route_seq_open(struct inode
*inode
, struct file
*file
)
2468 struct seq_file
*seq
;
2470 struct fib_trie_iter
*s
= kmalloc(sizeof(*s
), GFP_KERNEL
);
2475 rc
= seq_open(file
, &fib_route_seq_ops
);
2479 seq
= file
->private_data
;
2481 memset(s
, 0, sizeof(*s
));
2489 static struct file_operations fib_route_fops
= {
2490 .owner
= THIS_MODULE
,
2491 .open
= fib_route_seq_open
,
2493 .llseek
= seq_lseek
,
2494 .release
= seq_release_private
,
2497 int __init
fib_proc_init(void)
2499 if (!proc_net_fops_create("fib_trie", S_IRUGO
, &fib_trie_fops
))
2502 if (!proc_net_fops_create("fib_triestat", S_IRUGO
, &fib_triestat_fops
))
2505 if (!proc_net_fops_create("route", S_IRUGO
, &fib_route_fops
))
2511 proc_net_remove("fib_triestat");
2513 proc_net_remove("fib_trie");
2518 void __init
fib_proc_exit(void)
2520 proc_net_remove("fib_trie");
2521 proc_net_remove("fib_triestat");
2522 proc_net_remove("route");
2525 #endif /* CONFIG_PROC_FS */