ipv4: Fix fib_trie rebalancing
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / fib_trie.c
blobd1a39b1277d6beb141d82e19a036a10861990047
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
26 * Code from fib_hash has been reused which includes the following header:
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
33 * IPv4 FIB: lookup engine and maintenance routines.
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
43 * Substantial contributions to this work comes from:
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
51 #define VERSION "0.408"
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
75 #include <net/ip.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
78 #include <net/tcp.h>
79 #include <net/sock.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
83 #define MAX_STAT_DEPTH 32
85 #define KEYLENGTH (8*sizeof(t_key))
87 typedef unsigned int t_key;
89 #define T_TNODE 0
90 #define T_LEAF 1
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
97 struct node {
98 unsigned long parent;
99 t_key key;
102 struct leaf {
103 unsigned long parent;
104 t_key key;
105 struct hlist_head list;
106 struct rcu_head rcu;
109 struct leaf_info {
110 struct hlist_node hlist;
111 struct rcu_head rcu;
112 int plen;
113 struct list_head falh;
116 struct tnode {
117 unsigned long parent;
118 t_key key;
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
123 union {
124 struct rcu_head rcu;
125 struct work_struct work;
126 struct tnode *tnode_free;
128 struct node *child[0];
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
140 #endif
142 struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int prefixes;
149 unsigned int nodesizes[MAX_STAT_DEPTH];
152 struct trie {
153 struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156 #endif
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 int wasfull);
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
168 static struct kmem_cache *fn_alias_kmem __read_mostly;
169 static struct kmem_cache *trie_leaf_kmem __read_mostly;
171 static inline struct tnode *node_parent(struct node *node)
173 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
176 static inline struct tnode *node_parent_rcu(struct node *node)
178 struct tnode *ret = node_parent(node);
180 return rcu_dereference(ret);
183 /* Same as rcu_assign_pointer
184 * but that macro() assumes that value is a pointer.
186 static inline void node_set_parent(struct node *node, struct tnode *ptr)
188 smp_wmb();
189 node->parent = (unsigned long)ptr | NODE_TYPE(node);
192 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
194 BUG_ON(i >= 1U << tn->bits);
196 return tn->child[i];
199 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
201 struct node *ret = tnode_get_child(tn, i);
203 return rcu_dereference(ret);
206 static inline int tnode_child_length(const struct tnode *tn)
208 return 1 << tn->bits;
211 static inline t_key mask_pfx(t_key k, unsigned short l)
213 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
216 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
218 if (offset < KEYLENGTH)
219 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
220 else
221 return 0;
224 static inline int tkey_equals(t_key a, t_key b)
226 return a == b;
229 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
231 if (bits == 0 || offset >= KEYLENGTH)
232 return 1;
233 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
234 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237 static inline int tkey_mismatch(t_key a, int offset, t_key b)
239 t_key diff = a ^ b;
240 int i = offset;
242 if (!diff)
243 return 0;
244 while ((diff << i) >> (KEYLENGTH-1) == 0)
245 i++;
246 return i;
250 To understand this stuff, an understanding of keys and all their bits is
251 necessary. Every node in the trie has a key associated with it, but not
252 all of the bits in that key are significant.
254 Consider a node 'n' and its parent 'tp'.
256 If n is a leaf, every bit in its key is significant. Its presence is
257 necessitated by path compression, since during a tree traversal (when
258 searching for a leaf - unless we are doing an insertion) we will completely
259 ignore all skipped bits we encounter. Thus we need to verify, at the end of
260 a potentially successful search, that we have indeed been walking the
261 correct key path.
263 Note that we can never "miss" the correct key in the tree if present by
264 following the wrong path. Path compression ensures that segments of the key
265 that are the same for all keys with a given prefix are skipped, but the
266 skipped part *is* identical for each node in the subtrie below the skipped
267 bit! trie_insert() in this implementation takes care of that - note the
268 call to tkey_sub_equals() in trie_insert().
270 if n is an internal node - a 'tnode' here, the various parts of its key
271 have many different meanings.
273 Example:
274 _________________________________________________________________
275 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276 -----------------------------------------------------------------
277 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
279 _________________________________________________________________
280 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281 -----------------------------------------------------------------
282 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
284 tp->pos = 7
285 tp->bits = 3
286 n->pos = 15
287 n->bits = 4
289 First, let's just ignore the bits that come before the parent tp, that is
290 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291 not use them for anything.
293 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294 index into the parent's child array. That is, they will be used to find
295 'n' among tp's children.
297 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
298 for the node n.
300 All the bits we have seen so far are significant to the node n. The rest
301 of the bits are really not needed or indeed known in n->key.
303 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304 n's child array, and will of course be different for each child.
307 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
308 at this point.
312 static inline void check_tnode(const struct tnode *tn)
314 WARN_ON(tn && tn->pos+tn->bits > 32);
317 static const int halve_threshold = 25;
318 static const int inflate_threshold = 50;
319 static const int halve_threshold_root = 8;
320 static const int inflate_threshold_root = 15;
323 static void __alias_free_mem(struct rcu_head *head)
325 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
326 kmem_cache_free(fn_alias_kmem, fa);
329 static inline void alias_free_mem_rcu(struct fib_alias *fa)
331 call_rcu(&fa->rcu, __alias_free_mem);
334 static void __leaf_free_rcu(struct rcu_head *head)
336 struct leaf *l = container_of(head, struct leaf, rcu);
337 kmem_cache_free(trie_leaf_kmem, l);
340 static inline void free_leaf(struct leaf *l)
342 call_rcu_bh(&l->rcu, __leaf_free_rcu);
345 static void __leaf_info_free_rcu(struct rcu_head *head)
347 kfree(container_of(head, struct leaf_info, rcu));
350 static inline void free_leaf_info(struct leaf_info *leaf)
352 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
355 static struct tnode *tnode_alloc(size_t size)
357 if (size <= PAGE_SIZE)
358 return kzalloc(size, GFP_KERNEL);
359 else
360 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
363 static void __tnode_vfree(struct work_struct *arg)
365 struct tnode *tn = container_of(arg, struct tnode, work);
366 vfree(tn);
369 static void __tnode_free_rcu(struct rcu_head *head)
371 struct tnode *tn = container_of(head, struct tnode, rcu);
372 size_t size = sizeof(struct tnode) +
373 (sizeof(struct node *) << tn->bits);
375 if (size <= PAGE_SIZE)
376 kfree(tn);
377 else {
378 INIT_WORK(&tn->work, __tnode_vfree);
379 schedule_work(&tn->work);
383 static inline void tnode_free(struct tnode *tn)
385 if (IS_LEAF(tn))
386 free_leaf((struct leaf *) tn);
387 else
388 call_rcu(&tn->rcu, __tnode_free_rcu);
391 static void tnode_free_safe(struct tnode *tn)
393 BUG_ON(IS_LEAF(tn));
395 if (node_parent((struct node *) tn)) {
396 tn->tnode_free = tnode_free_head;
397 tnode_free_head = tn;
398 } else {
399 tnode_free(tn);
403 static void tnode_free_flush(void)
405 struct tnode *tn;
407 while ((tn = tnode_free_head)) {
408 tnode_free_head = tn->tnode_free;
409 tn->tnode_free = NULL;
410 tnode_free(tn);
414 static struct leaf *leaf_new(void)
416 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
417 if (l) {
418 l->parent = T_LEAF;
419 INIT_HLIST_HEAD(&l->list);
421 return l;
424 static struct leaf_info *leaf_info_new(int plen)
426 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
427 if (li) {
428 li->plen = plen;
429 INIT_LIST_HEAD(&li->falh);
431 return li;
434 static struct tnode *tnode_new(t_key key, int pos, int bits)
436 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
437 struct tnode *tn = tnode_alloc(sz);
439 if (tn) {
440 tn->parent = T_TNODE;
441 tn->pos = pos;
442 tn->bits = bits;
443 tn->key = key;
444 tn->full_children = 0;
445 tn->empty_children = 1<<bits;
448 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
449 (unsigned long) (sizeof(struct node) << bits));
450 return tn;
454 * Check whether a tnode 'n' is "full", i.e. it is an internal node
455 * and no bits are skipped. See discussion in dyntree paper p. 6
458 static inline int tnode_full(const struct tnode *tn, const struct node *n)
460 if (n == NULL || IS_LEAF(n))
461 return 0;
463 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
466 static inline void put_child(struct trie *t, struct tnode *tn, int i,
467 struct node *n)
469 tnode_put_child_reorg(tn, i, n, -1);
473 * Add a child at position i overwriting the old value.
474 * Update the value of full_children and empty_children.
477 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
478 int wasfull)
480 struct node *chi = tn->child[i];
481 int isfull;
483 BUG_ON(i >= 1<<tn->bits);
485 /* update emptyChildren */
486 if (n == NULL && chi != NULL)
487 tn->empty_children++;
488 else if (n != NULL && chi == NULL)
489 tn->empty_children--;
491 /* update fullChildren */
492 if (wasfull == -1)
493 wasfull = tnode_full(tn, chi);
495 isfull = tnode_full(tn, n);
496 if (wasfull && !isfull)
497 tn->full_children--;
498 else if (!wasfull && isfull)
499 tn->full_children++;
501 if (n)
502 node_set_parent(n, tn);
504 rcu_assign_pointer(tn->child[i], n);
507 static struct node *resize(struct trie *t, struct tnode *tn)
509 int i;
510 int err = 0;
511 struct tnode *old_tn;
512 int inflate_threshold_use;
513 int halve_threshold_use;
514 int max_resize;
516 if (!tn)
517 return NULL;
519 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
520 tn, inflate_threshold, halve_threshold);
522 /* No children */
523 if (tn->empty_children == tnode_child_length(tn)) {
524 tnode_free_safe(tn);
525 return NULL;
527 /* One child */
528 if (tn->empty_children == tnode_child_length(tn) - 1)
529 for (i = 0; i < tnode_child_length(tn); i++) {
530 struct node *n;
532 n = tn->child[i];
533 if (!n)
534 continue;
536 /* compress one level */
537 node_set_parent(n, NULL);
538 tnode_free_safe(tn);
539 return n;
542 * Double as long as the resulting node has a number of
543 * nonempty nodes that are above the threshold.
547 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
548 * the Helsinki University of Technology and Matti Tikkanen of Nokia
549 * Telecommunications, page 6:
550 * "A node is doubled if the ratio of non-empty children to all
551 * children in the *doubled* node is at least 'high'."
553 * 'high' in this instance is the variable 'inflate_threshold'. It
554 * is expressed as a percentage, so we multiply it with
555 * tnode_child_length() and instead of multiplying by 2 (since the
556 * child array will be doubled by inflate()) and multiplying
557 * the left-hand side by 100 (to handle the percentage thing) we
558 * multiply the left-hand side by 50.
560 * The left-hand side may look a bit weird: tnode_child_length(tn)
561 * - tn->empty_children is of course the number of non-null children
562 * in the current node. tn->full_children is the number of "full"
563 * children, that is non-null tnodes with a skip value of 0.
564 * All of those will be doubled in the resulting inflated tnode, so
565 * we just count them one extra time here.
567 * A clearer way to write this would be:
569 * to_be_doubled = tn->full_children;
570 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
571 * tn->full_children;
573 * new_child_length = tnode_child_length(tn) * 2;
575 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
576 * new_child_length;
577 * if (new_fill_factor >= inflate_threshold)
579 * ...and so on, tho it would mess up the while () loop.
581 * anyway,
582 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
583 * inflate_threshold
585 * avoid a division:
586 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
587 * inflate_threshold * new_child_length
589 * expand not_to_be_doubled and to_be_doubled, and shorten:
590 * 100 * (tnode_child_length(tn) - tn->empty_children +
591 * tn->full_children) >= inflate_threshold * new_child_length
593 * expand new_child_length:
594 * 100 * (tnode_child_length(tn) - tn->empty_children +
595 * tn->full_children) >=
596 * inflate_threshold * tnode_child_length(tn) * 2
598 * shorten again:
599 * 50 * (tn->full_children + tnode_child_length(tn) -
600 * tn->empty_children) >= inflate_threshold *
601 * tnode_child_length(tn)
605 check_tnode(tn);
607 /* Keep root node larger */
609 if (!tn->parent)
610 inflate_threshold_use = inflate_threshold_root;
611 else
612 inflate_threshold_use = inflate_threshold;
614 err = 0;
615 max_resize = 10;
616 while ((tn->full_children > 0 && max_resize-- &&
617 50 * (tn->full_children + tnode_child_length(tn)
618 - tn->empty_children)
619 >= inflate_threshold_use * tnode_child_length(tn))) {
621 old_tn = tn;
622 tn = inflate(t, tn);
624 if (IS_ERR(tn)) {
625 tn = old_tn;
626 #ifdef CONFIG_IP_FIB_TRIE_STATS
627 t->stats.resize_node_skipped++;
628 #endif
629 break;
633 if (max_resize < 0) {
634 if (!tn->parent)
635 pr_warning("Fix inflate_threshold_root."
636 " Now=%d size=%d bits\n",
637 inflate_threshold_root, tn->bits);
638 else
639 pr_warning("Fix inflate_threshold."
640 " Now=%d size=%d bits\n",
641 inflate_threshold, tn->bits);
644 check_tnode(tn);
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
652 /* Keep root node larger */
654 if (!tn->parent)
655 halve_threshold_use = halve_threshold_root;
656 else
657 halve_threshold_use = halve_threshold;
659 err = 0;
660 max_resize = 10;
661 while (tn->bits > 1 && max_resize-- &&
662 100 * (tnode_child_length(tn) - tn->empty_children) <
663 halve_threshold_use * tnode_child_length(tn)) {
665 old_tn = tn;
666 tn = halve(t, tn);
667 if (IS_ERR(tn)) {
668 tn = old_tn;
669 #ifdef CONFIG_IP_FIB_TRIE_STATS
670 t->stats.resize_node_skipped++;
671 #endif
672 break;
676 if (max_resize < 0) {
677 if (!tn->parent)
678 pr_warning("Fix halve_threshold_root."
679 " Now=%d size=%d bits\n",
680 halve_threshold_root, tn->bits);
681 else
682 pr_warning("Fix halve_threshold."
683 " Now=%d size=%d bits\n",
684 halve_threshold, tn->bits);
687 /* Only one child remains */
688 if (tn->empty_children == tnode_child_length(tn) - 1)
689 for (i = 0; i < tnode_child_length(tn); i++) {
690 struct node *n;
692 n = tn->child[i];
693 if (!n)
694 continue;
696 /* compress one level */
698 node_set_parent(n, NULL);
699 tnode_free_safe(tn);
700 return n;
703 return (struct node *) tn;
706 static struct tnode *inflate(struct trie *t, struct tnode *tn)
708 struct tnode *oldtnode = tn;
709 int olen = tnode_child_length(tn);
710 int i;
712 pr_debug("In inflate\n");
714 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
716 if (!tn)
717 return ERR_PTR(-ENOMEM);
720 * Preallocate and store tnodes before the actual work so we
721 * don't get into an inconsistent state if memory allocation
722 * fails. In case of failure we return the oldnode and inflate
723 * of tnode is ignored.
726 for (i = 0; i < olen; i++) {
727 struct tnode *inode;
729 inode = (struct tnode *) tnode_get_child(oldtnode, i);
730 if (inode &&
731 IS_TNODE(inode) &&
732 inode->pos == oldtnode->pos + oldtnode->bits &&
733 inode->bits > 1) {
734 struct tnode *left, *right;
735 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
737 left = tnode_new(inode->key&(~m), inode->pos + 1,
738 inode->bits - 1);
739 if (!left)
740 goto nomem;
742 right = tnode_new(inode->key|m, inode->pos + 1,
743 inode->bits - 1);
745 if (!right) {
746 tnode_free(left);
747 goto nomem;
750 put_child(t, tn, 2*i, (struct node *) left);
751 put_child(t, tn, 2*i+1, (struct node *) right);
755 for (i = 0; i < olen; i++) {
756 struct tnode *inode;
757 struct node *node = tnode_get_child(oldtnode, i);
758 struct tnode *left, *right;
759 int size, j;
761 /* An empty child */
762 if (node == NULL)
763 continue;
765 /* A leaf or an internal node with skipped bits */
767 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
768 tn->pos + tn->bits - 1) {
769 if (tkey_extract_bits(node->key,
770 oldtnode->pos + oldtnode->bits,
771 1) == 0)
772 put_child(t, tn, 2*i, node);
773 else
774 put_child(t, tn, 2*i+1, node);
775 continue;
778 /* An internal node with two children */
779 inode = (struct tnode *) node;
781 if (inode->bits == 1) {
782 put_child(t, tn, 2*i, inode->child[0]);
783 put_child(t, tn, 2*i+1, inode->child[1]);
785 tnode_free_safe(inode);
786 continue;
789 /* An internal node with more than two children */
791 /* We will replace this node 'inode' with two new
792 * ones, 'left' and 'right', each with half of the
793 * original children. The two new nodes will have
794 * a position one bit further down the key and this
795 * means that the "significant" part of their keys
796 * (see the discussion near the top of this file)
797 * will differ by one bit, which will be "0" in
798 * left's key and "1" in right's key. Since we are
799 * moving the key position by one step, the bit that
800 * we are moving away from - the bit at position
801 * (inode->pos) - is the one that will differ between
802 * left and right. So... we synthesize that bit in the
803 * two new keys.
804 * The mask 'm' below will be a single "one" bit at
805 * the position (inode->pos)
808 /* Use the old key, but set the new significant
809 * bit to zero.
812 left = (struct tnode *) tnode_get_child(tn, 2*i);
813 put_child(t, tn, 2*i, NULL);
815 BUG_ON(!left);
817 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
818 put_child(t, tn, 2*i+1, NULL);
820 BUG_ON(!right);
822 size = tnode_child_length(left);
823 for (j = 0; j < size; j++) {
824 put_child(t, left, j, inode->child[j]);
825 put_child(t, right, j, inode->child[j + size]);
827 put_child(t, tn, 2*i, resize(t, left));
828 put_child(t, tn, 2*i+1, resize(t, right));
830 tnode_free_safe(inode);
832 tnode_free_safe(oldtnode);
833 return tn;
834 nomem:
836 int size = tnode_child_length(tn);
837 int j;
839 for (j = 0; j < size; j++)
840 if (tn->child[j])
841 tnode_free((struct tnode *)tn->child[j]);
843 tnode_free(tn);
845 return ERR_PTR(-ENOMEM);
849 static struct tnode *halve(struct trie *t, struct tnode *tn)
851 struct tnode *oldtnode = tn;
852 struct node *left, *right;
853 int i;
854 int olen = tnode_child_length(tn);
856 pr_debug("In halve\n");
858 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
860 if (!tn)
861 return ERR_PTR(-ENOMEM);
864 * Preallocate and store tnodes before the actual work so we
865 * don't get into an inconsistent state if memory allocation
866 * fails. In case of failure we return the oldnode and halve
867 * of tnode is ignored.
870 for (i = 0; i < olen; i += 2) {
871 left = tnode_get_child(oldtnode, i);
872 right = tnode_get_child(oldtnode, i+1);
874 /* Two nonempty children */
875 if (left && right) {
876 struct tnode *newn;
878 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
880 if (!newn)
881 goto nomem;
883 put_child(t, tn, i/2, (struct node *)newn);
888 for (i = 0; i < olen; i += 2) {
889 struct tnode *newBinNode;
891 left = tnode_get_child(oldtnode, i);
892 right = tnode_get_child(oldtnode, i+1);
894 /* At least one of the children is empty */
895 if (left == NULL) {
896 if (right == NULL) /* Both are empty */
897 continue;
898 put_child(t, tn, i/2, right);
899 continue;
902 if (right == NULL) {
903 put_child(t, tn, i/2, left);
904 continue;
907 /* Two nonempty children */
908 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
909 put_child(t, tn, i/2, NULL);
910 put_child(t, newBinNode, 0, left);
911 put_child(t, newBinNode, 1, right);
912 put_child(t, tn, i/2, resize(t, newBinNode));
914 tnode_free_safe(oldtnode);
915 return tn;
916 nomem:
918 int size = tnode_child_length(tn);
919 int j;
921 for (j = 0; j < size; j++)
922 if (tn->child[j])
923 tnode_free((struct tnode *)tn->child[j]);
925 tnode_free(tn);
927 return ERR_PTR(-ENOMEM);
931 /* readside must use rcu_read_lock currently dump routines
932 via get_fa_head and dump */
934 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
936 struct hlist_head *head = &l->list;
937 struct hlist_node *node;
938 struct leaf_info *li;
940 hlist_for_each_entry_rcu(li, node, head, hlist)
941 if (li->plen == plen)
942 return li;
944 return NULL;
947 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
949 struct leaf_info *li = find_leaf_info(l, plen);
951 if (!li)
952 return NULL;
954 return &li->falh;
957 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
959 struct leaf_info *li = NULL, *last = NULL;
960 struct hlist_node *node;
962 if (hlist_empty(head)) {
963 hlist_add_head_rcu(&new->hlist, head);
964 } else {
965 hlist_for_each_entry(li, node, head, hlist) {
966 if (new->plen > li->plen)
967 break;
969 last = li;
971 if (last)
972 hlist_add_after_rcu(&last->hlist, &new->hlist);
973 else
974 hlist_add_before_rcu(&new->hlist, &li->hlist);
978 /* rcu_read_lock needs to be hold by caller from readside */
980 static struct leaf *
981 fib_find_node(struct trie *t, u32 key)
983 int pos;
984 struct tnode *tn;
985 struct node *n;
987 pos = 0;
988 n = rcu_dereference(t->trie);
990 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
991 tn = (struct tnode *) n;
993 check_tnode(tn);
995 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
996 pos = tn->pos + tn->bits;
997 n = tnode_get_child_rcu(tn,
998 tkey_extract_bits(key,
999 tn->pos,
1000 tn->bits));
1001 } else
1002 break;
1004 /* Case we have found a leaf. Compare prefixes */
1006 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1007 return (struct leaf *)n;
1009 return NULL;
1012 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
1014 int wasfull;
1015 t_key cindex, key;
1016 struct tnode *tp;
1018 key = tn->key;
1020 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1021 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1022 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1023 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1025 tnode_put_child_reorg((struct tnode *)tp, cindex,
1026 (struct node *)tn, wasfull);
1028 tp = node_parent((struct node *) tn);
1029 tnode_free_flush();
1030 if (!tp)
1031 break;
1032 tn = tp;
1035 /* Handle last (top) tnode */
1036 if (IS_TNODE(tn)) {
1037 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1038 tnode_free_flush();
1041 return (struct node *)tn;
1044 /* only used from updater-side */
1046 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1048 int pos, newpos;
1049 struct tnode *tp = NULL, *tn = NULL;
1050 struct node *n;
1051 struct leaf *l;
1052 int missbit;
1053 struct list_head *fa_head = NULL;
1054 struct leaf_info *li;
1055 t_key cindex;
1057 pos = 0;
1058 n = t->trie;
1060 /* If we point to NULL, stop. Either the tree is empty and we should
1061 * just put a new leaf in if, or we have reached an empty child slot,
1062 * and we should just put our new leaf in that.
1063 * If we point to a T_TNODE, check if it matches our key. Note that
1064 * a T_TNODE might be skipping any number of bits - its 'pos' need
1065 * not be the parent's 'pos'+'bits'!
1067 * If it does match the current key, get pos/bits from it, extract
1068 * the index from our key, push the T_TNODE and walk the tree.
1070 * If it doesn't, we have to replace it with a new T_TNODE.
1072 * If we point to a T_LEAF, it might or might not have the same key
1073 * as we do. If it does, just change the value, update the T_LEAF's
1074 * value, and return it.
1075 * If it doesn't, we need to replace it with a T_TNODE.
1078 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1079 tn = (struct tnode *) n;
1081 check_tnode(tn);
1083 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1084 tp = tn;
1085 pos = tn->pos + tn->bits;
1086 n = tnode_get_child(tn,
1087 tkey_extract_bits(key,
1088 tn->pos,
1089 tn->bits));
1091 BUG_ON(n && node_parent(n) != tn);
1092 } else
1093 break;
1097 * n ----> NULL, LEAF or TNODE
1099 * tp is n's (parent) ----> NULL or TNODE
1102 BUG_ON(tp && IS_LEAF(tp));
1104 /* Case 1: n is a leaf. Compare prefixes */
1106 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1107 l = (struct leaf *) n;
1108 li = leaf_info_new(plen);
1110 if (!li)
1111 return NULL;
1113 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li);
1115 goto done;
1117 l = leaf_new();
1119 if (!l)
1120 return NULL;
1122 l->key = key;
1123 li = leaf_info_new(plen);
1125 if (!li) {
1126 free_leaf(l);
1127 return NULL;
1130 fa_head = &li->falh;
1131 insert_leaf_info(&l->list, li);
1133 if (t->trie && n == NULL) {
1134 /* Case 2: n is NULL, and will just insert a new leaf */
1136 node_set_parent((struct node *)l, tp);
1138 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1139 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1140 } else {
1141 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1143 * Add a new tnode here
1144 * first tnode need some special handling
1147 if (tp)
1148 pos = tp->pos+tp->bits;
1149 else
1150 pos = 0;
1152 if (n) {
1153 newpos = tkey_mismatch(key, pos, n->key);
1154 tn = tnode_new(n->key, newpos, 1);
1155 } else {
1156 newpos = 0;
1157 tn = tnode_new(key, newpos, 1); /* First tnode */
1160 if (!tn) {
1161 free_leaf_info(li);
1162 free_leaf(l);
1163 return NULL;
1166 node_set_parent((struct node *)tn, tp);
1168 missbit = tkey_extract_bits(key, newpos, 1);
1169 put_child(t, tn, missbit, (struct node *)l);
1170 put_child(t, tn, 1-missbit, n);
1172 if (tp) {
1173 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1174 put_child(t, (struct tnode *)tp, cindex,
1175 (struct node *)tn);
1176 } else {
1177 rcu_assign_pointer(t->trie, (struct node *)tn);
1178 tp = tn;
1182 if (tp && tp->pos + tp->bits > 32)
1183 pr_warning("fib_trie"
1184 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1185 tp, tp->pos, tp->bits, key, plen);
1187 /* Rebalance the trie */
1189 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1190 done:
1191 return fa_head;
1195 * Caller must hold RTNL.
1197 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1199 struct trie *t = (struct trie *) tb->tb_data;
1200 struct fib_alias *fa, *new_fa;
1201 struct list_head *fa_head = NULL;
1202 struct fib_info *fi;
1203 int plen = cfg->fc_dst_len;
1204 u8 tos = cfg->fc_tos;
1205 u32 key, mask;
1206 int err;
1207 struct leaf *l;
1209 if (plen > 32)
1210 return -EINVAL;
1212 key = ntohl(cfg->fc_dst);
1214 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1216 mask = ntohl(inet_make_mask(plen));
1218 if (key & ~mask)
1219 return -EINVAL;
1221 key = key & mask;
1223 fi = fib_create_info(cfg);
1224 if (IS_ERR(fi)) {
1225 err = PTR_ERR(fi);
1226 goto err;
1229 l = fib_find_node(t, key);
1230 fa = NULL;
1232 if (l) {
1233 fa_head = get_fa_head(l, plen);
1234 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1237 /* Now fa, if non-NULL, points to the first fib alias
1238 * with the same keys [prefix,tos,priority], if such key already
1239 * exists or to the node before which we will insert new one.
1241 * If fa is NULL, we will need to allocate a new one and
1242 * insert to the head of f.
1244 * If f is NULL, no fib node matched the destination key
1245 * and we need to allocate a new one of those as well.
1248 if (fa && fa->fa_tos == tos &&
1249 fa->fa_info->fib_priority == fi->fib_priority) {
1250 struct fib_alias *fa_first, *fa_match;
1252 err = -EEXIST;
1253 if (cfg->fc_nlflags & NLM_F_EXCL)
1254 goto out;
1256 /* We have 2 goals:
1257 * 1. Find exact match for type, scope, fib_info to avoid
1258 * duplicate routes
1259 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1261 fa_match = NULL;
1262 fa_first = fa;
1263 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1264 list_for_each_entry_continue(fa, fa_head, fa_list) {
1265 if (fa->fa_tos != tos)
1266 break;
1267 if (fa->fa_info->fib_priority != fi->fib_priority)
1268 break;
1269 if (fa->fa_type == cfg->fc_type &&
1270 fa->fa_scope == cfg->fc_scope &&
1271 fa->fa_info == fi) {
1272 fa_match = fa;
1273 break;
1277 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1278 struct fib_info *fi_drop;
1279 u8 state;
1281 fa = fa_first;
1282 if (fa_match) {
1283 if (fa == fa_match)
1284 err = 0;
1285 goto out;
1287 err = -ENOBUFS;
1288 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1289 if (new_fa == NULL)
1290 goto out;
1292 fi_drop = fa->fa_info;
1293 new_fa->fa_tos = fa->fa_tos;
1294 new_fa->fa_info = fi;
1295 new_fa->fa_type = cfg->fc_type;
1296 new_fa->fa_scope = cfg->fc_scope;
1297 state = fa->fa_state;
1298 new_fa->fa_state = state & ~FA_S_ACCESSED;
1300 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1301 alias_free_mem_rcu(fa);
1303 fib_release_info(fi_drop);
1304 if (state & FA_S_ACCESSED)
1305 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1306 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1307 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1309 goto succeeded;
1311 /* Error if we find a perfect match which
1312 * uses the same scope, type, and nexthop
1313 * information.
1315 if (fa_match)
1316 goto out;
1318 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1319 fa = fa_first;
1321 err = -ENOENT;
1322 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1323 goto out;
1325 err = -ENOBUFS;
1326 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1327 if (new_fa == NULL)
1328 goto out;
1330 new_fa->fa_info = fi;
1331 new_fa->fa_tos = tos;
1332 new_fa->fa_type = cfg->fc_type;
1333 new_fa->fa_scope = cfg->fc_scope;
1334 new_fa->fa_state = 0;
1336 * Insert new entry to the list.
1339 if (!fa_head) {
1340 fa_head = fib_insert_node(t, key, plen);
1341 if (unlikely(!fa_head)) {
1342 err = -ENOMEM;
1343 goto out_free_new_fa;
1347 list_add_tail_rcu(&new_fa->fa_list,
1348 (fa ? &fa->fa_list : fa_head));
1350 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1351 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1352 &cfg->fc_nlinfo, 0);
1353 succeeded:
1354 return 0;
1356 out_free_new_fa:
1357 kmem_cache_free(fn_alias_kmem, new_fa);
1358 out:
1359 fib_release_info(fi);
1360 err:
1361 return err;
1364 /* should be called with rcu_read_lock */
1365 static int check_leaf(struct trie *t, struct leaf *l,
1366 t_key key, const struct flowi *flp,
1367 struct fib_result *res)
1369 struct leaf_info *li;
1370 struct hlist_head *hhead = &l->list;
1371 struct hlist_node *node;
1373 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1374 int err;
1375 int plen = li->plen;
1376 __be32 mask = inet_make_mask(plen);
1378 if (l->key != (key & ntohl(mask)))
1379 continue;
1381 err = fib_semantic_match(&li->falh, flp, res, plen);
1383 #ifdef CONFIG_IP_FIB_TRIE_STATS
1384 if (err <= 0)
1385 t->stats.semantic_match_passed++;
1386 else
1387 t->stats.semantic_match_miss++;
1388 #endif
1389 if (err <= 0)
1390 return err;
1393 return 1;
1396 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1397 struct fib_result *res)
1399 struct trie *t = (struct trie *) tb->tb_data;
1400 int ret;
1401 struct node *n;
1402 struct tnode *pn;
1403 int pos, bits;
1404 t_key key = ntohl(flp->fl4_dst);
1405 int chopped_off;
1406 t_key cindex = 0;
1407 int current_prefix_length = KEYLENGTH;
1408 struct tnode *cn;
1409 t_key node_prefix, key_prefix, pref_mismatch;
1410 int mp;
1412 rcu_read_lock();
1414 n = rcu_dereference(t->trie);
1415 if (!n)
1416 goto failed;
1418 #ifdef CONFIG_IP_FIB_TRIE_STATS
1419 t->stats.gets++;
1420 #endif
1422 /* Just a leaf? */
1423 if (IS_LEAF(n)) {
1424 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1425 goto found;
1428 pn = (struct tnode *) n;
1429 chopped_off = 0;
1431 while (pn) {
1432 pos = pn->pos;
1433 bits = pn->bits;
1435 if (!chopped_off)
1436 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1437 pos, bits);
1439 n = tnode_get_child(pn, cindex);
1441 if (n == NULL) {
1442 #ifdef CONFIG_IP_FIB_TRIE_STATS
1443 t->stats.null_node_hit++;
1444 #endif
1445 goto backtrace;
1448 if (IS_LEAF(n)) {
1449 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1450 if (ret > 0)
1451 goto backtrace;
1452 goto found;
1455 cn = (struct tnode *)n;
1458 * It's a tnode, and we can do some extra checks here if we
1459 * like, to avoid descending into a dead-end branch.
1460 * This tnode is in the parent's child array at index
1461 * key[p_pos..p_pos+p_bits] but potentially with some bits
1462 * chopped off, so in reality the index may be just a
1463 * subprefix, padded with zero at the end.
1464 * We can also take a look at any skipped bits in this
1465 * tnode - everything up to p_pos is supposed to be ok,
1466 * and the non-chopped bits of the index (se previous
1467 * paragraph) are also guaranteed ok, but the rest is
1468 * considered unknown.
1470 * The skipped bits are key[pos+bits..cn->pos].
1473 /* If current_prefix_length < pos+bits, we are already doing
1474 * actual prefix matching, which means everything from
1475 * pos+(bits-chopped_off) onward must be zero along some
1476 * branch of this subtree - otherwise there is *no* valid
1477 * prefix present. Here we can only check the skipped
1478 * bits. Remember, since we have already indexed into the
1479 * parent's child array, we know that the bits we chopped of
1480 * *are* zero.
1483 /* NOTA BENE: Checking only skipped bits
1484 for the new node here */
1486 if (current_prefix_length < pos+bits) {
1487 if (tkey_extract_bits(cn->key, current_prefix_length,
1488 cn->pos - current_prefix_length)
1489 || !(cn->child[0]))
1490 goto backtrace;
1494 * If chopped_off=0, the index is fully validated and we
1495 * only need to look at the skipped bits for this, the new,
1496 * tnode. What we actually want to do is to find out if
1497 * these skipped bits match our key perfectly, or if we will
1498 * have to count on finding a matching prefix further down,
1499 * because if we do, we would like to have some way of
1500 * verifying the existence of such a prefix at this point.
1503 /* The only thing we can do at this point is to verify that
1504 * any such matching prefix can indeed be a prefix to our
1505 * key, and if the bits in the node we are inspecting that
1506 * do not match our key are not ZERO, this cannot be true.
1507 * Thus, find out where there is a mismatch (before cn->pos)
1508 * and verify that all the mismatching bits are zero in the
1509 * new tnode's key.
1513 * Note: We aren't very concerned about the piece of
1514 * the key that precede pn->pos+pn->bits, since these
1515 * have already been checked. The bits after cn->pos
1516 * aren't checked since these are by definition
1517 * "unknown" at this point. Thus, what we want to see
1518 * is if we are about to enter the "prefix matching"
1519 * state, and in that case verify that the skipped
1520 * bits that will prevail throughout this subtree are
1521 * zero, as they have to be if we are to find a
1522 * matching prefix.
1525 node_prefix = mask_pfx(cn->key, cn->pos);
1526 key_prefix = mask_pfx(key, cn->pos);
1527 pref_mismatch = key_prefix^node_prefix;
1528 mp = 0;
1531 * In short: If skipped bits in this node do not match
1532 * the search key, enter the "prefix matching"
1533 * state.directly.
1535 if (pref_mismatch) {
1536 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1537 mp++;
1538 pref_mismatch = pref_mismatch << 1;
1540 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1542 if (key_prefix != 0)
1543 goto backtrace;
1545 if (current_prefix_length >= cn->pos)
1546 current_prefix_length = mp;
1549 pn = (struct tnode *)n; /* Descend */
1550 chopped_off = 0;
1551 continue;
1553 backtrace:
1554 chopped_off++;
1556 /* As zero don't change the child key (cindex) */
1557 while ((chopped_off <= pn->bits)
1558 && !(cindex & (1<<(chopped_off-1))))
1559 chopped_off++;
1561 /* Decrease current_... with bits chopped off */
1562 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1563 current_prefix_length = pn->pos + pn->bits
1564 - chopped_off;
1567 * Either we do the actual chop off according or if we have
1568 * chopped off all bits in this tnode walk up to our parent.
1571 if (chopped_off <= pn->bits) {
1572 cindex &= ~(1 << (chopped_off-1));
1573 } else {
1574 struct tnode *parent = node_parent((struct node *) pn);
1575 if (!parent)
1576 goto failed;
1578 /* Get Child's index */
1579 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1580 pn = parent;
1581 chopped_off = 0;
1583 #ifdef CONFIG_IP_FIB_TRIE_STATS
1584 t->stats.backtrack++;
1585 #endif
1586 goto backtrace;
1589 failed:
1590 ret = 1;
1591 found:
1592 rcu_read_unlock();
1593 return ret;
1597 * Remove the leaf and return parent.
1599 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1601 struct tnode *tp = node_parent((struct node *) l);
1603 pr_debug("entering trie_leaf_remove(%p)\n", l);
1605 if (tp) {
1606 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1607 put_child(t, (struct tnode *)tp, cindex, NULL);
1608 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1609 } else
1610 rcu_assign_pointer(t->trie, NULL);
1612 free_leaf(l);
1616 * Caller must hold RTNL.
1618 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1620 struct trie *t = (struct trie *) tb->tb_data;
1621 u32 key, mask;
1622 int plen = cfg->fc_dst_len;
1623 u8 tos = cfg->fc_tos;
1624 struct fib_alias *fa, *fa_to_delete;
1625 struct list_head *fa_head;
1626 struct leaf *l;
1627 struct leaf_info *li;
1629 if (plen > 32)
1630 return -EINVAL;
1632 key = ntohl(cfg->fc_dst);
1633 mask = ntohl(inet_make_mask(plen));
1635 if (key & ~mask)
1636 return -EINVAL;
1638 key = key & mask;
1639 l = fib_find_node(t, key);
1641 if (!l)
1642 return -ESRCH;
1644 fa_head = get_fa_head(l, plen);
1645 fa = fib_find_alias(fa_head, tos, 0);
1647 if (!fa)
1648 return -ESRCH;
1650 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1652 fa_to_delete = NULL;
1653 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1654 list_for_each_entry_continue(fa, fa_head, fa_list) {
1655 struct fib_info *fi = fa->fa_info;
1657 if (fa->fa_tos != tos)
1658 break;
1660 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1661 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1662 fa->fa_scope == cfg->fc_scope) &&
1663 (!cfg->fc_protocol ||
1664 fi->fib_protocol == cfg->fc_protocol) &&
1665 fib_nh_match(cfg, fi) == 0) {
1666 fa_to_delete = fa;
1667 break;
1671 if (!fa_to_delete)
1672 return -ESRCH;
1674 fa = fa_to_delete;
1675 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1676 &cfg->fc_nlinfo, 0);
1678 l = fib_find_node(t, key);
1679 li = find_leaf_info(l, plen);
1681 list_del_rcu(&fa->fa_list);
1683 if (list_empty(fa_head)) {
1684 hlist_del_rcu(&li->hlist);
1685 free_leaf_info(li);
1688 if (hlist_empty(&l->list))
1689 trie_leaf_remove(t, l);
1691 if (fa->fa_state & FA_S_ACCESSED)
1692 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1694 fib_release_info(fa->fa_info);
1695 alias_free_mem_rcu(fa);
1696 return 0;
1699 static int trie_flush_list(struct list_head *head)
1701 struct fib_alias *fa, *fa_node;
1702 int found = 0;
1704 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1705 struct fib_info *fi = fa->fa_info;
1707 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1708 list_del_rcu(&fa->fa_list);
1709 fib_release_info(fa->fa_info);
1710 alias_free_mem_rcu(fa);
1711 found++;
1714 return found;
1717 static int trie_flush_leaf(struct leaf *l)
1719 int found = 0;
1720 struct hlist_head *lih = &l->list;
1721 struct hlist_node *node, *tmp;
1722 struct leaf_info *li = NULL;
1724 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1725 found += trie_flush_list(&li->falh);
1727 if (list_empty(&li->falh)) {
1728 hlist_del_rcu(&li->hlist);
1729 free_leaf_info(li);
1732 return found;
1736 * Scan for the next right leaf starting at node p->child[idx]
1737 * Since we have back pointer, no recursion necessary.
1739 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1741 do {
1742 t_key idx;
1744 if (c)
1745 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1746 else
1747 idx = 0;
1749 while (idx < 1u << p->bits) {
1750 c = tnode_get_child_rcu(p, idx++);
1751 if (!c)
1752 continue;
1754 if (IS_LEAF(c)) {
1755 prefetch(p->child[idx]);
1756 return (struct leaf *) c;
1759 /* Rescan start scanning in new node */
1760 p = (struct tnode *) c;
1761 idx = 0;
1764 /* Node empty, walk back up to parent */
1765 c = (struct node *) p;
1766 } while ( (p = node_parent_rcu(c)) != NULL);
1768 return NULL; /* Root of trie */
1771 static struct leaf *trie_firstleaf(struct trie *t)
1773 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1775 if (!n)
1776 return NULL;
1778 if (IS_LEAF(n)) /* trie is just a leaf */
1779 return (struct leaf *) n;
1781 return leaf_walk_rcu(n, NULL);
1784 static struct leaf *trie_nextleaf(struct leaf *l)
1786 struct node *c = (struct node *) l;
1787 struct tnode *p = node_parent(c);
1789 if (!p)
1790 return NULL; /* trie with just one leaf */
1792 return leaf_walk_rcu(p, c);
1795 static struct leaf *trie_leafindex(struct trie *t, int index)
1797 struct leaf *l = trie_firstleaf(t);
1799 while (l && index-- > 0)
1800 l = trie_nextleaf(l);
1802 return l;
1807 * Caller must hold RTNL.
1809 static int fn_trie_flush(struct fib_table *tb)
1811 struct trie *t = (struct trie *) tb->tb_data;
1812 struct leaf *l, *ll = NULL;
1813 int found = 0;
1815 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1816 found += trie_flush_leaf(l);
1818 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll);
1820 ll = l;
1823 if (ll && hlist_empty(&ll->list))
1824 trie_leaf_remove(t, ll);
1826 pr_debug("trie_flush found=%d\n", found);
1827 return found;
1830 static void fn_trie_select_default(struct fib_table *tb,
1831 const struct flowi *flp,
1832 struct fib_result *res)
1834 struct trie *t = (struct trie *) tb->tb_data;
1835 int order, last_idx;
1836 struct fib_info *fi = NULL;
1837 struct fib_info *last_resort;
1838 struct fib_alias *fa = NULL;
1839 struct list_head *fa_head;
1840 struct leaf *l;
1842 last_idx = -1;
1843 last_resort = NULL;
1844 order = -1;
1846 rcu_read_lock();
1848 l = fib_find_node(t, 0);
1849 if (!l)
1850 goto out;
1852 fa_head = get_fa_head(l, 0);
1853 if (!fa_head)
1854 goto out;
1856 if (list_empty(fa_head))
1857 goto out;
1859 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1860 struct fib_info *next_fi = fa->fa_info;
1862 if (fa->fa_scope != res->scope ||
1863 fa->fa_type != RTN_UNICAST)
1864 continue;
1866 if (next_fi->fib_priority > res->fi->fib_priority)
1867 break;
1868 if (!next_fi->fib_nh[0].nh_gw ||
1869 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1870 continue;
1871 fa->fa_state |= FA_S_ACCESSED;
1873 if (fi == NULL) {
1874 if (next_fi != res->fi)
1875 break;
1876 } else if (!fib_detect_death(fi, order, &last_resort,
1877 &last_idx, tb->tb_default)) {
1878 fib_result_assign(res, fi);
1879 tb->tb_default = order;
1880 goto out;
1882 fi = next_fi;
1883 order++;
1885 if (order <= 0 || fi == NULL) {
1886 tb->tb_default = -1;
1887 goto out;
1890 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1891 tb->tb_default)) {
1892 fib_result_assign(res, fi);
1893 tb->tb_default = order;
1894 goto out;
1896 if (last_idx >= 0)
1897 fib_result_assign(res, last_resort);
1898 tb->tb_default = last_idx;
1899 out:
1900 rcu_read_unlock();
1903 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1904 struct fib_table *tb,
1905 struct sk_buff *skb, struct netlink_callback *cb)
1907 int i, s_i;
1908 struct fib_alias *fa;
1909 __be32 xkey = htonl(key);
1911 s_i = cb->args[5];
1912 i = 0;
1914 /* rcu_read_lock is hold by caller */
1916 list_for_each_entry_rcu(fa, fah, fa_list) {
1917 if (i < s_i) {
1918 i++;
1919 continue;
1922 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1923 cb->nlh->nlmsg_seq,
1924 RTM_NEWROUTE,
1925 tb->tb_id,
1926 fa->fa_type,
1927 fa->fa_scope,
1928 xkey,
1929 plen,
1930 fa->fa_tos,
1931 fa->fa_info, NLM_F_MULTI) < 0) {
1932 cb->args[5] = i;
1933 return -1;
1935 i++;
1937 cb->args[5] = i;
1938 return skb->len;
1941 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1942 struct sk_buff *skb, struct netlink_callback *cb)
1944 struct leaf_info *li;
1945 struct hlist_node *node;
1946 int i, s_i;
1948 s_i = cb->args[4];
1949 i = 0;
1951 /* rcu_read_lock is hold by caller */
1952 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1953 if (i < s_i) {
1954 i++;
1955 continue;
1958 if (i > s_i)
1959 cb->args[5] = 0;
1961 if (list_empty(&li->falh))
1962 continue;
1964 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1965 cb->args[4] = i;
1966 return -1;
1968 i++;
1971 cb->args[4] = i;
1972 return skb->len;
1975 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1976 struct netlink_callback *cb)
1978 struct leaf *l;
1979 struct trie *t = (struct trie *) tb->tb_data;
1980 t_key key = cb->args[2];
1981 int count = cb->args[3];
1983 rcu_read_lock();
1984 /* Dump starting at last key.
1985 * Note: 0.0.0.0/0 (ie default) is first key.
1987 if (count == 0)
1988 l = trie_firstleaf(t);
1989 else {
1990 /* Normally, continue from last key, but if that is missing
1991 * fallback to using slow rescan
1993 l = fib_find_node(t, key);
1994 if (!l)
1995 l = trie_leafindex(t, count);
1998 while (l) {
1999 cb->args[2] = l->key;
2000 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2001 cb->args[3] = count;
2002 rcu_read_unlock();
2003 return -1;
2006 ++count;
2007 l = trie_nextleaf(l);
2008 memset(&cb->args[4], 0,
2009 sizeof(cb->args) - 4*sizeof(cb->args[0]));
2011 cb->args[3] = count;
2012 rcu_read_unlock();
2014 return skb->len;
2017 void __init fib_hash_init(void)
2019 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2020 sizeof(struct fib_alias),
2021 0, SLAB_PANIC, NULL);
2023 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2024 max(sizeof(struct leaf),
2025 sizeof(struct leaf_info)),
2026 0, SLAB_PANIC, NULL);
2030 /* Fix more generic FIB names for init later */
2031 struct fib_table *fib_hash_table(u32 id)
2033 struct fib_table *tb;
2034 struct trie *t;
2036 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2037 GFP_KERNEL);
2038 if (tb == NULL)
2039 return NULL;
2041 tb->tb_id = id;
2042 tb->tb_default = -1;
2043 tb->tb_lookup = fn_trie_lookup;
2044 tb->tb_insert = fn_trie_insert;
2045 tb->tb_delete = fn_trie_delete;
2046 tb->tb_flush = fn_trie_flush;
2047 tb->tb_select_default = fn_trie_select_default;
2048 tb->tb_dump = fn_trie_dump;
2050 t = (struct trie *) tb->tb_data;
2051 memset(t, 0, sizeof(*t));
2053 if (id == RT_TABLE_LOCAL)
2054 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2056 return tb;
2059 #ifdef CONFIG_PROC_FS
2060 /* Depth first Trie walk iterator */
2061 struct fib_trie_iter {
2062 struct seq_net_private p;
2063 struct fib_table *tb;
2064 struct tnode *tnode;
2065 unsigned index;
2066 unsigned depth;
2069 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2071 struct tnode *tn = iter->tnode;
2072 unsigned cindex = iter->index;
2073 struct tnode *p;
2075 /* A single entry routing table */
2076 if (!tn)
2077 return NULL;
2079 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2080 iter->tnode, iter->index, iter->depth);
2081 rescan:
2082 while (cindex < (1<<tn->bits)) {
2083 struct node *n = tnode_get_child_rcu(tn, cindex);
2085 if (n) {
2086 if (IS_LEAF(n)) {
2087 iter->tnode = tn;
2088 iter->index = cindex + 1;
2089 } else {
2090 /* push down one level */
2091 iter->tnode = (struct tnode *) n;
2092 iter->index = 0;
2093 ++iter->depth;
2095 return n;
2098 ++cindex;
2101 /* Current node exhausted, pop back up */
2102 p = node_parent_rcu((struct node *)tn);
2103 if (p) {
2104 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2105 tn = p;
2106 --iter->depth;
2107 goto rescan;
2110 /* got root? */
2111 return NULL;
2114 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2115 struct trie *t)
2117 struct node *n;
2119 if (!t)
2120 return NULL;
2122 n = rcu_dereference(t->trie);
2123 if (!n)
2124 return NULL;
2126 if (IS_TNODE(n)) {
2127 iter->tnode = (struct tnode *) n;
2128 iter->index = 0;
2129 iter->depth = 1;
2130 } else {
2131 iter->tnode = NULL;
2132 iter->index = 0;
2133 iter->depth = 0;
2136 return n;
2139 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2141 struct node *n;
2142 struct fib_trie_iter iter;
2144 memset(s, 0, sizeof(*s));
2146 rcu_read_lock();
2147 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2148 if (IS_LEAF(n)) {
2149 struct leaf *l = (struct leaf *)n;
2150 struct leaf_info *li;
2151 struct hlist_node *tmp;
2153 s->leaves++;
2154 s->totdepth += iter.depth;
2155 if (iter.depth > s->maxdepth)
2156 s->maxdepth = iter.depth;
2158 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2159 ++s->prefixes;
2160 } else {
2161 const struct tnode *tn = (const struct tnode *) n;
2162 int i;
2164 s->tnodes++;
2165 if (tn->bits < MAX_STAT_DEPTH)
2166 s->nodesizes[tn->bits]++;
2168 for (i = 0; i < (1<<tn->bits); i++)
2169 if (!tn->child[i])
2170 s->nullpointers++;
2173 rcu_read_unlock();
2177 * This outputs /proc/net/fib_triestats
2179 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2181 unsigned i, max, pointers, bytes, avdepth;
2183 if (stat->leaves)
2184 avdepth = stat->totdepth*100 / stat->leaves;
2185 else
2186 avdepth = 0;
2188 seq_printf(seq, "\tAver depth: %u.%02d\n",
2189 avdepth / 100, avdepth % 100);
2190 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2192 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2193 bytes = sizeof(struct leaf) * stat->leaves;
2195 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2196 bytes += sizeof(struct leaf_info) * stat->prefixes;
2198 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2199 bytes += sizeof(struct tnode) * stat->tnodes;
2201 max = MAX_STAT_DEPTH;
2202 while (max > 0 && stat->nodesizes[max-1] == 0)
2203 max--;
2205 pointers = 0;
2206 for (i = 1; i <= max; i++)
2207 if (stat->nodesizes[i] != 0) {
2208 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2209 pointers += (1<<i) * stat->nodesizes[i];
2211 seq_putc(seq, '\n');
2212 seq_printf(seq, "\tPointers: %u\n", pointers);
2214 bytes += sizeof(struct node *) * pointers;
2215 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2216 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2219 #ifdef CONFIG_IP_FIB_TRIE_STATS
2220 static void trie_show_usage(struct seq_file *seq,
2221 const struct trie_use_stats *stats)
2223 seq_printf(seq, "\nCounters:\n---------\n");
2224 seq_printf(seq, "gets = %u\n", stats->gets);
2225 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2226 seq_printf(seq, "semantic match passed = %u\n",
2227 stats->semantic_match_passed);
2228 seq_printf(seq, "semantic match miss = %u\n",
2229 stats->semantic_match_miss);
2230 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2231 seq_printf(seq, "skipped node resize = %u\n\n",
2232 stats->resize_node_skipped);
2234 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2236 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2238 if (tb->tb_id == RT_TABLE_LOCAL)
2239 seq_puts(seq, "Local:\n");
2240 else if (tb->tb_id == RT_TABLE_MAIN)
2241 seq_puts(seq, "Main:\n");
2242 else
2243 seq_printf(seq, "Id %d:\n", tb->tb_id);
2247 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2249 struct net *net = (struct net *)seq->private;
2250 unsigned int h;
2252 seq_printf(seq,
2253 "Basic info: size of leaf:"
2254 " %Zd bytes, size of tnode: %Zd bytes.\n",
2255 sizeof(struct leaf), sizeof(struct tnode));
2257 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2258 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2259 struct hlist_node *node;
2260 struct fib_table *tb;
2262 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2263 struct trie *t = (struct trie *) tb->tb_data;
2264 struct trie_stat stat;
2266 if (!t)
2267 continue;
2269 fib_table_print(seq, tb);
2271 trie_collect_stats(t, &stat);
2272 trie_show_stats(seq, &stat);
2273 #ifdef CONFIG_IP_FIB_TRIE_STATS
2274 trie_show_usage(seq, &t->stats);
2275 #endif
2279 return 0;
2282 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2284 return single_open_net(inode, file, fib_triestat_seq_show);
2287 static const struct file_operations fib_triestat_fops = {
2288 .owner = THIS_MODULE,
2289 .open = fib_triestat_seq_open,
2290 .read = seq_read,
2291 .llseek = seq_lseek,
2292 .release = single_release_net,
2295 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2297 struct fib_trie_iter *iter = seq->private;
2298 struct net *net = seq_file_net(seq);
2299 loff_t idx = 0;
2300 unsigned int h;
2302 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2303 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2304 struct hlist_node *node;
2305 struct fib_table *tb;
2307 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2308 struct node *n;
2310 for (n = fib_trie_get_first(iter,
2311 (struct trie *) tb->tb_data);
2312 n; n = fib_trie_get_next(iter))
2313 if (pos == idx++) {
2314 iter->tb = tb;
2315 return n;
2320 return NULL;
2323 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2324 __acquires(RCU)
2326 rcu_read_lock();
2327 return fib_trie_get_idx(seq, *pos);
2330 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2332 struct fib_trie_iter *iter = seq->private;
2333 struct net *net = seq_file_net(seq);
2334 struct fib_table *tb = iter->tb;
2335 struct hlist_node *tb_node;
2336 unsigned int h;
2337 struct node *n;
2339 ++*pos;
2340 /* next node in same table */
2341 n = fib_trie_get_next(iter);
2342 if (n)
2343 return n;
2345 /* walk rest of this hash chain */
2346 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2347 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2348 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2349 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2350 if (n)
2351 goto found;
2354 /* new hash chain */
2355 while (++h < FIB_TABLE_HASHSZ) {
2356 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2357 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2358 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2359 if (n)
2360 goto found;
2363 return NULL;
2365 found:
2366 iter->tb = tb;
2367 return n;
2370 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2371 __releases(RCU)
2373 rcu_read_unlock();
2376 static void seq_indent(struct seq_file *seq, int n)
2378 while (n-- > 0) seq_puts(seq, " ");
2381 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2383 switch (s) {
2384 case RT_SCOPE_UNIVERSE: return "universe";
2385 case RT_SCOPE_SITE: return "site";
2386 case RT_SCOPE_LINK: return "link";
2387 case RT_SCOPE_HOST: return "host";
2388 case RT_SCOPE_NOWHERE: return "nowhere";
2389 default:
2390 snprintf(buf, len, "scope=%d", s);
2391 return buf;
2395 static const char *rtn_type_names[__RTN_MAX] = {
2396 [RTN_UNSPEC] = "UNSPEC",
2397 [RTN_UNICAST] = "UNICAST",
2398 [RTN_LOCAL] = "LOCAL",
2399 [RTN_BROADCAST] = "BROADCAST",
2400 [RTN_ANYCAST] = "ANYCAST",
2401 [RTN_MULTICAST] = "MULTICAST",
2402 [RTN_BLACKHOLE] = "BLACKHOLE",
2403 [RTN_UNREACHABLE] = "UNREACHABLE",
2404 [RTN_PROHIBIT] = "PROHIBIT",
2405 [RTN_THROW] = "THROW",
2406 [RTN_NAT] = "NAT",
2407 [RTN_XRESOLVE] = "XRESOLVE",
2410 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2412 if (t < __RTN_MAX && rtn_type_names[t])
2413 return rtn_type_names[t];
2414 snprintf(buf, len, "type %u", t);
2415 return buf;
2418 /* Pretty print the trie */
2419 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2421 const struct fib_trie_iter *iter = seq->private;
2422 struct node *n = v;
2424 if (!node_parent_rcu(n))
2425 fib_table_print(seq, iter->tb);
2427 if (IS_TNODE(n)) {
2428 struct tnode *tn = (struct tnode *) n;
2429 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2431 seq_indent(seq, iter->depth-1);
2432 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2433 &prf, tn->pos, tn->bits, tn->full_children,
2434 tn->empty_children);
2436 } else {
2437 struct leaf *l = (struct leaf *) n;
2438 struct leaf_info *li;
2439 struct hlist_node *node;
2440 __be32 val = htonl(l->key);
2442 seq_indent(seq, iter->depth);
2443 seq_printf(seq, " |-- %pI4\n", &val);
2445 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2446 struct fib_alias *fa;
2448 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2449 char buf1[32], buf2[32];
2451 seq_indent(seq, iter->depth+1);
2452 seq_printf(seq, " /%d %s %s", li->plen,
2453 rtn_scope(buf1, sizeof(buf1),
2454 fa->fa_scope),
2455 rtn_type(buf2, sizeof(buf2),
2456 fa->fa_type));
2457 if (fa->fa_tos)
2458 seq_printf(seq, " tos=%d", fa->fa_tos);
2459 seq_putc(seq, '\n');
2464 return 0;
2467 static const struct seq_operations fib_trie_seq_ops = {
2468 .start = fib_trie_seq_start,
2469 .next = fib_trie_seq_next,
2470 .stop = fib_trie_seq_stop,
2471 .show = fib_trie_seq_show,
2474 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2476 return seq_open_net(inode, file, &fib_trie_seq_ops,
2477 sizeof(struct fib_trie_iter));
2480 static const struct file_operations fib_trie_fops = {
2481 .owner = THIS_MODULE,
2482 .open = fib_trie_seq_open,
2483 .read = seq_read,
2484 .llseek = seq_lseek,
2485 .release = seq_release_net,
2488 struct fib_route_iter {
2489 struct seq_net_private p;
2490 struct trie *main_trie;
2491 loff_t pos;
2492 t_key key;
2495 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2497 struct leaf *l = NULL;
2498 struct trie *t = iter->main_trie;
2500 /* use cache location of last found key */
2501 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2502 pos -= iter->pos;
2503 else {
2504 iter->pos = 0;
2505 l = trie_firstleaf(t);
2508 while (l && pos-- > 0) {
2509 iter->pos++;
2510 l = trie_nextleaf(l);
2513 if (l)
2514 iter->key = pos; /* remember it */
2515 else
2516 iter->pos = 0; /* forget it */
2518 return l;
2521 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2522 __acquires(RCU)
2524 struct fib_route_iter *iter = seq->private;
2525 struct fib_table *tb;
2527 rcu_read_lock();
2528 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2529 if (!tb)
2530 return NULL;
2532 iter->main_trie = (struct trie *) tb->tb_data;
2533 if (*pos == 0)
2534 return SEQ_START_TOKEN;
2535 else
2536 return fib_route_get_idx(iter, *pos - 1);
2539 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2541 struct fib_route_iter *iter = seq->private;
2542 struct leaf *l = v;
2544 ++*pos;
2545 if (v == SEQ_START_TOKEN) {
2546 iter->pos = 0;
2547 l = trie_firstleaf(iter->main_trie);
2548 } else {
2549 iter->pos++;
2550 l = trie_nextleaf(l);
2553 if (l)
2554 iter->key = l->key;
2555 else
2556 iter->pos = 0;
2557 return l;
2560 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2561 __releases(RCU)
2563 rcu_read_unlock();
2566 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2568 static unsigned type2flags[RTN_MAX + 1] = {
2569 [7] = RTF_REJECT, [8] = RTF_REJECT,
2571 unsigned flags = type2flags[type];
2573 if (fi && fi->fib_nh->nh_gw)
2574 flags |= RTF_GATEWAY;
2575 if (mask == htonl(0xFFFFFFFF))
2576 flags |= RTF_HOST;
2577 flags |= RTF_UP;
2578 return flags;
2582 * This outputs /proc/net/route.
2583 * The format of the file is not supposed to be changed
2584 * and needs to be same as fib_hash output to avoid breaking
2585 * legacy utilities
2587 static int fib_route_seq_show(struct seq_file *seq, void *v)
2589 struct leaf *l = v;
2590 struct leaf_info *li;
2591 struct hlist_node *node;
2593 if (v == SEQ_START_TOKEN) {
2594 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2595 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2596 "\tWindow\tIRTT");
2597 return 0;
2600 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2601 struct fib_alias *fa;
2602 __be32 mask, prefix;
2604 mask = inet_make_mask(li->plen);
2605 prefix = htonl(l->key);
2607 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2608 const struct fib_info *fi = fa->fa_info;
2609 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2610 int len;
2612 if (fa->fa_type == RTN_BROADCAST
2613 || fa->fa_type == RTN_MULTICAST)
2614 continue;
2616 if (fi)
2617 seq_printf(seq,
2618 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2619 "%d\t%08X\t%d\t%u\t%u%n",
2620 fi->fib_dev ? fi->fib_dev->name : "*",
2621 prefix,
2622 fi->fib_nh->nh_gw, flags, 0, 0,
2623 fi->fib_priority,
2624 mask,
2625 (fi->fib_advmss ?
2626 fi->fib_advmss + 40 : 0),
2627 fi->fib_window,
2628 fi->fib_rtt >> 3, &len);
2629 else
2630 seq_printf(seq,
2631 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2632 "%d\t%08X\t%d\t%u\t%u%n",
2633 prefix, 0, flags, 0, 0, 0,
2634 mask, 0, 0, 0, &len);
2636 seq_printf(seq, "%*s\n", 127 - len, "");
2640 return 0;
2643 static const struct seq_operations fib_route_seq_ops = {
2644 .start = fib_route_seq_start,
2645 .next = fib_route_seq_next,
2646 .stop = fib_route_seq_stop,
2647 .show = fib_route_seq_show,
2650 static int fib_route_seq_open(struct inode *inode, struct file *file)
2652 return seq_open_net(inode, file, &fib_route_seq_ops,
2653 sizeof(struct fib_route_iter));
2656 static const struct file_operations fib_route_fops = {
2657 .owner = THIS_MODULE,
2658 .open = fib_route_seq_open,
2659 .read = seq_read,
2660 .llseek = seq_lseek,
2661 .release = seq_release_net,
2664 int __net_init fib_proc_init(struct net *net)
2666 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2667 goto out1;
2669 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2670 &fib_triestat_fops))
2671 goto out2;
2673 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2674 goto out3;
2676 return 0;
2678 out3:
2679 proc_net_remove(net, "fib_triestat");
2680 out2:
2681 proc_net_remove(net, "fib_trie");
2682 out1:
2683 return -ENOMEM;
2686 void __net_exit fib_proc_exit(struct net *net)
2688 proc_net_remove(net, "fib_trie");
2689 proc_net_remove(net, "fib_triestat");
2690 proc_net_remove(net, "route");
2693 #endif /* CONFIG_PROC_FS */