2 * Linux INET6 implementation
3 * Forwarding Information Database
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 * Yuji SEKIYA @USAGI: Support default route on router node;
15 * remove ip6_null_entry from the top of
17 * Ville Nuorvala: Fixed routing subtrees.
20 #define pr_fmt(fmt) "IPv6: " fmt
22 #include <linux/errno.h>
23 #include <linux/types.h>
24 #include <linux/net.h>
25 #include <linux/route.h>
26 #include <linux/netdevice.h>
27 #include <linux/in6.h>
28 #include <linux/init.h>
29 #include <linux/list.h>
30 #include <linux/slab.h>
33 #include <net/ndisc.h>
34 #include <net/addrconf.h>
35 #include <net/lwtunnel.h>
36 #include <net/fib_notifier.h>
38 #include <net/ip6_fib.h>
39 #include <net/ip6_route.h>
41 static struct kmem_cache
*fib6_node_kmem __read_mostly
;
46 int (*func
)(struct rt6_info
*, void *arg
);
51 #ifdef CONFIG_IPV6_SUBTREES
52 #define FWS_INIT FWS_S
54 #define FWS_INIT FWS_L
57 static struct rt6_info
*fib6_find_prefix(struct net
*net
,
58 struct fib6_table
*table
,
59 struct fib6_node
*fn
);
60 static struct fib6_node
*fib6_repair_tree(struct net
*net
,
61 struct fib6_table
*table
,
62 struct fib6_node
*fn
);
63 static int fib6_walk(struct net
*net
, struct fib6_walker
*w
);
64 static int fib6_walk_continue(struct fib6_walker
*w
);
67 * A routing update causes an increase of the serial number on the
68 * affected subtree. This allows for cached routes to be asynchronously
69 * tested when modifications are made to the destination cache as a
70 * result of redirects, path MTU changes, etc.
73 static void fib6_gc_timer_cb(struct timer_list
*t
);
75 #define FOR_WALKERS(net, w) \
76 list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
78 static void fib6_walker_link(struct net
*net
, struct fib6_walker
*w
)
80 write_lock_bh(&net
->ipv6
.fib6_walker_lock
);
81 list_add(&w
->lh
, &net
->ipv6
.fib6_walkers
);
82 write_unlock_bh(&net
->ipv6
.fib6_walker_lock
);
85 static void fib6_walker_unlink(struct net
*net
, struct fib6_walker
*w
)
87 write_lock_bh(&net
->ipv6
.fib6_walker_lock
);
89 write_unlock_bh(&net
->ipv6
.fib6_walker_lock
);
92 static int fib6_new_sernum(struct net
*net
)
97 old
= atomic_read(&net
->ipv6
.fib6_sernum
);
98 new = old
< INT_MAX
? old
+ 1 : 1;
99 } while (atomic_cmpxchg(&net
->ipv6
.fib6_sernum
,
105 FIB6_NO_SERNUM_CHANGE
= 0,
108 void fib6_update_sernum(struct rt6_info
*rt
)
110 struct net
*net
= dev_net(rt
->dst
.dev
);
111 struct fib6_node
*fn
;
113 fn
= rcu_dereference_protected(rt
->rt6i_node
,
114 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
116 fn
->fn_sernum
= fib6_new_sernum(net
);
120 * Auxiliary address test functions for the radix tree.
122 * These assume a 32bit processor (although it will work on
129 #if defined(__LITTLE_ENDIAN)
130 # define BITOP_BE32_SWIZZLE (0x1F & ~7)
132 # define BITOP_BE32_SWIZZLE 0
135 static __be32
addr_bit_set(const void *token
, int fn_bit
)
137 const __be32
*addr
= token
;
140 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
141 * is optimized version of
142 * htonl(1 << ((~fn_bit)&0x1F))
143 * See include/asm-generic/bitops/le.h.
145 return (__force __be32
)(1 << ((~fn_bit
^ BITOP_BE32_SWIZZLE
) & 0x1f)) &
149 static struct fib6_node
*node_alloc(struct net
*net
)
151 struct fib6_node
*fn
;
153 fn
= kmem_cache_zalloc(fib6_node_kmem
, GFP_ATOMIC
);
155 net
->ipv6
.rt6_stats
->fib_nodes
++;
160 static void node_free_immediate(struct net
*net
, struct fib6_node
*fn
)
162 kmem_cache_free(fib6_node_kmem
, fn
);
163 net
->ipv6
.rt6_stats
->fib_nodes
--;
166 static void node_free_rcu(struct rcu_head
*head
)
168 struct fib6_node
*fn
= container_of(head
, struct fib6_node
, rcu
);
170 kmem_cache_free(fib6_node_kmem
, fn
);
173 static void node_free(struct net
*net
, struct fib6_node
*fn
)
175 call_rcu(&fn
->rcu
, node_free_rcu
);
176 net
->ipv6
.rt6_stats
->fib_nodes
--;
179 void rt6_free_pcpu(struct rt6_info
*non_pcpu_rt
)
183 if (!non_pcpu_rt
->rt6i_pcpu
)
186 for_each_possible_cpu(cpu
) {
187 struct rt6_info
**ppcpu_rt
;
188 struct rt6_info
*pcpu_rt
;
190 ppcpu_rt
= per_cpu_ptr(non_pcpu_rt
->rt6i_pcpu
, cpu
);
193 dst_dev_put(&pcpu_rt
->dst
);
194 dst_release(&pcpu_rt
->dst
);
199 EXPORT_SYMBOL_GPL(rt6_free_pcpu
);
201 static void fib6_free_table(struct fib6_table
*table
)
203 inetpeer_invalidate_tree(&table
->tb6_peers
);
207 static void fib6_link_table(struct net
*net
, struct fib6_table
*tb
)
212 * Initialize table lock at a single place to give lockdep a key,
213 * tables aren't visible prior to being linked to the list.
215 spin_lock_init(&tb
->tb6_lock
);
216 h
= tb
->tb6_id
& (FIB6_TABLE_HASHSZ
- 1);
219 * No protection necessary, this is the only list mutatation
220 * operation, tables never disappear once they exist.
222 hlist_add_head_rcu(&tb
->tb6_hlist
, &net
->ipv6
.fib_table_hash
[h
]);
225 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
227 static struct fib6_table
*fib6_alloc_table(struct net
*net
, u32 id
)
229 struct fib6_table
*table
;
231 table
= kzalloc(sizeof(*table
), GFP_ATOMIC
);
234 rcu_assign_pointer(table
->tb6_root
.leaf
,
235 net
->ipv6
.ip6_null_entry
);
236 table
->tb6_root
.fn_flags
= RTN_ROOT
| RTN_TL_ROOT
| RTN_RTINFO
;
237 inet_peer_base_init(&table
->tb6_peers
);
243 struct fib6_table
*fib6_new_table(struct net
*net
, u32 id
)
245 struct fib6_table
*tb
;
249 tb
= fib6_get_table(net
, id
);
253 tb
= fib6_alloc_table(net
, id
);
255 fib6_link_table(net
, tb
);
259 EXPORT_SYMBOL_GPL(fib6_new_table
);
261 struct fib6_table
*fib6_get_table(struct net
*net
, u32 id
)
263 struct fib6_table
*tb
;
264 struct hlist_head
*head
;
269 h
= id
& (FIB6_TABLE_HASHSZ
- 1);
271 head
= &net
->ipv6
.fib_table_hash
[h
];
272 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
) {
273 if (tb
->tb6_id
== id
) {
282 EXPORT_SYMBOL_GPL(fib6_get_table
);
284 static void __net_init
fib6_tables_init(struct net
*net
)
286 fib6_link_table(net
, net
->ipv6
.fib6_main_tbl
);
287 fib6_link_table(net
, net
->ipv6
.fib6_local_tbl
);
291 struct fib6_table
*fib6_new_table(struct net
*net
, u32 id
)
293 return fib6_get_table(net
, id
);
296 struct fib6_table
*fib6_get_table(struct net
*net
, u32 id
)
298 return net
->ipv6
.fib6_main_tbl
;
301 struct dst_entry
*fib6_rule_lookup(struct net
*net
, struct flowi6
*fl6
,
302 const struct sk_buff
*skb
,
303 int flags
, pol_lookup_t lookup
)
307 rt
= lookup(net
, net
->ipv6
.fib6_main_tbl
, fl6
, skb
, flags
);
308 if (rt
->dst
.error
== -EAGAIN
) {
310 rt
= net
->ipv6
.ip6_null_entry
;
317 static void __net_init
fib6_tables_init(struct net
*net
)
319 fib6_link_table(net
, net
->ipv6
.fib6_main_tbl
);
324 unsigned int fib6_tables_seq_read(struct net
*net
)
326 unsigned int h
, fib_seq
= 0;
329 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
330 struct hlist_head
*head
= &net
->ipv6
.fib_table_hash
[h
];
331 struct fib6_table
*tb
;
333 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
)
334 fib_seq
+= tb
->fib_seq
;
341 static int call_fib6_entry_notifier(struct notifier_block
*nb
, struct net
*net
,
342 enum fib_event_type event_type
,
345 struct fib6_entry_notifier_info info
= {
349 return call_fib6_notifier(nb
, net
, event_type
, &info
.info
);
352 static int call_fib6_entry_notifiers(struct net
*net
,
353 enum fib_event_type event_type
,
355 struct netlink_ext_ack
*extack
)
357 struct fib6_entry_notifier_info info
= {
358 .info
.extack
= extack
,
362 rt
->rt6i_table
->fib_seq
++;
363 return call_fib6_notifiers(net
, event_type
, &info
.info
);
366 struct fib6_dump_arg
{
368 struct notifier_block
*nb
;
371 static void fib6_rt_dump(struct rt6_info
*rt
, struct fib6_dump_arg
*arg
)
373 if (rt
== arg
->net
->ipv6
.ip6_null_entry
)
375 call_fib6_entry_notifier(arg
->nb
, arg
->net
, FIB_EVENT_ENTRY_ADD
, rt
);
378 static int fib6_node_dump(struct fib6_walker
*w
)
382 for_each_fib6_walker_rt(w
)
383 fib6_rt_dump(rt
, w
->args
);
388 static void fib6_table_dump(struct net
*net
, struct fib6_table
*tb
,
389 struct fib6_walker
*w
)
391 w
->root
= &tb
->tb6_root
;
392 spin_lock_bh(&tb
->tb6_lock
);
394 spin_unlock_bh(&tb
->tb6_lock
);
397 /* Called with rcu_read_lock() */
398 int fib6_tables_dump(struct net
*net
, struct notifier_block
*nb
)
400 struct fib6_dump_arg arg
;
401 struct fib6_walker
*w
;
404 w
= kzalloc(sizeof(*w
), GFP_ATOMIC
);
408 w
->func
= fib6_node_dump
;
413 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
414 struct hlist_head
*head
= &net
->ipv6
.fib_table_hash
[h
];
415 struct fib6_table
*tb
;
417 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
)
418 fib6_table_dump(net
, tb
, w
);
426 static int fib6_dump_node(struct fib6_walker
*w
)
431 for_each_fib6_walker_rt(w
) {
432 res
= rt6_dump_route(rt
, w
->args
);
434 /* Frame is full, suspend walking */
439 /* Multipath routes are dumped in one route with the
440 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
441 * last sibling of this route (no need to dump the
442 * sibling routes again)
444 if (rt
->rt6i_nsiblings
)
445 rt
= list_last_entry(&rt
->rt6i_siblings
,
453 static void fib6_dump_end(struct netlink_callback
*cb
)
455 struct net
*net
= sock_net(cb
->skb
->sk
);
456 struct fib6_walker
*w
= (void *)cb
->args
[2];
461 fib6_walker_unlink(net
, w
);
466 cb
->done
= (void *)cb
->args
[3];
470 static int fib6_dump_done(struct netlink_callback
*cb
)
473 return cb
->done
? cb
->done(cb
) : 0;
476 static int fib6_dump_table(struct fib6_table
*table
, struct sk_buff
*skb
,
477 struct netlink_callback
*cb
)
479 struct net
*net
= sock_net(skb
->sk
);
480 struct fib6_walker
*w
;
483 w
= (void *)cb
->args
[2];
484 w
->root
= &table
->tb6_root
;
486 if (cb
->args
[4] == 0) {
490 spin_lock_bh(&table
->tb6_lock
);
491 res
= fib6_walk(net
, w
);
492 spin_unlock_bh(&table
->tb6_lock
);
495 cb
->args
[5] = w
->root
->fn_sernum
;
498 if (cb
->args
[5] != w
->root
->fn_sernum
) {
499 /* Begin at the root if the tree changed */
500 cb
->args
[5] = w
->root
->fn_sernum
;
507 spin_lock_bh(&table
->tb6_lock
);
508 res
= fib6_walk_continue(w
);
509 spin_unlock_bh(&table
->tb6_lock
);
511 fib6_walker_unlink(net
, w
);
519 static int inet6_dump_fib(struct sk_buff
*skb
, struct netlink_callback
*cb
)
521 struct net
*net
= sock_net(skb
->sk
);
523 unsigned int e
= 0, s_e
;
524 struct rt6_rtnl_dump_arg arg
;
525 struct fib6_walker
*w
;
526 struct fib6_table
*tb
;
527 struct hlist_head
*head
;
533 w
= (void *)cb
->args
[2];
537 * 1. hook callback destructor.
539 cb
->args
[3] = (long)cb
->done
;
540 cb
->done
= fib6_dump_done
;
543 * 2. allocate and initialize walker.
545 w
= kzalloc(sizeof(*w
), GFP_ATOMIC
);
548 w
->func
= fib6_dump_node
;
549 cb
->args
[2] = (long)w
;
558 for (h
= s_h
; h
< FIB6_TABLE_HASHSZ
; h
++, s_e
= 0) {
560 head
= &net
->ipv6
.fib_table_hash
[h
];
561 hlist_for_each_entry_rcu(tb
, head
, tb6_hlist
) {
564 res
= fib6_dump_table(tb
, skb
, cb
);
576 res
= res
< 0 ? res
: skb
->len
;
585 * return the appropriate node for a routing tree "add" operation
586 * by either creating and inserting or by returning an existing
590 static struct fib6_node
*fib6_add_1(struct net
*net
,
591 struct fib6_table
*table
,
592 struct fib6_node
*root
,
593 struct in6_addr
*addr
, int plen
,
594 int offset
, int allow_create
,
595 int replace_required
,
596 struct netlink_ext_ack
*extack
)
598 struct fib6_node
*fn
, *in
, *ln
;
599 struct fib6_node
*pn
= NULL
;
604 RT6_TRACE("fib6_add_1\n");
606 /* insert node in tree */
611 struct rt6_info
*leaf
= rcu_dereference_protected(fn
->leaf
,
612 lockdep_is_held(&table
->tb6_lock
));
613 key
= (struct rt6key
*)((u8
*)leaf
+ offset
);
618 if (plen
< fn
->fn_bit
||
619 !ipv6_prefix_equal(&key
->addr
, addr
, fn
->fn_bit
)) {
621 if (replace_required
) {
622 NL_SET_ERR_MSG(extack
,
623 "Can not replace route - no match found");
624 pr_warn("Can't replace route, no match found\n");
625 return ERR_PTR(-ENOENT
);
627 pr_warn("NLM_F_CREATE should be set when creating new route\n");
636 if (plen
== fn
->fn_bit
) {
637 /* clean up an intermediate node */
638 if (!(fn
->fn_flags
& RTN_RTINFO
)) {
639 RCU_INIT_POINTER(fn
->leaf
, NULL
);
641 /* remove null_entry in the root node */
642 } else if (fn
->fn_flags
& RTN_TL_ROOT
&&
643 rcu_access_pointer(fn
->leaf
) ==
644 net
->ipv6
.ip6_null_entry
) {
645 RCU_INIT_POINTER(fn
->leaf
, NULL
);
652 * We have more bits to go
655 /* Try to walk down on tree. */
656 dir
= addr_bit_set(addr
, fn
->fn_bit
);
659 rcu_dereference_protected(fn
->right
,
660 lockdep_is_held(&table
->tb6_lock
)) :
661 rcu_dereference_protected(fn
->left
,
662 lockdep_is_held(&table
->tb6_lock
));
666 /* We should not create new node because
667 * NLM_F_REPLACE was specified without NLM_F_CREATE
668 * I assume it is safe to require NLM_F_CREATE when
669 * REPLACE flag is used! Later we may want to remove the
670 * check for replace_required, because according
671 * to netlink specification, NLM_F_CREATE
672 * MUST be specified if new route is created.
673 * That would keep IPv6 consistent with IPv4
675 if (replace_required
) {
676 NL_SET_ERR_MSG(extack
,
677 "Can not replace route - no match found");
678 pr_warn("Can't replace route, no match found\n");
679 return ERR_PTR(-ENOENT
);
681 pr_warn("NLM_F_CREATE should be set when creating new route\n");
684 * We walked to the bottom of tree.
685 * Create new leaf node without children.
688 ln
= node_alloc(net
);
691 return ERR_PTR(-ENOMEM
);
693 RCU_INIT_POINTER(ln
->parent
, pn
);
696 rcu_assign_pointer(pn
->right
, ln
);
698 rcu_assign_pointer(pn
->left
, ln
);
705 * split since we don't have a common prefix anymore or
706 * we have a less significant route.
707 * we've to insert an intermediate node on the list
708 * this new node will point to the one we need to create
712 pn
= rcu_dereference_protected(fn
->parent
,
713 lockdep_is_held(&table
->tb6_lock
));
715 /* find 1st bit in difference between the 2 addrs.
717 See comment in __ipv6_addr_diff: bit may be an invalid value,
718 but if it is >= plen, the value is ignored in any case.
721 bit
= __ipv6_addr_diff(addr
, &key
->addr
, sizeof(*addr
));
726 * (new leaf node)[ln] (old node)[fn]
729 in
= node_alloc(net
);
730 ln
= node_alloc(net
);
734 node_free_immediate(net
, in
);
736 node_free_immediate(net
, ln
);
737 return ERR_PTR(-ENOMEM
);
741 * new intermediate node.
743 * be off since that an address that chooses one of
744 * the branches would not match less specific routes
745 * in the other branch
750 RCU_INIT_POINTER(in
->parent
, pn
);
752 atomic_inc(&rcu_dereference_protected(in
->leaf
,
753 lockdep_is_held(&table
->tb6_lock
))->rt6i_ref
);
755 /* update parent pointer */
757 rcu_assign_pointer(pn
->right
, in
);
759 rcu_assign_pointer(pn
->left
, in
);
763 RCU_INIT_POINTER(ln
->parent
, in
);
764 rcu_assign_pointer(fn
->parent
, in
);
766 if (addr_bit_set(addr
, bit
)) {
767 rcu_assign_pointer(in
->right
, ln
);
768 rcu_assign_pointer(in
->left
, fn
);
770 rcu_assign_pointer(in
->left
, ln
);
771 rcu_assign_pointer(in
->right
, fn
);
773 } else { /* plen <= bit */
776 * (new leaf node)[ln]
778 * (old node)[fn] NULL
781 ln
= node_alloc(net
);
784 return ERR_PTR(-ENOMEM
);
788 RCU_INIT_POINTER(ln
->parent
, pn
);
790 if (addr_bit_set(&key
->addr
, plen
))
791 RCU_INIT_POINTER(ln
->right
, fn
);
793 RCU_INIT_POINTER(ln
->left
, fn
);
795 rcu_assign_pointer(fn
->parent
, ln
);
798 rcu_assign_pointer(pn
->right
, ln
);
800 rcu_assign_pointer(pn
->left
, ln
);
805 static void fib6_copy_metrics(u32
*mp
, const struct mx6_config
*mxc
)
809 for (i
= 0; i
< RTAX_MAX
; i
++) {
810 if (test_bit(i
, mxc
->mx_valid
))
815 static int fib6_commit_metrics(struct dst_entry
*dst
, struct mx6_config
*mxc
)
820 if (dst
->flags
& DST_HOST
) {
821 u32
*mp
= dst_metrics_write_ptr(dst
);
826 fib6_copy_metrics(mp
, mxc
);
828 dst_init_metrics(dst
, mxc
->mx
, false);
830 /* We've stolen mx now. */
837 static void fib6_purge_rt(struct rt6_info
*rt
, struct fib6_node
*fn
,
840 struct fib6_table
*table
= rt
->rt6i_table
;
842 if (atomic_read(&rt
->rt6i_ref
) != 1) {
843 /* This route is used as dummy address holder in some split
844 * nodes. It is not leaked, but it still holds other resources,
845 * which must be released in time. So, scan ascendant nodes
846 * and replace dummy references to this route with references
847 * to still alive ones.
850 struct rt6_info
*leaf
= rcu_dereference_protected(fn
->leaf
,
851 lockdep_is_held(&table
->tb6_lock
));
852 struct rt6_info
*new_leaf
;
853 if (!(fn
->fn_flags
& RTN_RTINFO
) && leaf
== rt
) {
854 new_leaf
= fib6_find_prefix(net
, table
, fn
);
855 atomic_inc(&new_leaf
->rt6i_ref
);
856 rcu_assign_pointer(fn
->leaf
, new_leaf
);
859 fn
= rcu_dereference_protected(fn
->parent
,
860 lockdep_is_held(&table
->tb6_lock
));
866 * Insert routing information in a node.
869 static int fib6_add_rt2node(struct fib6_node
*fn
, struct rt6_info
*rt
,
870 struct nl_info
*info
, struct mx6_config
*mxc
,
871 struct netlink_ext_ack
*extack
)
873 struct rt6_info
*leaf
= rcu_dereference_protected(fn
->leaf
,
874 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
875 struct rt6_info
*iter
= NULL
;
876 struct rt6_info __rcu
**ins
;
877 struct rt6_info __rcu
**fallback_ins
= NULL
;
878 int replace
= (info
->nlh
&&
879 (info
->nlh
->nlmsg_flags
& NLM_F_REPLACE
));
880 int add
= (!info
->nlh
||
881 (info
->nlh
->nlmsg_flags
& NLM_F_CREATE
));
883 bool rt_can_ecmp
= rt6_qualify_for_ecmp(rt
);
884 u16 nlflags
= NLM_F_EXCL
;
887 if (info
->nlh
&& (info
->nlh
->nlmsg_flags
& NLM_F_APPEND
))
888 nlflags
|= NLM_F_APPEND
;
892 for (iter
= leaf
; iter
;
893 iter
= rcu_dereference_protected(iter
->rt6_next
,
894 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
))) {
896 * Search for duplicates
899 if (iter
->rt6i_metric
== rt
->rt6i_metric
) {
901 * Same priority level
904 (info
->nlh
->nlmsg_flags
& NLM_F_EXCL
))
907 nlflags
&= ~NLM_F_EXCL
;
909 if (rt_can_ecmp
== rt6_qualify_for_ecmp(iter
)) {
914 fallback_ins
= fallback_ins
?: ins
;
918 if (rt6_duplicate_nexthop(iter
, rt
)) {
919 if (rt
->rt6i_nsiblings
)
920 rt
->rt6i_nsiblings
= 0;
921 if (!(iter
->rt6i_flags
& RTF_EXPIRES
))
923 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
924 rt6_clean_expires(iter
);
926 rt6_set_expires(iter
, rt
->dst
.expires
);
927 iter
->rt6i_pmtu
= rt
->rt6i_pmtu
;
930 /* If we have the same destination and the same metric,
931 * but not the same gateway, then the route we try to
932 * add is sibling to this route, increment our counter
933 * of siblings, and later we will add our route to the
935 * Only static routes (which don't have flag
936 * RTF_EXPIRES) are used for ECMPv6.
938 * To avoid long list, we only had siblings if the
939 * route have a gateway.
942 rt6_qualify_for_ecmp(iter
))
943 rt
->rt6i_nsiblings
++;
946 if (iter
->rt6i_metric
> rt
->rt6i_metric
)
950 ins
= &iter
->rt6_next
;
953 if (fallback_ins
&& !found
) {
954 /* No ECMP-able route found, replace first non-ECMP one */
956 iter
= rcu_dereference_protected(*ins
,
957 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
961 /* Reset round-robin state, if necessary */
962 if (ins
== &fn
->leaf
)
965 /* Link this route to others same route. */
966 if (rt
->rt6i_nsiblings
) {
967 unsigned int rt6i_nsiblings
;
968 struct rt6_info
*sibling
, *temp_sibling
;
970 /* Find the first route that have the same metric */
973 if (sibling
->rt6i_metric
== rt
->rt6i_metric
&&
974 rt6_qualify_for_ecmp(sibling
)) {
975 list_add_tail(&rt
->rt6i_siblings
,
976 &sibling
->rt6i_siblings
);
979 sibling
= rcu_dereference_protected(sibling
->rt6_next
,
980 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
982 /* For each sibling in the list, increment the counter of
983 * siblings. BUG() if counters does not match, list of siblings
987 list_for_each_entry_safe(sibling
, temp_sibling
,
988 &rt
->rt6i_siblings
, rt6i_siblings
) {
989 sibling
->rt6i_nsiblings
++;
990 BUG_ON(sibling
->rt6i_nsiblings
!= rt
->rt6i_nsiblings
);
993 BUG_ON(rt6i_nsiblings
!= rt
->rt6i_nsiblings
);
994 rt6_multipath_rebalance(temp_sibling
);
1002 pr_warn("NLM_F_CREATE should be set when creating new route\n");
1005 nlflags
|= NLM_F_CREATE
;
1006 err
= fib6_commit_metrics(&rt
->dst
, mxc
);
1010 rcu_assign_pointer(rt
->rt6_next
, iter
);
1011 atomic_inc(&rt
->rt6i_ref
);
1012 rcu_assign_pointer(rt
->rt6i_node
, fn
);
1013 rcu_assign_pointer(*ins
, rt
);
1014 call_fib6_entry_notifiers(info
->nl_net
, FIB_EVENT_ENTRY_ADD
,
1016 if (!info
->skip_notify
)
1017 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
1018 info
->nl_net
->ipv6
.rt6_stats
->fib_rt_entries
++;
1020 if (!(fn
->fn_flags
& RTN_RTINFO
)) {
1021 info
->nl_net
->ipv6
.rt6_stats
->fib_route_nodes
++;
1022 fn
->fn_flags
|= RTN_RTINFO
;
1031 pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1035 err
= fib6_commit_metrics(&rt
->dst
, mxc
);
1039 atomic_inc(&rt
->rt6i_ref
);
1040 rcu_assign_pointer(rt
->rt6i_node
, fn
);
1041 rt
->rt6_next
= iter
->rt6_next
;
1042 rcu_assign_pointer(*ins
, rt
);
1043 call_fib6_entry_notifiers(info
->nl_net
, FIB_EVENT_ENTRY_REPLACE
,
1045 if (!info
->skip_notify
)
1046 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, NLM_F_REPLACE
);
1047 if (!(fn
->fn_flags
& RTN_RTINFO
)) {
1048 info
->nl_net
->ipv6
.rt6_stats
->fib_route_nodes
++;
1049 fn
->fn_flags
|= RTN_RTINFO
;
1051 nsiblings
= iter
->rt6i_nsiblings
;
1052 iter
->rt6i_node
= NULL
;
1053 fib6_purge_rt(iter
, fn
, info
->nl_net
);
1054 if (rcu_access_pointer(fn
->rr_ptr
) == iter
)
1059 /* Replacing an ECMP route, remove all siblings */
1060 ins
= &rt
->rt6_next
;
1061 iter
= rcu_dereference_protected(*ins
,
1062 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1064 if (iter
->rt6i_metric
> rt
->rt6i_metric
)
1066 if (rt6_qualify_for_ecmp(iter
)) {
1067 *ins
= iter
->rt6_next
;
1068 iter
->rt6i_node
= NULL
;
1069 fib6_purge_rt(iter
, fn
, info
->nl_net
);
1070 if (rcu_access_pointer(fn
->rr_ptr
) == iter
)
1074 info
->nl_net
->ipv6
.rt6_stats
->fib_rt_entries
--;
1076 ins
= &iter
->rt6_next
;
1078 iter
= rcu_dereference_protected(*ins
,
1079 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1081 WARN_ON(nsiblings
!= 0);
1088 static void fib6_start_gc(struct net
*net
, struct rt6_info
*rt
)
1090 if (!timer_pending(&net
->ipv6
.ip6_fib_timer
) &&
1091 (rt
->rt6i_flags
& (RTF_EXPIRES
| RTF_CACHE
)))
1092 mod_timer(&net
->ipv6
.ip6_fib_timer
,
1093 jiffies
+ net
->ipv6
.sysctl
.ip6_rt_gc_interval
);
1096 void fib6_force_start_gc(struct net
*net
)
1098 if (!timer_pending(&net
->ipv6
.ip6_fib_timer
))
1099 mod_timer(&net
->ipv6
.ip6_fib_timer
,
1100 jiffies
+ net
->ipv6
.sysctl
.ip6_rt_gc_interval
);
1103 static void __fib6_update_sernum_upto_root(struct rt6_info
*rt
,
1106 struct fib6_node
*fn
= rcu_dereference_protected(rt
->rt6i_node
,
1107 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1109 /* paired with smp_rmb() in rt6_get_cookie_safe() */
1112 fn
->fn_sernum
= sernum
;
1113 fn
= rcu_dereference_protected(fn
->parent
,
1114 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1118 void fib6_update_sernum_upto_root(struct net
*net
, struct rt6_info
*rt
)
1120 __fib6_update_sernum_upto_root(rt
, fib6_new_sernum(net
));
1124 * Add routing information to the routing tree.
1125 * <destination addr>/<source addr>
1126 * with source addr info in sub-trees
1127 * Need to own table->tb6_lock
1130 int fib6_add(struct fib6_node
*root
, struct rt6_info
*rt
,
1131 struct nl_info
*info
, struct mx6_config
*mxc
,
1132 struct netlink_ext_ack
*extack
)
1134 struct fib6_table
*table
= rt
->rt6i_table
;
1135 struct fib6_node
*fn
, *pn
= NULL
;
1137 int allow_create
= 1;
1138 int replace_required
= 0;
1139 int sernum
= fib6_new_sernum(info
->nl_net
);
1141 if (WARN_ON_ONCE(!atomic_read(&rt
->dst
.__refcnt
)))
1143 if (WARN_ON_ONCE(rt
->rt6i_flags
& RTF_CACHE
))
1147 if (!(info
->nlh
->nlmsg_flags
& NLM_F_CREATE
))
1149 if (info
->nlh
->nlmsg_flags
& NLM_F_REPLACE
)
1150 replace_required
= 1;
1152 if (!allow_create
&& !replace_required
)
1153 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1155 fn
= fib6_add_1(info
->nl_net
, table
, root
,
1156 &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
,
1157 offsetof(struct rt6_info
, rt6i_dst
), allow_create
,
1158 replace_required
, extack
);
1167 #ifdef CONFIG_IPV6_SUBTREES
1168 if (rt
->rt6i_src
.plen
) {
1169 struct fib6_node
*sn
;
1171 if (!rcu_access_pointer(fn
->subtree
)) {
1172 struct fib6_node
*sfn
;
1184 /* Create subtree root node */
1185 sfn
= node_alloc(info
->nl_net
);
1189 atomic_inc(&info
->nl_net
->ipv6
.ip6_null_entry
->rt6i_ref
);
1190 rcu_assign_pointer(sfn
->leaf
,
1191 info
->nl_net
->ipv6
.ip6_null_entry
);
1192 sfn
->fn_flags
= RTN_ROOT
;
1194 /* Now add the first leaf node to new subtree */
1196 sn
= fib6_add_1(info
->nl_net
, table
, sfn
,
1197 &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
,
1198 offsetof(struct rt6_info
, rt6i_src
),
1199 allow_create
, replace_required
, extack
);
1202 /* If it is failed, discard just allocated
1203 root, and then (in failure) stale node
1206 node_free_immediate(info
->nl_net
, sfn
);
1211 /* Now link new subtree to main tree */
1212 rcu_assign_pointer(sfn
->parent
, fn
);
1213 rcu_assign_pointer(fn
->subtree
, sfn
);
1215 sn
= fib6_add_1(info
->nl_net
, table
, FIB6_SUBTREE(fn
),
1216 &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
,
1217 offsetof(struct rt6_info
, rt6i_src
),
1218 allow_create
, replace_required
, extack
);
1226 if (!rcu_access_pointer(fn
->leaf
)) {
1227 if (fn
->fn_flags
& RTN_TL_ROOT
) {
1228 /* put back null_entry for root node */
1229 rcu_assign_pointer(fn
->leaf
,
1230 info
->nl_net
->ipv6
.ip6_null_entry
);
1232 atomic_inc(&rt
->rt6i_ref
);
1233 rcu_assign_pointer(fn
->leaf
, rt
);
1240 err
= fib6_add_rt2node(fn
, rt
, info
, mxc
, extack
);
1242 __fib6_update_sernum_upto_root(rt
, sernum
);
1243 fib6_start_gc(info
->nl_net
, rt
);
1248 #ifdef CONFIG_IPV6_SUBTREES
1250 * If fib6_add_1 has cleared the old leaf pointer in the
1251 * super-tree leaf node we have to find a new one for it.
1254 struct rt6_info
*pn_leaf
=
1255 rcu_dereference_protected(pn
->leaf
,
1256 lockdep_is_held(&table
->tb6_lock
));
1257 if (pn_leaf
== rt
) {
1259 RCU_INIT_POINTER(pn
->leaf
, NULL
);
1260 atomic_dec(&rt
->rt6i_ref
);
1262 if (!pn_leaf
&& !(pn
->fn_flags
& RTN_RTINFO
)) {
1263 pn_leaf
= fib6_find_prefix(info
->nl_net
, table
,
1269 info
->nl_net
->ipv6
.ip6_null_entry
;
1272 atomic_inc(&pn_leaf
->rt6i_ref
);
1273 rcu_assign_pointer(pn
->leaf
, pn_leaf
);
1282 /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1283 * 1. fn is an intermediate node and we failed to add the new
1284 * route to it in both subtree creation failure and fib6_add_rt2node()
1286 * 2. fn is the root node in the table and we fail to add the first
1287 * default route to it.
1290 (!(fn
->fn_flags
& (RTN_RTINFO
|RTN_ROOT
)) ||
1291 (fn
->fn_flags
& RTN_TL_ROOT
&&
1292 !rcu_access_pointer(fn
->leaf
))))
1293 fib6_repair_tree(info
->nl_net
, table
, fn
);
1294 /* Always release dst as dst->__refcnt is guaranteed
1295 * to be taken before entering this function
1297 dst_release_immediate(&rt
->dst
);
1302 * Routing tree lookup
1306 struct lookup_args
{
1307 int offset
; /* key offset on rt6_info */
1308 const struct in6_addr
*addr
; /* search key */
1311 static struct fib6_node
*fib6_lookup_1(struct fib6_node
*root
,
1312 struct lookup_args
*args
)
1314 struct fib6_node
*fn
;
1317 if (unlikely(args
->offset
== 0))
1327 struct fib6_node
*next
;
1329 dir
= addr_bit_set(args
->addr
, fn
->fn_bit
);
1331 next
= dir
? rcu_dereference(fn
->right
) :
1332 rcu_dereference(fn
->left
);
1342 struct fib6_node
*subtree
= FIB6_SUBTREE(fn
);
1344 if (subtree
|| fn
->fn_flags
& RTN_RTINFO
) {
1345 struct rt6_info
*leaf
= rcu_dereference(fn
->leaf
);
1351 key
= (struct rt6key
*) ((u8
*)leaf
+ args
->offset
);
1353 if (ipv6_prefix_equal(&key
->addr
, args
->addr
, key
->plen
)) {
1354 #ifdef CONFIG_IPV6_SUBTREES
1356 struct fib6_node
*sfn
;
1357 sfn
= fib6_lookup_1(subtree
, args
+ 1);
1363 if (fn
->fn_flags
& RTN_RTINFO
)
1368 if (fn
->fn_flags
& RTN_ROOT
)
1371 fn
= rcu_dereference(fn
->parent
);
1377 /* called with rcu_read_lock() held
1379 struct fib6_node
*fib6_lookup(struct fib6_node
*root
, const struct in6_addr
*daddr
,
1380 const struct in6_addr
*saddr
)
1382 struct fib6_node
*fn
;
1383 struct lookup_args args
[] = {
1385 .offset
= offsetof(struct rt6_info
, rt6i_dst
),
1388 #ifdef CONFIG_IPV6_SUBTREES
1390 .offset
= offsetof(struct rt6_info
, rt6i_src
),
1395 .offset
= 0, /* sentinel */
1399 fn
= fib6_lookup_1(root
, daddr
? args
: args
+ 1);
1400 if (!fn
|| fn
->fn_flags
& RTN_TL_ROOT
)
1407 * Get node with specified destination prefix (and source prefix,
1408 * if subtrees are used)
1409 * exact_match == true means we try to find fn with exact match of
1410 * the passed in prefix addr
1411 * exact_match == false means we try to find fn with longest prefix
1412 * match of the passed in prefix addr. This is useful for finding fn
1413 * for cached route as it will be stored in the exception table under
1414 * the node with longest prefix length.
1418 static struct fib6_node
*fib6_locate_1(struct fib6_node
*root
,
1419 const struct in6_addr
*addr
,
1420 int plen
, int offset
,
1423 struct fib6_node
*fn
, *prev
= NULL
;
1425 for (fn
= root
; fn
; ) {
1426 struct rt6_info
*leaf
= rcu_dereference(fn
->leaf
);
1429 /* This node is being deleted */
1431 if (plen
<= fn
->fn_bit
)
1437 key
= (struct rt6key
*)((u8
*)leaf
+ offset
);
1442 if (plen
< fn
->fn_bit
||
1443 !ipv6_prefix_equal(&key
->addr
, addr
, fn
->fn_bit
))
1446 if (plen
== fn
->fn_bit
)
1453 * We have more bits to go
1455 if (addr_bit_set(addr
, fn
->fn_bit
))
1456 fn
= rcu_dereference(fn
->right
);
1458 fn
= rcu_dereference(fn
->left
);
1467 struct fib6_node
*fib6_locate(struct fib6_node
*root
,
1468 const struct in6_addr
*daddr
, int dst_len
,
1469 const struct in6_addr
*saddr
, int src_len
,
1472 struct fib6_node
*fn
;
1474 fn
= fib6_locate_1(root
, daddr
, dst_len
,
1475 offsetof(struct rt6_info
, rt6i_dst
),
1478 #ifdef CONFIG_IPV6_SUBTREES
1480 WARN_ON(saddr
== NULL
);
1482 struct fib6_node
*subtree
= FIB6_SUBTREE(fn
);
1485 fn
= fib6_locate_1(subtree
, saddr
, src_len
,
1486 offsetof(struct rt6_info
, rt6i_src
),
1493 if (fn
&& fn
->fn_flags
& RTN_RTINFO
)
1505 static struct rt6_info
*fib6_find_prefix(struct net
*net
,
1506 struct fib6_table
*table
,
1507 struct fib6_node
*fn
)
1509 struct fib6_node
*child_left
, *child_right
;
1511 if (fn
->fn_flags
& RTN_ROOT
)
1512 return net
->ipv6
.ip6_null_entry
;
1515 child_left
= rcu_dereference_protected(fn
->left
,
1516 lockdep_is_held(&table
->tb6_lock
));
1517 child_right
= rcu_dereference_protected(fn
->right
,
1518 lockdep_is_held(&table
->tb6_lock
));
1520 return rcu_dereference_protected(child_left
->leaf
,
1521 lockdep_is_held(&table
->tb6_lock
));
1523 return rcu_dereference_protected(child_right
->leaf
,
1524 lockdep_is_held(&table
->tb6_lock
));
1526 fn
= FIB6_SUBTREE(fn
);
1532 * Called to trim the tree of intermediate nodes when possible. "fn"
1533 * is the node we want to try and remove.
1534 * Need to own table->tb6_lock
1537 static struct fib6_node
*fib6_repair_tree(struct net
*net
,
1538 struct fib6_table
*table
,
1539 struct fib6_node
*fn
)
1543 struct fib6_node
*child
;
1544 struct fib6_walker
*w
;
1547 /* Set fn->leaf to null_entry for root node. */
1548 if (fn
->fn_flags
& RTN_TL_ROOT
) {
1549 rcu_assign_pointer(fn
->leaf
, net
->ipv6
.ip6_null_entry
);
1554 struct fib6_node
*fn_r
= rcu_dereference_protected(fn
->right
,
1555 lockdep_is_held(&table
->tb6_lock
));
1556 struct fib6_node
*fn_l
= rcu_dereference_protected(fn
->left
,
1557 lockdep_is_held(&table
->tb6_lock
));
1558 struct fib6_node
*pn
= rcu_dereference_protected(fn
->parent
,
1559 lockdep_is_held(&table
->tb6_lock
));
1560 struct fib6_node
*pn_r
= rcu_dereference_protected(pn
->right
,
1561 lockdep_is_held(&table
->tb6_lock
));
1562 struct fib6_node
*pn_l
= rcu_dereference_protected(pn
->left
,
1563 lockdep_is_held(&table
->tb6_lock
));
1564 struct rt6_info
*fn_leaf
= rcu_dereference_protected(fn
->leaf
,
1565 lockdep_is_held(&table
->tb6_lock
));
1566 struct rt6_info
*pn_leaf
= rcu_dereference_protected(pn
->leaf
,
1567 lockdep_is_held(&table
->tb6_lock
));
1568 struct rt6_info
*new_fn_leaf
;
1570 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn
->fn_bit
, iter
);
1573 WARN_ON(fn
->fn_flags
& RTN_RTINFO
);
1574 WARN_ON(fn
->fn_flags
& RTN_TL_ROOT
);
1580 child
= fn_r
, children
|= 1;
1582 child
= fn_l
, children
|= 2;
1584 if (children
== 3 || FIB6_SUBTREE(fn
)
1585 #ifdef CONFIG_IPV6_SUBTREES
1586 /* Subtree root (i.e. fn) may have one child */
1587 || (children
&& fn
->fn_flags
& RTN_ROOT
)
1590 new_fn_leaf
= fib6_find_prefix(net
, table
, fn
);
1593 WARN_ON(!new_fn_leaf
);
1594 new_fn_leaf
= net
->ipv6
.ip6_null_entry
;
1597 atomic_inc(&new_fn_leaf
->rt6i_ref
);
1598 rcu_assign_pointer(fn
->leaf
, new_fn_leaf
);
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 if (FIB6_SUBTREE(pn
) == fn
) {
1604 WARN_ON(!(fn
->fn_flags
& RTN_ROOT
));
1605 RCU_INIT_POINTER(pn
->subtree
, NULL
);
1608 WARN_ON(fn
->fn_flags
& RTN_ROOT
);
1611 rcu_assign_pointer(pn
->right
, child
);
1612 else if (pn_l
== fn
)
1613 rcu_assign_pointer(pn
->left
, child
);
1619 rcu_assign_pointer(child
->parent
, pn
);
1621 #ifdef CONFIG_IPV6_SUBTREES
1625 read_lock(&net
->ipv6
.fib6_walker_lock
);
1626 FOR_WALKERS(net
, w
) {
1628 if (w
->node
== fn
) {
1629 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w
, w
->state
, nstate
);
1634 if (w
->node
== fn
) {
1637 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w
, w
->state
);
1638 w
->state
= w
->state
>= FWS_R
? FWS_U
: FWS_INIT
;
1640 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w
, w
->state
);
1641 w
->state
= w
->state
>= FWS_C
? FWS_U
: FWS_INIT
;
1646 read_unlock(&net
->ipv6
.fib6_walker_lock
);
1649 if (pn
->fn_flags
& RTN_RTINFO
|| FIB6_SUBTREE(pn
))
1652 RCU_INIT_POINTER(pn
->leaf
, NULL
);
1653 rt6_release(pn_leaf
);
1658 static void fib6_del_route(struct fib6_table
*table
, struct fib6_node
*fn
,
1659 struct rt6_info __rcu
**rtp
, struct nl_info
*info
)
1661 struct fib6_walker
*w
;
1662 struct rt6_info
*rt
= rcu_dereference_protected(*rtp
,
1663 lockdep_is_held(&table
->tb6_lock
));
1664 struct net
*net
= info
->nl_net
;
1666 RT6_TRACE("fib6_del_route\n");
1668 WARN_ON_ONCE(rt
->rt6i_flags
& RTF_CACHE
);
1671 *rtp
= rt
->rt6_next
;
1672 rt
->rt6i_node
= NULL
;
1673 net
->ipv6
.rt6_stats
->fib_rt_entries
--;
1674 net
->ipv6
.rt6_stats
->fib_discarded_routes
++;
1676 /* Flush all cached dst in exception table */
1677 rt6_flush_exceptions(rt
);
1679 /* Reset round-robin state, if necessary */
1680 if (rcu_access_pointer(fn
->rr_ptr
) == rt
)
1683 /* Remove this entry from other siblings */
1684 if (rt
->rt6i_nsiblings
) {
1685 struct rt6_info
*sibling
, *next_sibling
;
1687 list_for_each_entry_safe(sibling
, next_sibling
,
1688 &rt
->rt6i_siblings
, rt6i_siblings
)
1689 sibling
->rt6i_nsiblings
--;
1690 rt
->rt6i_nsiblings
= 0;
1691 list_del_init(&rt
->rt6i_siblings
);
1692 rt6_multipath_rebalance(next_sibling
);
1695 /* Adjust walkers */
1696 read_lock(&net
->ipv6
.fib6_walker_lock
);
1697 FOR_WALKERS(net
, w
) {
1698 if (w
->state
== FWS_C
&& w
->leaf
== rt
) {
1699 RT6_TRACE("walker %p adjusted by delroute\n", w
);
1700 w
->leaf
= rcu_dereference_protected(rt
->rt6_next
,
1701 lockdep_is_held(&table
->tb6_lock
));
1706 read_unlock(&net
->ipv6
.fib6_walker_lock
);
1708 /* If it was last route, call fib6_repair_tree() to:
1709 * 1. For root node, put back null_entry as how the table was created.
1710 * 2. For other nodes, expunge its radix tree node.
1712 if (!rcu_access_pointer(fn
->leaf
)) {
1713 if (!(fn
->fn_flags
& RTN_TL_ROOT
)) {
1714 fn
->fn_flags
&= ~RTN_RTINFO
;
1715 net
->ipv6
.rt6_stats
->fib_route_nodes
--;
1717 fn
= fib6_repair_tree(net
, table
, fn
);
1720 fib6_purge_rt(rt
, fn
, net
);
1722 call_fib6_entry_notifiers(net
, FIB_EVENT_ENTRY_DEL
, rt
, NULL
);
1723 if (!info
->skip_notify
)
1724 inet6_rt_notify(RTM_DELROUTE
, rt
, info
, 0);
1728 /* Need to own table->tb6_lock */
1729 int fib6_del(struct rt6_info
*rt
, struct nl_info
*info
)
1731 struct fib6_node
*fn
= rcu_dereference_protected(rt
->rt6i_node
,
1732 lockdep_is_held(&rt
->rt6i_table
->tb6_lock
));
1733 struct fib6_table
*table
= rt
->rt6i_table
;
1734 struct net
*net
= info
->nl_net
;
1735 struct rt6_info __rcu
**rtp
;
1736 struct rt6_info __rcu
**rtp_next
;
1739 if (rt
->dst
.obsolete
> 0) {
1744 if (!fn
|| rt
== net
->ipv6
.ip6_null_entry
)
1747 WARN_ON(!(fn
->fn_flags
& RTN_RTINFO
));
1749 /* remove cached dst from exception table */
1750 if (rt
->rt6i_flags
& RTF_CACHE
)
1751 return rt6_remove_exception_rt(rt
);
1754 * Walk the leaf entries looking for ourself
1757 for (rtp
= &fn
->leaf
; *rtp
; rtp
= rtp_next
) {
1758 struct rt6_info
*cur
= rcu_dereference_protected(*rtp
,
1759 lockdep_is_held(&table
->tb6_lock
));
1761 fib6_del_route(table
, fn
, rtp
, info
);
1764 rtp_next
= &cur
->rt6_next
;
1770 * Tree traversal function.
1772 * Certainly, it is not interrupt safe.
1773 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1774 * It means, that we can modify tree during walking
1775 * and use this function for garbage collection, clone pruning,
1776 * cleaning tree when a device goes down etc. etc.
1778 * It guarantees that every node will be traversed,
1779 * and that it will be traversed only once.
1781 * Callback function w->func may return:
1782 * 0 -> continue walking.
1783 * positive value -> walking is suspended (used by tree dumps,
1784 * and probably by gc, if it will be split to several slices)
1785 * negative value -> terminate walking.
1787 * The function itself returns:
1788 * 0 -> walk is complete.
1789 * >0 -> walk is incomplete (i.e. suspended)
1790 * <0 -> walk is terminated by an error.
1792 * This function is called with tb6_lock held.
1795 static int fib6_walk_continue(struct fib6_walker
*w
)
1797 struct fib6_node
*fn
, *pn
, *left
, *right
;
1799 /* w->root should always be table->tb6_root */
1800 WARN_ON_ONCE(!(w
->root
->fn_flags
& RTN_TL_ROOT
));
1808 #ifdef CONFIG_IPV6_SUBTREES
1810 if (FIB6_SUBTREE(fn
)) {
1811 w
->node
= FIB6_SUBTREE(fn
);
1818 left
= rcu_dereference_protected(fn
->left
, 1);
1821 w
->state
= FWS_INIT
;
1827 right
= rcu_dereference_protected(fn
->right
, 1);
1830 w
->state
= FWS_INIT
;
1834 w
->leaf
= rcu_dereference_protected(fn
->leaf
, 1);
1837 if (w
->leaf
&& fn
->fn_flags
& RTN_RTINFO
) {
1858 pn
= rcu_dereference_protected(fn
->parent
, 1);
1859 left
= rcu_dereference_protected(pn
->left
, 1);
1860 right
= rcu_dereference_protected(pn
->right
, 1);
1862 #ifdef CONFIG_IPV6_SUBTREES
1863 if (FIB6_SUBTREE(pn
) == fn
) {
1864 WARN_ON(!(fn
->fn_flags
& RTN_ROOT
));
1875 w
->leaf
= rcu_dereference_protected(w
->node
->leaf
, 1);
1885 static int fib6_walk(struct net
*net
, struct fib6_walker
*w
)
1889 w
->state
= FWS_INIT
;
1892 fib6_walker_link(net
, w
);
1893 res
= fib6_walk_continue(w
);
1895 fib6_walker_unlink(net
, w
);
1899 static int fib6_clean_node(struct fib6_walker
*w
)
1902 struct rt6_info
*rt
;
1903 struct fib6_cleaner
*c
= container_of(w
, struct fib6_cleaner
, w
);
1904 struct nl_info info
= {
1908 if (c
->sernum
!= FIB6_NO_SERNUM_CHANGE
&&
1909 w
->node
->fn_sernum
!= c
->sernum
)
1910 w
->node
->fn_sernum
= c
->sernum
;
1913 WARN_ON_ONCE(c
->sernum
== FIB6_NO_SERNUM_CHANGE
);
1918 for_each_fib6_walker_rt(w
) {
1919 res
= c
->func(rt
, c
->arg
);
1922 res
= fib6_del(rt
, &info
);
1925 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
1927 rcu_access_pointer(rt
->rt6i_node
),
1933 } else if (res
== -2) {
1934 if (WARN_ON(!rt
->rt6i_nsiblings
))
1936 rt
= list_last_entry(&rt
->rt6i_siblings
,
1937 struct rt6_info
, rt6i_siblings
);
1947 * Convenient frontend to tree walker.
1949 * func is called on each route.
1950 * It may return -2 -> skip multipath route.
1951 * -1 -> delete this route.
1952 * 0 -> continue walking
1955 static void fib6_clean_tree(struct net
*net
, struct fib6_node
*root
,
1956 int (*func
)(struct rt6_info
*, void *arg
),
1957 int sernum
, void *arg
)
1959 struct fib6_cleaner c
;
1962 c
.w
.func
= fib6_clean_node
;
1970 fib6_walk(net
, &c
.w
);
1973 static void __fib6_clean_all(struct net
*net
,
1974 int (*func
)(struct rt6_info
*, void *),
1975 int sernum
, void *arg
)
1977 struct fib6_table
*table
;
1978 struct hlist_head
*head
;
1982 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
1983 head
= &net
->ipv6
.fib_table_hash
[h
];
1984 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
1985 spin_lock_bh(&table
->tb6_lock
);
1986 fib6_clean_tree(net
, &table
->tb6_root
,
1988 spin_unlock_bh(&table
->tb6_lock
);
1994 void fib6_clean_all(struct net
*net
, int (*func
)(struct rt6_info
*, void *),
1997 __fib6_clean_all(net
, func
, FIB6_NO_SERNUM_CHANGE
, arg
);
2000 static void fib6_flush_trees(struct net
*net
)
2002 int new_sernum
= fib6_new_sernum(net
);
2004 __fib6_clean_all(net
, NULL
, new_sernum
, NULL
);
2008 * Garbage collection
2011 static int fib6_age(struct rt6_info
*rt
, void *arg
)
2013 struct fib6_gc_args
*gc_args
= arg
;
2014 unsigned long now
= jiffies
;
2017 * check addrconf expiration here.
2018 * Routes are expired even if they are in use.
2021 if (rt
->rt6i_flags
& RTF_EXPIRES
&& rt
->dst
.expires
) {
2022 if (time_after(now
, rt
->dst
.expires
)) {
2023 RT6_TRACE("expiring %p\n", rt
);
2029 /* Also age clones in the exception table.
2030 * Note, that clones are aged out
2031 * only if they are not in use now.
2033 rt6_age_exceptions(rt
, gc_args
, now
);
2038 void fib6_run_gc(unsigned long expires
, struct net
*net
, bool force
)
2040 struct fib6_gc_args gc_args
;
2044 spin_lock_bh(&net
->ipv6
.fib6_gc_lock
);
2045 } else if (!spin_trylock_bh(&net
->ipv6
.fib6_gc_lock
)) {
2046 mod_timer(&net
->ipv6
.ip6_fib_timer
, jiffies
+ HZ
);
2049 gc_args
.timeout
= expires
? (int)expires
:
2050 net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2053 fib6_clean_all(net
, fib6_age
, &gc_args
);
2055 net
->ipv6
.ip6_rt_last_gc
= now
;
2058 mod_timer(&net
->ipv6
.ip6_fib_timer
,
2060 + net
->ipv6
.sysctl
.ip6_rt_gc_interval
));
2062 del_timer(&net
->ipv6
.ip6_fib_timer
);
2063 spin_unlock_bh(&net
->ipv6
.fib6_gc_lock
);
2066 static void fib6_gc_timer_cb(struct timer_list
*t
)
2068 struct net
*arg
= from_timer(arg
, t
, ipv6
.ip6_fib_timer
);
2070 fib6_run_gc(0, arg
, true);
2073 static int __net_init
fib6_net_init(struct net
*net
)
2075 size_t size
= sizeof(struct hlist_head
) * FIB6_TABLE_HASHSZ
;
2078 err
= fib6_notifier_init(net
);
2082 spin_lock_init(&net
->ipv6
.fib6_gc_lock
);
2083 rwlock_init(&net
->ipv6
.fib6_walker_lock
);
2084 INIT_LIST_HEAD(&net
->ipv6
.fib6_walkers
);
2085 timer_setup(&net
->ipv6
.ip6_fib_timer
, fib6_gc_timer_cb
, 0);
2087 net
->ipv6
.rt6_stats
= kzalloc(sizeof(*net
->ipv6
.rt6_stats
), GFP_KERNEL
);
2088 if (!net
->ipv6
.rt6_stats
)
2091 /* Avoid false sharing : Use at least a full cache line */
2092 size
= max_t(size_t, size
, L1_CACHE_BYTES
);
2094 net
->ipv6
.fib_table_hash
= kzalloc(size
, GFP_KERNEL
);
2095 if (!net
->ipv6
.fib_table_hash
)
2098 net
->ipv6
.fib6_main_tbl
= kzalloc(sizeof(*net
->ipv6
.fib6_main_tbl
),
2100 if (!net
->ipv6
.fib6_main_tbl
)
2101 goto out_fib_table_hash
;
2103 net
->ipv6
.fib6_main_tbl
->tb6_id
= RT6_TABLE_MAIN
;
2104 rcu_assign_pointer(net
->ipv6
.fib6_main_tbl
->tb6_root
.leaf
,
2105 net
->ipv6
.ip6_null_entry
);
2106 net
->ipv6
.fib6_main_tbl
->tb6_root
.fn_flags
=
2107 RTN_ROOT
| RTN_TL_ROOT
| RTN_RTINFO
;
2108 inet_peer_base_init(&net
->ipv6
.fib6_main_tbl
->tb6_peers
);
2110 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2111 net
->ipv6
.fib6_local_tbl
= kzalloc(sizeof(*net
->ipv6
.fib6_local_tbl
),
2113 if (!net
->ipv6
.fib6_local_tbl
)
2114 goto out_fib6_main_tbl
;
2115 net
->ipv6
.fib6_local_tbl
->tb6_id
= RT6_TABLE_LOCAL
;
2116 rcu_assign_pointer(net
->ipv6
.fib6_local_tbl
->tb6_root
.leaf
,
2117 net
->ipv6
.ip6_null_entry
);
2118 net
->ipv6
.fib6_local_tbl
->tb6_root
.fn_flags
=
2119 RTN_ROOT
| RTN_TL_ROOT
| RTN_RTINFO
;
2120 inet_peer_base_init(&net
->ipv6
.fib6_local_tbl
->tb6_peers
);
2122 fib6_tables_init(net
);
2126 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2128 kfree(net
->ipv6
.fib6_main_tbl
);
2131 kfree(net
->ipv6
.fib_table_hash
);
2133 kfree(net
->ipv6
.rt6_stats
);
2135 fib6_notifier_exit(net
);
2139 static void fib6_net_exit(struct net
*net
)
2143 del_timer_sync(&net
->ipv6
.ip6_fib_timer
);
2145 for (i
= 0; i
< FIB6_TABLE_HASHSZ
; i
++) {
2146 struct hlist_head
*head
= &net
->ipv6
.fib_table_hash
[i
];
2147 struct hlist_node
*tmp
;
2148 struct fib6_table
*tb
;
2150 hlist_for_each_entry_safe(tb
, tmp
, head
, tb6_hlist
) {
2151 hlist_del(&tb
->tb6_hlist
);
2152 fib6_free_table(tb
);
2156 kfree(net
->ipv6
.fib_table_hash
);
2157 kfree(net
->ipv6
.rt6_stats
);
2158 fib6_notifier_exit(net
);
2161 static struct pernet_operations fib6_net_ops
= {
2162 .init
= fib6_net_init
,
2163 .exit
= fib6_net_exit
,
2166 int __init
fib6_init(void)
2170 fib6_node_kmem
= kmem_cache_create("fib6_nodes",
2171 sizeof(struct fib6_node
),
2172 0, SLAB_HWCACHE_ALIGN
,
2174 if (!fib6_node_kmem
)
2177 ret
= register_pernet_subsys(&fib6_net_ops
);
2179 goto out_kmem_cache_create
;
2181 ret
= rtnl_register_module(THIS_MODULE
, PF_INET6
, RTM_GETROUTE
, NULL
,
2184 goto out_unregister_subsys
;
2186 __fib6_flush_trees
= fib6_flush_trees
;
2190 out_unregister_subsys
:
2191 unregister_pernet_subsys(&fib6_net_ops
);
2192 out_kmem_cache_create
:
2193 kmem_cache_destroy(fib6_node_kmem
);
2197 void fib6_gc_cleanup(void)
2199 unregister_pernet_subsys(&fib6_net_ops
);
2200 kmem_cache_destroy(fib6_node_kmem
);
2203 #ifdef CONFIG_PROC_FS
2205 struct ipv6_route_iter
{
2206 struct seq_net_private p
;
2207 struct fib6_walker w
;
2209 struct fib6_table
*tbl
;
2213 static int ipv6_route_seq_show(struct seq_file
*seq
, void *v
)
2215 struct rt6_info
*rt
= v
;
2216 struct ipv6_route_iter
*iter
= seq
->private;
2218 seq_printf(seq
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2220 #ifdef CONFIG_IPV6_SUBTREES
2221 seq_printf(seq
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2223 seq_puts(seq
, "00000000000000000000000000000000 00 ");
2225 if (rt
->rt6i_flags
& RTF_GATEWAY
)
2226 seq_printf(seq
, "%pi6", &rt
->rt6i_gateway
);
2228 seq_puts(seq
, "00000000000000000000000000000000");
2230 seq_printf(seq
, " %08x %08x %08x %08x %8s\n",
2231 rt
->rt6i_metric
, atomic_read(&rt
->dst
.__refcnt
),
2232 rt
->dst
.__use
, rt
->rt6i_flags
,
2233 rt
->dst
.dev
? rt
->dst
.dev
->name
: "");
2234 iter
->w
.leaf
= NULL
;
2238 static int ipv6_route_yield(struct fib6_walker
*w
)
2240 struct ipv6_route_iter
*iter
= w
->args
;
2246 iter
->w
.leaf
= rcu_dereference_protected(
2247 iter
->w
.leaf
->rt6_next
,
2248 lockdep_is_held(&iter
->tbl
->tb6_lock
));
2250 if (!iter
->skip
&& iter
->w
.leaf
)
2252 } while (iter
->w
.leaf
);
2257 static void ipv6_route_seq_setup_walk(struct ipv6_route_iter
*iter
,
2260 memset(&iter
->w
, 0, sizeof(iter
->w
));
2261 iter
->w
.func
= ipv6_route_yield
;
2262 iter
->w
.root
= &iter
->tbl
->tb6_root
;
2263 iter
->w
.state
= FWS_INIT
;
2264 iter
->w
.node
= iter
->w
.root
;
2265 iter
->w
.args
= iter
;
2266 iter
->sernum
= iter
->w
.root
->fn_sernum
;
2267 INIT_LIST_HEAD(&iter
->w
.lh
);
2268 fib6_walker_link(net
, &iter
->w
);
2271 static struct fib6_table
*ipv6_route_seq_next_table(struct fib6_table
*tbl
,
2275 struct hlist_node
*node
;
2278 h
= (tbl
->tb6_id
& (FIB6_TABLE_HASHSZ
- 1)) + 1;
2279 node
= rcu_dereference_bh(hlist_next_rcu(&tbl
->tb6_hlist
));
2285 while (!node
&& h
< FIB6_TABLE_HASHSZ
) {
2286 node
= rcu_dereference_bh(
2287 hlist_first_rcu(&net
->ipv6
.fib_table_hash
[h
++]));
2289 return hlist_entry_safe(node
, struct fib6_table
, tb6_hlist
);
2292 static void ipv6_route_check_sernum(struct ipv6_route_iter
*iter
)
2294 if (iter
->sernum
!= iter
->w
.root
->fn_sernum
) {
2295 iter
->sernum
= iter
->w
.root
->fn_sernum
;
2296 iter
->w
.state
= FWS_INIT
;
2297 iter
->w
.node
= iter
->w
.root
;
2298 WARN_ON(iter
->w
.skip
);
2299 iter
->w
.skip
= iter
->w
.count
;
2303 static void *ipv6_route_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2307 struct net
*net
= seq_file_net(seq
);
2308 struct ipv6_route_iter
*iter
= seq
->private;
2313 n
= rcu_dereference_bh(((struct rt6_info
*)v
)->rt6_next
);
2320 ipv6_route_check_sernum(iter
);
2321 spin_lock_bh(&iter
->tbl
->tb6_lock
);
2322 r
= fib6_walk_continue(&iter
->w
);
2323 spin_unlock_bh(&iter
->tbl
->tb6_lock
);
2327 return iter
->w
.leaf
;
2329 fib6_walker_unlink(net
, &iter
->w
);
2332 fib6_walker_unlink(net
, &iter
->w
);
2334 iter
->tbl
= ipv6_route_seq_next_table(iter
->tbl
, net
);
2338 ipv6_route_seq_setup_walk(iter
, net
);
2342 static void *ipv6_route_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2345 struct net
*net
= seq_file_net(seq
);
2346 struct ipv6_route_iter
*iter
= seq
->private;
2349 iter
->tbl
= ipv6_route_seq_next_table(NULL
, net
);
2353 ipv6_route_seq_setup_walk(iter
, net
);
2354 return ipv6_route_seq_next(seq
, NULL
, pos
);
2360 static bool ipv6_route_iter_active(struct ipv6_route_iter
*iter
)
2362 struct fib6_walker
*w
= &iter
->w
;
2363 return w
->node
&& !(w
->state
== FWS_U
&& w
->node
== w
->root
);
2366 static void ipv6_route_seq_stop(struct seq_file
*seq
, void *v
)
2369 struct net
*net
= seq_file_net(seq
);
2370 struct ipv6_route_iter
*iter
= seq
->private;
2372 if (ipv6_route_iter_active(iter
))
2373 fib6_walker_unlink(net
, &iter
->w
);
2375 rcu_read_unlock_bh();
2378 static const struct seq_operations ipv6_route_seq_ops
= {
2379 .start
= ipv6_route_seq_start
,
2380 .next
= ipv6_route_seq_next
,
2381 .stop
= ipv6_route_seq_stop
,
2382 .show
= ipv6_route_seq_show
2385 int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2387 return seq_open_net(inode
, file
, &ipv6_route_seq_ops
,
2388 sizeof(struct ipv6_route_iter
));
2391 #endif /* CONFIG_PROC_FS */