2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
51 #include <linux/rtnetlink.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
57 #include <asm/uaccess.h>
60 #include <linux/sysctl.h>
63 /* Set to 3 to get tracing. */
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #define RT6_TRACE(x...) do { ; } while (0)
74 #define CLONE_OFFLINK_ROUTE 0
76 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
77 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
78 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
79 static void ip6_dst_destroy(struct dst_entry
*);
80 static void ip6_dst_ifdown(struct dst_entry
*,
81 struct net_device
*dev
, int how
);
82 static int ip6_dst_gc(struct dst_ops
*ops
);
84 static int ip6_pkt_discard(struct sk_buff
*skb
);
85 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
86 static void ip6_link_failure(struct sk_buff
*skb
);
87 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
91 struct in6_addr
*prefix
, int prefixlen
,
92 struct in6_addr
*gwaddr
, int ifindex
,
94 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
95 struct in6_addr
*prefix
, int prefixlen
,
96 struct in6_addr
*gwaddr
, int ifindex
);
99 static struct dst_ops ip6_dst_ops_template
= {
101 .protocol
= cpu_to_be16(ETH_P_IPV6
),
104 .check
= ip6_dst_check
,
105 .destroy
= ip6_dst_destroy
,
106 .ifdown
= ip6_dst_ifdown
,
107 .negative_advice
= ip6_negative_advice
,
108 .link_failure
= ip6_link_failure
,
109 .update_pmtu
= ip6_rt_update_pmtu
,
110 .local_out
= __ip6_local_out
,
111 .entries
= ATOMIC_INIT(0),
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
118 static struct dst_ops ip6_dst_blackhole_ops
= {
120 .protocol
= cpu_to_be16(ETH_P_IPV6
),
121 .destroy
= ip6_dst_destroy
,
122 .check
= ip6_dst_check
,
123 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
124 .entries
= ATOMIC_INIT(0),
127 static struct rt6_info ip6_null_entry_template
= {
130 .__refcnt
= ATOMIC_INIT(1),
133 .error
= -ENETUNREACH
,
134 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
135 .input
= ip6_pkt_discard
,
136 .output
= ip6_pkt_discard_out
,
139 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
140 .rt6i_metric
= ~(u32
) 0,
141 .rt6i_ref
= ATOMIC_INIT(1),
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
146 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
147 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
149 static struct rt6_info ip6_prohibit_entry_template
= {
152 .__refcnt
= ATOMIC_INIT(1),
156 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
157 .input
= ip6_pkt_prohibit
,
158 .output
= ip6_pkt_prohibit_out
,
161 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
162 .rt6i_metric
= ~(u32
) 0,
163 .rt6i_ref
= ATOMIC_INIT(1),
166 static struct rt6_info ip6_blk_hole_entry_template
= {
169 .__refcnt
= ATOMIC_INIT(1),
173 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
174 .input
= dst_discard
,
175 .output
= dst_discard
,
178 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
179 .rt6i_metric
= ~(u32
) 0,
180 .rt6i_ref
= ATOMIC_INIT(1),
185 /* allocate dst with ip6_dst_ops */
186 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
188 return (struct rt6_info
*)dst_alloc(ops
);
191 static void ip6_dst_destroy(struct dst_entry
*dst
)
193 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
194 struct inet6_dev
*idev
= rt
->rt6i_idev
;
197 rt
->rt6i_idev
= NULL
;
202 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
205 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
206 struct inet6_dev
*idev
= rt
->rt6i_idev
;
207 struct net_device
*loopback_dev
=
208 dev_net(dev
)->loopback_dev
;
210 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
211 struct inet6_dev
*loopback_idev
=
212 in6_dev_get(loopback_dev
);
213 if (loopback_idev
!= NULL
) {
214 rt
->rt6i_idev
= loopback_idev
;
220 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
222 return (rt
->rt6i_flags
& RTF_EXPIRES
&&
223 time_after(jiffies
, rt
->rt6i_expires
));
226 static inline int rt6_need_strict(struct in6_addr
*daddr
)
228 return (ipv6_addr_type(daddr
) &
229 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
));
233 * Route lookup. Any table->tb6_lock is implied.
236 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
238 struct in6_addr
*saddr
,
242 struct rt6_info
*local
= NULL
;
243 struct rt6_info
*sprt
;
245 if (!oif
&& ipv6_addr_any(saddr
))
248 for (sprt
= rt
; sprt
; sprt
= sprt
->u
.dst
.rt6_next
) {
249 struct net_device
*dev
= sprt
->rt6i_dev
;
252 if (dev
->ifindex
== oif
)
254 if (dev
->flags
& IFF_LOOPBACK
) {
255 if (sprt
->rt6i_idev
== NULL
||
256 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
257 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
259 if (local
&& (!oif
||
260 local
->rt6i_idev
->dev
->ifindex
== oif
))
266 if (ipv6_chk_addr(net
, saddr
, dev
,
267 flags
& RT6_LOOKUP_F_IFACE
))
276 if (flags
& RT6_LOOKUP_F_IFACE
)
277 return net
->ipv6
.ip6_null_entry
;
283 #ifdef CONFIG_IPV6_ROUTER_PREF
284 static void rt6_probe(struct rt6_info
*rt
)
286 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
288 * Okay, this does not seem to be appropriate
289 * for now, however, we need to check if it
290 * is really so; aka Router Reachability Probing.
292 * Router Reachability Probe MUST be rate-limited
293 * to no more than one per minute.
295 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
297 read_lock_bh(&neigh
->lock
);
298 if (!(neigh
->nud_state
& NUD_VALID
) &&
299 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
300 struct in6_addr mcaddr
;
301 struct in6_addr
*target
;
303 neigh
->updated
= jiffies
;
304 read_unlock_bh(&neigh
->lock
);
306 target
= (struct in6_addr
*)&neigh
->primary_key
;
307 addrconf_addr_solict_mult(target
, &mcaddr
);
308 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
310 read_unlock_bh(&neigh
->lock
);
313 static inline void rt6_probe(struct rt6_info
*rt
)
320 * Default Router Selection (RFC 2461 6.3.6)
322 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
324 struct net_device
*dev
= rt
->rt6i_dev
;
325 if (!oif
|| dev
->ifindex
== oif
)
327 if ((dev
->flags
& IFF_LOOPBACK
) &&
328 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
333 static inline int rt6_check_neigh(struct rt6_info
*rt
)
335 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
337 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
338 !(rt
->rt6i_flags
& RTF_GATEWAY
))
341 read_lock_bh(&neigh
->lock
);
342 if (neigh
->nud_state
& NUD_VALID
)
344 #ifdef CONFIG_IPV6_ROUTER_PREF
345 else if (neigh
->nud_state
& NUD_FAILED
)
350 read_unlock_bh(&neigh
->lock
);
356 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
361 m
= rt6_check_dev(rt
, oif
);
362 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
367 n
= rt6_check_neigh(rt
);
368 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
373 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
374 int *mpri
, struct rt6_info
*match
)
378 if (rt6_check_expired(rt
))
381 m
= rt6_score_route(rt
, oif
, strict
);
386 if (strict
& RT6_LOOKUP_F_REACHABLE
)
390 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
398 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
399 struct rt6_info
*rr_head
,
400 u32 metric
, int oif
, int strict
)
402 struct rt6_info
*rt
, *match
;
406 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
407 rt
= rt
->u
.dst
.rt6_next
)
408 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
409 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
410 rt
= rt
->u
.dst
.rt6_next
)
411 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
416 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
418 struct rt6_info
*match
, *rt0
;
421 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
422 __func__
, fn
->leaf
, oif
);
426 fn
->rr_ptr
= rt0
= fn
->leaf
;
428 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
431 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
432 struct rt6_info
*next
= rt0
->u
.dst
.rt6_next
;
434 /* no entries matched; do round-robin */
435 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
442 RT6_TRACE("%s() => %p\n",
445 net
= dev_net(rt0
->rt6i_dev
);
446 return (match
? match
: net
->ipv6
.ip6_null_entry
);
449 #ifdef CONFIG_IPV6_ROUTE_INFO
450 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
451 struct in6_addr
*gwaddr
)
453 struct net
*net
= dev_net(dev
);
454 struct route_info
*rinfo
= (struct route_info
*) opt
;
455 struct in6_addr prefix_buf
, *prefix
;
457 unsigned long lifetime
;
460 if (len
< sizeof(struct route_info
)) {
464 /* Sanity check for prefix_len and length */
465 if (rinfo
->length
> 3) {
467 } else if (rinfo
->prefix_len
> 128) {
469 } else if (rinfo
->prefix_len
> 64) {
470 if (rinfo
->length
< 2) {
473 } else if (rinfo
->prefix_len
> 0) {
474 if (rinfo
->length
< 1) {
479 pref
= rinfo
->route_pref
;
480 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
481 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
483 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
485 if (rinfo
->length
== 3)
486 prefix
= (struct in6_addr
*)rinfo
->prefix
;
488 /* this function is safe */
489 ipv6_addr_prefix(&prefix_buf
,
490 (struct in6_addr
*)rinfo
->prefix
,
492 prefix
= &prefix_buf
;
495 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
498 if (rt
&& !lifetime
) {
504 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
507 rt
->rt6i_flags
= RTF_ROUTEINFO
|
508 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
511 if (!addrconf_finite_timeout(lifetime
)) {
512 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
514 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
515 rt
->rt6i_flags
|= RTF_EXPIRES
;
517 dst_release(&rt
->u
.dst
);
523 #define BACKTRACK(__net, saddr) \
525 if (rt == __net->ipv6.ip6_null_entry) { \
526 struct fib6_node *pn; \
528 if (fn->fn_flags & RTN_TL_ROOT) \
531 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
532 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
535 if (fn->fn_flags & RTN_RTINFO) \
541 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
542 struct fib6_table
*table
,
543 struct flowi
*fl
, int flags
)
545 struct fib6_node
*fn
;
548 read_lock_bh(&table
->tb6_lock
);
549 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
552 rt
= rt6_device_match(net
, rt
, &fl
->fl6_src
, fl
->oif
, flags
);
553 BACKTRACK(net
, &fl
->fl6_src
);
555 dst_use(&rt
->u
.dst
, jiffies
);
556 read_unlock_bh(&table
->tb6_lock
);
561 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
562 const struct in6_addr
*saddr
, int oif
, int strict
)
572 struct dst_entry
*dst
;
573 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
576 memcpy(&fl
.fl6_src
, saddr
, sizeof(*saddr
));
577 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
580 dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_lookup
);
582 return (struct rt6_info
*) dst
;
589 EXPORT_SYMBOL(rt6_lookup
);
591 /* ip6_ins_rt is called with FREE table->tb6_lock.
592 It takes new route entry, the addition fails by any reason the
593 route is freed. In any case, if caller does not hold it, it may
597 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
600 struct fib6_table
*table
;
602 table
= rt
->rt6i_table
;
603 write_lock_bh(&table
->tb6_lock
);
604 err
= fib6_add(&table
->tb6_root
, rt
, info
);
605 write_unlock_bh(&table
->tb6_lock
);
610 int ip6_ins_rt(struct rt6_info
*rt
)
612 struct nl_info info
= {
613 .nl_net
= dev_net(rt
->rt6i_dev
),
615 return __ip6_ins_rt(rt
, &info
);
618 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
619 struct in6_addr
*saddr
)
627 rt
= ip6_rt_copy(ort
);
630 struct neighbour
*neigh
;
631 int attempts
= !in_softirq();
633 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
634 if (rt
->rt6i_dst
.plen
!= 128 &&
635 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
636 rt
->rt6i_flags
|= RTF_ANYCAST
;
637 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
640 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
641 rt
->rt6i_dst
.plen
= 128;
642 rt
->rt6i_flags
|= RTF_CACHE
;
643 rt
->u
.dst
.flags
|= DST_HOST
;
645 #ifdef CONFIG_IPV6_SUBTREES
646 if (rt
->rt6i_src
.plen
&& saddr
) {
647 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
648 rt
->rt6i_src
.plen
= 128;
653 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
655 struct net
*net
= dev_net(rt
->rt6i_dev
);
656 int saved_rt_min_interval
=
657 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
658 int saved_rt_elasticity
=
659 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
661 if (attempts
-- > 0) {
662 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
663 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
665 ip6_dst_gc(net
->ipv6
.ip6_dst_ops
);
667 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
669 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
670 saved_rt_min_interval
;
676 "Neighbour table overflow.\n");
677 dst_free(&rt
->u
.dst
);
680 rt
->rt6i_nexthop
= neigh
;
687 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
689 struct rt6_info
*rt
= ip6_rt_copy(ort
);
691 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
692 rt
->rt6i_dst
.plen
= 128;
693 rt
->rt6i_flags
|= RTF_CACHE
;
694 rt
->u
.dst
.flags
|= DST_HOST
;
695 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
700 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
701 struct flowi
*fl
, int flags
)
703 struct fib6_node
*fn
;
704 struct rt6_info
*rt
, *nrt
;
708 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
710 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
713 read_lock_bh(&table
->tb6_lock
);
716 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
719 rt
= rt6_select(fn
, oif
, strict
| reachable
);
721 BACKTRACK(net
, &fl
->fl6_src
);
722 if (rt
== net
->ipv6
.ip6_null_entry
||
723 rt
->rt6i_flags
& RTF_CACHE
)
726 dst_hold(&rt
->u
.dst
);
727 read_unlock_bh(&table
->tb6_lock
);
729 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
730 nrt
= rt6_alloc_cow(rt
, &fl
->fl6_dst
, &fl
->fl6_src
);
732 #if CLONE_OFFLINK_ROUTE
733 nrt
= rt6_alloc_clone(rt
, &fl
->fl6_dst
);
739 dst_release(&rt
->u
.dst
);
740 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
742 dst_hold(&rt
->u
.dst
);
744 err
= ip6_ins_rt(nrt
);
753 * Race condition! In the gap, when table->tb6_lock was
754 * released someone could insert this route. Relookup.
756 dst_release(&rt
->u
.dst
);
764 dst_hold(&rt
->u
.dst
);
765 read_unlock_bh(&table
->tb6_lock
);
767 rt
->u
.dst
.lastuse
= jiffies
;
773 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
774 struct flowi
*fl
, int flags
)
776 return ip6_pol_route(net
, table
, fl
->iif
, fl
, flags
);
779 void ip6_route_input(struct sk_buff
*skb
)
781 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
782 struct net
*net
= dev_net(skb
->dev
);
783 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
785 .iif
= skb
->dev
->ifindex
,
790 .flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
794 .proto
= iph
->nexthdr
,
797 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
798 flags
|= RT6_LOOKUP_F_IFACE
;
800 skb
->dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_input
);
803 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
804 struct flowi
*fl
, int flags
)
806 return ip6_pol_route(net
, table
, fl
->oif
, fl
, flags
);
809 struct dst_entry
* ip6_route_output(struct net
*net
, struct sock
*sk
,
814 if (rt6_need_strict(&fl
->fl6_dst
))
815 flags
|= RT6_LOOKUP_F_IFACE
;
817 if (!ipv6_addr_any(&fl
->fl6_src
))
818 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
820 unsigned int prefs
= inet6_sk(sk
)->srcprefs
;
821 if (prefs
& IPV6_PREFER_SRC_TMP
)
822 flags
|= RT6_LOOKUP_F_SRCPREF_TMP
;
823 if (prefs
& IPV6_PREFER_SRC_PUBLIC
)
824 flags
|= RT6_LOOKUP_F_SRCPREF_PUBLIC
;
825 if (prefs
& IPV6_PREFER_SRC_COA
)
826 flags
|= RT6_LOOKUP_F_SRCPREF_COA
;
829 return fib6_rule_lookup(net
, fl
, flags
, ip6_pol_route_output
);
832 EXPORT_SYMBOL(ip6_route_output
);
834 int ip6_dst_blackhole(struct sock
*sk
, struct dst_entry
**dstp
, struct flowi
*fl
)
836 struct rt6_info
*ort
= (struct rt6_info
*) *dstp
;
837 struct rt6_info
*rt
= (struct rt6_info
*)
838 dst_alloc(&ip6_dst_blackhole_ops
);
839 struct dst_entry
*new = NULL
;
844 atomic_set(&new->__refcnt
, 1);
846 new->input
= dst_discard
;
847 new->output
= dst_discard
;
849 memcpy(new->metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
850 new->dev
= ort
->u
.dst
.dev
;
853 rt
->rt6i_idev
= ort
->rt6i_idev
;
855 in6_dev_hold(rt
->rt6i_idev
);
856 rt
->rt6i_expires
= 0;
858 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
859 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
862 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
863 #ifdef CONFIG_IPV6_SUBTREES
864 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
872 return (new ? 0 : -ENOMEM
);
874 EXPORT_SYMBOL_GPL(ip6_dst_blackhole
);
877 * Destination cache support functions
880 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
884 rt
= (struct rt6_info
*) dst
;
886 if (rt
&& rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
))
892 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
894 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
897 if (rt
->rt6i_flags
& RTF_CACHE
)
905 static void ip6_link_failure(struct sk_buff
*skb
)
909 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0, skb
->dev
);
911 rt
= (struct rt6_info
*) skb
->dst
;
913 if (rt
->rt6i_flags
&RTF_CACHE
) {
914 dst_set_expires(&rt
->u
.dst
, 0);
915 rt
->rt6i_flags
|= RTF_EXPIRES
;
916 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
917 rt
->rt6i_node
->fn_sernum
= -1;
921 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
923 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
925 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
926 rt6
->rt6i_flags
|= RTF_MODIFIED
;
927 if (mtu
< IPV6_MIN_MTU
) {
929 dst
->metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
931 dst
->metrics
[RTAX_MTU
-1] = mtu
;
932 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
936 static int ipv6_get_mtu(struct net_device
*dev
);
938 static inline unsigned int ipv6_advmss(struct net
*net
, unsigned int mtu
)
940 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
942 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
943 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
946 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
947 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
948 * IPV6_MAXPLEN is also valid and means: "any MSS,
949 * rely only on pmtu discovery"
951 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
956 static struct dst_entry
*icmp6_dst_gc_list
;
957 static DEFINE_SPINLOCK(icmp6_dst_lock
);
959 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
960 struct neighbour
*neigh
,
961 const struct in6_addr
*addr
)
964 struct inet6_dev
*idev
= in6_dev_get(dev
);
965 struct net
*net
= dev_net(dev
);
967 if (unlikely(idev
== NULL
))
970 rt
= ip6_dst_alloc(net
->ipv6
.ip6_dst_ops
);
971 if (unlikely(rt
== NULL
)) {
980 neigh
= ndisc_get_neigh(dev
, addr
);
986 rt
->rt6i_idev
= idev
;
987 rt
->rt6i_nexthop
= neigh
;
988 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
989 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = 255;
990 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
991 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
992 rt
->u
.dst
.output
= ip6_output
;
994 #if 0 /* there's no chance to use these for ndisc */
995 rt
->u
.dst
.flags
= ipv6_addr_type(addr
) & IPV6_ADDR_UNICAST
998 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
999 rt
->rt6i_dst
.plen
= 128;
1002 spin_lock_bh(&icmp6_dst_lock
);
1003 rt
->u
.dst
.next
= icmp6_dst_gc_list
;
1004 icmp6_dst_gc_list
= &rt
->u
.dst
;
1005 spin_unlock_bh(&icmp6_dst_lock
);
1007 fib6_force_start_gc(net
);
1013 int icmp6_dst_gc(void)
1015 struct dst_entry
*dst
, *next
, **pprev
;
1020 spin_lock_bh(&icmp6_dst_lock
);
1021 pprev
= &icmp6_dst_gc_list
;
1023 while ((dst
= *pprev
) != NULL
) {
1024 if (!atomic_read(&dst
->__refcnt
)) {
1033 spin_unlock_bh(&icmp6_dst_lock
);
1038 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1041 struct dst_entry
*dst
, **pprev
;
1043 spin_lock_bh(&icmp6_dst_lock
);
1044 pprev
= &icmp6_dst_gc_list
;
1045 while ((dst
= *pprev
) != NULL
) {
1046 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1047 if (func(rt
, arg
)) {
1054 spin_unlock_bh(&icmp6_dst_lock
);
1057 static int ip6_dst_gc(struct dst_ops
*ops
)
1059 unsigned long now
= jiffies
;
1060 struct net
*net
= ops
->dst_net
;
1061 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1062 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1063 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1064 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1065 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1067 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1068 atomic_read(&ops
->entries
) <= rt_max_size
)
1071 net
->ipv6
.ip6_rt_gc_expire
++;
1072 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1073 net
->ipv6
.ip6_rt_last_gc
= now
;
1074 if (atomic_read(&ops
->entries
) < ops
->gc_thresh
)
1075 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1077 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1078 return (atomic_read(&ops
->entries
) > rt_max_size
);
1081 /* Clean host part of a prefix. Not necessary in radix tree,
1082 but results in cleaner routing tables.
1084 Remove it only when all the things will work!
1087 static int ipv6_get_mtu(struct net_device
*dev
)
1089 int mtu
= IPV6_MIN_MTU
;
1090 struct inet6_dev
*idev
;
1092 idev
= in6_dev_get(dev
);
1094 mtu
= idev
->cnf
.mtu6
;
1100 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1102 int hoplimit
= dst_metric(dst
, RTAX_HOPLIMIT
);
1104 struct net_device
*dev
= dst
->dev
;
1105 struct inet6_dev
*idev
= in6_dev_get(dev
);
1107 hoplimit
= idev
->cnf
.hop_limit
;
1110 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1119 int ip6_route_add(struct fib6_config
*cfg
)
1122 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1123 struct rt6_info
*rt
= NULL
;
1124 struct net_device
*dev
= NULL
;
1125 struct inet6_dev
*idev
= NULL
;
1126 struct fib6_table
*table
;
1129 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1131 #ifndef CONFIG_IPV6_SUBTREES
1132 if (cfg
->fc_src_len
)
1135 if (cfg
->fc_ifindex
) {
1137 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1140 idev
= in6_dev_get(dev
);
1145 if (cfg
->fc_metric
== 0)
1146 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1148 table
= fib6_new_table(net
, cfg
->fc_table
);
1149 if (table
== NULL
) {
1154 rt
= ip6_dst_alloc(net
->ipv6
.ip6_dst_ops
);
1161 rt
->u
.dst
.obsolete
= -1;
1162 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1163 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1166 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1167 cfg
->fc_protocol
= RTPROT_BOOT
;
1168 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1170 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1172 if (addr_type
& IPV6_ADDR_MULTICAST
)
1173 rt
->u
.dst
.input
= ip6_mc_input
;
1175 rt
->u
.dst
.input
= ip6_forward
;
1177 rt
->u
.dst
.output
= ip6_output
;
1179 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1180 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1181 if (rt
->rt6i_dst
.plen
== 128)
1182 rt
->u
.dst
.flags
= DST_HOST
;
1184 #ifdef CONFIG_IPV6_SUBTREES
1185 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1186 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1189 rt
->rt6i_metric
= cfg
->fc_metric
;
1191 /* We cannot add true routes via loopback here,
1192 they would result in kernel looping; promote them to reject routes
1194 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1195 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
))) {
1196 /* hold loopback dev/idev if we haven't done so. */
1197 if (dev
!= net
->loopback_dev
) {
1202 dev
= net
->loopback_dev
;
1204 idev
= in6_dev_get(dev
);
1210 rt
->u
.dst
.output
= ip6_pkt_discard_out
;
1211 rt
->u
.dst
.input
= ip6_pkt_discard
;
1212 rt
->u
.dst
.error
= -ENETUNREACH
;
1213 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1217 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1218 struct in6_addr
*gw_addr
;
1221 gw_addr
= &cfg
->fc_gateway
;
1222 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1223 gwa_type
= ipv6_addr_type(gw_addr
);
1225 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1226 struct rt6_info
*grt
;
1228 /* IPv6 strictly inhibits using not link-local
1229 addresses as nexthop address.
1230 Otherwise, router will not able to send redirects.
1231 It is very good, but in some (rare!) circumstances
1232 (SIT, PtP, NBMA NOARP links) it is handy to allow
1233 some exceptions. --ANK
1236 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1239 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1241 err
= -EHOSTUNREACH
;
1245 if (dev
!= grt
->rt6i_dev
) {
1246 dst_release(&grt
->u
.dst
);
1250 dev
= grt
->rt6i_dev
;
1251 idev
= grt
->rt6i_idev
;
1253 in6_dev_hold(grt
->rt6i_idev
);
1255 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1257 dst_release(&grt
->u
.dst
);
1263 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1271 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1272 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1273 if (IS_ERR(rt
->rt6i_nexthop
)) {
1274 err
= PTR_ERR(rt
->rt6i_nexthop
);
1275 rt
->rt6i_nexthop
= NULL
;
1280 rt
->rt6i_flags
= cfg
->fc_flags
;
1287 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1288 int type
= nla_type(nla
);
1291 if (type
> RTAX_MAX
) {
1296 rt
->u
.dst
.metrics
[type
- 1] = nla_get_u32(nla
);
1301 if (dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
) == 0)
1302 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1303 if (!dst_mtu(&rt
->u
.dst
))
1304 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(dev
);
1305 if (!dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
))
1306 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1307 rt
->u
.dst
.dev
= dev
;
1308 rt
->rt6i_idev
= idev
;
1309 rt
->rt6i_table
= table
;
1311 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1313 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1321 dst_free(&rt
->u
.dst
);
1325 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1328 struct fib6_table
*table
;
1329 struct net
*net
= dev_net(rt
->rt6i_dev
);
1331 if (rt
== net
->ipv6
.ip6_null_entry
)
1334 table
= rt
->rt6i_table
;
1335 write_lock_bh(&table
->tb6_lock
);
1337 err
= fib6_del(rt
, info
);
1338 dst_release(&rt
->u
.dst
);
1340 write_unlock_bh(&table
->tb6_lock
);
1345 int ip6_del_rt(struct rt6_info
*rt
)
1347 struct nl_info info
= {
1348 .nl_net
= dev_net(rt
->rt6i_dev
),
1350 return __ip6_del_rt(rt
, &info
);
1353 static int ip6_route_del(struct fib6_config
*cfg
)
1355 struct fib6_table
*table
;
1356 struct fib6_node
*fn
;
1357 struct rt6_info
*rt
;
1360 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1364 read_lock_bh(&table
->tb6_lock
);
1366 fn
= fib6_locate(&table
->tb6_root
,
1367 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1368 &cfg
->fc_src
, cfg
->fc_src_len
);
1371 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1372 if (cfg
->fc_ifindex
&&
1373 (rt
->rt6i_dev
== NULL
||
1374 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1376 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1377 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1379 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1381 dst_hold(&rt
->u
.dst
);
1382 read_unlock_bh(&table
->tb6_lock
);
1384 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1387 read_unlock_bh(&table
->tb6_lock
);
1395 struct ip6rd_flowi
{
1397 struct in6_addr gateway
;
1400 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1401 struct fib6_table
*table
,
1405 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl
;
1406 struct rt6_info
*rt
;
1407 struct fib6_node
*fn
;
1410 * Get the "current" route for this destination and
1411 * check if the redirect has come from approriate router.
1413 * RFC 2461 specifies that redirects should only be
1414 * accepted if they come from the nexthop to the target.
1415 * Due to the way the routes are chosen, this notion
1416 * is a bit fuzzy and one might need to check all possible
1420 read_lock_bh(&table
->tb6_lock
);
1421 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
1423 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1425 * Current route is on-link; redirect is always invalid.
1427 * Seems, previous statement is not true. It could
1428 * be node, which looks for us as on-link (f.e. proxy ndisc)
1429 * But then router serving it might decide, that we should
1430 * know truth 8)8) --ANK (980726).
1432 if (rt6_check_expired(rt
))
1434 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1436 if (fl
->oif
!= rt
->rt6i_dev
->ifindex
)
1438 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1444 rt
= net
->ipv6
.ip6_null_entry
;
1445 BACKTRACK(net
, &fl
->fl6_src
);
1447 dst_hold(&rt
->u
.dst
);
1449 read_unlock_bh(&table
->tb6_lock
);
1454 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1455 struct in6_addr
*src
,
1456 struct in6_addr
*gateway
,
1457 struct net_device
*dev
)
1459 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1460 struct net
*net
= dev_net(dev
);
1461 struct ip6rd_flowi rdfl
= {
1463 .oif
= dev
->ifindex
,
1471 .gateway
= *gateway
,
1474 if (rt6_need_strict(dest
))
1475 flags
|= RT6_LOOKUP_F_IFACE
;
1477 return (struct rt6_info
*)fib6_rule_lookup(net
, (struct flowi
*)&rdfl
,
1478 flags
, __ip6_route_redirect
);
1481 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1482 struct in6_addr
*saddr
,
1483 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1485 struct rt6_info
*rt
, *nrt
= NULL
;
1486 struct netevent_redirect netevent
;
1487 struct net
*net
= dev_net(neigh
->dev
);
1489 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1491 if (rt
== net
->ipv6
.ip6_null_entry
) {
1492 if (net_ratelimit())
1493 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1494 "for redirect target\n");
1499 * We have finally decided to accept it.
1502 neigh_update(neigh
, lladdr
, NUD_STALE
,
1503 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1504 NEIGH_UPDATE_F_OVERRIDE
|
1505 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1506 NEIGH_UPDATE_F_ISROUTER
))
1510 * Redirect received -> path was valid.
1511 * Look, redirects are sent only in response to data packets,
1512 * so that this nexthop apparently is reachable. --ANK
1514 dst_confirm(&rt
->u
.dst
);
1516 /* Duplicate redirect: silently ignore. */
1517 if (neigh
== rt
->u
.dst
.neighbour
)
1520 nrt
= ip6_rt_copy(rt
);
1524 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1526 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1528 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1529 nrt
->rt6i_dst
.plen
= 128;
1530 nrt
->u
.dst
.flags
|= DST_HOST
;
1532 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1533 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1534 /* Reset pmtu, it may be better */
1535 nrt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(neigh
->dev
);
1536 nrt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(dev_net(neigh
->dev
),
1537 dst_mtu(&nrt
->u
.dst
));
1539 if (ip6_ins_rt(nrt
))
1542 netevent
.old
= &rt
->u
.dst
;
1543 netevent
.new = &nrt
->u
.dst
;
1544 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1546 if (rt
->rt6i_flags
&RTF_CACHE
) {
1552 dst_release(&rt
->u
.dst
);
1557 * Handle ICMP "packet too big" messages
1558 * i.e. Path MTU discovery
1561 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1562 struct net_device
*dev
, u32 pmtu
)
1564 struct rt6_info
*rt
, *nrt
;
1565 struct net
*net
= dev_net(dev
);
1568 rt
= rt6_lookup(net
, daddr
, saddr
, dev
->ifindex
, 0);
1572 if (pmtu
>= dst_mtu(&rt
->u
.dst
))
1575 if (pmtu
< IPV6_MIN_MTU
) {
1577 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578 * MTU (1280) and a fragment header should always be included
1579 * after a node receiving Too Big message reporting PMTU is
1580 * less than the IPv6 Minimum Link MTU.
1582 pmtu
= IPV6_MIN_MTU
;
1586 /* New mtu received -> path was valid.
1587 They are sent only in response to data packets,
1588 so that this nexthop apparently is reachable. --ANK
1590 dst_confirm(&rt
->u
.dst
);
1592 /* Host route. If it is static, it would be better
1593 not to override it, but add new one, so that
1594 when cache entry will expire old pmtu
1595 would return automatically.
1597 if (rt
->rt6i_flags
& RTF_CACHE
) {
1598 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1600 rt
->u
.dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1601 dst_set_expires(&rt
->u
.dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1602 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1607 Two cases are possible:
1608 1. It is connected route. Action: COW
1609 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1611 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1612 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1614 nrt
= rt6_alloc_clone(rt
, daddr
);
1617 nrt
->u
.dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1619 nrt
->u
.dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1621 /* According to RFC 1981, detecting PMTU increase shouldn't be
1622 * happened within 5 mins, the recommended timer is 10 mins.
1623 * Here this route expiration time is set to ip6_rt_mtu_expires
1624 * which is 10 mins. After 10 mins the decreased pmtu is expired
1625 * and detecting PMTU increase will be automatically happened.
1627 dst_set_expires(&nrt
->u
.dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1628 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1633 dst_release(&rt
->u
.dst
);
1637 * Misc support functions
1640 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1642 struct net
*net
= dev_net(ort
->rt6i_dev
);
1643 struct rt6_info
*rt
= ip6_dst_alloc(net
->ipv6
.ip6_dst_ops
);
1646 rt
->u
.dst
.input
= ort
->u
.dst
.input
;
1647 rt
->u
.dst
.output
= ort
->u
.dst
.output
;
1649 memcpy(rt
->u
.dst
.metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
1650 rt
->u
.dst
.error
= ort
->u
.dst
.error
;
1651 rt
->u
.dst
.dev
= ort
->u
.dst
.dev
;
1653 dev_hold(rt
->u
.dst
.dev
);
1654 rt
->rt6i_idev
= ort
->rt6i_idev
;
1656 in6_dev_hold(rt
->rt6i_idev
);
1657 rt
->u
.dst
.lastuse
= jiffies
;
1658 rt
->rt6i_expires
= 0;
1660 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1661 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1662 rt
->rt6i_metric
= 0;
1664 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1665 #ifdef CONFIG_IPV6_SUBTREES
1666 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1668 rt
->rt6i_table
= ort
->rt6i_table
;
1673 #ifdef CONFIG_IPV6_ROUTE_INFO
1674 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1675 struct in6_addr
*prefix
, int prefixlen
,
1676 struct in6_addr
*gwaddr
, int ifindex
)
1678 struct fib6_node
*fn
;
1679 struct rt6_info
*rt
= NULL
;
1680 struct fib6_table
*table
;
1682 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1686 write_lock_bh(&table
->tb6_lock
);
1687 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1691 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1692 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1694 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1696 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1698 dst_hold(&rt
->u
.dst
);
1702 write_unlock_bh(&table
->tb6_lock
);
1706 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1707 struct in6_addr
*prefix
, int prefixlen
,
1708 struct in6_addr
*gwaddr
, int ifindex
,
1711 struct fib6_config cfg
= {
1712 .fc_table
= RT6_TABLE_INFO
,
1713 .fc_metric
= IP6_RT_PRIO_USER
,
1714 .fc_ifindex
= ifindex
,
1715 .fc_dst_len
= prefixlen
,
1716 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1717 RTF_UP
| RTF_PREF(pref
),
1719 .fc_nlinfo
.nlh
= NULL
,
1720 .fc_nlinfo
.nl_net
= net
,
1723 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1724 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1726 /* We should treat it as a default route if prefix length is 0. */
1728 cfg
.fc_flags
|= RTF_DEFAULT
;
1730 ip6_route_add(&cfg
);
1732 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1736 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1738 struct rt6_info
*rt
;
1739 struct fib6_table
*table
;
1741 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1745 write_lock_bh(&table
->tb6_lock
);
1746 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->u
.dst
.rt6_next
) {
1747 if (dev
== rt
->rt6i_dev
&&
1748 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1749 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1753 dst_hold(&rt
->u
.dst
);
1754 write_unlock_bh(&table
->tb6_lock
);
1758 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1759 struct net_device
*dev
,
1762 struct fib6_config cfg
= {
1763 .fc_table
= RT6_TABLE_DFLT
,
1764 .fc_metric
= IP6_RT_PRIO_USER
,
1765 .fc_ifindex
= dev
->ifindex
,
1766 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1767 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1769 .fc_nlinfo
.nlh
= NULL
,
1770 .fc_nlinfo
.nl_net
= dev_net(dev
),
1773 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1775 ip6_route_add(&cfg
);
1777 return rt6_get_dflt_router(gwaddr
, dev
);
1780 void rt6_purge_dflt_routers(struct net
*net
)
1782 struct rt6_info
*rt
;
1783 struct fib6_table
*table
;
1785 /* NOTE: Keep consistent with rt6_get_dflt_router */
1786 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1791 read_lock_bh(&table
->tb6_lock
);
1792 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1793 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1794 dst_hold(&rt
->u
.dst
);
1795 read_unlock_bh(&table
->tb6_lock
);
1800 read_unlock_bh(&table
->tb6_lock
);
1803 static void rtmsg_to_fib6_config(struct net
*net
,
1804 struct in6_rtmsg
*rtmsg
,
1805 struct fib6_config
*cfg
)
1807 memset(cfg
, 0, sizeof(*cfg
));
1809 cfg
->fc_table
= RT6_TABLE_MAIN
;
1810 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1811 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1812 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1813 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1814 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1815 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1817 cfg
->fc_nlinfo
.nl_net
= net
;
1819 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1820 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1821 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1824 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1826 struct fib6_config cfg
;
1827 struct in6_rtmsg rtmsg
;
1831 case SIOCADDRT
: /* Add a route */
1832 case SIOCDELRT
: /* Delete a route */
1833 if (!capable(CAP_NET_ADMIN
))
1835 err
= copy_from_user(&rtmsg
, arg
,
1836 sizeof(struct in6_rtmsg
));
1840 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1845 err
= ip6_route_add(&cfg
);
1848 err
= ip6_route_del(&cfg
);
1862 * Drop the packet on the floor
1865 static int ip6_pkt_drop(struct sk_buff
*skb
, int code
, int ipstats_mib_noroutes
)
1868 struct dst_entry
*dst
= skb
->dst
;
1869 switch (ipstats_mib_noroutes
) {
1870 case IPSTATS_MIB_INNOROUTES
:
1871 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1872 if (type
== IPV6_ADDR_ANY
|| type
== IPV6_ADDR_RESERVED
) {
1873 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1874 IPSTATS_MIB_INADDRERRORS
);
1878 case IPSTATS_MIB_OUTNOROUTES
:
1879 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1880 ipstats_mib_noroutes
);
1883 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0, skb
->dev
);
1888 static int ip6_pkt_discard(struct sk_buff
*skb
)
1890 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1893 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1895 skb
->dev
= skb
->dst
->dev
;
1896 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1901 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1903 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1906 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1908 skb
->dev
= skb
->dst
->dev
;
1909 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1915 * Allocate a dst for local (unicast / anycast) address.
1918 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1919 const struct in6_addr
*addr
,
1922 struct net
*net
= dev_net(idev
->dev
);
1923 struct rt6_info
*rt
= ip6_dst_alloc(net
->ipv6
.ip6_dst_ops
);
1924 struct neighbour
*neigh
;
1927 return ERR_PTR(-ENOMEM
);
1929 dev_hold(net
->loopback_dev
);
1932 rt
->u
.dst
.flags
= DST_HOST
;
1933 rt
->u
.dst
.input
= ip6_input
;
1934 rt
->u
.dst
.output
= ip6_output
;
1935 rt
->rt6i_dev
= net
->loopback_dev
;
1936 rt
->rt6i_idev
= idev
;
1937 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
1938 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1939 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1940 rt
->u
.dst
.obsolete
= -1;
1942 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
1944 rt
->rt6i_flags
|= RTF_ANYCAST
;
1946 rt
->rt6i_flags
|= RTF_LOCAL
;
1947 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
1948 if (IS_ERR(neigh
)) {
1949 dst_free(&rt
->u
.dst
);
1951 /* We are casting this because that is the return
1952 * value type. But an errno encoded pointer is the
1953 * same regardless of the underlying pointer type,
1954 * and that's what we are returning. So this is OK.
1956 return (struct rt6_info
*) neigh
;
1958 rt
->rt6i_nexthop
= neigh
;
1960 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1961 rt
->rt6i_dst
.plen
= 128;
1962 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
1964 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
1969 struct arg_dev_net
{
1970 struct net_device
*dev
;
1974 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
1976 struct net_device
*dev
= ((struct arg_dev_net
*)arg
)->dev
;
1977 struct net
*net
= ((struct arg_dev_net
*)arg
)->net
;
1979 if (((void *)rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
1980 rt
!= net
->ipv6
.ip6_null_entry
) {
1981 RT6_TRACE("deleted by ifdown %p\n", rt
);
1987 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
1989 struct arg_dev_net adn
= {
1994 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
1995 icmp6_clean_all(fib6_ifdown
, &adn
);
1998 struct rt6_mtu_change_arg
2000 struct net_device
*dev
;
2004 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2006 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2007 struct inet6_dev
*idev
;
2008 struct net
*net
= dev_net(arg
->dev
);
2010 /* In IPv6 pmtu discovery is not optional,
2011 so that RTAX_MTU lock cannot disable it.
2012 We still use this lock to block changes
2013 caused by addrconf/ndisc.
2016 idev
= __in6_dev_get(arg
->dev
);
2020 /* For administrative MTU increase, there is no way to discover
2021 IPv6 PMTU increase, so PMTU increase should be updated here.
2022 Since RFC 1981 doesn't include administrative MTU increase
2023 update PMTU increase is a MUST. (i.e. jumbo frame)
2026 If new MTU is less than route PMTU, this new MTU will be the
2027 lowest MTU in the path, update the route PMTU to reflect PMTU
2028 decreases; if new MTU is greater than route PMTU, and the
2029 old MTU is the lowest MTU in the path, update the route PMTU
2030 to reflect the increase. In this case if the other nodes' MTU
2031 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2034 if (rt
->rt6i_dev
== arg
->dev
&&
2035 !dst_metric_locked(&rt
->u
.dst
, RTAX_MTU
) &&
2036 (dst_mtu(&rt
->u
.dst
) >= arg
->mtu
||
2037 (dst_mtu(&rt
->u
.dst
) < arg
->mtu
&&
2038 dst_mtu(&rt
->u
.dst
) == idev
->cnf
.mtu6
))) {
2039 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = arg
->mtu
;
2040 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, arg
->mtu
);
2045 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2047 struct rt6_mtu_change_arg arg
= {
2052 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2055 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2056 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2057 [RTA_OIF
] = { .type
= NLA_U32
},
2058 [RTA_IIF
] = { .type
= NLA_U32
},
2059 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2060 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2063 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2064 struct fib6_config
*cfg
)
2067 struct nlattr
*tb
[RTA_MAX
+1];
2070 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2075 rtm
= nlmsg_data(nlh
);
2076 memset(cfg
, 0, sizeof(*cfg
));
2078 cfg
->fc_table
= rtm
->rtm_table
;
2079 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2080 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2081 cfg
->fc_flags
= RTF_UP
;
2082 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2084 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2085 cfg
->fc_flags
|= RTF_REJECT
;
2087 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2088 cfg
->fc_nlinfo
.nlh
= nlh
;
2089 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2091 if (tb
[RTA_GATEWAY
]) {
2092 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2093 cfg
->fc_flags
|= RTF_GATEWAY
;
2097 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2099 if (nla_len(tb
[RTA_DST
]) < plen
)
2102 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2106 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2108 if (nla_len(tb
[RTA_SRC
]) < plen
)
2111 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2115 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2117 if (tb
[RTA_PRIORITY
])
2118 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2120 if (tb
[RTA_METRICS
]) {
2121 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2122 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2126 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2133 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2135 struct fib6_config cfg
;
2138 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2142 return ip6_route_del(&cfg
);
2145 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2147 struct fib6_config cfg
;
2150 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2154 return ip6_route_add(&cfg
);
2157 static inline size_t rt6_nlmsg_size(void)
2159 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2160 + nla_total_size(16) /* RTA_SRC */
2161 + nla_total_size(16) /* RTA_DST */
2162 + nla_total_size(16) /* RTA_GATEWAY */
2163 + nla_total_size(16) /* RTA_PREFSRC */
2164 + nla_total_size(4) /* RTA_TABLE */
2165 + nla_total_size(4) /* RTA_IIF */
2166 + nla_total_size(4) /* RTA_OIF */
2167 + nla_total_size(4) /* RTA_PRIORITY */
2168 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2169 + nla_total_size(sizeof(struct rta_cacheinfo
));
2172 static int rt6_fill_node(struct net
*net
,
2173 struct sk_buff
*skb
, struct rt6_info
*rt
,
2174 struct in6_addr
*dst
, struct in6_addr
*src
,
2175 int iif
, int type
, u32 pid
, u32 seq
,
2176 int prefix
, int nowait
, unsigned int flags
)
2179 struct nlmsghdr
*nlh
;
2183 if (prefix
) { /* user wants prefix routes only */
2184 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2185 /* success since this is not a prefix route */
2190 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2194 rtm
= nlmsg_data(nlh
);
2195 rtm
->rtm_family
= AF_INET6
;
2196 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2197 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2200 table
= rt
->rt6i_table
->tb6_id
;
2202 table
= RT6_TABLE_UNSPEC
;
2203 rtm
->rtm_table
= table
;
2204 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2205 if (rt
->rt6i_flags
&RTF_REJECT
)
2206 rtm
->rtm_type
= RTN_UNREACHABLE
;
2207 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2208 rtm
->rtm_type
= RTN_LOCAL
;
2210 rtm
->rtm_type
= RTN_UNICAST
;
2212 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2213 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2214 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2215 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2216 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2217 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2218 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2219 rtm
->rtm_protocol
= RTPROT_RA
;
2221 if (rt
->rt6i_flags
&RTF_CACHE
)
2222 rtm
->rtm_flags
|= RTM_F_CLONED
;
2225 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2226 rtm
->rtm_dst_len
= 128;
2227 } else if (rtm
->rtm_dst_len
)
2228 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2229 #ifdef CONFIG_IPV6_SUBTREES
2231 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2232 rtm
->rtm_src_len
= 128;
2233 } else if (rtm
->rtm_src_len
)
2234 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2237 #ifdef CONFIG_IPV6_MROUTE
2238 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2239 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2244 goto nla_put_failure
;
2246 if (err
== -EMSGSIZE
)
2247 goto nla_put_failure
;
2252 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2254 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->u
.dst
);
2255 struct in6_addr saddr_buf
;
2256 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2257 dst
, 0, &saddr_buf
) == 0)
2258 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2261 if (rtnetlink_put_metrics(skb
, rt
->u
.dst
.metrics
) < 0)
2262 goto nla_put_failure
;
2264 if (rt
->u
.dst
.neighbour
)
2265 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->u
.dst
.neighbour
->primary_key
);
2268 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2270 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2272 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2274 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2275 expires
= rt
->rt6i_expires
- jiffies
;
2279 if (rtnl_put_cacheinfo(skb
, &rt
->u
.dst
, 0, 0, 0,
2280 expires
, rt
->u
.dst
.error
) < 0)
2281 goto nla_put_failure
;
2283 return nlmsg_end(skb
, nlh
);
2286 nlmsg_cancel(skb
, nlh
);
2290 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2292 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2295 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2296 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2297 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2301 return rt6_fill_node(arg
->net
,
2302 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2303 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2304 prefix
, 0, NLM_F_MULTI
);
2307 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2309 struct net
*net
= sock_net(in_skb
->sk
);
2310 struct nlattr
*tb
[RTA_MAX
+1];
2311 struct rt6_info
*rt
;
2312 struct sk_buff
*skb
;
2317 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2322 memset(&fl
, 0, sizeof(fl
));
2325 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2328 ipv6_addr_copy(&fl
.fl6_src
, nla_data(tb
[RTA_SRC
]));
2332 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2335 ipv6_addr_copy(&fl
.fl6_dst
, nla_data(tb
[RTA_DST
]));
2339 iif
= nla_get_u32(tb
[RTA_IIF
]);
2342 fl
.oif
= nla_get_u32(tb
[RTA_OIF
]);
2345 struct net_device
*dev
;
2346 dev
= __dev_get_by_index(net
, iif
);
2353 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2359 /* Reserve room for dummy headers, this skb can pass
2360 through good chunk of routing engine.
2362 skb_reset_mac_header(skb
);
2363 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2365 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl
);
2366 skb
->dst
= &rt
->u
.dst
;
2368 err
= rt6_fill_node(net
, skb
, rt
, &fl
.fl6_dst
, &fl
.fl6_src
, iif
,
2369 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2370 nlh
->nlmsg_seq
, 0, 0, 0);
2376 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2381 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2383 struct sk_buff
*skb
;
2384 struct net
*net
= info
->nl_net
;
2389 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2391 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2395 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2396 event
, info
->pid
, seq
, 0, 0, 0);
2398 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2399 WARN_ON(err
== -EMSGSIZE
);
2403 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2404 info
->nlh
, gfp_any());
2408 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2411 static int ip6_route_dev_notify(struct notifier_block
*this,
2412 unsigned long event
, void *data
)
2414 struct net_device
*dev
= (struct net_device
*)data
;
2415 struct net
*net
= dev_net(dev
);
2417 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2418 net
->ipv6
.ip6_null_entry
->u
.dst
.dev
= dev
;
2419 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2420 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2421 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.dev
= dev
;
2422 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2423 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.dev
= dev
;
2424 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2435 #ifdef CONFIG_PROC_FS
2437 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2448 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2450 struct seq_file
*m
= p_arg
;
2452 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2454 #ifdef CONFIG_IPV6_SUBTREES
2455 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2457 seq_puts(m
, "00000000000000000000000000000000 00 ");
2460 if (rt
->rt6i_nexthop
) {
2461 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2463 seq_puts(m
, "00000000000000000000000000000000");
2465 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2466 rt
->rt6i_metric
, atomic_read(&rt
->u
.dst
.__refcnt
),
2467 rt
->u
.dst
.__use
, rt
->rt6i_flags
,
2468 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2472 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2474 struct net
*net
= (struct net
*)m
->private;
2475 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2479 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2481 return single_open_net(inode
, file
, ipv6_route_show
);
2484 static const struct file_operations ipv6_route_proc_fops
= {
2485 .owner
= THIS_MODULE
,
2486 .open
= ipv6_route_open
,
2488 .llseek
= seq_lseek
,
2489 .release
= single_release_net
,
2492 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2494 struct net
*net
= (struct net
*)seq
->private;
2495 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2496 net
->ipv6
.rt6_stats
->fib_nodes
,
2497 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2498 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2499 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2500 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2501 atomic_read(&net
->ipv6
.ip6_dst_ops
->entries
),
2502 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2507 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2509 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2512 static const struct file_operations rt6_stats_seq_fops
= {
2513 .owner
= THIS_MODULE
,
2514 .open
= rt6_stats_seq_open
,
2516 .llseek
= seq_lseek
,
2517 .release
= single_release_net
,
2519 #endif /* CONFIG_PROC_FS */
2521 #ifdef CONFIG_SYSCTL
2524 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
, struct file
* filp
,
2525 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2527 struct net
*net
= current
->nsproxy
->net_ns
;
2528 int delay
= net
->ipv6
.sysctl
.flush_delay
;
2530 proc_dointvec(ctl
, write
, filp
, buffer
, lenp
, ppos
);
2531 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2537 ctl_table ipv6_route_table_template
[] = {
2539 .procname
= "flush",
2540 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2541 .maxlen
= sizeof(int),
2543 .proc_handler
= ipv6_sysctl_rtcache_flush
2546 .ctl_name
= NET_IPV6_ROUTE_GC_THRESH
,
2547 .procname
= "gc_thresh",
2548 .data
= &ip6_dst_ops_template
.gc_thresh
,
2549 .maxlen
= sizeof(int),
2551 .proc_handler
= proc_dointvec
,
2554 .ctl_name
= NET_IPV6_ROUTE_MAX_SIZE
,
2555 .procname
= "max_size",
2556 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2557 .maxlen
= sizeof(int),
2559 .proc_handler
= proc_dointvec
,
2562 .ctl_name
= NET_IPV6_ROUTE_GC_MIN_INTERVAL
,
2563 .procname
= "gc_min_interval",
2564 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2565 .maxlen
= sizeof(int),
2567 .proc_handler
= proc_dointvec_jiffies
,
2568 .strategy
= sysctl_jiffies
,
2571 .ctl_name
= NET_IPV6_ROUTE_GC_TIMEOUT
,
2572 .procname
= "gc_timeout",
2573 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2574 .maxlen
= sizeof(int),
2576 .proc_handler
= proc_dointvec_jiffies
,
2577 .strategy
= sysctl_jiffies
,
2580 .ctl_name
= NET_IPV6_ROUTE_GC_INTERVAL
,
2581 .procname
= "gc_interval",
2582 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2583 .maxlen
= sizeof(int),
2585 .proc_handler
= proc_dointvec_jiffies
,
2586 .strategy
= sysctl_jiffies
,
2589 .ctl_name
= NET_IPV6_ROUTE_GC_ELASTICITY
,
2590 .procname
= "gc_elasticity",
2591 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2592 .maxlen
= sizeof(int),
2594 .proc_handler
= proc_dointvec_jiffies
,
2595 .strategy
= sysctl_jiffies
,
2598 .ctl_name
= NET_IPV6_ROUTE_MTU_EXPIRES
,
2599 .procname
= "mtu_expires",
2600 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2601 .maxlen
= sizeof(int),
2603 .proc_handler
= proc_dointvec_jiffies
,
2604 .strategy
= sysctl_jiffies
,
2607 .ctl_name
= NET_IPV6_ROUTE_MIN_ADVMSS
,
2608 .procname
= "min_adv_mss",
2609 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2610 .maxlen
= sizeof(int),
2612 .proc_handler
= proc_dointvec_jiffies
,
2613 .strategy
= sysctl_jiffies
,
2616 .ctl_name
= NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS
,
2617 .procname
= "gc_min_interval_ms",
2618 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2619 .maxlen
= sizeof(int),
2621 .proc_handler
= proc_dointvec_ms_jiffies
,
2622 .strategy
= sysctl_ms_jiffies
,
2627 struct ctl_table
*ipv6_route_sysctl_init(struct net
*net
)
2629 struct ctl_table
*table
;
2631 table
= kmemdup(ipv6_route_table_template
,
2632 sizeof(ipv6_route_table_template
),
2636 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2637 table
[1].data
= &net
->ipv6
.ip6_dst_ops
->gc_thresh
;
2638 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2639 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2640 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2641 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2642 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2643 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2644 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2651 static int ip6_route_net_init(struct net
*net
)
2655 net
->ipv6
.ip6_dst_ops
= kmemdup(&ip6_dst_ops_template
,
2656 sizeof(*net
->ipv6
.ip6_dst_ops
),
2658 if (!net
->ipv6
.ip6_dst_ops
)
2660 net
->ipv6
.ip6_dst_ops
->dst_net
= hold_net(net
);
2662 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2663 sizeof(*net
->ipv6
.ip6_null_entry
),
2665 if (!net
->ipv6
.ip6_null_entry
)
2666 goto out_ip6_dst_ops
;
2667 net
->ipv6
.ip6_null_entry
->u
.dst
.path
=
2668 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2669 net
->ipv6
.ip6_null_entry
->u
.dst
.ops
= net
->ipv6
.ip6_dst_ops
;
2671 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2672 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2673 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2675 if (!net
->ipv6
.ip6_prohibit_entry
)
2676 goto out_ip6_null_entry
;
2677 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.path
=
2678 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2679 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.ops
= net
->ipv6
.ip6_dst_ops
;
2681 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2682 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2684 if (!net
->ipv6
.ip6_blk_hole_entry
)
2685 goto out_ip6_prohibit_entry
;
2686 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.path
=
2687 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2688 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.ops
= net
->ipv6
.ip6_dst_ops
;
2691 net
->ipv6
.sysctl
.flush_delay
= 0;
2692 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2693 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2694 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2695 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2696 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2697 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2698 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2700 #ifdef CONFIG_PROC_FS
2701 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2702 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2704 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2710 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2711 out_ip6_prohibit_entry
:
2712 kfree(net
->ipv6
.ip6_prohibit_entry
);
2714 kfree(net
->ipv6
.ip6_null_entry
);
2717 release_net(net
->ipv6
.ip6_dst_ops
->dst_net
);
2718 kfree(net
->ipv6
.ip6_dst_ops
);
2722 static void ip6_route_net_exit(struct net
*net
)
2724 #ifdef CONFIG_PROC_FS
2725 proc_net_remove(net
, "ipv6_route");
2726 proc_net_remove(net
, "rt6_stats");
2728 kfree(net
->ipv6
.ip6_null_entry
);
2729 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2730 kfree(net
->ipv6
.ip6_prohibit_entry
);
2731 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2733 release_net(net
->ipv6
.ip6_dst_ops
->dst_net
);
2734 kfree(net
->ipv6
.ip6_dst_ops
);
2737 static struct pernet_operations ip6_route_net_ops
= {
2738 .init
= ip6_route_net_init
,
2739 .exit
= ip6_route_net_exit
,
2742 static struct notifier_block ip6_route_dev_notifier
= {
2743 .notifier_call
= ip6_route_dev_notify
,
2747 int __init
ip6_route_init(void)
2752 ip6_dst_ops_template
.kmem_cachep
=
2753 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2754 SLAB_HWCACHE_ALIGN
, NULL
);
2755 if (!ip6_dst_ops_template
.kmem_cachep
)
2758 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2760 goto out_kmem_cache
;
2762 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2764 /* Registering of the loopback is done before this portion of code,
2765 * the loopback reference in rt6_info will not be taken, do it
2766 * manually for init_net */
2767 init_net
.ipv6
.ip6_null_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2768 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2769 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2770 init_net
.ipv6
.ip6_prohibit_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2771 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2772 init_net
.ipv6
.ip6_blk_hole_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2773 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2777 goto out_register_subsys
;
2783 ret
= fib6_rules_init();
2788 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2789 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2790 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2791 goto fib6_rules_init
;
2793 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2795 goto fib6_rules_init
;
2801 fib6_rules_cleanup();
2806 out_register_subsys
:
2807 unregister_pernet_subsys(&ip6_route_net_ops
);
2809 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2813 void ip6_route_cleanup(void)
2815 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2816 fib6_rules_cleanup();
2819 unregister_pernet_subsys(&ip6_route_net_ops
);
2820 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);