2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
51 #include <linux/rtnetlink.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
57 #include <asm/uaccess.h>
60 #include <linux/sysctl.h>
63 /* Set to 3 to get tracing. */
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #define RT6_TRACE(x...) do { ; } while (0)
74 #define CLONE_OFFLINK_ROUTE 0
76 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
77 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
78 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
79 static void ip6_dst_destroy(struct dst_entry
*);
80 static void ip6_dst_ifdown(struct dst_entry
*,
81 struct net_device
*dev
, int how
);
82 static int ip6_dst_gc(struct dst_ops
*ops
);
84 static int ip6_pkt_discard(struct sk_buff
*skb
);
85 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
86 static void ip6_link_failure(struct sk_buff
*skb
);
87 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
91 struct in6_addr
*prefix
, int prefixlen
,
92 struct in6_addr
*gwaddr
, int ifindex
,
94 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
95 struct in6_addr
*prefix
, int prefixlen
,
96 struct in6_addr
*gwaddr
, int ifindex
);
99 static struct dst_ops ip6_dst_ops_template
= {
101 .protocol
= cpu_to_be16(ETH_P_IPV6
),
104 .check
= ip6_dst_check
,
105 .destroy
= ip6_dst_destroy
,
106 .ifdown
= ip6_dst_ifdown
,
107 .negative_advice
= ip6_negative_advice
,
108 .link_failure
= ip6_link_failure
,
109 .update_pmtu
= ip6_rt_update_pmtu
,
110 .local_out
= __ip6_local_out
,
111 .entries
= ATOMIC_INIT(0),
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
118 static struct dst_ops ip6_dst_blackhole_ops
= {
120 .protocol
= cpu_to_be16(ETH_P_IPV6
),
121 .destroy
= ip6_dst_destroy
,
122 .check
= ip6_dst_check
,
123 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
124 .entries
= ATOMIC_INIT(0),
127 static struct rt6_info ip6_null_entry_template
= {
130 .__refcnt
= ATOMIC_INIT(1),
133 .error
= -ENETUNREACH
,
134 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
135 .input
= ip6_pkt_discard
,
136 .output
= ip6_pkt_discard_out
,
139 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
140 .rt6i_protocol
= RTPROT_KERNEL
,
141 .rt6i_metric
= ~(u32
) 0,
142 .rt6i_ref
= ATOMIC_INIT(1),
145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
148 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
150 static struct rt6_info ip6_prohibit_entry_template
= {
153 .__refcnt
= ATOMIC_INIT(1),
157 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
158 .input
= ip6_pkt_prohibit
,
159 .output
= ip6_pkt_prohibit_out
,
162 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
163 .rt6i_protocol
= RTPROT_KERNEL
,
164 .rt6i_metric
= ~(u32
) 0,
165 .rt6i_ref
= ATOMIC_INIT(1),
168 static struct rt6_info ip6_blk_hole_entry_template
= {
171 .__refcnt
= ATOMIC_INIT(1),
175 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
176 .input
= dst_discard
,
177 .output
= dst_discard
,
180 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
181 .rt6i_protocol
= RTPROT_KERNEL
,
182 .rt6i_metric
= ~(u32
) 0,
183 .rt6i_ref
= ATOMIC_INIT(1),
188 /* allocate dst with ip6_dst_ops */
189 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
191 return (struct rt6_info
*)dst_alloc(ops
);
194 static void ip6_dst_destroy(struct dst_entry
*dst
)
196 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
197 struct inet6_dev
*idev
= rt
->rt6i_idev
;
200 rt
->rt6i_idev
= NULL
;
205 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
208 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
209 struct inet6_dev
*idev
= rt
->rt6i_idev
;
210 struct net_device
*loopback_dev
=
211 dev_net(dev
)->loopback_dev
;
213 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
214 struct inet6_dev
*loopback_idev
=
215 in6_dev_get(loopback_dev
);
216 if (loopback_idev
!= NULL
) {
217 rt
->rt6i_idev
= loopback_idev
;
223 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
225 return (rt
->rt6i_flags
& RTF_EXPIRES
&&
226 time_after(jiffies
, rt
->rt6i_expires
));
229 static inline int rt6_need_strict(struct in6_addr
*daddr
)
231 return (ipv6_addr_type(daddr
) &
232 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
));
236 * Route lookup. Any table->tb6_lock is implied.
239 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
241 struct in6_addr
*saddr
,
245 struct rt6_info
*local
= NULL
;
246 struct rt6_info
*sprt
;
248 if (!oif
&& ipv6_addr_any(saddr
))
251 for (sprt
= rt
; sprt
; sprt
= sprt
->u
.dst
.rt6_next
) {
252 struct net_device
*dev
= sprt
->rt6i_dev
;
255 if (dev
->ifindex
== oif
)
257 if (dev
->flags
& IFF_LOOPBACK
) {
258 if (sprt
->rt6i_idev
== NULL
||
259 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
260 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
262 if (local
&& (!oif
||
263 local
->rt6i_idev
->dev
->ifindex
== oif
))
269 if (ipv6_chk_addr(net
, saddr
, dev
,
270 flags
& RT6_LOOKUP_F_IFACE
))
279 if (flags
& RT6_LOOKUP_F_IFACE
)
280 return net
->ipv6
.ip6_null_entry
;
286 #ifdef CONFIG_IPV6_ROUTER_PREF
287 static void rt6_probe(struct rt6_info
*rt
)
289 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
291 * Okay, this does not seem to be appropriate
292 * for now, however, we need to check if it
293 * is really so; aka Router Reachability Probing.
295 * Router Reachability Probe MUST be rate-limited
296 * to no more than one per minute.
298 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
300 read_lock_bh(&neigh
->lock
);
301 if (!(neigh
->nud_state
& NUD_VALID
) &&
302 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
303 struct in6_addr mcaddr
;
304 struct in6_addr
*target
;
306 neigh
->updated
= jiffies
;
307 read_unlock_bh(&neigh
->lock
);
309 target
= (struct in6_addr
*)&neigh
->primary_key
;
310 addrconf_addr_solict_mult(target
, &mcaddr
);
311 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
313 read_unlock_bh(&neigh
->lock
);
316 static inline void rt6_probe(struct rt6_info
*rt
)
323 * Default Router Selection (RFC 2461 6.3.6)
325 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
327 struct net_device
*dev
= rt
->rt6i_dev
;
328 if (!oif
|| dev
->ifindex
== oif
)
330 if ((dev
->flags
& IFF_LOOPBACK
) &&
331 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
336 static inline int rt6_check_neigh(struct rt6_info
*rt
)
338 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
340 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
341 !(rt
->rt6i_flags
& RTF_GATEWAY
))
344 read_lock_bh(&neigh
->lock
);
345 if (neigh
->nud_state
& NUD_VALID
)
347 #ifdef CONFIG_IPV6_ROUTER_PREF
348 else if (neigh
->nud_state
& NUD_FAILED
)
353 read_unlock_bh(&neigh
->lock
);
359 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
364 m
= rt6_check_dev(rt
, oif
);
365 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
367 #ifdef CONFIG_IPV6_ROUTER_PREF
368 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
370 n
= rt6_check_neigh(rt
);
371 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
376 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
377 int *mpri
, struct rt6_info
*match
)
381 if (rt6_check_expired(rt
))
384 m
= rt6_score_route(rt
, oif
, strict
);
389 if (strict
& RT6_LOOKUP_F_REACHABLE
)
393 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
401 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
402 struct rt6_info
*rr_head
,
403 u32 metric
, int oif
, int strict
)
405 struct rt6_info
*rt
, *match
;
409 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
410 rt
= rt
->u
.dst
.rt6_next
)
411 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
412 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
413 rt
= rt
->u
.dst
.rt6_next
)
414 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
419 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
421 struct rt6_info
*match
, *rt0
;
424 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
425 __func__
, fn
->leaf
, oif
);
429 fn
->rr_ptr
= rt0
= fn
->leaf
;
431 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
434 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
435 struct rt6_info
*next
= rt0
->u
.dst
.rt6_next
;
437 /* no entries matched; do round-robin */
438 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
445 RT6_TRACE("%s() => %p\n",
448 net
= dev_net(rt0
->rt6i_dev
);
449 return (match
? match
: net
->ipv6
.ip6_null_entry
);
452 #ifdef CONFIG_IPV6_ROUTE_INFO
453 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
454 struct in6_addr
*gwaddr
)
456 struct net
*net
= dev_net(dev
);
457 struct route_info
*rinfo
= (struct route_info
*) opt
;
458 struct in6_addr prefix_buf
, *prefix
;
460 unsigned long lifetime
;
463 if (len
< sizeof(struct route_info
)) {
467 /* Sanity check for prefix_len and length */
468 if (rinfo
->length
> 3) {
470 } else if (rinfo
->prefix_len
> 128) {
472 } else if (rinfo
->prefix_len
> 64) {
473 if (rinfo
->length
< 2) {
476 } else if (rinfo
->prefix_len
> 0) {
477 if (rinfo
->length
< 1) {
482 pref
= rinfo
->route_pref
;
483 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
486 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
488 if (rinfo
->length
== 3)
489 prefix
= (struct in6_addr
*)rinfo
->prefix
;
491 /* this function is safe */
492 ipv6_addr_prefix(&prefix_buf
,
493 (struct in6_addr
*)rinfo
->prefix
,
495 prefix
= &prefix_buf
;
498 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
501 if (rt
&& !lifetime
) {
507 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
510 rt
->rt6i_flags
= RTF_ROUTEINFO
|
511 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
514 if (!addrconf_finite_timeout(lifetime
)) {
515 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
517 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
518 rt
->rt6i_flags
|= RTF_EXPIRES
;
520 dst_release(&rt
->u
.dst
);
526 #define BACKTRACK(__net, saddr) \
528 if (rt == __net->ipv6.ip6_null_entry) { \
529 struct fib6_node *pn; \
531 if (fn->fn_flags & RTN_TL_ROOT) \
534 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
538 if (fn->fn_flags & RTN_RTINFO) \
544 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
545 struct fib6_table
*table
,
546 struct flowi
*fl
, int flags
)
548 struct fib6_node
*fn
;
551 read_lock_bh(&table
->tb6_lock
);
552 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
555 rt
= rt6_device_match(net
, rt
, &fl
->fl6_src
, fl
->oif
, flags
);
556 BACKTRACK(net
, &fl
->fl6_src
);
558 dst_use(&rt
->u
.dst
, jiffies
);
559 read_unlock_bh(&table
->tb6_lock
);
564 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
565 const struct in6_addr
*saddr
, int oif
, int strict
)
575 struct dst_entry
*dst
;
576 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
579 memcpy(&fl
.fl6_src
, saddr
, sizeof(*saddr
));
580 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
583 dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_lookup
);
585 return (struct rt6_info
*) dst
;
592 EXPORT_SYMBOL(rt6_lookup
);
594 /* ip6_ins_rt is called with FREE table->tb6_lock.
595 It takes new route entry, the addition fails by any reason the
596 route is freed. In any case, if caller does not hold it, it may
600 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
603 struct fib6_table
*table
;
605 table
= rt
->rt6i_table
;
606 write_lock_bh(&table
->tb6_lock
);
607 err
= fib6_add(&table
->tb6_root
, rt
, info
);
608 write_unlock_bh(&table
->tb6_lock
);
613 int ip6_ins_rt(struct rt6_info
*rt
)
615 struct nl_info info
= {
616 .nl_net
= dev_net(rt
->rt6i_dev
),
618 return __ip6_ins_rt(rt
, &info
);
621 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
622 struct in6_addr
*saddr
)
630 rt
= ip6_rt_copy(ort
);
633 struct neighbour
*neigh
;
634 int attempts
= !in_softirq();
636 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
637 if (rt
->rt6i_dst
.plen
!= 128 &&
638 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
639 rt
->rt6i_flags
|= RTF_ANYCAST
;
640 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
643 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
644 rt
->rt6i_dst
.plen
= 128;
645 rt
->rt6i_flags
|= RTF_CACHE
;
646 rt
->u
.dst
.flags
|= DST_HOST
;
648 #ifdef CONFIG_IPV6_SUBTREES
649 if (rt
->rt6i_src
.plen
&& saddr
) {
650 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
651 rt
->rt6i_src
.plen
= 128;
656 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
658 struct net
*net
= dev_net(rt
->rt6i_dev
);
659 int saved_rt_min_interval
=
660 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
661 int saved_rt_elasticity
=
662 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
664 if (attempts
-- > 0) {
665 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
666 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
668 ip6_dst_gc(&net
->ipv6
.ip6_dst_ops
);
670 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
672 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
673 saved_rt_min_interval
;
679 "Neighbour table overflow.\n");
680 dst_free(&rt
->u
.dst
);
683 rt
->rt6i_nexthop
= neigh
;
690 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
692 struct rt6_info
*rt
= ip6_rt_copy(ort
);
694 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
695 rt
->rt6i_dst
.plen
= 128;
696 rt
->rt6i_flags
|= RTF_CACHE
;
697 rt
->u
.dst
.flags
|= DST_HOST
;
698 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
703 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
704 struct flowi
*fl
, int flags
)
706 struct fib6_node
*fn
;
707 struct rt6_info
*rt
, *nrt
;
711 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
713 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
716 read_lock_bh(&table
->tb6_lock
);
719 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
722 rt
= rt6_select(fn
, oif
, strict
| reachable
);
724 BACKTRACK(net
, &fl
->fl6_src
);
725 if (rt
== net
->ipv6
.ip6_null_entry
||
726 rt
->rt6i_flags
& RTF_CACHE
)
729 dst_hold(&rt
->u
.dst
);
730 read_unlock_bh(&table
->tb6_lock
);
732 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
733 nrt
= rt6_alloc_cow(rt
, &fl
->fl6_dst
, &fl
->fl6_src
);
735 #if CLONE_OFFLINK_ROUTE
736 nrt
= rt6_alloc_clone(rt
, &fl
->fl6_dst
);
742 dst_release(&rt
->u
.dst
);
743 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
745 dst_hold(&rt
->u
.dst
);
747 err
= ip6_ins_rt(nrt
);
756 * Race condition! In the gap, when table->tb6_lock was
757 * released someone could insert this route. Relookup.
759 dst_release(&rt
->u
.dst
);
767 dst_hold(&rt
->u
.dst
);
768 read_unlock_bh(&table
->tb6_lock
);
770 rt
->u
.dst
.lastuse
= jiffies
;
776 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
777 struct flowi
*fl
, int flags
)
779 return ip6_pol_route(net
, table
, fl
->iif
, fl
, flags
);
782 void ip6_route_input(struct sk_buff
*skb
)
784 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
785 struct net
*net
= dev_net(skb
->dev
);
786 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
788 .iif
= skb
->dev
->ifindex
,
793 .flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
797 .proto
= iph
->nexthdr
,
800 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
801 flags
|= RT6_LOOKUP_F_IFACE
;
803 skb_dst_set(skb
, fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_input
));
806 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
807 struct flowi
*fl
, int flags
)
809 return ip6_pol_route(net
, table
, fl
->oif
, fl
, flags
);
812 struct dst_entry
* ip6_route_output(struct net
*net
, struct sock
*sk
,
817 if (rt6_need_strict(&fl
->fl6_dst
))
818 flags
|= RT6_LOOKUP_F_IFACE
;
820 if (!ipv6_addr_any(&fl
->fl6_src
))
821 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
823 unsigned int prefs
= inet6_sk(sk
)->srcprefs
;
824 if (prefs
& IPV6_PREFER_SRC_TMP
)
825 flags
|= RT6_LOOKUP_F_SRCPREF_TMP
;
826 if (prefs
& IPV6_PREFER_SRC_PUBLIC
)
827 flags
|= RT6_LOOKUP_F_SRCPREF_PUBLIC
;
828 if (prefs
& IPV6_PREFER_SRC_COA
)
829 flags
|= RT6_LOOKUP_F_SRCPREF_COA
;
832 return fib6_rule_lookup(net
, fl
, flags
, ip6_pol_route_output
);
835 EXPORT_SYMBOL(ip6_route_output
);
837 int ip6_dst_blackhole(struct sock
*sk
, struct dst_entry
**dstp
, struct flowi
*fl
)
839 struct rt6_info
*ort
= (struct rt6_info
*) *dstp
;
840 struct rt6_info
*rt
= (struct rt6_info
*)
841 dst_alloc(&ip6_dst_blackhole_ops
);
842 struct dst_entry
*new = NULL
;
847 atomic_set(&new->__refcnt
, 1);
849 new->input
= dst_discard
;
850 new->output
= dst_discard
;
852 memcpy(new->metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
853 new->dev
= ort
->u
.dst
.dev
;
856 rt
->rt6i_idev
= ort
->rt6i_idev
;
858 in6_dev_hold(rt
->rt6i_idev
);
859 rt
->rt6i_expires
= 0;
861 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
862 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
865 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
866 #ifdef CONFIG_IPV6_SUBTREES
867 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
875 return (new ? 0 : -ENOMEM
);
877 EXPORT_SYMBOL_GPL(ip6_dst_blackhole
);
880 * Destination cache support functions
883 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
887 rt
= (struct rt6_info
*) dst
;
889 if (rt
&& rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
))
895 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
897 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
900 if (rt
->rt6i_flags
& RTF_CACHE
) {
901 if (rt6_check_expired(rt
)) {
913 static void ip6_link_failure(struct sk_buff
*skb
)
917 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0, skb
->dev
);
919 rt
= (struct rt6_info
*) skb_dst(skb
);
921 if (rt
->rt6i_flags
&RTF_CACHE
) {
922 dst_set_expires(&rt
->u
.dst
, 0);
923 rt
->rt6i_flags
|= RTF_EXPIRES
;
924 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
925 rt
->rt6i_node
->fn_sernum
= -1;
929 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
931 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
933 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
934 rt6
->rt6i_flags
|= RTF_MODIFIED
;
935 if (mtu
< IPV6_MIN_MTU
) {
937 dst
->metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
939 dst
->metrics
[RTAX_MTU
-1] = mtu
;
940 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
944 static int ipv6_get_mtu(struct net_device
*dev
);
946 static inline unsigned int ipv6_advmss(struct net
*net
, unsigned int mtu
)
948 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
950 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
951 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
954 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
955 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
956 * IPV6_MAXPLEN is also valid and means: "any MSS,
957 * rely only on pmtu discovery"
959 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
964 static struct dst_entry
*icmp6_dst_gc_list
;
965 static DEFINE_SPINLOCK(icmp6_dst_lock
);
967 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
968 struct neighbour
*neigh
,
969 const struct in6_addr
*addr
)
972 struct inet6_dev
*idev
= in6_dev_get(dev
);
973 struct net
*net
= dev_net(dev
);
975 if (unlikely(idev
== NULL
))
978 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
979 if (unlikely(rt
== NULL
)) {
988 neigh
= ndisc_get_neigh(dev
, addr
);
994 rt
->rt6i_idev
= idev
;
995 rt
->rt6i_nexthop
= neigh
;
996 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
997 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = 255;
998 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
999 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1000 rt
->u
.dst
.output
= ip6_output
;
1002 #if 0 /* there's no chance to use these for ndisc */
1003 rt
->u
.dst
.flags
= ipv6_addr_type(addr
) & IPV6_ADDR_UNICAST
1006 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1007 rt
->rt6i_dst
.plen
= 128;
1010 spin_lock_bh(&icmp6_dst_lock
);
1011 rt
->u
.dst
.next
= icmp6_dst_gc_list
;
1012 icmp6_dst_gc_list
= &rt
->u
.dst
;
1013 spin_unlock_bh(&icmp6_dst_lock
);
1015 fib6_force_start_gc(net
);
1021 int icmp6_dst_gc(void)
1023 struct dst_entry
*dst
, *next
, **pprev
;
1028 spin_lock_bh(&icmp6_dst_lock
);
1029 pprev
= &icmp6_dst_gc_list
;
1031 while ((dst
= *pprev
) != NULL
) {
1032 if (!atomic_read(&dst
->__refcnt
)) {
1041 spin_unlock_bh(&icmp6_dst_lock
);
1046 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1049 struct dst_entry
*dst
, **pprev
;
1051 spin_lock_bh(&icmp6_dst_lock
);
1052 pprev
= &icmp6_dst_gc_list
;
1053 while ((dst
= *pprev
) != NULL
) {
1054 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1055 if (func(rt
, arg
)) {
1062 spin_unlock_bh(&icmp6_dst_lock
);
1065 static int ip6_dst_gc(struct dst_ops
*ops
)
1067 unsigned long now
= jiffies
;
1068 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1069 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1070 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1071 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1072 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1073 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1075 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1076 atomic_read(&ops
->entries
) <= rt_max_size
)
1079 net
->ipv6
.ip6_rt_gc_expire
++;
1080 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1081 net
->ipv6
.ip6_rt_last_gc
= now
;
1082 if (atomic_read(&ops
->entries
) < ops
->gc_thresh
)
1083 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1085 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1086 return (atomic_read(&ops
->entries
) > rt_max_size
);
1089 /* Clean host part of a prefix. Not necessary in radix tree,
1090 but results in cleaner routing tables.
1092 Remove it only when all the things will work!
1095 static int ipv6_get_mtu(struct net_device
*dev
)
1097 int mtu
= IPV6_MIN_MTU
;
1098 struct inet6_dev
*idev
;
1100 idev
= in6_dev_get(dev
);
1102 mtu
= idev
->cnf
.mtu6
;
1108 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1110 int hoplimit
= dst_metric(dst
, RTAX_HOPLIMIT
);
1112 struct net_device
*dev
= dst
->dev
;
1113 struct inet6_dev
*idev
= in6_dev_get(dev
);
1115 hoplimit
= idev
->cnf
.hop_limit
;
1118 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1127 int ip6_route_add(struct fib6_config
*cfg
)
1130 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1131 struct rt6_info
*rt
= NULL
;
1132 struct net_device
*dev
= NULL
;
1133 struct inet6_dev
*idev
= NULL
;
1134 struct fib6_table
*table
;
1137 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1139 #ifndef CONFIG_IPV6_SUBTREES
1140 if (cfg
->fc_src_len
)
1143 if (cfg
->fc_ifindex
) {
1145 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1148 idev
= in6_dev_get(dev
);
1153 if (cfg
->fc_metric
== 0)
1154 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1156 table
= fib6_new_table(net
, cfg
->fc_table
);
1157 if (table
== NULL
) {
1162 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1169 rt
->u
.dst
.obsolete
= -1;
1170 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1171 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1174 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1175 cfg
->fc_protocol
= RTPROT_BOOT
;
1176 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1178 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1180 if (addr_type
& IPV6_ADDR_MULTICAST
)
1181 rt
->u
.dst
.input
= ip6_mc_input
;
1183 rt
->u
.dst
.input
= ip6_forward
;
1185 rt
->u
.dst
.output
= ip6_output
;
1187 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1188 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1189 if (rt
->rt6i_dst
.plen
== 128)
1190 rt
->u
.dst
.flags
= DST_HOST
;
1192 #ifdef CONFIG_IPV6_SUBTREES
1193 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1194 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1197 rt
->rt6i_metric
= cfg
->fc_metric
;
1199 /* We cannot add true routes via loopback here,
1200 they would result in kernel looping; promote them to reject routes
1202 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1203 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
))) {
1204 /* hold loopback dev/idev if we haven't done so. */
1205 if (dev
!= net
->loopback_dev
) {
1210 dev
= net
->loopback_dev
;
1212 idev
= in6_dev_get(dev
);
1218 rt
->u
.dst
.output
= ip6_pkt_discard_out
;
1219 rt
->u
.dst
.input
= ip6_pkt_discard
;
1220 rt
->u
.dst
.error
= -ENETUNREACH
;
1221 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1225 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1226 struct in6_addr
*gw_addr
;
1229 gw_addr
= &cfg
->fc_gateway
;
1230 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1231 gwa_type
= ipv6_addr_type(gw_addr
);
1233 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1234 struct rt6_info
*grt
;
1236 /* IPv6 strictly inhibits using not link-local
1237 addresses as nexthop address.
1238 Otherwise, router will not able to send redirects.
1239 It is very good, but in some (rare!) circumstances
1240 (SIT, PtP, NBMA NOARP links) it is handy to allow
1241 some exceptions. --ANK
1244 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1247 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1249 err
= -EHOSTUNREACH
;
1253 if (dev
!= grt
->rt6i_dev
) {
1254 dst_release(&grt
->u
.dst
);
1258 dev
= grt
->rt6i_dev
;
1259 idev
= grt
->rt6i_idev
;
1261 in6_dev_hold(grt
->rt6i_idev
);
1263 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1265 dst_release(&grt
->u
.dst
);
1271 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1279 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1280 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1281 if (IS_ERR(rt
->rt6i_nexthop
)) {
1282 err
= PTR_ERR(rt
->rt6i_nexthop
);
1283 rt
->rt6i_nexthop
= NULL
;
1288 rt
->rt6i_flags
= cfg
->fc_flags
;
1295 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1296 int type
= nla_type(nla
);
1299 if (type
> RTAX_MAX
) {
1304 rt
->u
.dst
.metrics
[type
- 1] = nla_get_u32(nla
);
1309 if (dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
) == 0)
1310 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1311 if (!dst_mtu(&rt
->u
.dst
))
1312 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(dev
);
1313 if (!dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
))
1314 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1315 rt
->u
.dst
.dev
= dev
;
1316 rt
->rt6i_idev
= idev
;
1317 rt
->rt6i_table
= table
;
1319 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1321 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1329 dst_free(&rt
->u
.dst
);
1333 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1336 struct fib6_table
*table
;
1337 struct net
*net
= dev_net(rt
->rt6i_dev
);
1339 if (rt
== net
->ipv6
.ip6_null_entry
)
1342 table
= rt
->rt6i_table
;
1343 write_lock_bh(&table
->tb6_lock
);
1345 err
= fib6_del(rt
, info
);
1346 dst_release(&rt
->u
.dst
);
1348 write_unlock_bh(&table
->tb6_lock
);
1353 int ip6_del_rt(struct rt6_info
*rt
)
1355 struct nl_info info
= {
1356 .nl_net
= dev_net(rt
->rt6i_dev
),
1358 return __ip6_del_rt(rt
, &info
);
1361 static int ip6_route_del(struct fib6_config
*cfg
)
1363 struct fib6_table
*table
;
1364 struct fib6_node
*fn
;
1365 struct rt6_info
*rt
;
1368 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1372 read_lock_bh(&table
->tb6_lock
);
1374 fn
= fib6_locate(&table
->tb6_root
,
1375 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1376 &cfg
->fc_src
, cfg
->fc_src_len
);
1379 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1380 if (cfg
->fc_ifindex
&&
1381 (rt
->rt6i_dev
== NULL
||
1382 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1384 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1385 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1387 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1389 dst_hold(&rt
->u
.dst
);
1390 read_unlock_bh(&table
->tb6_lock
);
1392 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1395 read_unlock_bh(&table
->tb6_lock
);
1403 struct ip6rd_flowi
{
1405 struct in6_addr gateway
;
1408 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1409 struct fib6_table
*table
,
1413 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl
;
1414 struct rt6_info
*rt
;
1415 struct fib6_node
*fn
;
1418 * Get the "current" route for this destination and
1419 * check if the redirect has come from approriate router.
1421 * RFC 2461 specifies that redirects should only be
1422 * accepted if they come from the nexthop to the target.
1423 * Due to the way the routes are chosen, this notion
1424 * is a bit fuzzy and one might need to check all possible
1428 read_lock_bh(&table
->tb6_lock
);
1429 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
1431 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1433 * Current route is on-link; redirect is always invalid.
1435 * Seems, previous statement is not true. It could
1436 * be node, which looks for us as on-link (f.e. proxy ndisc)
1437 * But then router serving it might decide, that we should
1438 * know truth 8)8) --ANK (980726).
1440 if (rt6_check_expired(rt
))
1442 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1444 if (fl
->oif
!= rt
->rt6i_dev
->ifindex
)
1446 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1452 rt
= net
->ipv6
.ip6_null_entry
;
1453 BACKTRACK(net
, &fl
->fl6_src
);
1455 dst_hold(&rt
->u
.dst
);
1457 read_unlock_bh(&table
->tb6_lock
);
1462 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1463 struct in6_addr
*src
,
1464 struct in6_addr
*gateway
,
1465 struct net_device
*dev
)
1467 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1468 struct net
*net
= dev_net(dev
);
1469 struct ip6rd_flowi rdfl
= {
1471 .oif
= dev
->ifindex
,
1481 ipv6_addr_copy(&rdfl
.gateway
, gateway
);
1483 if (rt6_need_strict(dest
))
1484 flags
|= RT6_LOOKUP_F_IFACE
;
1486 return (struct rt6_info
*)fib6_rule_lookup(net
, (struct flowi
*)&rdfl
,
1487 flags
, __ip6_route_redirect
);
1490 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1491 struct in6_addr
*saddr
,
1492 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1494 struct rt6_info
*rt
, *nrt
= NULL
;
1495 struct netevent_redirect netevent
;
1496 struct net
*net
= dev_net(neigh
->dev
);
1498 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1500 if (rt
== net
->ipv6
.ip6_null_entry
) {
1501 if (net_ratelimit())
1502 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1503 "for redirect target\n");
1508 * We have finally decided to accept it.
1511 neigh_update(neigh
, lladdr
, NUD_STALE
,
1512 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1513 NEIGH_UPDATE_F_OVERRIDE
|
1514 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1515 NEIGH_UPDATE_F_ISROUTER
))
1519 * Redirect received -> path was valid.
1520 * Look, redirects are sent only in response to data packets,
1521 * so that this nexthop apparently is reachable. --ANK
1523 dst_confirm(&rt
->u
.dst
);
1525 /* Duplicate redirect: silently ignore. */
1526 if (neigh
== rt
->u
.dst
.neighbour
)
1529 nrt
= ip6_rt_copy(rt
);
1533 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1535 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1537 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1538 nrt
->rt6i_dst
.plen
= 128;
1539 nrt
->u
.dst
.flags
|= DST_HOST
;
1541 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1542 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1543 /* Reset pmtu, it may be better */
1544 nrt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(neigh
->dev
);
1545 nrt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(dev_net(neigh
->dev
),
1546 dst_mtu(&nrt
->u
.dst
));
1548 if (ip6_ins_rt(nrt
))
1551 netevent
.old
= &rt
->u
.dst
;
1552 netevent
.new = &nrt
->u
.dst
;
1553 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1555 if (rt
->rt6i_flags
&RTF_CACHE
) {
1561 dst_release(&rt
->u
.dst
);
1566 * Handle ICMP "packet too big" messages
1567 * i.e. Path MTU discovery
1570 static void rt6_do_pmtu_disc(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1571 struct net
*net
, u32 pmtu
, int ifindex
)
1573 struct rt6_info
*rt
, *nrt
;
1576 rt
= rt6_lookup(net
, daddr
, saddr
, ifindex
, 0);
1580 if (pmtu
>= dst_mtu(&rt
->u
.dst
))
1583 if (pmtu
< IPV6_MIN_MTU
) {
1585 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1586 * MTU (1280) and a fragment header should always be included
1587 * after a node receiving Too Big message reporting PMTU is
1588 * less than the IPv6 Minimum Link MTU.
1590 pmtu
= IPV6_MIN_MTU
;
1594 /* New mtu received -> path was valid.
1595 They are sent only in response to data packets,
1596 so that this nexthop apparently is reachable. --ANK
1598 dst_confirm(&rt
->u
.dst
);
1600 /* Host route. If it is static, it would be better
1601 not to override it, but add new one, so that
1602 when cache entry will expire old pmtu
1603 would return automatically.
1605 if (rt
->rt6i_flags
& RTF_CACHE
) {
1606 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1608 rt
->u
.dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1609 dst_set_expires(&rt
->u
.dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1610 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1615 Two cases are possible:
1616 1. It is connected route. Action: COW
1617 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1619 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1620 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1622 nrt
= rt6_alloc_clone(rt
, daddr
);
1625 nrt
->u
.dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1627 nrt
->u
.dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1629 /* According to RFC 1981, detecting PMTU increase shouldn't be
1630 * happened within 5 mins, the recommended timer is 10 mins.
1631 * Here this route expiration time is set to ip6_rt_mtu_expires
1632 * which is 10 mins. After 10 mins the decreased pmtu is expired
1633 * and detecting PMTU increase will be automatically happened.
1635 dst_set_expires(&nrt
->u
.dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1636 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1641 dst_release(&rt
->u
.dst
);
1644 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1645 struct net_device
*dev
, u32 pmtu
)
1647 struct net
*net
= dev_net(dev
);
1650 * RFC 1981 states that a node "MUST reduce the size of the packets it
1651 * is sending along the path" that caused the Packet Too Big message.
1652 * Since it's not possible in the general case to determine which
1653 * interface was used to send the original packet, we update the MTU
1654 * on the interface that will be used to send future packets. We also
1655 * update the MTU on the interface that received the Packet Too Big in
1656 * case the original packet was forced out that interface with
1657 * SO_BINDTODEVICE or similar. This is the next best thing to the
1658 * correct behaviour, which would be to update the MTU on all
1661 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, 0);
1662 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, dev
->ifindex
);
1666 * Misc support functions
1669 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1671 struct net
*net
= dev_net(ort
->rt6i_dev
);
1672 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1675 rt
->u
.dst
.input
= ort
->u
.dst
.input
;
1676 rt
->u
.dst
.output
= ort
->u
.dst
.output
;
1678 memcpy(rt
->u
.dst
.metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
1679 rt
->u
.dst
.error
= ort
->u
.dst
.error
;
1680 rt
->u
.dst
.dev
= ort
->u
.dst
.dev
;
1682 dev_hold(rt
->u
.dst
.dev
);
1683 rt
->rt6i_idev
= ort
->rt6i_idev
;
1685 in6_dev_hold(rt
->rt6i_idev
);
1686 rt
->u
.dst
.lastuse
= jiffies
;
1687 rt
->rt6i_expires
= 0;
1689 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1690 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1691 rt
->rt6i_metric
= 0;
1693 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1694 #ifdef CONFIG_IPV6_SUBTREES
1695 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1697 rt
->rt6i_table
= ort
->rt6i_table
;
1702 #ifdef CONFIG_IPV6_ROUTE_INFO
1703 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1704 struct in6_addr
*prefix
, int prefixlen
,
1705 struct in6_addr
*gwaddr
, int ifindex
)
1707 struct fib6_node
*fn
;
1708 struct rt6_info
*rt
= NULL
;
1709 struct fib6_table
*table
;
1711 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1715 write_lock_bh(&table
->tb6_lock
);
1716 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1720 for (rt
= fn
->leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1721 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1723 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1725 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1727 dst_hold(&rt
->u
.dst
);
1731 write_unlock_bh(&table
->tb6_lock
);
1735 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1736 struct in6_addr
*prefix
, int prefixlen
,
1737 struct in6_addr
*gwaddr
, int ifindex
,
1740 struct fib6_config cfg
= {
1741 .fc_table
= RT6_TABLE_INFO
,
1742 .fc_metric
= IP6_RT_PRIO_USER
,
1743 .fc_ifindex
= ifindex
,
1744 .fc_dst_len
= prefixlen
,
1745 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1746 RTF_UP
| RTF_PREF(pref
),
1748 .fc_nlinfo
.nlh
= NULL
,
1749 .fc_nlinfo
.nl_net
= net
,
1752 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1753 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1755 /* We should treat it as a default route if prefix length is 0. */
1757 cfg
.fc_flags
|= RTF_DEFAULT
;
1759 ip6_route_add(&cfg
);
1761 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1765 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1767 struct rt6_info
*rt
;
1768 struct fib6_table
*table
;
1770 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1774 write_lock_bh(&table
->tb6_lock
);
1775 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->u
.dst
.rt6_next
) {
1776 if (dev
== rt
->rt6i_dev
&&
1777 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1778 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1782 dst_hold(&rt
->u
.dst
);
1783 write_unlock_bh(&table
->tb6_lock
);
1787 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1788 struct net_device
*dev
,
1791 struct fib6_config cfg
= {
1792 .fc_table
= RT6_TABLE_DFLT
,
1793 .fc_metric
= IP6_RT_PRIO_USER
,
1794 .fc_ifindex
= dev
->ifindex
,
1795 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1796 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1798 .fc_nlinfo
.nlh
= NULL
,
1799 .fc_nlinfo
.nl_net
= dev_net(dev
),
1802 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1804 ip6_route_add(&cfg
);
1806 return rt6_get_dflt_router(gwaddr
, dev
);
1809 void rt6_purge_dflt_routers(struct net
*net
)
1811 struct rt6_info
*rt
;
1812 struct fib6_table
*table
;
1814 /* NOTE: Keep consistent with rt6_get_dflt_router */
1815 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1820 read_lock_bh(&table
->tb6_lock
);
1821 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->u
.dst
.rt6_next
) {
1822 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1823 dst_hold(&rt
->u
.dst
);
1824 read_unlock_bh(&table
->tb6_lock
);
1829 read_unlock_bh(&table
->tb6_lock
);
1832 static void rtmsg_to_fib6_config(struct net
*net
,
1833 struct in6_rtmsg
*rtmsg
,
1834 struct fib6_config
*cfg
)
1836 memset(cfg
, 0, sizeof(*cfg
));
1838 cfg
->fc_table
= RT6_TABLE_MAIN
;
1839 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1840 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1841 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1842 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1843 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1844 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1846 cfg
->fc_nlinfo
.nl_net
= net
;
1848 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1849 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1850 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1853 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1855 struct fib6_config cfg
;
1856 struct in6_rtmsg rtmsg
;
1860 case SIOCADDRT
: /* Add a route */
1861 case SIOCDELRT
: /* Delete a route */
1862 if (!capable(CAP_NET_ADMIN
))
1864 err
= copy_from_user(&rtmsg
, arg
,
1865 sizeof(struct in6_rtmsg
));
1869 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1874 err
= ip6_route_add(&cfg
);
1877 err
= ip6_route_del(&cfg
);
1891 * Drop the packet on the floor
1894 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
1897 struct dst_entry
*dst
= skb_dst(skb
);
1898 switch (ipstats_mib_noroutes
) {
1899 case IPSTATS_MIB_INNOROUTES
:
1900 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1901 if (type
== IPV6_ADDR_ANY
|| type
== IPV6_ADDR_RESERVED
) {
1902 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1903 IPSTATS_MIB_INADDRERRORS
);
1907 case IPSTATS_MIB_OUTNOROUTES
:
1908 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1909 ipstats_mib_noroutes
);
1912 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0, skb
->dev
);
1917 static int ip6_pkt_discard(struct sk_buff
*skb
)
1919 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1922 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1924 skb
->dev
= skb_dst(skb
)->dev
;
1925 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1928 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1930 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1932 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1935 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1937 skb
->dev
= skb_dst(skb
)->dev
;
1938 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1944 * Allocate a dst for local (unicast / anycast) address.
1947 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1948 const struct in6_addr
*addr
,
1951 struct net
*net
= dev_net(idev
->dev
);
1952 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1953 struct neighbour
*neigh
;
1956 return ERR_PTR(-ENOMEM
);
1958 dev_hold(net
->loopback_dev
);
1961 rt
->u
.dst
.flags
= DST_HOST
;
1962 rt
->u
.dst
.input
= ip6_input
;
1963 rt
->u
.dst
.output
= ip6_output
;
1964 rt
->rt6i_dev
= net
->loopback_dev
;
1965 rt
->rt6i_idev
= idev
;
1966 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
1967 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->u
.dst
));
1968 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1969 rt
->u
.dst
.obsolete
= -1;
1971 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
1973 rt
->rt6i_flags
|= RTF_ANYCAST
;
1975 rt
->rt6i_flags
|= RTF_LOCAL
;
1976 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
1977 if (IS_ERR(neigh
)) {
1978 dst_free(&rt
->u
.dst
);
1980 /* We are casting this because that is the return
1981 * value type. But an errno encoded pointer is the
1982 * same regardless of the underlying pointer type,
1983 * and that's what we are returning. So this is OK.
1985 return (struct rt6_info
*) neigh
;
1987 rt
->rt6i_nexthop
= neigh
;
1989 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1990 rt
->rt6i_dst
.plen
= 128;
1991 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
1993 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
1998 struct arg_dev_net
{
1999 struct net_device
*dev
;
2003 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
2005 struct net_device
*dev
= ((struct arg_dev_net
*)arg
)->dev
;
2006 struct net
*net
= ((struct arg_dev_net
*)arg
)->net
;
2008 if (((void *)rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
2009 rt
!= net
->ipv6
.ip6_null_entry
) {
2010 RT6_TRACE("deleted by ifdown %p\n", rt
);
2016 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
2018 struct arg_dev_net adn
= {
2023 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
2024 icmp6_clean_all(fib6_ifdown
, &adn
);
2027 struct rt6_mtu_change_arg
2029 struct net_device
*dev
;
2033 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2035 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2036 struct inet6_dev
*idev
;
2037 struct net
*net
= dev_net(arg
->dev
);
2039 /* In IPv6 pmtu discovery is not optional,
2040 so that RTAX_MTU lock cannot disable it.
2041 We still use this lock to block changes
2042 caused by addrconf/ndisc.
2045 idev
= __in6_dev_get(arg
->dev
);
2049 /* For administrative MTU increase, there is no way to discover
2050 IPv6 PMTU increase, so PMTU increase should be updated here.
2051 Since RFC 1981 doesn't include administrative MTU increase
2052 update PMTU increase is a MUST. (i.e. jumbo frame)
2055 If new MTU is less than route PMTU, this new MTU will be the
2056 lowest MTU in the path, update the route PMTU to reflect PMTU
2057 decreases; if new MTU is greater than route PMTU, and the
2058 old MTU is the lowest MTU in the path, update the route PMTU
2059 to reflect the increase. In this case if the other nodes' MTU
2060 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2063 if (rt
->rt6i_dev
== arg
->dev
&&
2064 !dst_metric_locked(&rt
->u
.dst
, RTAX_MTU
) &&
2065 (dst_mtu(&rt
->u
.dst
) >= arg
->mtu
||
2066 (dst_mtu(&rt
->u
.dst
) < arg
->mtu
&&
2067 dst_mtu(&rt
->u
.dst
) == idev
->cnf
.mtu6
))) {
2068 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = arg
->mtu
;
2069 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, arg
->mtu
);
2074 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2076 struct rt6_mtu_change_arg arg
= {
2081 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2084 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2085 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2086 [RTA_OIF
] = { .type
= NLA_U32
},
2087 [RTA_IIF
] = { .type
= NLA_U32
},
2088 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2089 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2092 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2093 struct fib6_config
*cfg
)
2096 struct nlattr
*tb
[RTA_MAX
+1];
2099 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2104 rtm
= nlmsg_data(nlh
);
2105 memset(cfg
, 0, sizeof(*cfg
));
2107 cfg
->fc_table
= rtm
->rtm_table
;
2108 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2109 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2110 cfg
->fc_flags
= RTF_UP
;
2111 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2113 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2114 cfg
->fc_flags
|= RTF_REJECT
;
2116 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2117 cfg
->fc_nlinfo
.nlh
= nlh
;
2118 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2120 if (tb
[RTA_GATEWAY
]) {
2121 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2122 cfg
->fc_flags
|= RTF_GATEWAY
;
2126 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2128 if (nla_len(tb
[RTA_DST
]) < plen
)
2131 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2135 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2137 if (nla_len(tb
[RTA_SRC
]) < plen
)
2140 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2144 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2146 if (tb
[RTA_PRIORITY
])
2147 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2149 if (tb
[RTA_METRICS
]) {
2150 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2151 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2155 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2162 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2164 struct fib6_config cfg
;
2167 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2171 return ip6_route_del(&cfg
);
2174 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2176 struct fib6_config cfg
;
2179 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2183 return ip6_route_add(&cfg
);
2186 static inline size_t rt6_nlmsg_size(void)
2188 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2189 + nla_total_size(16) /* RTA_SRC */
2190 + nla_total_size(16) /* RTA_DST */
2191 + nla_total_size(16) /* RTA_GATEWAY */
2192 + nla_total_size(16) /* RTA_PREFSRC */
2193 + nla_total_size(4) /* RTA_TABLE */
2194 + nla_total_size(4) /* RTA_IIF */
2195 + nla_total_size(4) /* RTA_OIF */
2196 + nla_total_size(4) /* RTA_PRIORITY */
2197 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2198 + nla_total_size(sizeof(struct rta_cacheinfo
));
2201 static int rt6_fill_node(struct net
*net
,
2202 struct sk_buff
*skb
, struct rt6_info
*rt
,
2203 struct in6_addr
*dst
, struct in6_addr
*src
,
2204 int iif
, int type
, u32 pid
, u32 seq
,
2205 int prefix
, int nowait
, unsigned int flags
)
2208 struct nlmsghdr
*nlh
;
2212 if (prefix
) { /* user wants prefix routes only */
2213 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2214 /* success since this is not a prefix route */
2219 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2223 rtm
= nlmsg_data(nlh
);
2224 rtm
->rtm_family
= AF_INET6
;
2225 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2226 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2229 table
= rt
->rt6i_table
->tb6_id
;
2231 table
= RT6_TABLE_UNSPEC
;
2232 rtm
->rtm_table
= table
;
2233 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2234 if (rt
->rt6i_flags
&RTF_REJECT
)
2235 rtm
->rtm_type
= RTN_UNREACHABLE
;
2236 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2237 rtm
->rtm_type
= RTN_LOCAL
;
2239 rtm
->rtm_type
= RTN_UNICAST
;
2241 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2242 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2243 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2244 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2245 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2246 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2247 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2248 rtm
->rtm_protocol
= RTPROT_RA
;
2250 if (rt
->rt6i_flags
&RTF_CACHE
)
2251 rtm
->rtm_flags
|= RTM_F_CLONED
;
2254 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2255 rtm
->rtm_dst_len
= 128;
2256 } else if (rtm
->rtm_dst_len
)
2257 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2258 #ifdef CONFIG_IPV6_SUBTREES
2260 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2261 rtm
->rtm_src_len
= 128;
2262 } else if (rtm
->rtm_src_len
)
2263 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2266 #ifdef CONFIG_IPV6_MROUTE
2267 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2268 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2273 goto nla_put_failure
;
2275 if (err
== -EMSGSIZE
)
2276 goto nla_put_failure
;
2281 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2283 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->u
.dst
);
2284 struct in6_addr saddr_buf
;
2285 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2286 dst
, 0, &saddr_buf
) == 0)
2287 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2290 if (rtnetlink_put_metrics(skb
, rt
->u
.dst
.metrics
) < 0)
2291 goto nla_put_failure
;
2293 if (rt
->u
.dst
.neighbour
)
2294 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->u
.dst
.neighbour
->primary_key
);
2297 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2299 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2301 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2303 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2304 expires
= rt
->rt6i_expires
- jiffies
;
2308 if (rtnl_put_cacheinfo(skb
, &rt
->u
.dst
, 0, 0, 0,
2309 expires
, rt
->u
.dst
.error
) < 0)
2310 goto nla_put_failure
;
2312 return nlmsg_end(skb
, nlh
);
2315 nlmsg_cancel(skb
, nlh
);
2319 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2321 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2324 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2325 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2326 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2330 return rt6_fill_node(arg
->net
,
2331 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2332 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2333 prefix
, 0, NLM_F_MULTI
);
2336 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2338 struct net
*net
= sock_net(in_skb
->sk
);
2339 struct nlattr
*tb
[RTA_MAX
+1];
2340 struct rt6_info
*rt
;
2341 struct sk_buff
*skb
;
2346 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2351 memset(&fl
, 0, sizeof(fl
));
2354 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2357 ipv6_addr_copy(&fl
.fl6_src
, nla_data(tb
[RTA_SRC
]));
2361 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2364 ipv6_addr_copy(&fl
.fl6_dst
, nla_data(tb
[RTA_DST
]));
2368 iif
= nla_get_u32(tb
[RTA_IIF
]);
2371 fl
.oif
= nla_get_u32(tb
[RTA_OIF
]);
2374 struct net_device
*dev
;
2375 dev
= __dev_get_by_index(net
, iif
);
2382 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2388 /* Reserve room for dummy headers, this skb can pass
2389 through good chunk of routing engine.
2391 skb_reset_mac_header(skb
);
2392 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2394 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl
);
2395 skb_dst_set(skb
, &rt
->u
.dst
);
2397 err
= rt6_fill_node(net
, skb
, rt
, &fl
.fl6_dst
, &fl
.fl6_src
, iif
,
2398 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2399 nlh
->nlmsg_seq
, 0, 0, 0);
2405 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2410 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2412 struct sk_buff
*skb
;
2413 struct net
*net
= info
->nl_net
;
2418 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2420 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2424 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2425 event
, info
->pid
, seq
, 0, 0, 0);
2427 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2428 WARN_ON(err
== -EMSGSIZE
);
2432 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2433 info
->nlh
, gfp_any());
2437 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2440 static int ip6_route_dev_notify(struct notifier_block
*this,
2441 unsigned long event
, void *data
)
2443 struct net_device
*dev
= (struct net_device
*)data
;
2444 struct net
*net
= dev_net(dev
);
2446 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2447 net
->ipv6
.ip6_null_entry
->u
.dst
.dev
= dev
;
2448 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2449 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2450 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.dev
= dev
;
2451 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2452 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.dev
= dev
;
2453 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2464 #ifdef CONFIG_PROC_FS
2466 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2477 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2479 struct seq_file
*m
= p_arg
;
2481 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2483 #ifdef CONFIG_IPV6_SUBTREES
2484 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2486 seq_puts(m
, "00000000000000000000000000000000 00 ");
2489 if (rt
->rt6i_nexthop
) {
2490 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2492 seq_puts(m
, "00000000000000000000000000000000");
2494 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2495 rt
->rt6i_metric
, atomic_read(&rt
->u
.dst
.__refcnt
),
2496 rt
->u
.dst
.__use
, rt
->rt6i_flags
,
2497 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2501 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2503 struct net
*net
= (struct net
*)m
->private;
2504 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2508 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2510 return single_open_net(inode
, file
, ipv6_route_show
);
2513 static const struct file_operations ipv6_route_proc_fops
= {
2514 .owner
= THIS_MODULE
,
2515 .open
= ipv6_route_open
,
2517 .llseek
= seq_lseek
,
2518 .release
= single_release_net
,
2521 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2523 struct net
*net
= (struct net
*)seq
->private;
2524 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2525 net
->ipv6
.rt6_stats
->fib_nodes
,
2526 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2527 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2528 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2529 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2530 atomic_read(&net
->ipv6
.ip6_dst_ops
.entries
),
2531 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2536 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2538 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2541 static const struct file_operations rt6_stats_seq_fops
= {
2542 .owner
= THIS_MODULE
,
2543 .open
= rt6_stats_seq_open
,
2545 .llseek
= seq_lseek
,
2546 .release
= single_release_net
,
2548 #endif /* CONFIG_PROC_FS */
2550 #ifdef CONFIG_SYSCTL
2553 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
,
2554 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2556 struct net
*net
= current
->nsproxy
->net_ns
;
2557 int delay
= net
->ipv6
.sysctl
.flush_delay
;
2559 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
2560 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2566 ctl_table ipv6_route_table_template
[] = {
2568 .procname
= "flush",
2569 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2570 .maxlen
= sizeof(int),
2572 .proc_handler
= ipv6_sysctl_rtcache_flush
2575 .procname
= "gc_thresh",
2576 .data
= &ip6_dst_ops_template
.gc_thresh
,
2577 .maxlen
= sizeof(int),
2579 .proc_handler
= proc_dointvec
,
2582 .procname
= "max_size",
2583 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2584 .maxlen
= sizeof(int),
2586 .proc_handler
= proc_dointvec
,
2589 .procname
= "gc_min_interval",
2590 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2591 .maxlen
= sizeof(int),
2593 .proc_handler
= proc_dointvec_jiffies
,
2596 .procname
= "gc_timeout",
2597 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2598 .maxlen
= sizeof(int),
2600 .proc_handler
= proc_dointvec_jiffies
,
2603 .procname
= "gc_interval",
2604 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2605 .maxlen
= sizeof(int),
2607 .proc_handler
= proc_dointvec_jiffies
,
2610 .procname
= "gc_elasticity",
2611 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2612 .maxlen
= sizeof(int),
2614 .proc_handler
= proc_dointvec_jiffies
,
2617 .procname
= "mtu_expires",
2618 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2619 .maxlen
= sizeof(int),
2621 .proc_handler
= proc_dointvec_jiffies
,
2624 .procname
= "min_adv_mss",
2625 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2626 .maxlen
= sizeof(int),
2628 .proc_handler
= proc_dointvec_jiffies
,
2631 .procname
= "gc_min_interval_ms",
2632 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2633 .maxlen
= sizeof(int),
2635 .proc_handler
= proc_dointvec_ms_jiffies
,
2640 struct ctl_table
*ipv6_route_sysctl_init(struct net
*net
)
2642 struct ctl_table
*table
;
2644 table
= kmemdup(ipv6_route_table_template
,
2645 sizeof(ipv6_route_table_template
),
2649 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2650 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
2651 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2652 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2653 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2654 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2655 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2656 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2657 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2658 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2665 static int ip6_route_net_init(struct net
*net
)
2669 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
2670 sizeof(net
->ipv6
.ip6_dst_ops
));
2672 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2673 sizeof(*net
->ipv6
.ip6_null_entry
),
2675 if (!net
->ipv6
.ip6_null_entry
)
2676 goto out_ip6_dst_ops
;
2677 net
->ipv6
.ip6_null_entry
->u
.dst
.path
=
2678 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2679 net
->ipv6
.ip6_null_entry
->u
.dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2681 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2682 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2683 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2685 if (!net
->ipv6
.ip6_prohibit_entry
)
2686 goto out_ip6_null_entry
;
2687 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.path
=
2688 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2689 net
->ipv6
.ip6_prohibit_entry
->u
.dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2691 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2692 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2694 if (!net
->ipv6
.ip6_blk_hole_entry
)
2695 goto out_ip6_prohibit_entry
;
2696 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.path
=
2697 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2698 net
->ipv6
.ip6_blk_hole_entry
->u
.dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2701 net
->ipv6
.sysctl
.flush_delay
= 0;
2702 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2703 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2704 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2705 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2706 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2707 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2708 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2710 #ifdef CONFIG_PROC_FS
2711 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2712 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2714 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2720 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2721 out_ip6_prohibit_entry
:
2722 kfree(net
->ipv6
.ip6_prohibit_entry
);
2724 kfree(net
->ipv6
.ip6_null_entry
);
2730 static void ip6_route_net_exit(struct net
*net
)
2732 #ifdef CONFIG_PROC_FS
2733 proc_net_remove(net
, "ipv6_route");
2734 proc_net_remove(net
, "rt6_stats");
2736 kfree(net
->ipv6
.ip6_null_entry
);
2737 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2738 kfree(net
->ipv6
.ip6_prohibit_entry
);
2739 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2743 static struct pernet_operations ip6_route_net_ops
= {
2744 .init
= ip6_route_net_init
,
2745 .exit
= ip6_route_net_exit
,
2748 static struct notifier_block ip6_route_dev_notifier
= {
2749 .notifier_call
= ip6_route_dev_notify
,
2753 int __init
ip6_route_init(void)
2758 ip6_dst_ops_template
.kmem_cachep
=
2759 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2760 SLAB_HWCACHE_ALIGN
, NULL
);
2761 if (!ip6_dst_ops_template
.kmem_cachep
)
2764 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2766 goto out_kmem_cache
;
2768 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2770 /* Registering of the loopback is done before this portion of code,
2771 * the loopback reference in rt6_info will not be taken, do it
2772 * manually for init_net */
2773 init_net
.ipv6
.ip6_null_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2774 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2775 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2776 init_net
.ipv6
.ip6_prohibit_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2777 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2778 init_net
.ipv6
.ip6_blk_hole_entry
->u
.dst
.dev
= init_net
.loopback_dev
;
2779 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2783 goto out_register_subsys
;
2789 ret
= fib6_rules_init();
2794 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2795 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2796 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2797 goto fib6_rules_init
;
2799 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2801 goto fib6_rules_init
;
2807 fib6_rules_cleanup();
2812 out_register_subsys
:
2813 unregister_pernet_subsys(&ip6_route_net_ops
);
2815 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2819 void ip6_route_cleanup(void)
2821 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2822 fib6_rules_cleanup();
2825 unregister_pernet_subsys(&ip6_route_net_ops
);
2826 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);