2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
78 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
79 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
80 static void ip6_dst_destroy(struct dst_entry
*);
81 static void ip6_dst_ifdown(struct dst_entry
*,
82 struct net_device
*dev
, int how
);
83 static int ip6_dst_gc(struct dst_ops
*ops
);
85 static int ip6_pkt_discard(struct sk_buff
*skb
);
86 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
87 static void ip6_link_failure(struct sk_buff
*skb
);
88 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
92 struct in6_addr
*prefix
, int prefixlen
,
93 struct in6_addr
*gwaddr
, int ifindex
,
95 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
96 struct in6_addr
*prefix
, int prefixlen
,
97 struct in6_addr
*gwaddr
, int ifindex
);
100 static struct dst_ops ip6_dst_ops_template
= {
102 .protocol
= cpu_to_be16(ETH_P_IPV6
),
105 .check
= ip6_dst_check
,
106 .destroy
= ip6_dst_destroy
,
107 .ifdown
= ip6_dst_ifdown
,
108 .negative_advice
= ip6_negative_advice
,
109 .link_failure
= ip6_link_failure
,
110 .update_pmtu
= ip6_rt_update_pmtu
,
111 .local_out
= __ip6_local_out
,
112 .entries
= ATOMIC_INIT(0),
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
119 static struct dst_ops ip6_dst_blackhole_ops
= {
121 .protocol
= cpu_to_be16(ETH_P_IPV6
),
122 .destroy
= ip6_dst_destroy
,
123 .check
= ip6_dst_check
,
124 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
125 .entries
= ATOMIC_INIT(0),
128 static struct rt6_info ip6_null_entry_template
= {
130 .__refcnt
= ATOMIC_INIT(1),
133 .error
= -ENETUNREACH
,
134 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
135 .input
= ip6_pkt_discard
,
136 .output
= ip6_pkt_discard_out
,
138 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
139 .rt6i_protocol
= RTPROT_KERNEL
,
140 .rt6i_metric
= ~(u32
) 0,
141 .rt6i_ref
= ATOMIC_INIT(1),
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
146 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
147 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
149 static struct rt6_info ip6_prohibit_entry_template
= {
151 .__refcnt
= ATOMIC_INIT(1),
155 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
156 .input
= ip6_pkt_prohibit
,
157 .output
= ip6_pkt_prohibit_out
,
159 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
160 .rt6i_protocol
= RTPROT_KERNEL
,
161 .rt6i_metric
= ~(u32
) 0,
162 .rt6i_ref
= ATOMIC_INIT(1),
165 static struct rt6_info ip6_blk_hole_entry_template
= {
167 .__refcnt
= ATOMIC_INIT(1),
171 .metrics
= { [RTAX_HOPLIMIT
- 1] = 255, },
172 .input
= dst_discard
,
173 .output
= dst_discard
,
175 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
176 .rt6i_protocol
= RTPROT_KERNEL
,
177 .rt6i_metric
= ~(u32
) 0,
178 .rt6i_ref
= ATOMIC_INIT(1),
183 /* allocate dst with ip6_dst_ops */
184 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
186 return (struct rt6_info
*)dst_alloc(ops
);
189 static void ip6_dst_destroy(struct dst_entry
*dst
)
191 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
192 struct inet6_dev
*idev
= rt
->rt6i_idev
;
195 rt
->rt6i_idev
= NULL
;
200 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
203 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
204 struct inet6_dev
*idev
= rt
->rt6i_idev
;
205 struct net_device
*loopback_dev
=
206 dev_net(dev
)->loopback_dev
;
208 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
209 struct inet6_dev
*loopback_idev
=
210 in6_dev_get(loopback_dev
);
211 if (loopback_idev
!= NULL
) {
212 rt
->rt6i_idev
= loopback_idev
;
218 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
220 return (rt
->rt6i_flags
& RTF_EXPIRES
&&
221 time_after(jiffies
, rt
->rt6i_expires
));
224 static inline int rt6_need_strict(struct in6_addr
*daddr
)
226 return (ipv6_addr_type(daddr
) &
227 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
));
231 * Route lookup. Any table->tb6_lock is implied.
234 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
236 struct in6_addr
*saddr
,
240 struct rt6_info
*local
= NULL
;
241 struct rt6_info
*sprt
;
243 if (!oif
&& ipv6_addr_any(saddr
))
246 for (sprt
= rt
; sprt
; sprt
= sprt
->dst
.rt6_next
) {
247 struct net_device
*dev
= sprt
->rt6i_dev
;
250 if (dev
->ifindex
== oif
)
252 if (dev
->flags
& IFF_LOOPBACK
) {
253 if (sprt
->rt6i_idev
== NULL
||
254 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
255 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
257 if (local
&& (!oif
||
258 local
->rt6i_idev
->dev
->ifindex
== oif
))
264 if (ipv6_chk_addr(net
, saddr
, dev
,
265 flags
& RT6_LOOKUP_F_IFACE
))
274 if (flags
& RT6_LOOKUP_F_IFACE
)
275 return net
->ipv6
.ip6_null_entry
;
281 #ifdef CONFIG_IPV6_ROUTER_PREF
282 static void rt6_probe(struct rt6_info
*rt
)
284 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
286 * Okay, this does not seem to be appropriate
287 * for now, however, we need to check if it
288 * is really so; aka Router Reachability Probing.
290 * Router Reachability Probe MUST be rate-limited
291 * to no more than one per minute.
293 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
295 read_lock_bh(&neigh
->lock
);
296 if (!(neigh
->nud_state
& NUD_VALID
) &&
297 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
298 struct in6_addr mcaddr
;
299 struct in6_addr
*target
;
301 neigh
->updated
= jiffies
;
302 read_unlock_bh(&neigh
->lock
);
304 target
= (struct in6_addr
*)&neigh
->primary_key
;
305 addrconf_addr_solict_mult(target
, &mcaddr
);
306 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
308 read_unlock_bh(&neigh
->lock
);
311 static inline void rt6_probe(struct rt6_info
*rt
)
317 * Default Router Selection (RFC 2461 6.3.6)
319 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
321 struct net_device
*dev
= rt
->rt6i_dev
;
322 if (!oif
|| dev
->ifindex
== oif
)
324 if ((dev
->flags
& IFF_LOOPBACK
) &&
325 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
330 static inline int rt6_check_neigh(struct rt6_info
*rt
)
332 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
334 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
335 !(rt
->rt6i_flags
& RTF_GATEWAY
))
338 read_lock_bh(&neigh
->lock
);
339 if (neigh
->nud_state
& NUD_VALID
)
341 #ifdef CONFIG_IPV6_ROUTER_PREF
342 else if (neigh
->nud_state
& NUD_FAILED
)
347 read_unlock_bh(&neigh
->lock
);
353 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
358 m
= rt6_check_dev(rt
, oif
);
359 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
361 #ifdef CONFIG_IPV6_ROUTER_PREF
362 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
364 n
= rt6_check_neigh(rt
);
365 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
370 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
371 int *mpri
, struct rt6_info
*match
)
375 if (rt6_check_expired(rt
))
378 m
= rt6_score_route(rt
, oif
, strict
);
383 if (strict
& RT6_LOOKUP_F_REACHABLE
)
387 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
395 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
396 struct rt6_info
*rr_head
,
397 u32 metric
, int oif
, int strict
)
399 struct rt6_info
*rt
, *match
;
403 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
404 rt
= rt
->dst
.rt6_next
)
405 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
406 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
407 rt
= rt
->dst
.rt6_next
)
408 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
413 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
415 struct rt6_info
*match
, *rt0
;
418 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
419 __func__
, fn
->leaf
, oif
);
423 fn
->rr_ptr
= rt0
= fn
->leaf
;
425 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
428 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
429 struct rt6_info
*next
= rt0
->dst
.rt6_next
;
431 /* no entries matched; do round-robin */
432 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
439 RT6_TRACE("%s() => %p\n",
442 net
= dev_net(rt0
->rt6i_dev
);
443 return (match
? match
: net
->ipv6
.ip6_null_entry
);
446 #ifdef CONFIG_IPV6_ROUTE_INFO
447 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
448 struct in6_addr
*gwaddr
)
450 struct net
*net
= dev_net(dev
);
451 struct route_info
*rinfo
= (struct route_info
*) opt
;
452 struct in6_addr prefix_buf
, *prefix
;
454 unsigned long lifetime
;
457 if (len
< sizeof(struct route_info
)) {
461 /* Sanity check for prefix_len and length */
462 if (rinfo
->length
> 3) {
464 } else if (rinfo
->prefix_len
> 128) {
466 } else if (rinfo
->prefix_len
> 64) {
467 if (rinfo
->length
< 2) {
470 } else if (rinfo
->prefix_len
> 0) {
471 if (rinfo
->length
< 1) {
476 pref
= rinfo
->route_pref
;
477 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
480 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
482 if (rinfo
->length
== 3)
483 prefix
= (struct in6_addr
*)rinfo
->prefix
;
485 /* this function is safe */
486 ipv6_addr_prefix(&prefix_buf
,
487 (struct in6_addr
*)rinfo
->prefix
,
489 prefix
= &prefix_buf
;
492 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
495 if (rt
&& !lifetime
) {
501 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
504 rt
->rt6i_flags
= RTF_ROUTEINFO
|
505 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
508 if (!addrconf_finite_timeout(lifetime
)) {
509 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
511 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
512 rt
->rt6i_flags
|= RTF_EXPIRES
;
514 dst_release(&rt
->dst
);
520 #define BACKTRACK(__net, saddr) \
522 if (rt == __net->ipv6.ip6_null_entry) { \
523 struct fib6_node *pn; \
525 if (fn->fn_flags & RTN_TL_ROOT) \
528 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
532 if (fn->fn_flags & RTN_RTINFO) \
538 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
539 struct fib6_table
*table
,
540 struct flowi
*fl
, int flags
)
542 struct fib6_node
*fn
;
545 read_lock_bh(&table
->tb6_lock
);
546 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
549 rt
= rt6_device_match(net
, rt
, &fl
->fl6_src
, fl
->oif
, flags
);
550 BACKTRACK(net
, &fl
->fl6_src
);
552 dst_use(&rt
->dst
, jiffies
);
553 read_unlock_bh(&table
->tb6_lock
);
558 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
559 const struct in6_addr
*saddr
, int oif
, int strict
)
569 struct dst_entry
*dst
;
570 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
573 memcpy(&fl
.fl6_src
, saddr
, sizeof(*saddr
));
574 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
577 dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_lookup
);
579 return (struct rt6_info
*) dst
;
586 EXPORT_SYMBOL(rt6_lookup
);
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589 It takes new route entry, the addition fails by any reason the
590 route is freed. In any case, if caller does not hold it, it may
594 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
597 struct fib6_table
*table
;
599 table
= rt
->rt6i_table
;
600 write_lock_bh(&table
->tb6_lock
);
601 err
= fib6_add(&table
->tb6_root
, rt
, info
);
602 write_unlock_bh(&table
->tb6_lock
);
607 int ip6_ins_rt(struct rt6_info
*rt
)
609 struct nl_info info
= {
610 .nl_net
= dev_net(rt
->rt6i_dev
),
612 return __ip6_ins_rt(rt
, &info
);
615 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
616 struct in6_addr
*saddr
)
624 rt
= ip6_rt_copy(ort
);
627 struct neighbour
*neigh
;
628 int attempts
= !in_softirq();
630 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
631 if (rt
->rt6i_dst
.plen
!= 128 &&
632 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
633 rt
->rt6i_flags
|= RTF_ANYCAST
;
634 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
637 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
638 rt
->rt6i_dst
.plen
= 128;
639 rt
->rt6i_flags
|= RTF_CACHE
;
640 rt
->dst
.flags
|= DST_HOST
;
642 #ifdef CONFIG_IPV6_SUBTREES
643 if (rt
->rt6i_src
.plen
&& saddr
) {
644 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
645 rt
->rt6i_src
.plen
= 128;
650 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
652 struct net
*net
= dev_net(rt
->rt6i_dev
);
653 int saved_rt_min_interval
=
654 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
655 int saved_rt_elasticity
=
656 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
658 if (attempts
-- > 0) {
659 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
660 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
662 ip6_dst_gc(&net
->ipv6
.ip6_dst_ops
);
664 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
666 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
667 saved_rt_min_interval
;
673 "ipv6: Neighbour table overflow.\n");
677 rt
->rt6i_nexthop
= neigh
;
684 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
686 struct rt6_info
*rt
= ip6_rt_copy(ort
);
688 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
689 rt
->rt6i_dst
.plen
= 128;
690 rt
->rt6i_flags
|= RTF_CACHE
;
691 rt
->dst
.flags
|= DST_HOST
;
692 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
697 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
698 struct flowi
*fl
, int flags
)
700 struct fib6_node
*fn
;
701 struct rt6_info
*rt
, *nrt
;
705 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
707 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
710 read_lock_bh(&table
->tb6_lock
);
713 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
716 rt
= rt6_select(fn
, oif
, strict
| reachable
);
718 BACKTRACK(net
, &fl
->fl6_src
);
719 if (rt
== net
->ipv6
.ip6_null_entry
||
720 rt
->rt6i_flags
& RTF_CACHE
)
724 read_unlock_bh(&table
->tb6_lock
);
726 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
727 nrt
= rt6_alloc_cow(rt
, &fl
->fl6_dst
, &fl
->fl6_src
);
729 #if CLONE_OFFLINK_ROUTE
730 nrt
= rt6_alloc_clone(rt
, &fl
->fl6_dst
);
736 dst_release(&rt
->dst
);
737 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
741 err
= ip6_ins_rt(nrt
);
750 * Race condition! In the gap, when table->tb6_lock was
751 * released someone could insert this route. Relookup.
753 dst_release(&rt
->dst
);
762 read_unlock_bh(&table
->tb6_lock
);
764 rt
->dst
.lastuse
= jiffies
;
770 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
771 struct flowi
*fl
, int flags
)
773 return ip6_pol_route(net
, table
, fl
->iif
, fl
, flags
);
776 void ip6_route_input(struct sk_buff
*skb
)
778 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
779 struct net
*net
= dev_net(skb
->dev
);
780 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
782 .iif
= skb
->dev
->ifindex
,
787 .flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
791 .proto
= iph
->nexthdr
,
794 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
795 flags
|= RT6_LOOKUP_F_IFACE
;
797 skb_dst_set(skb
, fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_input
));
800 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
801 struct flowi
*fl
, int flags
)
803 return ip6_pol_route(net
, table
, fl
->oif
, fl
, flags
);
806 struct dst_entry
* ip6_route_output(struct net
*net
, struct sock
*sk
,
811 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl
->fl6_dst
))
812 flags
|= RT6_LOOKUP_F_IFACE
;
814 if (!ipv6_addr_any(&fl
->fl6_src
))
815 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
817 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
819 return fib6_rule_lookup(net
, fl
, flags
, ip6_pol_route_output
);
822 EXPORT_SYMBOL(ip6_route_output
);
824 int ip6_dst_blackhole(struct sock
*sk
, struct dst_entry
**dstp
, struct flowi
*fl
)
826 struct rt6_info
*ort
= (struct rt6_info
*) *dstp
;
827 struct rt6_info
*rt
= (struct rt6_info
*)
828 dst_alloc(&ip6_dst_blackhole_ops
);
829 struct dst_entry
*new = NULL
;
834 atomic_set(&new->__refcnt
, 1);
836 new->input
= dst_discard
;
837 new->output
= dst_discard
;
839 memcpy(new->metrics
, ort
->dst
.metrics
, RTAX_MAX
*sizeof(u32
));
840 new->dev
= ort
->dst
.dev
;
843 rt
->rt6i_idev
= ort
->rt6i_idev
;
845 in6_dev_hold(rt
->rt6i_idev
);
846 rt
->rt6i_expires
= 0;
848 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
849 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
852 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
853 #ifdef CONFIG_IPV6_SUBTREES
854 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
862 return (new ? 0 : -ENOMEM
);
864 EXPORT_SYMBOL_GPL(ip6_dst_blackhole
);
867 * Destination cache support functions
870 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
874 rt
= (struct rt6_info
*) dst
;
876 if (rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
))
882 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
884 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
887 if (rt
->rt6i_flags
& RTF_CACHE
) {
888 if (rt6_check_expired(rt
)) {
900 static void ip6_link_failure(struct sk_buff
*skb
)
904 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
906 rt
= (struct rt6_info
*) skb_dst(skb
);
908 if (rt
->rt6i_flags
&RTF_CACHE
) {
909 dst_set_expires(&rt
->dst
, 0);
910 rt
->rt6i_flags
|= RTF_EXPIRES
;
911 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
912 rt
->rt6i_node
->fn_sernum
= -1;
916 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
918 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
920 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
921 rt6
->rt6i_flags
|= RTF_MODIFIED
;
922 if (mtu
< IPV6_MIN_MTU
) {
924 dst
->metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
926 dst
->metrics
[RTAX_MTU
-1] = mtu
;
927 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
931 static int ipv6_get_mtu(struct net_device
*dev
);
933 static inline unsigned int ipv6_advmss(struct net
*net
, unsigned int mtu
)
935 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
937 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
938 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
941 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
942 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
943 * IPV6_MAXPLEN is also valid and means: "any MSS,
944 * rely only on pmtu discovery"
946 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
951 static struct dst_entry
*icmp6_dst_gc_list
;
952 static DEFINE_SPINLOCK(icmp6_dst_lock
);
954 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
955 struct neighbour
*neigh
,
956 const struct in6_addr
*addr
)
959 struct inet6_dev
*idev
= in6_dev_get(dev
);
960 struct net
*net
= dev_net(dev
);
962 if (unlikely(idev
== NULL
))
965 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
966 if (unlikely(rt
== NULL
)) {
975 neigh
= ndisc_get_neigh(dev
, addr
);
981 rt
->rt6i_idev
= idev
;
982 rt
->rt6i_nexthop
= neigh
;
983 atomic_set(&rt
->dst
.__refcnt
, 1);
984 rt
->dst
.metrics
[RTAX_HOPLIMIT
-1] = 255;
985 rt
->dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
986 rt
->dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->dst
));
987 rt
->dst
.output
= ip6_output
;
990 spin_lock_bh(&icmp6_dst_lock
);
991 rt
->dst
.next
= icmp6_dst_gc_list
;
992 icmp6_dst_gc_list
= &rt
->dst
;
993 spin_unlock_bh(&icmp6_dst_lock
);
995 fib6_force_start_gc(net
);
1001 int icmp6_dst_gc(void)
1003 struct dst_entry
*dst
, *next
, **pprev
;
1008 spin_lock_bh(&icmp6_dst_lock
);
1009 pprev
= &icmp6_dst_gc_list
;
1011 while ((dst
= *pprev
) != NULL
) {
1012 if (!atomic_read(&dst
->__refcnt
)) {
1021 spin_unlock_bh(&icmp6_dst_lock
);
1026 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1029 struct dst_entry
*dst
, **pprev
;
1031 spin_lock_bh(&icmp6_dst_lock
);
1032 pprev
= &icmp6_dst_gc_list
;
1033 while ((dst
= *pprev
) != NULL
) {
1034 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1035 if (func(rt
, arg
)) {
1042 spin_unlock_bh(&icmp6_dst_lock
);
1045 static int ip6_dst_gc(struct dst_ops
*ops
)
1047 unsigned long now
= jiffies
;
1048 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1049 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1050 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1051 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1052 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1053 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1055 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1056 atomic_read(&ops
->entries
) <= rt_max_size
)
1059 net
->ipv6
.ip6_rt_gc_expire
++;
1060 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1061 net
->ipv6
.ip6_rt_last_gc
= now
;
1062 if (atomic_read(&ops
->entries
) < ops
->gc_thresh
)
1063 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1065 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1066 return (atomic_read(&ops
->entries
) > rt_max_size
);
1069 /* Clean host part of a prefix. Not necessary in radix tree,
1070 but results in cleaner routing tables.
1072 Remove it only when all the things will work!
1075 static int ipv6_get_mtu(struct net_device
*dev
)
1077 int mtu
= IPV6_MIN_MTU
;
1078 struct inet6_dev
*idev
;
1081 idev
= __in6_dev_get(dev
);
1083 mtu
= idev
->cnf
.mtu6
;
1088 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1090 int hoplimit
= dst_metric(dst
, RTAX_HOPLIMIT
);
1092 struct net_device
*dev
= dst
->dev
;
1093 struct inet6_dev
*idev
;
1096 idev
= __in6_dev_get(dev
);
1098 hoplimit
= idev
->cnf
.hop_limit
;
1100 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1110 int ip6_route_add(struct fib6_config
*cfg
)
1113 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1114 struct rt6_info
*rt
= NULL
;
1115 struct net_device
*dev
= NULL
;
1116 struct inet6_dev
*idev
= NULL
;
1117 struct fib6_table
*table
;
1120 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1122 #ifndef CONFIG_IPV6_SUBTREES
1123 if (cfg
->fc_src_len
)
1126 if (cfg
->fc_ifindex
) {
1128 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1131 idev
= in6_dev_get(dev
);
1136 if (cfg
->fc_metric
== 0)
1137 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1139 table
= fib6_new_table(net
, cfg
->fc_table
);
1140 if (table
== NULL
) {
1145 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1152 rt
->dst
.obsolete
= -1;
1153 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1154 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1157 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1158 cfg
->fc_protocol
= RTPROT_BOOT
;
1159 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1161 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1163 if (addr_type
& IPV6_ADDR_MULTICAST
)
1164 rt
->dst
.input
= ip6_mc_input
;
1166 rt
->dst
.input
= ip6_forward
;
1168 rt
->dst
.output
= ip6_output
;
1170 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1171 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1172 if (rt
->rt6i_dst
.plen
== 128)
1173 rt
->dst
.flags
= DST_HOST
;
1175 #ifdef CONFIG_IPV6_SUBTREES
1176 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1177 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1180 rt
->rt6i_metric
= cfg
->fc_metric
;
1182 /* We cannot add true routes via loopback here,
1183 they would result in kernel looping; promote them to reject routes
1185 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1186 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
))) {
1187 /* hold loopback dev/idev if we haven't done so. */
1188 if (dev
!= net
->loopback_dev
) {
1193 dev
= net
->loopback_dev
;
1195 idev
= in6_dev_get(dev
);
1201 rt
->dst
.output
= ip6_pkt_discard_out
;
1202 rt
->dst
.input
= ip6_pkt_discard
;
1203 rt
->dst
.error
= -ENETUNREACH
;
1204 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1208 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1209 struct in6_addr
*gw_addr
;
1212 gw_addr
= &cfg
->fc_gateway
;
1213 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1214 gwa_type
= ipv6_addr_type(gw_addr
);
1216 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1217 struct rt6_info
*grt
;
1219 /* IPv6 strictly inhibits using not link-local
1220 addresses as nexthop address.
1221 Otherwise, router will not able to send redirects.
1222 It is very good, but in some (rare!) circumstances
1223 (SIT, PtP, NBMA NOARP links) it is handy to allow
1224 some exceptions. --ANK
1227 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1230 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1232 err
= -EHOSTUNREACH
;
1236 if (dev
!= grt
->rt6i_dev
) {
1237 dst_release(&grt
->dst
);
1241 dev
= grt
->rt6i_dev
;
1242 idev
= grt
->rt6i_idev
;
1244 in6_dev_hold(grt
->rt6i_idev
);
1246 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1248 dst_release(&grt
->dst
);
1254 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1262 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1263 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1264 if (IS_ERR(rt
->rt6i_nexthop
)) {
1265 err
= PTR_ERR(rt
->rt6i_nexthop
);
1266 rt
->rt6i_nexthop
= NULL
;
1271 rt
->rt6i_flags
= cfg
->fc_flags
;
1278 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1279 int type
= nla_type(nla
);
1282 if (type
> RTAX_MAX
) {
1287 rt
->dst
.metrics
[type
- 1] = nla_get_u32(nla
);
1292 if (dst_metric(&rt
->dst
, RTAX_HOPLIMIT
) == 0)
1293 rt
->dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1294 if (!dst_mtu(&rt
->dst
))
1295 rt
->dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(dev
);
1296 if (!dst_metric(&rt
->dst
, RTAX_ADVMSS
))
1297 rt
->dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->dst
));
1299 rt
->rt6i_idev
= idev
;
1300 rt
->rt6i_table
= table
;
1302 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1304 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1316 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1319 struct fib6_table
*table
;
1320 struct net
*net
= dev_net(rt
->rt6i_dev
);
1322 if (rt
== net
->ipv6
.ip6_null_entry
)
1325 table
= rt
->rt6i_table
;
1326 write_lock_bh(&table
->tb6_lock
);
1328 err
= fib6_del(rt
, info
);
1329 dst_release(&rt
->dst
);
1331 write_unlock_bh(&table
->tb6_lock
);
1336 int ip6_del_rt(struct rt6_info
*rt
)
1338 struct nl_info info
= {
1339 .nl_net
= dev_net(rt
->rt6i_dev
),
1341 return __ip6_del_rt(rt
, &info
);
1344 static int ip6_route_del(struct fib6_config
*cfg
)
1346 struct fib6_table
*table
;
1347 struct fib6_node
*fn
;
1348 struct rt6_info
*rt
;
1351 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1355 read_lock_bh(&table
->tb6_lock
);
1357 fn
= fib6_locate(&table
->tb6_root
,
1358 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1359 &cfg
->fc_src
, cfg
->fc_src_len
);
1362 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1363 if (cfg
->fc_ifindex
&&
1364 (rt
->rt6i_dev
== NULL
||
1365 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1367 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1368 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1370 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1373 read_unlock_bh(&table
->tb6_lock
);
1375 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1378 read_unlock_bh(&table
->tb6_lock
);
1386 struct ip6rd_flowi
{
1388 struct in6_addr gateway
;
1391 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1392 struct fib6_table
*table
,
1396 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl
;
1397 struct rt6_info
*rt
;
1398 struct fib6_node
*fn
;
1401 * Get the "current" route for this destination and
1402 * check if the redirect has come from approriate router.
1404 * RFC 2461 specifies that redirects should only be
1405 * accepted if they come from the nexthop to the target.
1406 * Due to the way the routes are chosen, this notion
1407 * is a bit fuzzy and one might need to check all possible
1411 read_lock_bh(&table
->tb6_lock
);
1412 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
1414 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1416 * Current route is on-link; redirect is always invalid.
1418 * Seems, previous statement is not true. It could
1419 * be node, which looks for us as on-link (f.e. proxy ndisc)
1420 * But then router serving it might decide, that we should
1421 * know truth 8)8) --ANK (980726).
1423 if (rt6_check_expired(rt
))
1425 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1427 if (fl
->oif
!= rt
->rt6i_dev
->ifindex
)
1429 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1435 rt
= net
->ipv6
.ip6_null_entry
;
1436 BACKTRACK(net
, &fl
->fl6_src
);
1440 read_unlock_bh(&table
->tb6_lock
);
1445 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1446 struct in6_addr
*src
,
1447 struct in6_addr
*gateway
,
1448 struct net_device
*dev
)
1450 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1451 struct net
*net
= dev_net(dev
);
1452 struct ip6rd_flowi rdfl
= {
1454 .oif
= dev
->ifindex
,
1464 ipv6_addr_copy(&rdfl
.gateway
, gateway
);
1466 if (rt6_need_strict(dest
))
1467 flags
|= RT6_LOOKUP_F_IFACE
;
1469 return (struct rt6_info
*)fib6_rule_lookup(net
, (struct flowi
*)&rdfl
,
1470 flags
, __ip6_route_redirect
);
1473 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1474 struct in6_addr
*saddr
,
1475 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1477 struct rt6_info
*rt
, *nrt
= NULL
;
1478 struct netevent_redirect netevent
;
1479 struct net
*net
= dev_net(neigh
->dev
);
1481 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1483 if (rt
== net
->ipv6
.ip6_null_entry
) {
1484 if (net_ratelimit())
1485 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1486 "for redirect target\n");
1491 * We have finally decided to accept it.
1494 neigh_update(neigh
, lladdr
, NUD_STALE
,
1495 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1496 NEIGH_UPDATE_F_OVERRIDE
|
1497 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1498 NEIGH_UPDATE_F_ISROUTER
))
1502 * Redirect received -> path was valid.
1503 * Look, redirects are sent only in response to data packets,
1504 * so that this nexthop apparently is reachable. --ANK
1506 dst_confirm(&rt
->dst
);
1508 /* Duplicate redirect: silently ignore. */
1509 if (neigh
== rt
->dst
.neighbour
)
1512 nrt
= ip6_rt_copy(rt
);
1516 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1518 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1520 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1521 nrt
->rt6i_dst
.plen
= 128;
1522 nrt
->dst
.flags
|= DST_HOST
;
1524 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1525 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1526 /* Reset pmtu, it may be better */
1527 nrt
->dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(neigh
->dev
);
1528 nrt
->dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(dev_net(neigh
->dev
),
1529 dst_mtu(&nrt
->dst
));
1531 if (ip6_ins_rt(nrt
))
1534 netevent
.old
= &rt
->dst
;
1535 netevent
.new = &nrt
->dst
;
1536 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1538 if (rt
->rt6i_flags
&RTF_CACHE
) {
1544 dst_release(&rt
->dst
);
1548 * Handle ICMP "packet too big" messages
1549 * i.e. Path MTU discovery
1552 static void rt6_do_pmtu_disc(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1553 struct net
*net
, u32 pmtu
, int ifindex
)
1555 struct rt6_info
*rt
, *nrt
;
1558 rt
= rt6_lookup(net
, daddr
, saddr
, ifindex
, 0);
1562 if (pmtu
>= dst_mtu(&rt
->dst
))
1565 if (pmtu
< IPV6_MIN_MTU
) {
1567 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1568 * MTU (1280) and a fragment header should always be included
1569 * after a node receiving Too Big message reporting PMTU is
1570 * less than the IPv6 Minimum Link MTU.
1572 pmtu
= IPV6_MIN_MTU
;
1576 /* New mtu received -> path was valid.
1577 They are sent only in response to data packets,
1578 so that this nexthop apparently is reachable. --ANK
1580 dst_confirm(&rt
->dst
);
1582 /* Host route. If it is static, it would be better
1583 not to override it, but add new one, so that
1584 when cache entry will expire old pmtu
1585 would return automatically.
1587 if (rt
->rt6i_flags
& RTF_CACHE
) {
1588 rt
->dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1590 rt
->dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1591 dst_set_expires(&rt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1592 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1597 Two cases are possible:
1598 1. It is connected route. Action: COW
1599 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1601 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1602 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1604 nrt
= rt6_alloc_clone(rt
, daddr
);
1607 nrt
->dst
.metrics
[RTAX_MTU
-1] = pmtu
;
1609 nrt
->dst
.metrics
[RTAX_FEATURES
-1] |= RTAX_FEATURE_ALLFRAG
;
1611 /* According to RFC 1981, detecting PMTU increase shouldn't be
1612 * happened within 5 mins, the recommended timer is 10 mins.
1613 * Here this route expiration time is set to ip6_rt_mtu_expires
1614 * which is 10 mins. After 10 mins the decreased pmtu is expired
1615 * and detecting PMTU increase will be automatically happened.
1617 dst_set_expires(&nrt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1618 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1623 dst_release(&rt
->dst
);
1626 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1627 struct net_device
*dev
, u32 pmtu
)
1629 struct net
*net
= dev_net(dev
);
1632 * RFC 1981 states that a node "MUST reduce the size of the packets it
1633 * is sending along the path" that caused the Packet Too Big message.
1634 * Since it's not possible in the general case to determine which
1635 * interface was used to send the original packet, we update the MTU
1636 * on the interface that will be used to send future packets. We also
1637 * update the MTU on the interface that received the Packet Too Big in
1638 * case the original packet was forced out that interface with
1639 * SO_BINDTODEVICE or similar. This is the next best thing to the
1640 * correct behaviour, which would be to update the MTU on all
1643 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, 0);
1644 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, dev
->ifindex
);
1648 * Misc support functions
1651 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1653 struct net
*net
= dev_net(ort
->rt6i_dev
);
1654 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1657 rt
->dst
.input
= ort
->dst
.input
;
1658 rt
->dst
.output
= ort
->dst
.output
;
1660 memcpy(rt
->dst
.metrics
, ort
->dst
.metrics
, RTAX_MAX
*sizeof(u32
));
1661 rt
->dst
.error
= ort
->dst
.error
;
1662 rt
->dst
.dev
= ort
->dst
.dev
;
1664 dev_hold(rt
->dst
.dev
);
1665 rt
->rt6i_idev
= ort
->rt6i_idev
;
1667 in6_dev_hold(rt
->rt6i_idev
);
1668 rt
->dst
.lastuse
= jiffies
;
1669 rt
->rt6i_expires
= 0;
1671 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1672 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1673 rt
->rt6i_metric
= 0;
1675 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1676 #ifdef CONFIG_IPV6_SUBTREES
1677 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1679 rt
->rt6i_table
= ort
->rt6i_table
;
1684 #ifdef CONFIG_IPV6_ROUTE_INFO
1685 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1686 struct in6_addr
*prefix
, int prefixlen
,
1687 struct in6_addr
*gwaddr
, int ifindex
)
1689 struct fib6_node
*fn
;
1690 struct rt6_info
*rt
= NULL
;
1691 struct fib6_table
*table
;
1693 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1697 write_lock_bh(&table
->tb6_lock
);
1698 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1702 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1703 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1705 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1707 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1713 write_unlock_bh(&table
->tb6_lock
);
1717 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1718 struct in6_addr
*prefix
, int prefixlen
,
1719 struct in6_addr
*gwaddr
, int ifindex
,
1722 struct fib6_config cfg
= {
1723 .fc_table
= RT6_TABLE_INFO
,
1724 .fc_metric
= IP6_RT_PRIO_USER
,
1725 .fc_ifindex
= ifindex
,
1726 .fc_dst_len
= prefixlen
,
1727 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1728 RTF_UP
| RTF_PREF(pref
),
1730 .fc_nlinfo
.nlh
= NULL
,
1731 .fc_nlinfo
.nl_net
= net
,
1734 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1735 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1737 /* We should treat it as a default route if prefix length is 0. */
1739 cfg
.fc_flags
|= RTF_DEFAULT
;
1741 ip6_route_add(&cfg
);
1743 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1747 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1749 struct rt6_info
*rt
;
1750 struct fib6_table
*table
;
1752 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1756 write_lock_bh(&table
->tb6_lock
);
1757 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->dst
.rt6_next
) {
1758 if (dev
== rt
->rt6i_dev
&&
1759 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1760 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1765 write_unlock_bh(&table
->tb6_lock
);
1769 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1770 struct net_device
*dev
,
1773 struct fib6_config cfg
= {
1774 .fc_table
= RT6_TABLE_DFLT
,
1775 .fc_metric
= IP6_RT_PRIO_USER
,
1776 .fc_ifindex
= dev
->ifindex
,
1777 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1778 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1780 .fc_nlinfo
.nlh
= NULL
,
1781 .fc_nlinfo
.nl_net
= dev_net(dev
),
1784 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1786 ip6_route_add(&cfg
);
1788 return rt6_get_dflt_router(gwaddr
, dev
);
1791 void rt6_purge_dflt_routers(struct net
*net
)
1793 struct rt6_info
*rt
;
1794 struct fib6_table
*table
;
1796 /* NOTE: Keep consistent with rt6_get_dflt_router */
1797 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1802 read_lock_bh(&table
->tb6_lock
);
1803 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1804 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1806 read_unlock_bh(&table
->tb6_lock
);
1811 read_unlock_bh(&table
->tb6_lock
);
1814 static void rtmsg_to_fib6_config(struct net
*net
,
1815 struct in6_rtmsg
*rtmsg
,
1816 struct fib6_config
*cfg
)
1818 memset(cfg
, 0, sizeof(*cfg
));
1820 cfg
->fc_table
= RT6_TABLE_MAIN
;
1821 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1822 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1823 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1824 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1825 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1826 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1828 cfg
->fc_nlinfo
.nl_net
= net
;
1830 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1831 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1832 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1835 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1837 struct fib6_config cfg
;
1838 struct in6_rtmsg rtmsg
;
1842 case SIOCADDRT
: /* Add a route */
1843 case SIOCDELRT
: /* Delete a route */
1844 if (!capable(CAP_NET_ADMIN
))
1846 err
= copy_from_user(&rtmsg
, arg
,
1847 sizeof(struct in6_rtmsg
));
1851 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1856 err
= ip6_route_add(&cfg
);
1859 err
= ip6_route_del(&cfg
);
1873 * Drop the packet on the floor
1876 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
1879 struct dst_entry
*dst
= skb_dst(skb
);
1880 switch (ipstats_mib_noroutes
) {
1881 case IPSTATS_MIB_INNOROUTES
:
1882 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1883 if (type
== IPV6_ADDR_ANY
) {
1884 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1885 IPSTATS_MIB_INADDRERRORS
);
1889 case IPSTATS_MIB_OUTNOROUTES
:
1890 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1891 ipstats_mib_noroutes
);
1894 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
1899 static int ip6_pkt_discard(struct sk_buff
*skb
)
1901 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1904 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1906 skb
->dev
= skb_dst(skb
)->dev
;
1907 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1910 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1912 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1914 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1917 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1919 skb
->dev
= skb_dst(skb
)->dev
;
1920 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1926 * Allocate a dst for local (unicast / anycast) address.
1929 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1930 const struct in6_addr
*addr
,
1933 struct net
*net
= dev_net(idev
->dev
);
1934 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1935 struct neighbour
*neigh
;
1938 return ERR_PTR(-ENOMEM
);
1940 dev_hold(net
->loopback_dev
);
1943 rt
->dst
.flags
= DST_HOST
;
1944 rt
->dst
.input
= ip6_input
;
1945 rt
->dst
.output
= ip6_output
;
1946 rt
->rt6i_dev
= net
->loopback_dev
;
1947 rt
->rt6i_idev
= idev
;
1948 rt
->dst
.metrics
[RTAX_MTU
-1] = ipv6_get_mtu(rt
->rt6i_dev
);
1949 rt
->dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, dst_mtu(&rt
->dst
));
1950 rt
->dst
.metrics
[RTAX_HOPLIMIT
-1] = -1;
1951 rt
->dst
.obsolete
= -1;
1953 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
1955 rt
->rt6i_flags
|= RTF_ANYCAST
;
1957 rt
->rt6i_flags
|= RTF_LOCAL
;
1958 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
1959 if (IS_ERR(neigh
)) {
1962 /* We are casting this because that is the return
1963 * value type. But an errno encoded pointer is the
1964 * same regardless of the underlying pointer type,
1965 * and that's what we are returning. So this is OK.
1967 return (struct rt6_info
*) neigh
;
1969 rt
->rt6i_nexthop
= neigh
;
1971 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1972 rt
->rt6i_dst
.plen
= 128;
1973 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
1975 atomic_set(&rt
->dst
.__refcnt
, 1);
1980 struct arg_dev_net
{
1981 struct net_device
*dev
;
1985 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
1987 struct net_device
*dev
= ((struct arg_dev_net
*)arg
)->dev
;
1988 struct net
*net
= ((struct arg_dev_net
*)arg
)->net
;
1990 if (((void *)rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
1991 rt
!= net
->ipv6
.ip6_null_entry
) {
1992 RT6_TRACE("deleted by ifdown %p\n", rt
);
1998 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
2000 struct arg_dev_net adn
= {
2005 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
2006 icmp6_clean_all(fib6_ifdown
, &adn
);
2009 struct rt6_mtu_change_arg
2011 struct net_device
*dev
;
2015 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2017 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2018 struct inet6_dev
*idev
;
2019 struct net
*net
= dev_net(arg
->dev
);
2021 /* In IPv6 pmtu discovery is not optional,
2022 so that RTAX_MTU lock cannot disable it.
2023 We still use this lock to block changes
2024 caused by addrconf/ndisc.
2027 idev
= __in6_dev_get(arg
->dev
);
2031 /* For administrative MTU increase, there is no way to discover
2032 IPv6 PMTU increase, so PMTU increase should be updated here.
2033 Since RFC 1981 doesn't include administrative MTU increase
2034 update PMTU increase is a MUST. (i.e. jumbo frame)
2037 If new MTU is less than route PMTU, this new MTU will be the
2038 lowest MTU in the path, update the route PMTU to reflect PMTU
2039 decreases; if new MTU is greater than route PMTU, and the
2040 old MTU is the lowest MTU in the path, update the route PMTU
2041 to reflect the increase. In this case if the other nodes' MTU
2042 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2045 if (rt
->rt6i_dev
== arg
->dev
&&
2046 !dst_metric_locked(&rt
->dst
, RTAX_MTU
) &&
2047 (dst_mtu(&rt
->dst
) >= arg
->mtu
||
2048 (dst_mtu(&rt
->dst
) < arg
->mtu
&&
2049 dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
))) {
2050 rt
->dst
.metrics
[RTAX_MTU
-1] = arg
->mtu
;
2051 rt
->dst
.metrics
[RTAX_ADVMSS
-1] = ipv6_advmss(net
, arg
->mtu
);
2056 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2058 struct rt6_mtu_change_arg arg
= {
2063 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2066 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2067 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2068 [RTA_OIF
] = { .type
= NLA_U32
},
2069 [RTA_IIF
] = { .type
= NLA_U32
},
2070 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2071 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2074 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2075 struct fib6_config
*cfg
)
2078 struct nlattr
*tb
[RTA_MAX
+1];
2081 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2086 rtm
= nlmsg_data(nlh
);
2087 memset(cfg
, 0, sizeof(*cfg
));
2089 cfg
->fc_table
= rtm
->rtm_table
;
2090 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2091 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2092 cfg
->fc_flags
= RTF_UP
;
2093 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2095 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2096 cfg
->fc_flags
|= RTF_REJECT
;
2098 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2099 cfg
->fc_nlinfo
.nlh
= nlh
;
2100 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2102 if (tb
[RTA_GATEWAY
]) {
2103 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2104 cfg
->fc_flags
|= RTF_GATEWAY
;
2108 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2110 if (nla_len(tb
[RTA_DST
]) < plen
)
2113 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2117 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2119 if (nla_len(tb
[RTA_SRC
]) < plen
)
2122 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2126 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2128 if (tb
[RTA_PRIORITY
])
2129 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2131 if (tb
[RTA_METRICS
]) {
2132 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2133 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2137 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2144 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2146 struct fib6_config cfg
;
2149 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2153 return ip6_route_del(&cfg
);
2156 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2158 struct fib6_config cfg
;
2161 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2165 return ip6_route_add(&cfg
);
2168 static inline size_t rt6_nlmsg_size(void)
2170 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2171 + nla_total_size(16) /* RTA_SRC */
2172 + nla_total_size(16) /* RTA_DST */
2173 + nla_total_size(16) /* RTA_GATEWAY */
2174 + nla_total_size(16) /* RTA_PREFSRC */
2175 + nla_total_size(4) /* RTA_TABLE */
2176 + nla_total_size(4) /* RTA_IIF */
2177 + nla_total_size(4) /* RTA_OIF */
2178 + nla_total_size(4) /* RTA_PRIORITY */
2179 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2180 + nla_total_size(sizeof(struct rta_cacheinfo
));
2183 static int rt6_fill_node(struct net
*net
,
2184 struct sk_buff
*skb
, struct rt6_info
*rt
,
2185 struct in6_addr
*dst
, struct in6_addr
*src
,
2186 int iif
, int type
, u32 pid
, u32 seq
,
2187 int prefix
, int nowait
, unsigned int flags
)
2190 struct nlmsghdr
*nlh
;
2194 if (prefix
) { /* user wants prefix routes only */
2195 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2196 /* success since this is not a prefix route */
2201 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2205 rtm
= nlmsg_data(nlh
);
2206 rtm
->rtm_family
= AF_INET6
;
2207 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2208 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2211 table
= rt
->rt6i_table
->tb6_id
;
2213 table
= RT6_TABLE_UNSPEC
;
2214 rtm
->rtm_table
= table
;
2215 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2216 if (rt
->rt6i_flags
&RTF_REJECT
)
2217 rtm
->rtm_type
= RTN_UNREACHABLE
;
2218 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2219 rtm
->rtm_type
= RTN_LOCAL
;
2221 rtm
->rtm_type
= RTN_UNICAST
;
2223 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2224 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2225 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2226 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2227 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2228 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2229 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2230 rtm
->rtm_protocol
= RTPROT_RA
;
2232 if (rt
->rt6i_flags
&RTF_CACHE
)
2233 rtm
->rtm_flags
|= RTM_F_CLONED
;
2236 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2237 rtm
->rtm_dst_len
= 128;
2238 } else if (rtm
->rtm_dst_len
)
2239 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2240 #ifdef CONFIG_IPV6_SUBTREES
2242 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2243 rtm
->rtm_src_len
= 128;
2244 } else if (rtm
->rtm_src_len
)
2245 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2248 #ifdef CONFIG_IPV6_MROUTE
2249 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2250 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2255 goto nla_put_failure
;
2257 if (err
== -EMSGSIZE
)
2258 goto nla_put_failure
;
2263 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2265 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->dst
);
2266 struct in6_addr saddr_buf
;
2267 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2268 dst
, 0, &saddr_buf
) == 0)
2269 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2272 if (rtnetlink_put_metrics(skb
, rt
->dst
.metrics
) < 0)
2273 goto nla_put_failure
;
2275 if (rt
->dst
.neighbour
)
2276 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->dst
.neighbour
->primary_key
);
2279 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2281 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2283 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2285 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2286 expires
= rt
->rt6i_expires
- jiffies
;
2290 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, 0, 0,
2291 expires
, rt
->dst
.error
) < 0)
2292 goto nla_put_failure
;
2294 return nlmsg_end(skb
, nlh
);
2297 nlmsg_cancel(skb
, nlh
);
2301 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2303 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2306 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2307 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2308 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2312 return rt6_fill_node(arg
->net
,
2313 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2314 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2315 prefix
, 0, NLM_F_MULTI
);
2318 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2320 struct net
*net
= sock_net(in_skb
->sk
);
2321 struct nlattr
*tb
[RTA_MAX
+1];
2322 struct rt6_info
*rt
;
2323 struct sk_buff
*skb
;
2328 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2333 memset(&fl
, 0, sizeof(fl
));
2336 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2339 ipv6_addr_copy(&fl
.fl6_src
, nla_data(tb
[RTA_SRC
]));
2343 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2346 ipv6_addr_copy(&fl
.fl6_dst
, nla_data(tb
[RTA_DST
]));
2350 iif
= nla_get_u32(tb
[RTA_IIF
]);
2353 fl
.oif
= nla_get_u32(tb
[RTA_OIF
]);
2356 struct net_device
*dev
;
2357 dev
= __dev_get_by_index(net
, iif
);
2364 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2370 /* Reserve room for dummy headers, this skb can pass
2371 through good chunk of routing engine.
2373 skb_reset_mac_header(skb
);
2374 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2376 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl
);
2377 skb_dst_set(skb
, &rt
->dst
);
2379 err
= rt6_fill_node(net
, skb
, rt
, &fl
.fl6_dst
, &fl
.fl6_src
, iif
,
2380 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2381 nlh
->nlmsg_seq
, 0, 0, 0);
2387 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2392 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2394 struct sk_buff
*skb
;
2395 struct net
*net
= info
->nl_net
;
2400 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2402 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2406 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2407 event
, info
->pid
, seq
, 0, 0, 0);
2409 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2410 WARN_ON(err
== -EMSGSIZE
);
2414 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2415 info
->nlh
, gfp_any());
2419 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2422 static int ip6_route_dev_notify(struct notifier_block
*this,
2423 unsigned long event
, void *data
)
2425 struct net_device
*dev
= (struct net_device
*)data
;
2426 struct net
*net
= dev_net(dev
);
2428 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2429 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
2430 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2431 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2432 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
2433 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2434 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
2435 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2446 #ifdef CONFIG_PROC_FS
2448 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2459 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2461 struct seq_file
*m
= p_arg
;
2463 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2465 #ifdef CONFIG_IPV6_SUBTREES
2466 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2468 seq_puts(m
, "00000000000000000000000000000000 00 ");
2471 if (rt
->rt6i_nexthop
) {
2472 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2474 seq_puts(m
, "00000000000000000000000000000000");
2476 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2477 rt
->rt6i_metric
, atomic_read(&rt
->dst
.__refcnt
),
2478 rt
->dst
.__use
, rt
->rt6i_flags
,
2479 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2483 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2485 struct net
*net
= (struct net
*)m
->private;
2486 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2490 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2492 return single_open_net(inode
, file
, ipv6_route_show
);
2495 static const struct file_operations ipv6_route_proc_fops
= {
2496 .owner
= THIS_MODULE
,
2497 .open
= ipv6_route_open
,
2499 .llseek
= seq_lseek
,
2500 .release
= single_release_net
,
2503 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2505 struct net
*net
= (struct net
*)seq
->private;
2506 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2507 net
->ipv6
.rt6_stats
->fib_nodes
,
2508 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2509 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2510 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2511 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2512 atomic_read(&net
->ipv6
.ip6_dst_ops
.entries
),
2513 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2518 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2520 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2523 static const struct file_operations rt6_stats_seq_fops
= {
2524 .owner
= THIS_MODULE
,
2525 .open
= rt6_stats_seq_open
,
2527 .llseek
= seq_lseek
,
2528 .release
= single_release_net
,
2530 #endif /* CONFIG_PROC_FS */
2532 #ifdef CONFIG_SYSCTL
2535 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
,
2536 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2538 struct net
*net
= current
->nsproxy
->net_ns
;
2539 int delay
= net
->ipv6
.sysctl
.flush_delay
;
2541 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
2542 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2548 ctl_table ipv6_route_table_template
[] = {
2550 .procname
= "flush",
2551 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2552 .maxlen
= sizeof(int),
2554 .proc_handler
= ipv6_sysctl_rtcache_flush
2557 .procname
= "gc_thresh",
2558 .data
= &ip6_dst_ops_template
.gc_thresh
,
2559 .maxlen
= sizeof(int),
2561 .proc_handler
= proc_dointvec
,
2564 .procname
= "max_size",
2565 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2566 .maxlen
= sizeof(int),
2568 .proc_handler
= proc_dointvec
,
2571 .procname
= "gc_min_interval",
2572 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2573 .maxlen
= sizeof(int),
2575 .proc_handler
= proc_dointvec_jiffies
,
2578 .procname
= "gc_timeout",
2579 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2580 .maxlen
= sizeof(int),
2582 .proc_handler
= proc_dointvec_jiffies
,
2585 .procname
= "gc_interval",
2586 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2587 .maxlen
= sizeof(int),
2589 .proc_handler
= proc_dointvec_jiffies
,
2592 .procname
= "gc_elasticity",
2593 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2594 .maxlen
= sizeof(int),
2596 .proc_handler
= proc_dointvec
,
2599 .procname
= "mtu_expires",
2600 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2601 .maxlen
= sizeof(int),
2603 .proc_handler
= proc_dointvec_jiffies
,
2606 .procname
= "min_adv_mss",
2607 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2608 .maxlen
= sizeof(int),
2610 .proc_handler
= proc_dointvec
,
2613 .procname
= "gc_min_interval_ms",
2614 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2615 .maxlen
= sizeof(int),
2617 .proc_handler
= proc_dointvec_ms_jiffies
,
2622 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
2624 struct ctl_table
*table
;
2626 table
= kmemdup(ipv6_route_table_template
,
2627 sizeof(ipv6_route_table_template
),
2631 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2632 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
2633 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2634 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2635 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2636 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2637 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2638 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2639 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2640 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2647 static int __net_init
ip6_route_net_init(struct net
*net
)
2651 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
2652 sizeof(net
->ipv6
.ip6_dst_ops
));
2654 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2655 sizeof(*net
->ipv6
.ip6_null_entry
),
2657 if (!net
->ipv6
.ip6_null_entry
)
2658 goto out_ip6_dst_ops
;
2659 net
->ipv6
.ip6_null_entry
->dst
.path
=
2660 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2661 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2663 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2664 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2665 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2667 if (!net
->ipv6
.ip6_prohibit_entry
)
2668 goto out_ip6_null_entry
;
2669 net
->ipv6
.ip6_prohibit_entry
->dst
.path
=
2670 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2671 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2673 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2674 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2676 if (!net
->ipv6
.ip6_blk_hole_entry
)
2677 goto out_ip6_prohibit_entry
;
2678 net
->ipv6
.ip6_blk_hole_entry
->dst
.path
=
2679 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2680 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2683 net
->ipv6
.sysctl
.flush_delay
= 0;
2684 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2685 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2686 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2687 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2688 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2689 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2690 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2692 #ifdef CONFIG_PROC_FS
2693 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2694 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2696 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2702 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2703 out_ip6_prohibit_entry
:
2704 kfree(net
->ipv6
.ip6_prohibit_entry
);
2706 kfree(net
->ipv6
.ip6_null_entry
);
2712 static void __net_exit
ip6_route_net_exit(struct net
*net
)
2714 #ifdef CONFIG_PROC_FS
2715 proc_net_remove(net
, "ipv6_route");
2716 proc_net_remove(net
, "rt6_stats");
2718 kfree(net
->ipv6
.ip6_null_entry
);
2719 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2720 kfree(net
->ipv6
.ip6_prohibit_entry
);
2721 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2725 static struct pernet_operations ip6_route_net_ops
= {
2726 .init
= ip6_route_net_init
,
2727 .exit
= ip6_route_net_exit
,
2730 static struct notifier_block ip6_route_dev_notifier
= {
2731 .notifier_call
= ip6_route_dev_notify
,
2735 int __init
ip6_route_init(void)
2740 ip6_dst_ops_template
.kmem_cachep
=
2741 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2742 SLAB_HWCACHE_ALIGN
, NULL
);
2743 if (!ip6_dst_ops_template
.kmem_cachep
)
2746 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2748 goto out_kmem_cache
;
2750 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2752 /* Registering of the loopback is done before this portion of code,
2753 * the loopback reference in rt6_info will not be taken, do it
2754 * manually for init_net */
2755 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
2756 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2757 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2758 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
2759 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2760 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
2761 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2765 goto out_register_subsys
;
2771 ret
= fib6_rules_init();
2776 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2777 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2778 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2779 goto fib6_rules_init
;
2781 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2783 goto fib6_rules_init
;
2789 fib6_rules_cleanup();
2794 out_register_subsys
:
2795 unregister_pernet_subsys(&ip6_route_net_ops
);
2797 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2801 void ip6_route_cleanup(void)
2803 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2804 fib6_rules_cleanup();
2807 unregister_pernet_subsys(&ip6_route_net_ops
);
2808 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);