Merge branch 'nf-next' of git://1984.lsi.us.es/net-next
[linux-2.6/libata-dev.git] / net / ipv6 / route.c
blob5855e9ede3cbd939e4e8f672703945dc10f0596a
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
96 unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
100 #endif
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
106 u32 *p = NULL;
108 if (!(rt->dst.flags & DST_HOST))
109 return NULL;
111 if (!rt->rt6i_peer)
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
115 if (peer) {
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
119 p = peer->metrics;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
126 if (prev != old) {
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
129 p = NULL;
132 return p;
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
141 .family = AF_INET6,
142 .protocol = cpu_to_be16(ETH_P_IPV6),
143 .gc = ip6_dst_gc,
144 .gc_thresh = 1024,
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
147 .mtu = ip6_mtu,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170 unsigned long old)
172 return NULL;
175 static struct dst_ops ip6_dst_blackhole_ops = {
176 .family = AF_INET6,
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 255,
191 static struct rt6_info ip6_null_entry_template = {
192 .dst = {
193 .__refcnt = ATOMIC_INIT(1),
194 .__use = 1,
195 .obsolete = -1,
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
212 .dst = {
213 .__refcnt = ATOMIC_INIT(1),
214 .__use = 1,
215 .obsolete = -1,
216 .error = -EACCES,
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
227 .dst = {
228 .__refcnt = ATOMIC_INIT(1),
229 .__use = 1,
230 .obsolete = -1,
231 .error = -EINVAL,
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
241 #endif
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
246 int flags)
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
250 if (rt)
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
254 return rt;
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
266 if (idev) {
267 rt->rt6i_idev = NULL;
268 in6_dev_put(idev);
270 if (peer) {
271 rt->rt6i_peer = NULL;
272 inet_putpeer(peer);
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289 inet_putpeer(peer);
290 else
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295 int how)
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev) {
306 rt->rt6i_idev = loopback_idev;
307 in6_dev_put(idev);
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329 struct rt6_info *rt,
330 const struct in6_addr *saddr,
331 int oif,
332 int flags)
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
338 goto out;
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
343 if (oif) {
344 if (dev->ifindex == oif)
345 return sprt;
346 if (dev->flags & IFF_LOOPBACK) {
347 if (!sprt->rt6i_idev ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
350 continue;
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
353 continue;
355 local = sprt;
357 } else {
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
360 return sprt;
364 if (oif) {
365 if (local)
366 return local;
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
371 out:
372 return rt;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
387 rcu_read_lock();
388 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
390 goto out;
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403 } else {
404 read_unlock_bh(&neigh->lock);
406 out:
407 rcu_read_unlock();
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
413 #endif
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
422 return 2;
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425 return 1;
426 return 0;
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
432 int m;
434 rcu_read_lock();
435 neigh = dst_get_neighbour_noref(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
438 m = 1;
439 else if (neigh) {
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
442 m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
445 m = 0;
446 #endif
447 else
448 m = 1;
449 read_unlock_bh(&neigh->lock);
450 } else
451 m = 0;
452 rcu_read_unlock();
453 return m;
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457 int strict)
459 int m, n;
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
463 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469 return -1;
470 return m;
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
476 int m;
478 if (rt6_check_expired(rt))
479 goto out;
481 m = rt6_score_route(rt, oif, strict);
482 if (m < 0)
483 goto out;
485 if (m > *mpri) {
486 if (strict & RT6_LOOKUP_F_REACHABLE)
487 rt6_probe(match);
488 *mpri = m;
489 match = rt;
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491 rt6_probe(rt);
494 out:
495 return match;
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
503 int mpri = -1;
505 match = NULL;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
513 return match;
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
519 struct net *net;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
524 rt0 = fn->rr_ptr;
525 if (!rt0)
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
530 if (!match &&
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536 next = fn->leaf;
538 if (next != rt0)
539 fn->rr_ptr = next;
542 RT6_TRACE("%s() => %p\n",
543 __func__, match);
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
556 unsigned int pref;
557 unsigned long lifetime;
558 struct rt6_info *rt;
560 if (len < sizeof(struct route_info)) {
561 return -EINVAL;
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
566 return -EINVAL;
567 } else if (rinfo->prefix_len > 128) {
568 return -EINVAL;
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
571 return -EINVAL;
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
575 return -EINVAL;
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
581 return -EINVAL;
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
587 else {
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
591 rinfo->prefix_len);
592 prefix = &prefix_buf;
595 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596 dev->ifindex);
598 if (rt && !lifetime) {
599 ip6_del_rt(rt);
600 rt = NULL;
603 if (!rt && lifetime)
604 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605 pref);
606 else if (rt)
607 rt->rt6i_flags = RTF_ROUTEINFO |
608 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
610 if (rt) {
611 if (!addrconf_finite_timeout(lifetime)) {
612 rt->rt6i_flags &= ~RTF_EXPIRES;
613 } else {
614 rt->rt6i_expires = jiffies + HZ * lifetime;
615 rt->rt6i_flags |= RTF_EXPIRES;
617 dst_release(&rt->dst);
619 return 0;
621 #endif
623 #define BACKTRACK(__net, saddr) \
624 do { \
625 if (rt == __net->ipv6.ip6_null_entry) { \
626 struct fib6_node *pn; \
627 while (1) { \
628 if (fn->fn_flags & RTN_TL_ROOT) \
629 goto out; \
630 pn = fn->parent; \
631 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633 else \
634 fn = pn; \
635 if (fn->fn_flags & RTN_RTINFO) \
636 goto restart; \
639 } while (0)
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642 struct fib6_table *table,
643 struct flowi6 *fl6, int flags)
645 struct fib6_node *fn;
646 struct rt6_info *rt;
648 read_lock_bh(&table->tb6_lock);
649 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651 rt = fn->leaf;
652 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653 BACKTRACK(net, &fl6->saddr);
654 out:
655 dst_use(&rt->dst, jiffies);
656 read_unlock_bh(&table->tb6_lock);
657 return rt;
661 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
662 int flags)
664 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
666 EXPORT_SYMBOL_GPL(ip6_route_lookup);
668 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
669 const struct in6_addr *saddr, int oif, int strict)
671 struct flowi6 fl6 = {
672 .flowi6_oif = oif,
673 .daddr = *daddr,
675 struct dst_entry *dst;
676 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
678 if (saddr) {
679 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
680 flags |= RT6_LOOKUP_F_HAS_SADDR;
683 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
684 if (dst->error == 0)
685 return (struct rt6_info *) dst;
687 dst_release(dst);
689 return NULL;
692 EXPORT_SYMBOL(rt6_lookup);
694 /* ip6_ins_rt is called with FREE table->tb6_lock.
695 It takes new route entry, the addition fails by any reason the
696 route is freed. In any case, if caller does not hold it, it may
697 be destroyed.
700 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
702 int err;
703 struct fib6_table *table;
705 table = rt->rt6i_table;
706 write_lock_bh(&table->tb6_lock);
707 err = fib6_add(&table->tb6_root, rt, info);
708 write_unlock_bh(&table->tb6_lock);
710 return err;
713 int ip6_ins_rt(struct rt6_info *rt)
715 struct nl_info info = {
716 .nl_net = dev_net(rt->rt6i_dev),
718 return __ip6_ins_rt(rt, &info);
721 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
722 const struct in6_addr *daddr,
723 const struct in6_addr *saddr)
725 struct rt6_info *rt;
728 * Clone the route.
731 rt = ip6_rt_copy(ort, daddr);
733 if (rt) {
734 struct neighbour *neigh;
735 int attempts = !in_softirq();
737 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
738 if (ort->rt6i_dst.plen != 128 &&
739 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
740 rt->rt6i_flags |= RTF_ANYCAST;
741 rt->rt6i_gateway = *daddr;
744 rt->rt6i_flags |= RTF_CACHE;
746 #ifdef CONFIG_IPV6_SUBTREES
747 if (rt->rt6i_src.plen && saddr) {
748 rt->rt6i_src.addr = *saddr;
749 rt->rt6i_src.plen = 128;
751 #endif
753 retry:
754 neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway,
755 rt->rt6i_dev);
756 if (IS_ERR(neigh)) {
757 struct net *net = dev_net(rt->rt6i_dev);
758 int saved_rt_min_interval =
759 net->ipv6.sysctl.ip6_rt_gc_min_interval;
760 int saved_rt_elasticity =
761 net->ipv6.sysctl.ip6_rt_gc_elasticity;
763 if (attempts-- > 0) {
764 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
765 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
767 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
769 net->ipv6.sysctl.ip6_rt_gc_elasticity =
770 saved_rt_elasticity;
771 net->ipv6.sysctl.ip6_rt_gc_min_interval =
772 saved_rt_min_interval;
773 goto retry;
776 if (net_ratelimit())
777 printk(KERN_WARNING
778 "ipv6: Neighbour table overflow.\n");
779 dst_free(&rt->dst);
780 return NULL;
782 dst_set_neighbour(&rt->dst, neigh);
786 return rt;
789 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
790 const struct in6_addr *daddr)
792 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
794 if (rt) {
795 rt->rt6i_flags |= RTF_CACHE;
796 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
798 return rt;
801 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
802 struct flowi6 *fl6, int flags)
804 struct fib6_node *fn;
805 struct rt6_info *rt, *nrt;
806 int strict = 0;
807 int attempts = 3;
808 int err;
809 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
811 strict |= flags & RT6_LOOKUP_F_IFACE;
813 relookup:
814 read_lock_bh(&table->tb6_lock);
816 restart_2:
817 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
819 restart:
820 rt = rt6_select(fn, oif, strict | reachable);
822 BACKTRACK(net, &fl6->saddr);
823 if (rt == net->ipv6.ip6_null_entry ||
824 rt->rt6i_flags & RTF_CACHE)
825 goto out;
827 dst_hold(&rt->dst);
828 read_unlock_bh(&table->tb6_lock);
830 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
831 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
832 else if (!(rt->dst.flags & DST_HOST))
833 nrt = rt6_alloc_clone(rt, &fl6->daddr);
834 else
835 goto out2;
837 dst_release(&rt->dst);
838 rt = nrt ? : net->ipv6.ip6_null_entry;
840 dst_hold(&rt->dst);
841 if (nrt) {
842 err = ip6_ins_rt(nrt);
843 if (!err)
844 goto out2;
847 if (--attempts <= 0)
848 goto out2;
851 * Race condition! In the gap, when table->tb6_lock was
852 * released someone could insert this route. Relookup.
854 dst_release(&rt->dst);
855 goto relookup;
857 out:
858 if (reachable) {
859 reachable = 0;
860 goto restart_2;
862 dst_hold(&rt->dst);
863 read_unlock_bh(&table->tb6_lock);
864 out2:
865 rt->dst.lastuse = jiffies;
866 rt->dst.__use++;
868 return rt;
871 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
872 struct flowi6 *fl6, int flags)
874 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
877 void ip6_route_input(struct sk_buff *skb)
879 const struct ipv6hdr *iph = ipv6_hdr(skb);
880 struct net *net = dev_net(skb->dev);
881 int flags = RT6_LOOKUP_F_HAS_SADDR;
882 struct flowi6 fl6 = {
883 .flowi6_iif = skb->dev->ifindex,
884 .daddr = iph->daddr,
885 .saddr = iph->saddr,
886 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
887 .flowi6_mark = skb->mark,
888 .flowi6_proto = iph->nexthdr,
891 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
892 flags |= RT6_LOOKUP_F_IFACE;
894 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
897 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
898 struct flowi6 *fl6, int flags)
900 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
903 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
904 struct flowi6 *fl6)
906 int flags = 0;
908 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
909 flags |= RT6_LOOKUP_F_IFACE;
911 if (!ipv6_addr_any(&fl6->saddr))
912 flags |= RT6_LOOKUP_F_HAS_SADDR;
913 else if (sk)
914 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
916 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
919 EXPORT_SYMBOL(ip6_route_output);
921 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
923 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
924 struct dst_entry *new = NULL;
926 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
927 if (rt) {
928 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
930 new = &rt->dst;
932 new->__use = 1;
933 new->input = dst_discard;
934 new->output = dst_discard;
936 if (dst_metrics_read_only(&ort->dst))
937 new->_metrics = ort->dst._metrics;
938 else
939 dst_copy_metrics(new, &ort->dst);
940 rt->rt6i_idev = ort->rt6i_idev;
941 if (rt->rt6i_idev)
942 in6_dev_hold(rt->rt6i_idev);
943 rt->rt6i_expires = 0;
945 rt->rt6i_gateway = ort->rt6i_gateway;
946 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
947 rt->rt6i_metric = 0;
949 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
950 #ifdef CONFIG_IPV6_SUBTREES
951 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
952 #endif
954 dst_free(new);
957 dst_release(dst_orig);
958 return new ? new : ERR_PTR(-ENOMEM);
962 * Destination cache support functions
965 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
967 struct rt6_info *rt;
969 rt = (struct rt6_info *) dst;
971 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
972 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
973 if (!rt->rt6i_peer)
974 rt6_bind_peer(rt, 0);
975 rt->rt6i_peer_genid = rt6_peer_genid();
977 return dst;
979 return NULL;
982 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
984 struct rt6_info *rt = (struct rt6_info *) dst;
986 if (rt) {
987 if (rt->rt6i_flags & RTF_CACHE) {
988 if (rt6_check_expired(rt)) {
989 ip6_del_rt(rt);
990 dst = NULL;
992 } else {
993 dst_release(dst);
994 dst = NULL;
997 return dst;
1000 static void ip6_link_failure(struct sk_buff *skb)
1002 struct rt6_info *rt;
1004 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1006 rt = (struct rt6_info *) skb_dst(skb);
1007 if (rt) {
1008 if (rt->rt6i_flags & RTF_CACHE) {
1009 dst_set_expires(&rt->dst, 0);
1010 rt->rt6i_flags |= RTF_EXPIRES;
1011 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1012 rt->rt6i_node->fn_sernum = -1;
1016 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1018 struct rt6_info *rt6 = (struct rt6_info*)dst;
1020 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1021 rt6->rt6i_flags |= RTF_MODIFIED;
1022 if (mtu < IPV6_MIN_MTU) {
1023 u32 features = dst_metric(dst, RTAX_FEATURES);
1024 mtu = IPV6_MIN_MTU;
1025 features |= RTAX_FEATURE_ALLFRAG;
1026 dst_metric_set(dst, RTAX_FEATURES, features);
1028 dst_metric_set(dst, RTAX_MTU, mtu);
1032 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1034 struct net_device *dev = dst->dev;
1035 unsigned int mtu = dst_mtu(dst);
1036 struct net *net = dev_net(dev);
1038 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1040 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1041 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1044 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1045 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1046 * IPV6_MAXPLEN is also valid and means: "any MSS,
1047 * rely only on pmtu discovery"
1049 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1050 mtu = IPV6_MAXPLEN;
1051 return mtu;
1054 static unsigned int ip6_mtu(const struct dst_entry *dst)
1056 struct inet6_dev *idev;
1057 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1059 if (mtu)
1060 return mtu;
1062 mtu = IPV6_MIN_MTU;
1064 rcu_read_lock();
1065 idev = __in6_dev_get(dst->dev);
1066 if (idev)
1067 mtu = idev->cnf.mtu6;
1068 rcu_read_unlock();
1070 return mtu;
1073 static struct dst_entry *icmp6_dst_gc_list;
1074 static DEFINE_SPINLOCK(icmp6_dst_lock);
1076 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1077 struct neighbour *neigh,
1078 struct flowi6 *fl6)
1080 struct dst_entry *dst;
1081 struct rt6_info *rt;
1082 struct inet6_dev *idev = in6_dev_get(dev);
1083 struct net *net = dev_net(dev);
1085 if (unlikely(!idev))
1086 return NULL;
1088 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1089 if (unlikely(!rt)) {
1090 in6_dev_put(idev);
1091 dst = ERR_PTR(-ENOMEM);
1092 goto out;
1095 if (neigh)
1096 neigh_hold(neigh);
1097 else {
1098 neigh = __neigh_lookup_errno(&nd_tbl, &fl6->daddr, dev);
1099 if (IS_ERR(neigh)) {
1100 dst_free(&rt->dst);
1101 return ERR_CAST(neigh);
1105 rt->dst.flags |= DST_HOST;
1106 rt->dst.output = ip6_output;
1107 dst_set_neighbour(&rt->dst, neigh);
1108 atomic_set(&rt->dst.__refcnt, 1);
1109 rt->rt6i_dst.addr = fl6->daddr;
1110 rt->rt6i_dst.plen = 128;
1111 rt->rt6i_idev = idev;
1112 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1114 spin_lock_bh(&icmp6_dst_lock);
1115 rt->dst.next = icmp6_dst_gc_list;
1116 icmp6_dst_gc_list = &rt->dst;
1117 spin_unlock_bh(&icmp6_dst_lock);
1119 fib6_force_start_gc(net);
1121 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1123 out:
1124 return dst;
1127 int icmp6_dst_gc(void)
1129 struct dst_entry *dst, **pprev;
1130 int more = 0;
1132 spin_lock_bh(&icmp6_dst_lock);
1133 pprev = &icmp6_dst_gc_list;
1135 while ((dst = *pprev) != NULL) {
1136 if (!atomic_read(&dst->__refcnt)) {
1137 *pprev = dst->next;
1138 dst_free(dst);
1139 } else {
1140 pprev = &dst->next;
1141 ++more;
1145 spin_unlock_bh(&icmp6_dst_lock);
1147 return more;
1150 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1151 void *arg)
1153 struct dst_entry *dst, **pprev;
1155 spin_lock_bh(&icmp6_dst_lock);
1156 pprev = &icmp6_dst_gc_list;
1157 while ((dst = *pprev) != NULL) {
1158 struct rt6_info *rt = (struct rt6_info *) dst;
1159 if (func(rt, arg)) {
1160 *pprev = dst->next;
1161 dst_free(dst);
1162 } else {
1163 pprev = &dst->next;
1166 spin_unlock_bh(&icmp6_dst_lock);
1169 static int ip6_dst_gc(struct dst_ops *ops)
1171 unsigned long now = jiffies;
1172 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1173 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1174 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1175 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1176 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1177 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1178 int entries;
1180 entries = dst_entries_get_fast(ops);
1181 if (time_after(rt_last_gc + rt_min_interval, now) &&
1182 entries <= rt_max_size)
1183 goto out;
1185 net->ipv6.ip6_rt_gc_expire++;
1186 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1187 net->ipv6.ip6_rt_last_gc = now;
1188 entries = dst_entries_get_slow(ops);
1189 if (entries < ops->gc_thresh)
1190 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1191 out:
1192 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1193 return entries > rt_max_size;
1196 /* Clean host part of a prefix. Not necessary in radix tree,
1197 but results in cleaner routing tables.
1199 Remove it only when all the things will work!
1202 int ip6_dst_hoplimit(struct dst_entry *dst)
1204 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1205 if (hoplimit == 0) {
1206 struct net_device *dev = dst->dev;
1207 struct inet6_dev *idev;
1209 rcu_read_lock();
1210 idev = __in6_dev_get(dev);
1211 if (idev)
1212 hoplimit = idev->cnf.hop_limit;
1213 else
1214 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1215 rcu_read_unlock();
1217 return hoplimit;
1219 EXPORT_SYMBOL(ip6_dst_hoplimit);
1225 int ip6_route_add(struct fib6_config *cfg)
1227 int err;
1228 struct net *net = cfg->fc_nlinfo.nl_net;
1229 struct rt6_info *rt = NULL;
1230 struct net_device *dev = NULL;
1231 struct inet6_dev *idev = NULL;
1232 struct fib6_table *table;
1233 int addr_type;
1235 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1236 return -EINVAL;
1237 #ifndef CONFIG_IPV6_SUBTREES
1238 if (cfg->fc_src_len)
1239 return -EINVAL;
1240 #endif
1241 if (cfg->fc_ifindex) {
1242 err = -ENODEV;
1243 dev = dev_get_by_index(net, cfg->fc_ifindex);
1244 if (!dev)
1245 goto out;
1246 idev = in6_dev_get(dev);
1247 if (!idev)
1248 goto out;
1251 if (cfg->fc_metric == 0)
1252 cfg->fc_metric = IP6_RT_PRIO_USER;
1254 err = -ENOBUFS;
1255 if (cfg->fc_nlinfo.nlh &&
1256 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1257 table = fib6_get_table(net, cfg->fc_table);
1258 if (!table) {
1259 printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1260 table = fib6_new_table(net, cfg->fc_table);
1262 } else {
1263 table = fib6_new_table(net, cfg->fc_table);
1266 if (!table)
1267 goto out;
1269 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1271 if (!rt) {
1272 err = -ENOMEM;
1273 goto out;
1276 rt->dst.obsolete = -1;
1277 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1278 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1281 if (cfg->fc_protocol == RTPROT_UNSPEC)
1282 cfg->fc_protocol = RTPROT_BOOT;
1283 rt->rt6i_protocol = cfg->fc_protocol;
1285 addr_type = ipv6_addr_type(&cfg->fc_dst);
1287 if (addr_type & IPV6_ADDR_MULTICAST)
1288 rt->dst.input = ip6_mc_input;
1289 else if (cfg->fc_flags & RTF_LOCAL)
1290 rt->dst.input = ip6_input;
1291 else
1292 rt->dst.input = ip6_forward;
1294 rt->dst.output = ip6_output;
1296 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1297 rt->rt6i_dst.plen = cfg->fc_dst_len;
1298 if (rt->rt6i_dst.plen == 128)
1299 rt->dst.flags |= DST_HOST;
1301 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1302 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1303 if (!metrics) {
1304 err = -ENOMEM;
1305 goto out;
1307 dst_init_metrics(&rt->dst, metrics, 0);
1309 #ifdef CONFIG_IPV6_SUBTREES
1310 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1311 rt->rt6i_src.plen = cfg->fc_src_len;
1312 #endif
1314 rt->rt6i_metric = cfg->fc_metric;
1316 /* We cannot add true routes via loopback here,
1317 they would result in kernel looping; promote them to reject routes
1319 if ((cfg->fc_flags & RTF_REJECT) ||
1320 (dev && (dev->flags & IFF_LOOPBACK) &&
1321 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1322 !(cfg->fc_flags & RTF_LOCAL))) {
1323 /* hold loopback dev/idev if we haven't done so. */
1324 if (dev != net->loopback_dev) {
1325 if (dev) {
1326 dev_put(dev);
1327 in6_dev_put(idev);
1329 dev = net->loopback_dev;
1330 dev_hold(dev);
1331 idev = in6_dev_get(dev);
1332 if (!idev) {
1333 err = -ENODEV;
1334 goto out;
1337 rt->dst.output = ip6_pkt_discard_out;
1338 rt->dst.input = ip6_pkt_discard;
1339 rt->dst.error = -ENETUNREACH;
1340 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1341 goto install_route;
1344 if (cfg->fc_flags & RTF_GATEWAY) {
1345 const struct in6_addr *gw_addr;
1346 int gwa_type;
1348 gw_addr = &cfg->fc_gateway;
1349 rt->rt6i_gateway = *gw_addr;
1350 gwa_type = ipv6_addr_type(gw_addr);
1352 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1353 struct rt6_info *grt;
1355 /* IPv6 strictly inhibits using not link-local
1356 addresses as nexthop address.
1357 Otherwise, router will not able to send redirects.
1358 It is very good, but in some (rare!) circumstances
1359 (SIT, PtP, NBMA NOARP links) it is handy to allow
1360 some exceptions. --ANK
1362 err = -EINVAL;
1363 if (!(gwa_type & IPV6_ADDR_UNICAST))
1364 goto out;
1366 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1368 err = -EHOSTUNREACH;
1369 if (!grt)
1370 goto out;
1371 if (dev) {
1372 if (dev != grt->rt6i_dev) {
1373 dst_release(&grt->dst);
1374 goto out;
1376 } else {
1377 dev = grt->rt6i_dev;
1378 idev = grt->rt6i_idev;
1379 dev_hold(dev);
1380 in6_dev_hold(grt->rt6i_idev);
1382 if (!(grt->rt6i_flags & RTF_GATEWAY))
1383 err = 0;
1384 dst_release(&grt->dst);
1386 if (err)
1387 goto out;
1389 err = -EINVAL;
1390 if (!dev || (dev->flags & IFF_LOOPBACK))
1391 goto out;
1394 err = -ENODEV;
1395 if (!dev)
1396 goto out;
1398 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1399 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1400 err = -EINVAL;
1401 goto out;
1403 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1404 rt->rt6i_prefsrc.plen = 128;
1405 } else
1406 rt->rt6i_prefsrc.plen = 0;
1408 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1409 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1410 if (IS_ERR(n)) {
1411 err = PTR_ERR(n);
1412 goto out;
1414 dst_set_neighbour(&rt->dst, n);
1417 rt->rt6i_flags = cfg->fc_flags;
1419 install_route:
1420 if (cfg->fc_mx) {
1421 struct nlattr *nla;
1422 int remaining;
1424 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1425 int type = nla_type(nla);
1427 if (type) {
1428 if (type > RTAX_MAX) {
1429 err = -EINVAL;
1430 goto out;
1433 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1438 rt->dst.dev = dev;
1439 rt->rt6i_idev = idev;
1440 rt->rt6i_table = table;
1442 cfg->fc_nlinfo.nl_net = dev_net(dev);
1444 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1446 out:
1447 if (dev)
1448 dev_put(dev);
1449 if (idev)
1450 in6_dev_put(idev);
1451 if (rt)
1452 dst_free(&rt->dst);
1453 return err;
1456 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1458 int err;
1459 struct fib6_table *table;
1460 struct net *net = dev_net(rt->rt6i_dev);
1462 if (rt == net->ipv6.ip6_null_entry)
1463 return -ENOENT;
1465 table = rt->rt6i_table;
1466 write_lock_bh(&table->tb6_lock);
1468 err = fib6_del(rt, info);
1469 dst_release(&rt->dst);
1471 write_unlock_bh(&table->tb6_lock);
1473 return err;
1476 int ip6_del_rt(struct rt6_info *rt)
1478 struct nl_info info = {
1479 .nl_net = dev_net(rt->rt6i_dev),
1481 return __ip6_del_rt(rt, &info);
1484 static int ip6_route_del(struct fib6_config *cfg)
1486 struct fib6_table *table;
1487 struct fib6_node *fn;
1488 struct rt6_info *rt;
1489 int err = -ESRCH;
1491 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1492 if (!table)
1493 return err;
1495 read_lock_bh(&table->tb6_lock);
1497 fn = fib6_locate(&table->tb6_root,
1498 &cfg->fc_dst, cfg->fc_dst_len,
1499 &cfg->fc_src, cfg->fc_src_len);
1501 if (fn) {
1502 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1503 if (cfg->fc_ifindex &&
1504 (!rt->rt6i_dev ||
1505 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1506 continue;
1507 if (cfg->fc_flags & RTF_GATEWAY &&
1508 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1509 continue;
1510 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1511 continue;
1512 dst_hold(&rt->dst);
1513 read_unlock_bh(&table->tb6_lock);
1515 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1518 read_unlock_bh(&table->tb6_lock);
1520 return err;
1524 * Handle redirects
1526 struct ip6rd_flowi {
1527 struct flowi6 fl6;
1528 struct in6_addr gateway;
1531 static struct rt6_info *__ip6_route_redirect(struct net *net,
1532 struct fib6_table *table,
1533 struct flowi6 *fl6,
1534 int flags)
1536 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1537 struct rt6_info *rt;
1538 struct fib6_node *fn;
1541 * Get the "current" route for this destination and
1542 * check if the redirect has come from approriate router.
1544 * RFC 2461 specifies that redirects should only be
1545 * accepted if they come from the nexthop to the target.
1546 * Due to the way the routes are chosen, this notion
1547 * is a bit fuzzy and one might need to check all possible
1548 * routes.
1551 read_lock_bh(&table->tb6_lock);
1552 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1553 restart:
1554 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1556 * Current route is on-link; redirect is always invalid.
1558 * Seems, previous statement is not true. It could
1559 * be node, which looks for us as on-link (f.e. proxy ndisc)
1560 * But then router serving it might decide, that we should
1561 * know truth 8)8) --ANK (980726).
1563 if (rt6_check_expired(rt))
1564 continue;
1565 if (!(rt->rt6i_flags & RTF_GATEWAY))
1566 continue;
1567 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1568 continue;
1569 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1570 continue;
1571 break;
1574 if (!rt)
1575 rt = net->ipv6.ip6_null_entry;
1576 BACKTRACK(net, &fl6->saddr);
1577 out:
1578 dst_hold(&rt->dst);
1580 read_unlock_bh(&table->tb6_lock);
1582 return rt;
1585 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1586 const struct in6_addr *src,
1587 const struct in6_addr *gateway,
1588 struct net_device *dev)
1590 int flags = RT6_LOOKUP_F_HAS_SADDR;
1591 struct net *net = dev_net(dev);
1592 struct ip6rd_flowi rdfl = {
1593 .fl6 = {
1594 .flowi6_oif = dev->ifindex,
1595 .daddr = *dest,
1596 .saddr = *src,
1600 rdfl.gateway = *gateway;
1602 if (rt6_need_strict(dest))
1603 flags |= RT6_LOOKUP_F_IFACE;
1605 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1606 flags, __ip6_route_redirect);
1609 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1610 const struct in6_addr *saddr,
1611 struct neighbour *neigh, u8 *lladdr, int on_link)
1613 struct rt6_info *rt, *nrt = NULL;
1614 struct netevent_redirect netevent;
1615 struct net *net = dev_net(neigh->dev);
1617 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1619 if (rt == net->ipv6.ip6_null_entry) {
1620 if (net_ratelimit())
1621 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1622 "for redirect target\n");
1623 goto out;
1627 * We have finally decided to accept it.
1630 neigh_update(neigh, lladdr, NUD_STALE,
1631 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1632 NEIGH_UPDATE_F_OVERRIDE|
1633 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1634 NEIGH_UPDATE_F_ISROUTER))
1638 * Redirect received -> path was valid.
1639 * Look, redirects are sent only in response to data packets,
1640 * so that this nexthop apparently is reachable. --ANK
1642 dst_confirm(&rt->dst);
1644 /* Duplicate redirect: silently ignore. */
1645 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1646 goto out;
1648 nrt = ip6_rt_copy(rt, dest);
1649 if (!nrt)
1650 goto out;
1652 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1653 if (on_link)
1654 nrt->rt6i_flags &= ~RTF_GATEWAY;
1656 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1657 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1659 if (ip6_ins_rt(nrt))
1660 goto out;
1662 netevent.old = &rt->dst;
1663 netevent.new = &nrt->dst;
1664 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1666 if (rt->rt6i_flags & RTF_CACHE) {
1667 ip6_del_rt(rt);
1668 return;
1671 out:
1672 dst_release(&rt->dst);
1676 * Handle ICMP "packet too big" messages
1677 * i.e. Path MTU discovery
1680 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1681 struct net *net, u32 pmtu, int ifindex)
1683 struct rt6_info *rt, *nrt;
1684 int allfrag = 0;
1685 again:
1686 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1687 if (!rt)
1688 return;
1690 if (rt6_check_expired(rt)) {
1691 ip6_del_rt(rt);
1692 goto again;
1695 if (pmtu >= dst_mtu(&rt->dst))
1696 goto out;
1698 if (pmtu < IPV6_MIN_MTU) {
1700 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1701 * MTU (1280) and a fragment header should always be included
1702 * after a node receiving Too Big message reporting PMTU is
1703 * less than the IPv6 Minimum Link MTU.
1705 pmtu = IPV6_MIN_MTU;
1706 allfrag = 1;
1709 /* New mtu received -> path was valid.
1710 They are sent only in response to data packets,
1711 so that this nexthop apparently is reachable. --ANK
1713 dst_confirm(&rt->dst);
1715 /* Host route. If it is static, it would be better
1716 not to override it, but add new one, so that
1717 when cache entry will expire old pmtu
1718 would return automatically.
1720 if (rt->rt6i_flags & RTF_CACHE) {
1721 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1722 if (allfrag) {
1723 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1724 features |= RTAX_FEATURE_ALLFRAG;
1725 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1727 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1728 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1729 goto out;
1732 /* Network route.
1733 Two cases are possible:
1734 1. It is connected route. Action: COW
1735 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1737 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1738 nrt = rt6_alloc_cow(rt, daddr, saddr);
1739 else
1740 nrt = rt6_alloc_clone(rt, daddr);
1742 if (nrt) {
1743 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1744 if (allfrag) {
1745 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1746 features |= RTAX_FEATURE_ALLFRAG;
1747 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1750 /* According to RFC 1981, detecting PMTU increase shouldn't be
1751 * happened within 5 mins, the recommended timer is 10 mins.
1752 * Here this route expiration time is set to ip6_rt_mtu_expires
1753 * which is 10 mins. After 10 mins the decreased pmtu is expired
1754 * and detecting PMTU increase will be automatically happened.
1756 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1757 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1759 ip6_ins_rt(nrt);
1761 out:
1762 dst_release(&rt->dst);
1765 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1766 struct net_device *dev, u32 pmtu)
1768 struct net *net = dev_net(dev);
1771 * RFC 1981 states that a node "MUST reduce the size of the packets it
1772 * is sending along the path" that caused the Packet Too Big message.
1773 * Since it's not possible in the general case to determine which
1774 * interface was used to send the original packet, we update the MTU
1775 * on the interface that will be used to send future packets. We also
1776 * update the MTU on the interface that received the Packet Too Big in
1777 * case the original packet was forced out that interface with
1778 * SO_BINDTODEVICE or similar. This is the next best thing to the
1779 * correct behaviour, which would be to update the MTU on all
1780 * interfaces.
1782 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1783 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1787 * Misc support functions
1790 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1791 const struct in6_addr *dest)
1793 struct net *net = dev_net(ort->rt6i_dev);
1794 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1795 ort->dst.dev, 0);
1797 if (rt) {
1798 rt->dst.input = ort->dst.input;
1799 rt->dst.output = ort->dst.output;
1800 rt->dst.flags |= DST_HOST;
1802 rt->rt6i_dst.addr = *dest;
1803 rt->rt6i_dst.plen = 128;
1804 dst_copy_metrics(&rt->dst, &ort->dst);
1805 rt->dst.error = ort->dst.error;
1806 rt->rt6i_idev = ort->rt6i_idev;
1807 if (rt->rt6i_idev)
1808 in6_dev_hold(rt->rt6i_idev);
1809 rt->dst.lastuse = jiffies;
1810 rt->rt6i_expires = 0;
1812 rt->rt6i_gateway = ort->rt6i_gateway;
1813 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1814 rt->rt6i_metric = 0;
1816 #ifdef CONFIG_IPV6_SUBTREES
1817 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1818 #endif
1819 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1820 rt->rt6i_table = ort->rt6i_table;
1822 return rt;
1825 #ifdef CONFIG_IPV6_ROUTE_INFO
1826 static struct rt6_info *rt6_get_route_info(struct net *net,
1827 const struct in6_addr *prefix, int prefixlen,
1828 const struct in6_addr *gwaddr, int ifindex)
1830 struct fib6_node *fn;
1831 struct rt6_info *rt = NULL;
1832 struct fib6_table *table;
1834 table = fib6_get_table(net, RT6_TABLE_INFO);
1835 if (!table)
1836 return NULL;
1838 write_lock_bh(&table->tb6_lock);
1839 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1840 if (!fn)
1841 goto out;
1843 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1844 if (rt->rt6i_dev->ifindex != ifindex)
1845 continue;
1846 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1847 continue;
1848 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1849 continue;
1850 dst_hold(&rt->dst);
1851 break;
1853 out:
1854 write_unlock_bh(&table->tb6_lock);
1855 return rt;
1858 static struct rt6_info *rt6_add_route_info(struct net *net,
1859 const struct in6_addr *prefix, int prefixlen,
1860 const struct in6_addr *gwaddr, int ifindex,
1861 unsigned pref)
1863 struct fib6_config cfg = {
1864 .fc_table = RT6_TABLE_INFO,
1865 .fc_metric = IP6_RT_PRIO_USER,
1866 .fc_ifindex = ifindex,
1867 .fc_dst_len = prefixlen,
1868 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1869 RTF_UP | RTF_PREF(pref),
1870 .fc_nlinfo.pid = 0,
1871 .fc_nlinfo.nlh = NULL,
1872 .fc_nlinfo.nl_net = net,
1875 cfg.fc_dst = *prefix;
1876 cfg.fc_gateway = *gwaddr;
1878 /* We should treat it as a default route if prefix length is 0. */
1879 if (!prefixlen)
1880 cfg.fc_flags |= RTF_DEFAULT;
1882 ip6_route_add(&cfg);
1884 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1886 #endif
1888 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1890 struct rt6_info *rt;
1891 struct fib6_table *table;
1893 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1894 if (!table)
1895 return NULL;
1897 write_lock_bh(&table->tb6_lock);
1898 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1899 if (dev == rt->rt6i_dev &&
1900 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1901 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1902 break;
1904 if (rt)
1905 dst_hold(&rt->dst);
1906 write_unlock_bh(&table->tb6_lock);
1907 return rt;
1910 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1911 struct net_device *dev,
1912 unsigned int pref)
1914 struct fib6_config cfg = {
1915 .fc_table = RT6_TABLE_DFLT,
1916 .fc_metric = IP6_RT_PRIO_USER,
1917 .fc_ifindex = dev->ifindex,
1918 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1919 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1920 .fc_nlinfo.pid = 0,
1921 .fc_nlinfo.nlh = NULL,
1922 .fc_nlinfo.nl_net = dev_net(dev),
1925 cfg.fc_gateway = *gwaddr;
1927 ip6_route_add(&cfg);
1929 return rt6_get_dflt_router(gwaddr, dev);
1932 void rt6_purge_dflt_routers(struct net *net)
1934 struct rt6_info *rt;
1935 struct fib6_table *table;
1937 /* NOTE: Keep consistent with rt6_get_dflt_router */
1938 table = fib6_get_table(net, RT6_TABLE_DFLT);
1939 if (!table)
1940 return;
1942 restart:
1943 read_lock_bh(&table->tb6_lock);
1944 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1945 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1946 dst_hold(&rt->dst);
1947 read_unlock_bh(&table->tb6_lock);
1948 ip6_del_rt(rt);
1949 goto restart;
1952 read_unlock_bh(&table->tb6_lock);
1955 static void rtmsg_to_fib6_config(struct net *net,
1956 struct in6_rtmsg *rtmsg,
1957 struct fib6_config *cfg)
1959 memset(cfg, 0, sizeof(*cfg));
1961 cfg->fc_table = RT6_TABLE_MAIN;
1962 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1963 cfg->fc_metric = rtmsg->rtmsg_metric;
1964 cfg->fc_expires = rtmsg->rtmsg_info;
1965 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1966 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1967 cfg->fc_flags = rtmsg->rtmsg_flags;
1969 cfg->fc_nlinfo.nl_net = net;
1971 cfg->fc_dst = rtmsg->rtmsg_dst;
1972 cfg->fc_src = rtmsg->rtmsg_src;
1973 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1976 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1978 struct fib6_config cfg;
1979 struct in6_rtmsg rtmsg;
1980 int err;
1982 switch(cmd) {
1983 case SIOCADDRT: /* Add a route */
1984 case SIOCDELRT: /* Delete a route */
1985 if (!capable(CAP_NET_ADMIN))
1986 return -EPERM;
1987 err = copy_from_user(&rtmsg, arg,
1988 sizeof(struct in6_rtmsg));
1989 if (err)
1990 return -EFAULT;
1992 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1994 rtnl_lock();
1995 switch (cmd) {
1996 case SIOCADDRT:
1997 err = ip6_route_add(&cfg);
1998 break;
1999 case SIOCDELRT:
2000 err = ip6_route_del(&cfg);
2001 break;
2002 default:
2003 err = -EINVAL;
2005 rtnl_unlock();
2007 return err;
2010 return -EINVAL;
2014 * Drop the packet on the floor
2017 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2019 int type;
2020 struct dst_entry *dst = skb_dst(skb);
2021 switch (ipstats_mib_noroutes) {
2022 case IPSTATS_MIB_INNOROUTES:
2023 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2024 if (type == IPV6_ADDR_ANY) {
2025 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2026 IPSTATS_MIB_INADDRERRORS);
2027 break;
2029 /* FALLTHROUGH */
2030 case IPSTATS_MIB_OUTNOROUTES:
2031 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2032 ipstats_mib_noroutes);
2033 break;
2035 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2036 kfree_skb(skb);
2037 return 0;
2040 static int ip6_pkt_discard(struct sk_buff *skb)
2042 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2045 static int ip6_pkt_discard_out(struct sk_buff *skb)
2047 skb->dev = skb_dst(skb)->dev;
2048 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2051 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2053 static int ip6_pkt_prohibit(struct sk_buff *skb)
2055 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2058 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2060 skb->dev = skb_dst(skb)->dev;
2061 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2064 #endif
2067 * Allocate a dst for local (unicast / anycast) address.
2070 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2071 const struct in6_addr *addr,
2072 bool anycast)
2074 struct net *net = dev_net(idev->dev);
2075 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2076 net->loopback_dev, 0);
2077 struct neighbour *neigh;
2079 if (!rt) {
2080 if (net_ratelimit())
2081 pr_warning("IPv6: Maximum number of routes reached,"
2082 " consider increasing route/max_size.\n");
2083 return ERR_PTR(-ENOMEM);
2086 in6_dev_hold(idev);
2088 rt->dst.flags |= DST_HOST;
2089 rt->dst.input = ip6_input;
2090 rt->dst.output = ip6_output;
2091 rt->rt6i_idev = idev;
2092 rt->dst.obsolete = -1;
2094 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2095 if (anycast)
2096 rt->rt6i_flags |= RTF_ANYCAST;
2097 else
2098 rt->rt6i_flags |= RTF_LOCAL;
2099 neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
2100 if (IS_ERR(neigh)) {
2101 dst_free(&rt->dst);
2103 return ERR_CAST(neigh);
2105 dst_set_neighbour(&rt->dst, neigh);
2107 rt->rt6i_dst.addr = *addr;
2108 rt->rt6i_dst.plen = 128;
2109 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2111 atomic_set(&rt->dst.__refcnt, 1);
2113 return rt;
2116 int ip6_route_get_saddr(struct net *net,
2117 struct rt6_info *rt,
2118 const struct in6_addr *daddr,
2119 unsigned int prefs,
2120 struct in6_addr *saddr)
2122 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2123 int err = 0;
2124 if (rt->rt6i_prefsrc.plen)
2125 *saddr = rt->rt6i_prefsrc.addr;
2126 else
2127 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2128 daddr, prefs, saddr);
2129 return err;
2132 /* remove deleted ip from prefsrc entries */
2133 struct arg_dev_net_ip {
2134 struct net_device *dev;
2135 struct net *net;
2136 struct in6_addr *addr;
2139 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2141 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2142 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2143 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2145 if (((void *)rt->rt6i_dev == dev || !dev) &&
2146 rt != net->ipv6.ip6_null_entry &&
2147 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2148 /* remove prefsrc entry */
2149 rt->rt6i_prefsrc.plen = 0;
2151 return 0;
2154 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2156 struct net *net = dev_net(ifp->idev->dev);
2157 struct arg_dev_net_ip adni = {
2158 .dev = ifp->idev->dev,
2159 .net = net,
2160 .addr = &ifp->addr,
2162 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2165 struct arg_dev_net {
2166 struct net_device *dev;
2167 struct net *net;
2170 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2172 const struct arg_dev_net *adn = arg;
2173 const struct net_device *dev = adn->dev;
2175 if ((rt->rt6i_dev == dev || !dev) &&
2176 rt != adn->net->ipv6.ip6_null_entry) {
2177 RT6_TRACE("deleted by ifdown %p\n", rt);
2178 return -1;
2180 return 0;
2183 void rt6_ifdown(struct net *net, struct net_device *dev)
2185 struct arg_dev_net adn = {
2186 .dev = dev,
2187 .net = net,
2190 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2191 icmp6_clean_all(fib6_ifdown, &adn);
2194 struct rt6_mtu_change_arg
2196 struct net_device *dev;
2197 unsigned mtu;
2200 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2202 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2203 struct inet6_dev *idev;
2205 /* In IPv6 pmtu discovery is not optional,
2206 so that RTAX_MTU lock cannot disable it.
2207 We still use this lock to block changes
2208 caused by addrconf/ndisc.
2211 idev = __in6_dev_get(arg->dev);
2212 if (!idev)
2213 return 0;
2215 /* For administrative MTU increase, there is no way to discover
2216 IPv6 PMTU increase, so PMTU increase should be updated here.
2217 Since RFC 1981 doesn't include administrative MTU increase
2218 update PMTU increase is a MUST. (i.e. jumbo frame)
2221 If new MTU is less than route PMTU, this new MTU will be the
2222 lowest MTU in the path, update the route PMTU to reflect PMTU
2223 decreases; if new MTU is greater than route PMTU, and the
2224 old MTU is the lowest MTU in the path, update the route PMTU
2225 to reflect the increase. In this case if the other nodes' MTU
2226 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2227 PMTU discouvery.
2229 if (rt->rt6i_dev == arg->dev &&
2230 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2231 (dst_mtu(&rt->dst) >= arg->mtu ||
2232 (dst_mtu(&rt->dst) < arg->mtu &&
2233 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2234 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2236 return 0;
2239 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2241 struct rt6_mtu_change_arg arg = {
2242 .dev = dev,
2243 .mtu = mtu,
2246 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2249 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2250 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2251 [RTA_OIF] = { .type = NLA_U32 },
2252 [RTA_IIF] = { .type = NLA_U32 },
2253 [RTA_PRIORITY] = { .type = NLA_U32 },
2254 [RTA_METRICS] = { .type = NLA_NESTED },
2257 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2258 struct fib6_config *cfg)
2260 struct rtmsg *rtm;
2261 struct nlattr *tb[RTA_MAX+1];
2262 int err;
2264 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2265 if (err < 0)
2266 goto errout;
2268 err = -EINVAL;
2269 rtm = nlmsg_data(nlh);
2270 memset(cfg, 0, sizeof(*cfg));
2272 cfg->fc_table = rtm->rtm_table;
2273 cfg->fc_dst_len = rtm->rtm_dst_len;
2274 cfg->fc_src_len = rtm->rtm_src_len;
2275 cfg->fc_flags = RTF_UP;
2276 cfg->fc_protocol = rtm->rtm_protocol;
2278 if (rtm->rtm_type == RTN_UNREACHABLE)
2279 cfg->fc_flags |= RTF_REJECT;
2281 if (rtm->rtm_type == RTN_LOCAL)
2282 cfg->fc_flags |= RTF_LOCAL;
2284 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2285 cfg->fc_nlinfo.nlh = nlh;
2286 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2288 if (tb[RTA_GATEWAY]) {
2289 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2290 cfg->fc_flags |= RTF_GATEWAY;
2293 if (tb[RTA_DST]) {
2294 int plen = (rtm->rtm_dst_len + 7) >> 3;
2296 if (nla_len(tb[RTA_DST]) < plen)
2297 goto errout;
2299 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2302 if (tb[RTA_SRC]) {
2303 int plen = (rtm->rtm_src_len + 7) >> 3;
2305 if (nla_len(tb[RTA_SRC]) < plen)
2306 goto errout;
2308 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2311 if (tb[RTA_PREFSRC])
2312 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2314 if (tb[RTA_OIF])
2315 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2317 if (tb[RTA_PRIORITY])
2318 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2320 if (tb[RTA_METRICS]) {
2321 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2322 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2325 if (tb[RTA_TABLE])
2326 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2328 err = 0;
2329 errout:
2330 return err;
2333 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2335 struct fib6_config cfg;
2336 int err;
2338 err = rtm_to_fib6_config(skb, nlh, &cfg);
2339 if (err < 0)
2340 return err;
2342 return ip6_route_del(&cfg);
2345 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2347 struct fib6_config cfg;
2348 int err;
2350 err = rtm_to_fib6_config(skb, nlh, &cfg);
2351 if (err < 0)
2352 return err;
2354 return ip6_route_add(&cfg);
2357 static inline size_t rt6_nlmsg_size(void)
2359 return NLMSG_ALIGN(sizeof(struct rtmsg))
2360 + nla_total_size(16) /* RTA_SRC */
2361 + nla_total_size(16) /* RTA_DST */
2362 + nla_total_size(16) /* RTA_GATEWAY */
2363 + nla_total_size(16) /* RTA_PREFSRC */
2364 + nla_total_size(4) /* RTA_TABLE */
2365 + nla_total_size(4) /* RTA_IIF */
2366 + nla_total_size(4) /* RTA_OIF */
2367 + nla_total_size(4) /* RTA_PRIORITY */
2368 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2369 + nla_total_size(sizeof(struct rta_cacheinfo));
2372 static int rt6_fill_node(struct net *net,
2373 struct sk_buff *skb, struct rt6_info *rt,
2374 struct in6_addr *dst, struct in6_addr *src,
2375 int iif, int type, u32 pid, u32 seq,
2376 int prefix, int nowait, unsigned int flags)
2378 struct rtmsg *rtm;
2379 struct nlmsghdr *nlh;
2380 long expires;
2381 u32 table;
2382 struct neighbour *n;
2384 if (prefix) { /* user wants prefix routes only */
2385 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2386 /* success since this is not a prefix route */
2387 return 1;
2391 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2392 if (!nlh)
2393 return -EMSGSIZE;
2395 rtm = nlmsg_data(nlh);
2396 rtm->rtm_family = AF_INET6;
2397 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2398 rtm->rtm_src_len = rt->rt6i_src.plen;
2399 rtm->rtm_tos = 0;
2400 if (rt->rt6i_table)
2401 table = rt->rt6i_table->tb6_id;
2402 else
2403 table = RT6_TABLE_UNSPEC;
2404 rtm->rtm_table = table;
2405 NLA_PUT_U32(skb, RTA_TABLE, table);
2406 if (rt->rt6i_flags & RTF_REJECT)
2407 rtm->rtm_type = RTN_UNREACHABLE;
2408 else if (rt->rt6i_flags & RTF_LOCAL)
2409 rtm->rtm_type = RTN_LOCAL;
2410 else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
2411 rtm->rtm_type = RTN_LOCAL;
2412 else
2413 rtm->rtm_type = RTN_UNICAST;
2414 rtm->rtm_flags = 0;
2415 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2416 rtm->rtm_protocol = rt->rt6i_protocol;
2417 if (rt->rt6i_flags & RTF_DYNAMIC)
2418 rtm->rtm_protocol = RTPROT_REDIRECT;
2419 else if (rt->rt6i_flags & RTF_ADDRCONF)
2420 rtm->rtm_protocol = RTPROT_KERNEL;
2421 else if (rt->rt6i_flags & RTF_DEFAULT)
2422 rtm->rtm_protocol = RTPROT_RA;
2424 if (rt->rt6i_flags & RTF_CACHE)
2425 rtm->rtm_flags |= RTM_F_CLONED;
2427 if (dst) {
2428 NLA_PUT(skb, RTA_DST, 16, dst);
2429 rtm->rtm_dst_len = 128;
2430 } else if (rtm->rtm_dst_len)
2431 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2432 #ifdef CONFIG_IPV6_SUBTREES
2433 if (src) {
2434 NLA_PUT(skb, RTA_SRC, 16, src);
2435 rtm->rtm_src_len = 128;
2436 } else if (rtm->rtm_src_len)
2437 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2438 #endif
2439 if (iif) {
2440 #ifdef CONFIG_IPV6_MROUTE
2441 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2442 int err = ip6mr_get_route(net, skb, rtm, nowait);
2443 if (err <= 0) {
2444 if (!nowait) {
2445 if (err == 0)
2446 return 0;
2447 goto nla_put_failure;
2448 } else {
2449 if (err == -EMSGSIZE)
2450 goto nla_put_failure;
2453 } else
2454 #endif
2455 NLA_PUT_U32(skb, RTA_IIF, iif);
2456 } else if (dst) {
2457 struct in6_addr saddr_buf;
2458 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2459 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2462 if (rt->rt6i_prefsrc.plen) {
2463 struct in6_addr saddr_buf;
2464 saddr_buf = rt->rt6i_prefsrc.addr;
2465 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2468 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2469 goto nla_put_failure;
2471 rcu_read_lock();
2472 n = dst_get_neighbour_noref(&rt->dst);
2473 if (n)
2474 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2475 rcu_read_unlock();
2477 if (rt->dst.dev)
2478 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2480 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2482 if (!(rt->rt6i_flags & RTF_EXPIRES))
2483 expires = 0;
2484 else if (rt->rt6i_expires - jiffies < INT_MAX)
2485 expires = rt->rt6i_expires - jiffies;
2486 else
2487 expires = INT_MAX;
2489 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2490 expires, rt->dst.error) < 0)
2491 goto nla_put_failure;
2493 return nlmsg_end(skb, nlh);
2495 nla_put_failure:
2496 nlmsg_cancel(skb, nlh);
2497 return -EMSGSIZE;
2500 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2502 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2503 int prefix;
2505 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2506 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2507 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2508 } else
2509 prefix = 0;
2511 return rt6_fill_node(arg->net,
2512 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2513 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2514 prefix, 0, NLM_F_MULTI);
2517 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2519 struct net *net = sock_net(in_skb->sk);
2520 struct nlattr *tb[RTA_MAX+1];
2521 struct rt6_info *rt;
2522 struct sk_buff *skb;
2523 struct rtmsg *rtm;
2524 struct flowi6 fl6;
2525 int err, iif = 0;
2527 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2528 if (err < 0)
2529 goto errout;
2531 err = -EINVAL;
2532 memset(&fl6, 0, sizeof(fl6));
2534 if (tb[RTA_SRC]) {
2535 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2536 goto errout;
2538 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2541 if (tb[RTA_DST]) {
2542 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2543 goto errout;
2545 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2548 if (tb[RTA_IIF])
2549 iif = nla_get_u32(tb[RTA_IIF]);
2551 if (tb[RTA_OIF])
2552 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2554 if (iif) {
2555 struct net_device *dev;
2556 dev = __dev_get_by_index(net, iif);
2557 if (!dev) {
2558 err = -ENODEV;
2559 goto errout;
2563 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2564 if (!skb) {
2565 err = -ENOBUFS;
2566 goto errout;
2569 /* Reserve room for dummy headers, this skb can pass
2570 through good chunk of routing engine.
2572 skb_reset_mac_header(skb);
2573 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2575 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2576 skb_dst_set(skb, &rt->dst);
2578 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2579 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2580 nlh->nlmsg_seq, 0, 0, 0);
2581 if (err < 0) {
2582 kfree_skb(skb);
2583 goto errout;
2586 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2587 errout:
2588 return err;
2591 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2593 struct sk_buff *skb;
2594 struct net *net = info->nl_net;
2595 u32 seq;
2596 int err;
2598 err = -ENOBUFS;
2599 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2601 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2602 if (!skb)
2603 goto errout;
2605 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2606 event, info->pid, seq, 0, 0, 0);
2607 if (err < 0) {
2608 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2609 WARN_ON(err == -EMSGSIZE);
2610 kfree_skb(skb);
2611 goto errout;
2613 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2614 info->nlh, gfp_any());
2615 return;
2616 errout:
2617 if (err < 0)
2618 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2621 static int ip6_route_dev_notify(struct notifier_block *this,
2622 unsigned long event, void *data)
2624 struct net_device *dev = (struct net_device *)data;
2625 struct net *net = dev_net(dev);
2627 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2628 net->ipv6.ip6_null_entry->dst.dev = dev;
2629 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2630 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2631 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2632 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2633 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2634 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2635 #endif
2638 return NOTIFY_OK;
2642 * /proc
2645 #ifdef CONFIG_PROC_FS
2647 struct rt6_proc_arg
2649 char *buffer;
2650 int offset;
2651 int length;
2652 int skip;
2653 int len;
2656 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2658 struct seq_file *m = p_arg;
2659 struct neighbour *n;
2661 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2663 #ifdef CONFIG_IPV6_SUBTREES
2664 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2665 #else
2666 seq_puts(m, "00000000000000000000000000000000 00 ");
2667 #endif
2668 rcu_read_lock();
2669 n = dst_get_neighbour_noref(&rt->dst);
2670 if (n) {
2671 seq_printf(m, "%pi6", n->primary_key);
2672 } else {
2673 seq_puts(m, "00000000000000000000000000000000");
2675 rcu_read_unlock();
2676 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2677 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2678 rt->dst.__use, rt->rt6i_flags,
2679 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2680 return 0;
2683 static int ipv6_route_show(struct seq_file *m, void *v)
2685 struct net *net = (struct net *)m->private;
2686 fib6_clean_all(net, rt6_info_route, 0, m);
2687 return 0;
2690 static int ipv6_route_open(struct inode *inode, struct file *file)
2692 return single_open_net(inode, file, ipv6_route_show);
2695 static const struct file_operations ipv6_route_proc_fops = {
2696 .owner = THIS_MODULE,
2697 .open = ipv6_route_open,
2698 .read = seq_read,
2699 .llseek = seq_lseek,
2700 .release = single_release_net,
2703 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2705 struct net *net = (struct net *)seq->private;
2706 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2707 net->ipv6.rt6_stats->fib_nodes,
2708 net->ipv6.rt6_stats->fib_route_nodes,
2709 net->ipv6.rt6_stats->fib_rt_alloc,
2710 net->ipv6.rt6_stats->fib_rt_entries,
2711 net->ipv6.rt6_stats->fib_rt_cache,
2712 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2713 net->ipv6.rt6_stats->fib_discarded_routes);
2715 return 0;
2718 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2720 return single_open_net(inode, file, rt6_stats_seq_show);
2723 static const struct file_operations rt6_stats_seq_fops = {
2724 .owner = THIS_MODULE,
2725 .open = rt6_stats_seq_open,
2726 .read = seq_read,
2727 .llseek = seq_lseek,
2728 .release = single_release_net,
2730 #endif /* CONFIG_PROC_FS */
2732 #ifdef CONFIG_SYSCTL
2734 static
2735 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2736 void __user *buffer, size_t *lenp, loff_t *ppos)
2738 struct net *net;
2739 int delay;
2740 if (!write)
2741 return -EINVAL;
2743 net = (struct net *)ctl->extra1;
2744 delay = net->ipv6.sysctl.flush_delay;
2745 proc_dointvec(ctl, write, buffer, lenp, ppos);
2746 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2747 return 0;
2750 ctl_table ipv6_route_table_template[] = {
2752 .procname = "flush",
2753 .data = &init_net.ipv6.sysctl.flush_delay,
2754 .maxlen = sizeof(int),
2755 .mode = 0200,
2756 .proc_handler = ipv6_sysctl_rtcache_flush
2759 .procname = "gc_thresh",
2760 .data = &ip6_dst_ops_template.gc_thresh,
2761 .maxlen = sizeof(int),
2762 .mode = 0644,
2763 .proc_handler = proc_dointvec,
2766 .procname = "max_size",
2767 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2768 .maxlen = sizeof(int),
2769 .mode = 0644,
2770 .proc_handler = proc_dointvec,
2773 .procname = "gc_min_interval",
2774 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2775 .maxlen = sizeof(int),
2776 .mode = 0644,
2777 .proc_handler = proc_dointvec_jiffies,
2780 .procname = "gc_timeout",
2781 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2782 .maxlen = sizeof(int),
2783 .mode = 0644,
2784 .proc_handler = proc_dointvec_jiffies,
2787 .procname = "gc_interval",
2788 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2789 .maxlen = sizeof(int),
2790 .mode = 0644,
2791 .proc_handler = proc_dointvec_jiffies,
2794 .procname = "gc_elasticity",
2795 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2796 .maxlen = sizeof(int),
2797 .mode = 0644,
2798 .proc_handler = proc_dointvec,
2801 .procname = "mtu_expires",
2802 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2803 .maxlen = sizeof(int),
2804 .mode = 0644,
2805 .proc_handler = proc_dointvec_jiffies,
2808 .procname = "min_adv_mss",
2809 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2810 .maxlen = sizeof(int),
2811 .mode = 0644,
2812 .proc_handler = proc_dointvec,
2815 .procname = "gc_min_interval_ms",
2816 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2817 .maxlen = sizeof(int),
2818 .mode = 0644,
2819 .proc_handler = proc_dointvec_ms_jiffies,
2824 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2826 struct ctl_table *table;
2828 table = kmemdup(ipv6_route_table_template,
2829 sizeof(ipv6_route_table_template),
2830 GFP_KERNEL);
2832 if (table) {
2833 table[0].data = &net->ipv6.sysctl.flush_delay;
2834 table[0].extra1 = net;
2835 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2836 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2837 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2838 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2839 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2840 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2841 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2842 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2843 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2846 return table;
2848 #endif
2850 static int __net_init ip6_route_net_init(struct net *net)
2852 int ret = -ENOMEM;
2854 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2855 sizeof(net->ipv6.ip6_dst_ops));
2857 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2858 goto out_ip6_dst_ops;
2860 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2861 sizeof(*net->ipv6.ip6_null_entry),
2862 GFP_KERNEL);
2863 if (!net->ipv6.ip6_null_entry)
2864 goto out_ip6_dst_entries;
2865 net->ipv6.ip6_null_entry->dst.path =
2866 (struct dst_entry *)net->ipv6.ip6_null_entry;
2867 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2868 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2869 ip6_template_metrics, true);
2871 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2872 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2873 sizeof(*net->ipv6.ip6_prohibit_entry),
2874 GFP_KERNEL);
2875 if (!net->ipv6.ip6_prohibit_entry)
2876 goto out_ip6_null_entry;
2877 net->ipv6.ip6_prohibit_entry->dst.path =
2878 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2879 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2880 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2881 ip6_template_metrics, true);
2883 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2884 sizeof(*net->ipv6.ip6_blk_hole_entry),
2885 GFP_KERNEL);
2886 if (!net->ipv6.ip6_blk_hole_entry)
2887 goto out_ip6_prohibit_entry;
2888 net->ipv6.ip6_blk_hole_entry->dst.path =
2889 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2890 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2891 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2892 ip6_template_metrics, true);
2893 #endif
2895 net->ipv6.sysctl.flush_delay = 0;
2896 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2897 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2898 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2899 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2900 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2901 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2902 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2904 #ifdef CONFIG_PROC_FS
2905 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2906 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2907 #endif
2908 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2910 ret = 0;
2911 out:
2912 return ret;
2914 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2915 out_ip6_prohibit_entry:
2916 kfree(net->ipv6.ip6_prohibit_entry);
2917 out_ip6_null_entry:
2918 kfree(net->ipv6.ip6_null_entry);
2919 #endif
2920 out_ip6_dst_entries:
2921 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2922 out_ip6_dst_ops:
2923 goto out;
2926 static void __net_exit ip6_route_net_exit(struct net *net)
2928 #ifdef CONFIG_PROC_FS
2929 proc_net_remove(net, "ipv6_route");
2930 proc_net_remove(net, "rt6_stats");
2931 #endif
2932 kfree(net->ipv6.ip6_null_entry);
2933 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2934 kfree(net->ipv6.ip6_prohibit_entry);
2935 kfree(net->ipv6.ip6_blk_hole_entry);
2936 #endif
2937 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2940 static struct pernet_operations ip6_route_net_ops = {
2941 .init = ip6_route_net_init,
2942 .exit = ip6_route_net_exit,
2945 static struct notifier_block ip6_route_dev_notifier = {
2946 .notifier_call = ip6_route_dev_notify,
2947 .priority = 0,
2950 int __init ip6_route_init(void)
2952 int ret;
2954 ret = -ENOMEM;
2955 ip6_dst_ops_template.kmem_cachep =
2956 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2957 SLAB_HWCACHE_ALIGN, NULL);
2958 if (!ip6_dst_ops_template.kmem_cachep)
2959 goto out;
2961 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2962 if (ret)
2963 goto out_kmem_cache;
2965 ret = register_pernet_subsys(&ip6_route_net_ops);
2966 if (ret)
2967 goto out_dst_entries;
2969 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2971 /* Registering of the loopback is done before this portion of code,
2972 * the loopback reference in rt6_info will not be taken, do it
2973 * manually for init_net */
2974 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2977 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2978 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2980 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981 #endif
2982 ret = fib6_init();
2983 if (ret)
2984 goto out_register_subsys;
2986 ret = xfrm6_init();
2987 if (ret)
2988 goto out_fib6_init;
2990 ret = fib6_rules_init();
2991 if (ret)
2992 goto xfrm6_init;
2994 ret = -ENOBUFS;
2995 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2996 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2997 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2998 goto fib6_rules_init;
3000 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3001 if (ret)
3002 goto fib6_rules_init;
3004 out:
3005 return ret;
3007 fib6_rules_init:
3008 fib6_rules_cleanup();
3009 xfrm6_init:
3010 xfrm6_fini();
3011 out_fib6_init:
3012 fib6_gc_cleanup();
3013 out_register_subsys:
3014 unregister_pernet_subsys(&ip6_route_net_ops);
3015 out_dst_entries:
3016 dst_entries_destroy(&ip6_dst_blackhole_ops);
3017 out_kmem_cache:
3018 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3019 goto out;
3022 void ip6_route_cleanup(void)
3024 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3025 fib6_rules_cleanup();
3026 xfrm6_fini();
3027 fib6_gc_cleanup();
3028 unregister_pernet_subsys(&ip6_route_net_ops);
3029 dst_entries_destroy(&ip6_dst_blackhole_ops);
3030 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);