Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv6 / route.c
blob0e381bb94683491815d95fc00b63b6e9c843862e
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
96 unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
100 #endif
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
106 u32 *p = NULL;
108 if (!(rt->dst.flags & DST_HOST))
109 return NULL;
111 if (!rt->rt6i_peer)
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
115 if (peer) {
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
119 p = peer->metrics;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
126 if (prev != old) {
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
129 p = NULL;
132 return p;
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
141 .family = AF_INET6,
142 .protocol = cpu_to_be16(ETH_P_IPV6),
143 .gc = ip6_dst_gc,
144 .gc_thresh = 1024,
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
147 .mtu = ip6_mtu,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170 unsigned long old)
172 return NULL;
175 static struct dst_ops ip6_dst_blackhole_ops = {
176 .family = AF_INET6,
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 255,
191 static struct rt6_info ip6_null_entry_template = {
192 .dst = {
193 .__refcnt = ATOMIC_INIT(1),
194 .__use = 1,
195 .obsolete = -1,
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
212 .dst = {
213 .__refcnt = ATOMIC_INIT(1),
214 .__use = 1,
215 .obsolete = -1,
216 .error = -EACCES,
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
227 .dst = {
228 .__refcnt = ATOMIC_INIT(1),
229 .__use = 1,
230 .obsolete = -1,
231 .error = -EINVAL,
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
241 #endif
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
246 int flags)
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
250 if (rt != NULL)
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
254 return rt;
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
266 if (idev != NULL) {
267 rt->rt6i_idev = NULL;
268 in6_dev_put(idev);
270 if (peer) {
271 rt->rt6i_peer = NULL;
272 inet_putpeer(peer);
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289 inet_putpeer(peer);
290 else
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295 int how)
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
307 in6_dev_put(idev);
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329 struct rt6_info *rt,
330 const struct in6_addr *saddr,
331 int oif,
332 int flags)
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
338 goto out;
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
343 if (oif) {
344 if (dev->ifindex == oif)
345 return sprt;
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
350 continue;
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
353 continue;
355 local = sprt;
357 } else {
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
360 return sprt;
364 if (oif) {
365 if (local)
366 return local;
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
371 out:
372 return rt;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
387 rcu_read_lock();
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
390 goto out;
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403 } else {
404 read_unlock_bh(&neigh->lock);
406 out:
407 rcu_read_unlock();
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
413 #endif
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
422 return 2;
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425 return 1;
426 return 0;
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
432 int m;
434 rcu_read_lock();
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
438 m = 1;
439 else if (neigh) {
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
442 m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
445 m = 0;
446 #endif
447 else
448 m = 1;
449 read_unlock_bh(&neigh->lock);
450 } else
451 m = 0;
452 rcu_read_unlock();
453 return m;
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457 int strict)
459 int m, n;
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
463 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469 return -1;
470 return m;
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
476 int m;
478 if (rt6_check_expired(rt))
479 goto out;
481 m = rt6_score_route(rt, oif, strict);
482 if (m < 0)
483 goto out;
485 if (m > *mpri) {
486 if (strict & RT6_LOOKUP_F_REACHABLE)
487 rt6_probe(match);
488 *mpri = m;
489 match = rt;
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491 rt6_probe(rt);
494 out:
495 return match;
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
503 int mpri = -1;
505 match = NULL;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
513 return match;
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
519 struct net *net;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
524 rt0 = fn->rr_ptr;
525 if (!rt0)
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
530 if (!match &&
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536 next = fn->leaf;
538 if (next != rt0)
539 fn->rr_ptr = next;
542 RT6_TRACE("%s() => %p\n",
543 __func__, match);
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
556 unsigned int pref;
557 unsigned long lifetime;
558 struct rt6_info *rt;
560 if (len < sizeof(struct route_info)) {
561 return -EINVAL;
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
566 return -EINVAL;
567 } else if (rinfo->prefix_len > 128) {
568 return -EINVAL;
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
571 return -EINVAL;
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
575 return -EINVAL;
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
581 return -EINVAL;
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
587 else {
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
591 rinfo->prefix_len);
592 prefix = &prefix_buf;
595 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596 dev->ifindex);
598 if (rt && !lifetime) {
599 ip6_del_rt(rt);
600 rt = NULL;
603 if (!rt && lifetime)
604 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605 pref);
606 else if (rt)
607 rt->rt6i_flags = RTF_ROUTEINFO |
608 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
610 if (rt) {
611 if (!addrconf_finite_timeout(lifetime)) {
612 rt->rt6i_flags &= ~RTF_EXPIRES;
613 } else {
614 rt->rt6i_expires = jiffies + HZ * lifetime;
615 rt->rt6i_flags |= RTF_EXPIRES;
617 dst_release(&rt->dst);
619 return 0;
621 #endif
623 #define BACKTRACK(__net, saddr) \
624 do { \
625 if (rt == __net->ipv6.ip6_null_entry) { \
626 struct fib6_node *pn; \
627 while (1) { \
628 if (fn->fn_flags & RTN_TL_ROOT) \
629 goto out; \
630 pn = fn->parent; \
631 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633 else \
634 fn = pn; \
635 if (fn->fn_flags & RTN_RTINFO) \
636 goto restart; \
639 } while(0)
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642 struct fib6_table *table,
643 struct flowi6 *fl6, int flags)
645 struct fib6_node *fn;
646 struct rt6_info *rt;
648 read_lock_bh(&table->tb6_lock);
649 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651 rt = fn->leaf;
652 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653 BACKTRACK(net, &fl6->saddr);
654 out:
655 dst_use(&rt->dst, jiffies);
656 read_unlock_bh(&table->tb6_lock);
657 return rt;
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662 const struct in6_addr *saddr, int oif, int strict)
664 struct flowi6 fl6 = {
665 .flowi6_oif = oif,
666 .daddr = *daddr,
668 struct dst_entry *dst;
669 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
671 if (saddr) {
672 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673 flags |= RT6_LOOKUP_F_HAS_SADDR;
676 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677 if (dst->error == 0)
678 return (struct rt6_info *) dst;
680 dst_release(dst);
682 return NULL;
685 EXPORT_SYMBOL(rt6_lookup);
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688 It takes new route entry, the addition fails by any reason the
689 route is freed. In any case, if caller does not hold it, it may
690 be destroyed.
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
695 int err;
696 struct fib6_table *table;
698 table = rt->rt6i_table;
699 write_lock_bh(&table->tb6_lock);
700 err = fib6_add(&table->tb6_root, rt, info);
701 write_unlock_bh(&table->tb6_lock);
703 return err;
706 int ip6_ins_rt(struct rt6_info *rt)
708 struct nl_info info = {
709 .nl_net = dev_net(rt->rt6i_dev),
711 return __ip6_ins_rt(rt, &info);
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715 const struct in6_addr *daddr,
716 const struct in6_addr *saddr)
718 struct rt6_info *rt;
721 * Clone the route.
724 rt = ip6_rt_copy(ort, daddr);
726 if (rt) {
727 struct neighbour *neigh;
728 int attempts = !in_softirq();
730 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731 if (rt->rt6i_dst.plen != 128 &&
732 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733 rt->rt6i_flags |= RTF_ANYCAST;
734 rt->rt6i_gateway = *daddr;
737 rt->rt6i_flags |= RTF_CACHE;
739 #ifdef CONFIG_IPV6_SUBTREES
740 if (rt->rt6i_src.plen && saddr) {
741 rt->rt6i_src.addr = *saddr;
742 rt->rt6i_src.plen = 128;
744 #endif
746 retry:
747 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
748 if (IS_ERR(neigh)) {
749 struct net *net = dev_net(rt->rt6i_dev);
750 int saved_rt_min_interval =
751 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752 int saved_rt_elasticity =
753 net->ipv6.sysctl.ip6_rt_gc_elasticity;
755 if (attempts-- > 0) {
756 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
759 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761 net->ipv6.sysctl.ip6_rt_gc_elasticity =
762 saved_rt_elasticity;
763 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764 saved_rt_min_interval;
765 goto retry;
768 if (net_ratelimit())
769 printk(KERN_WARNING
770 "ipv6: Neighbour table overflow.\n");
771 dst_free(&rt->dst);
772 return NULL;
774 dst_set_neighbour(&rt->dst, neigh);
778 return rt;
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782 const struct in6_addr *daddr)
784 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
786 if (rt) {
787 rt->rt6i_flags |= RTF_CACHE;
788 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
790 return rt;
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794 struct flowi6 *fl6, int flags)
796 struct fib6_node *fn;
797 struct rt6_info *rt, *nrt;
798 int strict = 0;
799 int attempts = 3;
800 int err;
801 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
803 strict |= flags & RT6_LOOKUP_F_IFACE;
805 relookup:
806 read_lock_bh(&table->tb6_lock);
808 restart_2:
809 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
811 restart:
812 rt = rt6_select(fn, oif, strict | reachable);
814 BACKTRACK(net, &fl6->saddr);
815 if (rt == net->ipv6.ip6_null_entry ||
816 rt->rt6i_flags & RTF_CACHE)
817 goto out;
819 dst_hold(&rt->dst);
820 read_unlock_bh(&table->tb6_lock);
822 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
823 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
824 else if (!(rt->dst.flags & DST_HOST))
825 nrt = rt6_alloc_clone(rt, &fl6->daddr);
826 else
827 goto out2;
829 dst_release(&rt->dst);
830 rt = nrt ? : net->ipv6.ip6_null_entry;
832 dst_hold(&rt->dst);
833 if (nrt) {
834 err = ip6_ins_rt(nrt);
835 if (!err)
836 goto out2;
839 if (--attempts <= 0)
840 goto out2;
843 * Race condition! In the gap, when table->tb6_lock was
844 * released someone could insert this route. Relookup.
846 dst_release(&rt->dst);
847 goto relookup;
849 out:
850 if (reachable) {
851 reachable = 0;
852 goto restart_2;
854 dst_hold(&rt->dst);
855 read_unlock_bh(&table->tb6_lock);
856 out2:
857 rt->dst.lastuse = jiffies;
858 rt->dst.__use++;
860 return rt;
863 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
864 struct flowi6 *fl6, int flags)
866 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
869 void ip6_route_input(struct sk_buff *skb)
871 const struct ipv6hdr *iph = ipv6_hdr(skb);
872 struct net *net = dev_net(skb->dev);
873 int flags = RT6_LOOKUP_F_HAS_SADDR;
874 struct flowi6 fl6 = {
875 .flowi6_iif = skb->dev->ifindex,
876 .daddr = iph->daddr,
877 .saddr = iph->saddr,
878 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
879 .flowi6_mark = skb->mark,
880 .flowi6_proto = iph->nexthdr,
883 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
884 flags |= RT6_LOOKUP_F_IFACE;
886 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
889 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
890 struct flowi6 *fl6, int flags)
892 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
895 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
896 struct flowi6 *fl6)
898 int flags = 0;
900 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
901 flags |= RT6_LOOKUP_F_IFACE;
903 if (!ipv6_addr_any(&fl6->saddr))
904 flags |= RT6_LOOKUP_F_HAS_SADDR;
905 else if (sk)
906 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
908 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
911 EXPORT_SYMBOL(ip6_route_output);
913 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
915 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
916 struct dst_entry *new = NULL;
918 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
919 if (rt) {
920 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
922 new = &rt->dst;
924 new->__use = 1;
925 new->input = dst_discard;
926 new->output = dst_discard;
928 if (dst_metrics_read_only(&ort->dst))
929 new->_metrics = ort->dst._metrics;
930 else
931 dst_copy_metrics(new, &ort->dst);
932 rt->rt6i_idev = ort->rt6i_idev;
933 if (rt->rt6i_idev)
934 in6_dev_hold(rt->rt6i_idev);
935 rt->rt6i_expires = 0;
937 rt->rt6i_gateway = ort->rt6i_gateway;
938 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
939 rt->rt6i_metric = 0;
941 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
942 #ifdef CONFIG_IPV6_SUBTREES
943 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
944 #endif
946 dst_free(new);
949 dst_release(dst_orig);
950 return new ? new : ERR_PTR(-ENOMEM);
954 * Destination cache support functions
957 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
959 struct rt6_info *rt;
961 rt = (struct rt6_info *) dst;
963 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
964 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
965 if (!rt->rt6i_peer)
966 rt6_bind_peer(rt, 0);
967 rt->rt6i_peer_genid = rt6_peer_genid();
969 return dst;
971 return NULL;
974 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
976 struct rt6_info *rt = (struct rt6_info *) dst;
978 if (rt) {
979 if (rt->rt6i_flags & RTF_CACHE) {
980 if (rt6_check_expired(rt)) {
981 ip6_del_rt(rt);
982 dst = NULL;
984 } else {
985 dst_release(dst);
986 dst = NULL;
989 return dst;
992 static void ip6_link_failure(struct sk_buff *skb)
994 struct rt6_info *rt;
996 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
998 rt = (struct rt6_info *) skb_dst(skb);
999 if (rt) {
1000 if (rt->rt6i_flags&RTF_CACHE) {
1001 dst_set_expires(&rt->dst, 0);
1002 rt->rt6i_flags |= RTF_EXPIRES;
1003 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1004 rt->rt6i_node->fn_sernum = -1;
1008 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1010 struct rt6_info *rt6 = (struct rt6_info*)dst;
1012 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1013 rt6->rt6i_flags |= RTF_MODIFIED;
1014 if (mtu < IPV6_MIN_MTU) {
1015 u32 features = dst_metric(dst, RTAX_FEATURES);
1016 mtu = IPV6_MIN_MTU;
1017 features |= RTAX_FEATURE_ALLFRAG;
1018 dst_metric_set(dst, RTAX_FEATURES, features);
1020 dst_metric_set(dst, RTAX_MTU, mtu);
1024 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1026 struct net_device *dev = dst->dev;
1027 unsigned int mtu = dst_mtu(dst);
1028 struct net *net = dev_net(dev);
1030 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1032 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1033 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1036 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1037 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1038 * IPV6_MAXPLEN is also valid and means: "any MSS,
1039 * rely only on pmtu discovery"
1041 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1042 mtu = IPV6_MAXPLEN;
1043 return mtu;
1046 static unsigned int ip6_mtu(const struct dst_entry *dst)
1048 struct inet6_dev *idev;
1049 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1051 if (mtu)
1052 return mtu;
1054 mtu = IPV6_MIN_MTU;
1056 rcu_read_lock();
1057 idev = __in6_dev_get(dst->dev);
1058 if (idev)
1059 mtu = idev->cnf.mtu6;
1060 rcu_read_unlock();
1062 return mtu;
1065 static struct dst_entry *icmp6_dst_gc_list;
1066 static DEFINE_SPINLOCK(icmp6_dst_lock);
1068 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1069 struct neighbour *neigh,
1070 const struct in6_addr *addr)
1072 struct rt6_info *rt;
1073 struct inet6_dev *idev = in6_dev_get(dev);
1074 struct net *net = dev_net(dev);
1076 if (unlikely(idev == NULL))
1077 return NULL;
1079 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1080 if (unlikely(rt == NULL)) {
1081 in6_dev_put(idev);
1082 goto out;
1085 if (neigh)
1086 neigh_hold(neigh);
1087 else {
1088 neigh = ndisc_get_neigh(dev, addr);
1089 if (IS_ERR(neigh))
1090 neigh = NULL;
1093 rt->dst.flags |= DST_HOST;
1094 rt->dst.output = ip6_output;
1095 dst_set_neighbour(&rt->dst, neigh);
1096 atomic_set(&rt->dst.__refcnt, 1);
1097 rt->rt6i_dst.addr = *addr;
1098 rt->rt6i_dst.plen = 128;
1099 rt->rt6i_idev = idev;
1100 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1102 spin_lock_bh(&icmp6_dst_lock);
1103 rt->dst.next = icmp6_dst_gc_list;
1104 icmp6_dst_gc_list = &rt->dst;
1105 spin_unlock_bh(&icmp6_dst_lock);
1107 fib6_force_start_gc(net);
1109 out:
1110 return &rt->dst;
1113 int icmp6_dst_gc(void)
1115 struct dst_entry *dst, **pprev;
1116 int more = 0;
1118 spin_lock_bh(&icmp6_dst_lock);
1119 pprev = &icmp6_dst_gc_list;
1121 while ((dst = *pprev) != NULL) {
1122 if (!atomic_read(&dst->__refcnt)) {
1123 *pprev = dst->next;
1124 dst_free(dst);
1125 } else {
1126 pprev = &dst->next;
1127 ++more;
1131 spin_unlock_bh(&icmp6_dst_lock);
1133 return more;
1136 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1137 void *arg)
1139 struct dst_entry *dst, **pprev;
1141 spin_lock_bh(&icmp6_dst_lock);
1142 pprev = &icmp6_dst_gc_list;
1143 while ((dst = *pprev) != NULL) {
1144 struct rt6_info *rt = (struct rt6_info *) dst;
1145 if (func(rt, arg)) {
1146 *pprev = dst->next;
1147 dst_free(dst);
1148 } else {
1149 pprev = &dst->next;
1152 spin_unlock_bh(&icmp6_dst_lock);
1155 static int ip6_dst_gc(struct dst_ops *ops)
1157 unsigned long now = jiffies;
1158 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1159 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1160 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1161 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1162 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1163 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1164 int entries;
1166 entries = dst_entries_get_fast(ops);
1167 if (time_after(rt_last_gc + rt_min_interval, now) &&
1168 entries <= rt_max_size)
1169 goto out;
1171 net->ipv6.ip6_rt_gc_expire++;
1172 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1173 net->ipv6.ip6_rt_last_gc = now;
1174 entries = dst_entries_get_slow(ops);
1175 if (entries < ops->gc_thresh)
1176 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1177 out:
1178 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1179 return entries > rt_max_size;
1182 /* Clean host part of a prefix. Not necessary in radix tree,
1183 but results in cleaner routing tables.
1185 Remove it only when all the things will work!
1188 int ip6_dst_hoplimit(struct dst_entry *dst)
1190 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1191 if (hoplimit == 0) {
1192 struct net_device *dev = dst->dev;
1193 struct inet6_dev *idev;
1195 rcu_read_lock();
1196 idev = __in6_dev_get(dev);
1197 if (idev)
1198 hoplimit = idev->cnf.hop_limit;
1199 else
1200 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1201 rcu_read_unlock();
1203 return hoplimit;
1205 EXPORT_SYMBOL(ip6_dst_hoplimit);
1211 int ip6_route_add(struct fib6_config *cfg)
1213 int err;
1214 struct net *net = cfg->fc_nlinfo.nl_net;
1215 struct rt6_info *rt = NULL;
1216 struct net_device *dev = NULL;
1217 struct inet6_dev *idev = NULL;
1218 struct fib6_table *table;
1219 int addr_type;
1221 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1222 return -EINVAL;
1223 #ifndef CONFIG_IPV6_SUBTREES
1224 if (cfg->fc_src_len)
1225 return -EINVAL;
1226 #endif
1227 if (cfg->fc_ifindex) {
1228 err = -ENODEV;
1229 dev = dev_get_by_index(net, cfg->fc_ifindex);
1230 if (!dev)
1231 goto out;
1232 idev = in6_dev_get(dev);
1233 if (!idev)
1234 goto out;
1237 if (cfg->fc_metric == 0)
1238 cfg->fc_metric = IP6_RT_PRIO_USER;
1240 err = -ENOBUFS;
1241 if (NULL != cfg->fc_nlinfo.nlh &&
1242 !(cfg->fc_nlinfo.nlh->nlmsg_flags&NLM_F_CREATE)) {
1243 table = fib6_get_table(net, cfg->fc_table);
1244 if (table == NULL) {
1245 printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1246 table = fib6_new_table(net, cfg->fc_table);
1248 } else {
1249 table = fib6_new_table(net, cfg->fc_table);
1251 if (table == NULL) {
1252 goto out;
1255 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1257 if (rt == NULL) {
1258 err = -ENOMEM;
1259 goto out;
1262 rt->dst.obsolete = -1;
1263 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1264 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1267 if (cfg->fc_protocol == RTPROT_UNSPEC)
1268 cfg->fc_protocol = RTPROT_BOOT;
1269 rt->rt6i_protocol = cfg->fc_protocol;
1271 addr_type = ipv6_addr_type(&cfg->fc_dst);
1273 if (addr_type & IPV6_ADDR_MULTICAST)
1274 rt->dst.input = ip6_mc_input;
1275 else if (cfg->fc_flags & RTF_LOCAL)
1276 rt->dst.input = ip6_input;
1277 else
1278 rt->dst.input = ip6_forward;
1280 rt->dst.output = ip6_output;
1282 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1283 rt->rt6i_dst.plen = cfg->fc_dst_len;
1284 if (rt->rt6i_dst.plen == 128)
1285 rt->dst.flags |= DST_HOST;
1287 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1288 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1289 if (!metrics) {
1290 err = -ENOMEM;
1291 goto out;
1293 dst_init_metrics(&rt->dst, metrics, 0);
1295 #ifdef CONFIG_IPV6_SUBTREES
1296 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1297 rt->rt6i_src.plen = cfg->fc_src_len;
1298 #endif
1300 rt->rt6i_metric = cfg->fc_metric;
1302 /* We cannot add true routes via loopback here,
1303 they would result in kernel looping; promote them to reject routes
1305 if ((cfg->fc_flags & RTF_REJECT) ||
1306 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1307 && !(cfg->fc_flags&RTF_LOCAL))) {
1308 /* hold loopback dev/idev if we haven't done so. */
1309 if (dev != net->loopback_dev) {
1310 if (dev) {
1311 dev_put(dev);
1312 in6_dev_put(idev);
1314 dev = net->loopback_dev;
1315 dev_hold(dev);
1316 idev = in6_dev_get(dev);
1317 if (!idev) {
1318 err = -ENODEV;
1319 goto out;
1322 rt->dst.output = ip6_pkt_discard_out;
1323 rt->dst.input = ip6_pkt_discard;
1324 rt->dst.error = -ENETUNREACH;
1325 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1326 goto install_route;
1329 if (cfg->fc_flags & RTF_GATEWAY) {
1330 const struct in6_addr *gw_addr;
1331 int gwa_type;
1333 gw_addr = &cfg->fc_gateway;
1334 rt->rt6i_gateway = *gw_addr;
1335 gwa_type = ipv6_addr_type(gw_addr);
1337 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1338 struct rt6_info *grt;
1340 /* IPv6 strictly inhibits using not link-local
1341 addresses as nexthop address.
1342 Otherwise, router will not able to send redirects.
1343 It is very good, but in some (rare!) circumstances
1344 (SIT, PtP, NBMA NOARP links) it is handy to allow
1345 some exceptions. --ANK
1347 err = -EINVAL;
1348 if (!(gwa_type&IPV6_ADDR_UNICAST))
1349 goto out;
1351 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1353 err = -EHOSTUNREACH;
1354 if (grt == NULL)
1355 goto out;
1356 if (dev) {
1357 if (dev != grt->rt6i_dev) {
1358 dst_release(&grt->dst);
1359 goto out;
1361 } else {
1362 dev = grt->rt6i_dev;
1363 idev = grt->rt6i_idev;
1364 dev_hold(dev);
1365 in6_dev_hold(grt->rt6i_idev);
1367 if (!(grt->rt6i_flags&RTF_GATEWAY))
1368 err = 0;
1369 dst_release(&grt->dst);
1371 if (err)
1372 goto out;
1374 err = -EINVAL;
1375 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1376 goto out;
1379 err = -ENODEV;
1380 if (dev == NULL)
1381 goto out;
1383 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1384 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1385 err = -EINVAL;
1386 goto out;
1388 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1389 rt->rt6i_prefsrc.plen = 128;
1390 } else
1391 rt->rt6i_prefsrc.plen = 0;
1393 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1394 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1395 if (IS_ERR(n)) {
1396 err = PTR_ERR(n);
1397 goto out;
1399 dst_set_neighbour(&rt->dst, n);
1402 rt->rt6i_flags = cfg->fc_flags;
1404 install_route:
1405 if (cfg->fc_mx) {
1406 struct nlattr *nla;
1407 int remaining;
1409 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1410 int type = nla_type(nla);
1412 if (type) {
1413 if (type > RTAX_MAX) {
1414 err = -EINVAL;
1415 goto out;
1418 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1423 rt->dst.dev = dev;
1424 rt->rt6i_idev = idev;
1425 rt->rt6i_table = table;
1427 cfg->fc_nlinfo.nl_net = dev_net(dev);
1429 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1431 out:
1432 if (dev)
1433 dev_put(dev);
1434 if (idev)
1435 in6_dev_put(idev);
1436 if (rt)
1437 dst_free(&rt->dst);
1438 return err;
1441 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1443 int err;
1444 struct fib6_table *table;
1445 struct net *net = dev_net(rt->rt6i_dev);
1447 if (rt == net->ipv6.ip6_null_entry)
1448 return -ENOENT;
1450 table = rt->rt6i_table;
1451 write_lock_bh(&table->tb6_lock);
1453 err = fib6_del(rt, info);
1454 dst_release(&rt->dst);
1456 write_unlock_bh(&table->tb6_lock);
1458 return err;
1461 int ip6_del_rt(struct rt6_info *rt)
1463 struct nl_info info = {
1464 .nl_net = dev_net(rt->rt6i_dev),
1466 return __ip6_del_rt(rt, &info);
1469 static int ip6_route_del(struct fib6_config *cfg)
1471 struct fib6_table *table;
1472 struct fib6_node *fn;
1473 struct rt6_info *rt;
1474 int err = -ESRCH;
1476 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1477 if (table == NULL)
1478 return err;
1480 read_lock_bh(&table->tb6_lock);
1482 fn = fib6_locate(&table->tb6_root,
1483 &cfg->fc_dst, cfg->fc_dst_len,
1484 &cfg->fc_src, cfg->fc_src_len);
1486 if (fn) {
1487 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1488 if (cfg->fc_ifindex &&
1489 (rt->rt6i_dev == NULL ||
1490 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1491 continue;
1492 if (cfg->fc_flags & RTF_GATEWAY &&
1493 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1494 continue;
1495 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1496 continue;
1497 dst_hold(&rt->dst);
1498 read_unlock_bh(&table->tb6_lock);
1500 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1503 read_unlock_bh(&table->tb6_lock);
1505 return err;
1509 * Handle redirects
1511 struct ip6rd_flowi {
1512 struct flowi6 fl6;
1513 struct in6_addr gateway;
1516 static struct rt6_info *__ip6_route_redirect(struct net *net,
1517 struct fib6_table *table,
1518 struct flowi6 *fl6,
1519 int flags)
1521 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1522 struct rt6_info *rt;
1523 struct fib6_node *fn;
1526 * Get the "current" route for this destination and
1527 * check if the redirect has come from approriate router.
1529 * RFC 2461 specifies that redirects should only be
1530 * accepted if they come from the nexthop to the target.
1531 * Due to the way the routes are chosen, this notion
1532 * is a bit fuzzy and one might need to check all possible
1533 * routes.
1536 read_lock_bh(&table->tb6_lock);
1537 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1538 restart:
1539 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1541 * Current route is on-link; redirect is always invalid.
1543 * Seems, previous statement is not true. It could
1544 * be node, which looks for us as on-link (f.e. proxy ndisc)
1545 * But then router serving it might decide, that we should
1546 * know truth 8)8) --ANK (980726).
1548 if (rt6_check_expired(rt))
1549 continue;
1550 if (!(rt->rt6i_flags & RTF_GATEWAY))
1551 continue;
1552 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1553 continue;
1554 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1555 continue;
1556 break;
1559 if (!rt)
1560 rt = net->ipv6.ip6_null_entry;
1561 BACKTRACK(net, &fl6->saddr);
1562 out:
1563 dst_hold(&rt->dst);
1565 read_unlock_bh(&table->tb6_lock);
1567 return rt;
1570 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1571 const struct in6_addr *src,
1572 const struct in6_addr *gateway,
1573 struct net_device *dev)
1575 int flags = RT6_LOOKUP_F_HAS_SADDR;
1576 struct net *net = dev_net(dev);
1577 struct ip6rd_flowi rdfl = {
1578 .fl6 = {
1579 .flowi6_oif = dev->ifindex,
1580 .daddr = *dest,
1581 .saddr = *src,
1585 rdfl.gateway = *gateway;
1587 if (rt6_need_strict(dest))
1588 flags |= RT6_LOOKUP_F_IFACE;
1590 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1591 flags, __ip6_route_redirect);
1594 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1595 const struct in6_addr *saddr,
1596 struct neighbour *neigh, u8 *lladdr, int on_link)
1598 struct rt6_info *rt, *nrt = NULL;
1599 struct netevent_redirect netevent;
1600 struct net *net = dev_net(neigh->dev);
1602 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1604 if (rt == net->ipv6.ip6_null_entry) {
1605 if (net_ratelimit())
1606 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1607 "for redirect target\n");
1608 goto out;
1612 * We have finally decided to accept it.
1615 neigh_update(neigh, lladdr, NUD_STALE,
1616 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1617 NEIGH_UPDATE_F_OVERRIDE|
1618 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1619 NEIGH_UPDATE_F_ISROUTER))
1623 * Redirect received -> path was valid.
1624 * Look, redirects are sent only in response to data packets,
1625 * so that this nexthop apparently is reachable. --ANK
1627 dst_confirm(&rt->dst);
1629 /* Duplicate redirect: silently ignore. */
1630 if (neigh == dst_get_neighbour_raw(&rt->dst))
1631 goto out;
1633 nrt = ip6_rt_copy(rt, dest);
1634 if (nrt == NULL)
1635 goto out;
1637 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1638 if (on_link)
1639 nrt->rt6i_flags &= ~RTF_GATEWAY;
1641 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1642 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1644 if (ip6_ins_rt(nrt))
1645 goto out;
1647 netevent.old = &rt->dst;
1648 netevent.new = &nrt->dst;
1649 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1651 if (rt->rt6i_flags&RTF_CACHE) {
1652 ip6_del_rt(rt);
1653 return;
1656 out:
1657 dst_release(&rt->dst);
1661 * Handle ICMP "packet too big" messages
1662 * i.e. Path MTU discovery
1665 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1666 struct net *net, u32 pmtu, int ifindex)
1668 struct rt6_info *rt, *nrt;
1669 int allfrag = 0;
1670 again:
1671 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1672 if (rt == NULL)
1673 return;
1675 if (rt6_check_expired(rt)) {
1676 ip6_del_rt(rt);
1677 goto again;
1680 if (pmtu >= dst_mtu(&rt->dst))
1681 goto out;
1683 if (pmtu < IPV6_MIN_MTU) {
1685 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1686 * MTU (1280) and a fragment header should always be included
1687 * after a node receiving Too Big message reporting PMTU is
1688 * less than the IPv6 Minimum Link MTU.
1690 pmtu = IPV6_MIN_MTU;
1691 allfrag = 1;
1694 /* New mtu received -> path was valid.
1695 They are sent only in response to data packets,
1696 so that this nexthop apparently is reachable. --ANK
1698 dst_confirm(&rt->dst);
1700 /* Host route. If it is static, it would be better
1701 not to override it, but add new one, so that
1702 when cache entry will expire old pmtu
1703 would return automatically.
1705 if (rt->rt6i_flags & RTF_CACHE) {
1706 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1707 if (allfrag) {
1708 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1709 features |= RTAX_FEATURE_ALLFRAG;
1710 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1712 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1713 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1714 goto out;
1717 /* Network route.
1718 Two cases are possible:
1719 1. It is connected route. Action: COW
1720 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1722 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1723 nrt = rt6_alloc_cow(rt, daddr, saddr);
1724 else
1725 nrt = rt6_alloc_clone(rt, daddr);
1727 if (nrt) {
1728 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1729 if (allfrag) {
1730 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1731 features |= RTAX_FEATURE_ALLFRAG;
1732 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1735 /* According to RFC 1981, detecting PMTU increase shouldn't be
1736 * happened within 5 mins, the recommended timer is 10 mins.
1737 * Here this route expiration time is set to ip6_rt_mtu_expires
1738 * which is 10 mins. After 10 mins the decreased pmtu is expired
1739 * and detecting PMTU increase will be automatically happened.
1741 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1742 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1744 ip6_ins_rt(nrt);
1746 out:
1747 dst_release(&rt->dst);
1750 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1751 struct net_device *dev, u32 pmtu)
1753 struct net *net = dev_net(dev);
1756 * RFC 1981 states that a node "MUST reduce the size of the packets it
1757 * is sending along the path" that caused the Packet Too Big message.
1758 * Since it's not possible in the general case to determine which
1759 * interface was used to send the original packet, we update the MTU
1760 * on the interface that will be used to send future packets. We also
1761 * update the MTU on the interface that received the Packet Too Big in
1762 * case the original packet was forced out that interface with
1763 * SO_BINDTODEVICE or similar. This is the next best thing to the
1764 * correct behaviour, which would be to update the MTU on all
1765 * interfaces.
1767 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1768 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1772 * Misc support functions
1775 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1776 const struct in6_addr *dest)
1778 struct net *net = dev_net(ort->rt6i_dev);
1779 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1780 ort->dst.dev, 0);
1782 if (rt) {
1783 rt->dst.input = ort->dst.input;
1784 rt->dst.output = ort->dst.output;
1785 rt->dst.flags |= DST_HOST;
1787 rt->rt6i_dst.addr = *dest;
1788 rt->rt6i_dst.plen = 128;
1789 dst_copy_metrics(&rt->dst, &ort->dst);
1790 rt->dst.error = ort->dst.error;
1791 rt->rt6i_idev = ort->rt6i_idev;
1792 if (rt->rt6i_idev)
1793 in6_dev_hold(rt->rt6i_idev);
1794 rt->dst.lastuse = jiffies;
1795 rt->rt6i_expires = 0;
1797 rt->rt6i_gateway = ort->rt6i_gateway;
1798 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1799 rt->rt6i_metric = 0;
1801 #ifdef CONFIG_IPV6_SUBTREES
1802 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1803 #endif
1804 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1805 rt->rt6i_table = ort->rt6i_table;
1807 return rt;
1810 #ifdef CONFIG_IPV6_ROUTE_INFO
1811 static struct rt6_info *rt6_get_route_info(struct net *net,
1812 const struct in6_addr *prefix, int prefixlen,
1813 const struct in6_addr *gwaddr, int ifindex)
1815 struct fib6_node *fn;
1816 struct rt6_info *rt = NULL;
1817 struct fib6_table *table;
1819 table = fib6_get_table(net, RT6_TABLE_INFO);
1820 if (table == NULL)
1821 return NULL;
1823 write_lock_bh(&table->tb6_lock);
1824 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1825 if (!fn)
1826 goto out;
1828 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1829 if (rt->rt6i_dev->ifindex != ifindex)
1830 continue;
1831 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1832 continue;
1833 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1834 continue;
1835 dst_hold(&rt->dst);
1836 break;
1838 out:
1839 write_unlock_bh(&table->tb6_lock);
1840 return rt;
1843 static struct rt6_info *rt6_add_route_info(struct net *net,
1844 const struct in6_addr *prefix, int prefixlen,
1845 const struct in6_addr *gwaddr, int ifindex,
1846 unsigned pref)
1848 struct fib6_config cfg = {
1849 .fc_table = RT6_TABLE_INFO,
1850 .fc_metric = IP6_RT_PRIO_USER,
1851 .fc_ifindex = ifindex,
1852 .fc_dst_len = prefixlen,
1853 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1854 RTF_UP | RTF_PREF(pref),
1855 .fc_nlinfo.pid = 0,
1856 .fc_nlinfo.nlh = NULL,
1857 .fc_nlinfo.nl_net = net,
1860 cfg.fc_dst = *prefix;
1861 cfg.fc_gateway = *gwaddr;
1863 /* We should treat it as a default route if prefix length is 0. */
1864 if (!prefixlen)
1865 cfg.fc_flags |= RTF_DEFAULT;
1867 ip6_route_add(&cfg);
1869 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1871 #endif
1873 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1875 struct rt6_info *rt;
1876 struct fib6_table *table;
1878 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1879 if (table == NULL)
1880 return NULL;
1882 write_lock_bh(&table->tb6_lock);
1883 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1884 if (dev == rt->rt6i_dev &&
1885 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1886 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1887 break;
1889 if (rt)
1890 dst_hold(&rt->dst);
1891 write_unlock_bh(&table->tb6_lock);
1892 return rt;
1895 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1896 struct net_device *dev,
1897 unsigned int pref)
1899 struct fib6_config cfg = {
1900 .fc_table = RT6_TABLE_DFLT,
1901 .fc_metric = IP6_RT_PRIO_USER,
1902 .fc_ifindex = dev->ifindex,
1903 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1904 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1905 .fc_nlinfo.pid = 0,
1906 .fc_nlinfo.nlh = NULL,
1907 .fc_nlinfo.nl_net = dev_net(dev),
1910 cfg.fc_gateway = *gwaddr;
1912 ip6_route_add(&cfg);
1914 return rt6_get_dflt_router(gwaddr, dev);
1917 void rt6_purge_dflt_routers(struct net *net)
1919 struct rt6_info *rt;
1920 struct fib6_table *table;
1922 /* NOTE: Keep consistent with rt6_get_dflt_router */
1923 table = fib6_get_table(net, RT6_TABLE_DFLT);
1924 if (table == NULL)
1925 return;
1927 restart:
1928 read_lock_bh(&table->tb6_lock);
1929 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1930 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1931 dst_hold(&rt->dst);
1932 read_unlock_bh(&table->tb6_lock);
1933 ip6_del_rt(rt);
1934 goto restart;
1937 read_unlock_bh(&table->tb6_lock);
1940 static void rtmsg_to_fib6_config(struct net *net,
1941 struct in6_rtmsg *rtmsg,
1942 struct fib6_config *cfg)
1944 memset(cfg, 0, sizeof(*cfg));
1946 cfg->fc_table = RT6_TABLE_MAIN;
1947 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1948 cfg->fc_metric = rtmsg->rtmsg_metric;
1949 cfg->fc_expires = rtmsg->rtmsg_info;
1950 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1951 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1952 cfg->fc_flags = rtmsg->rtmsg_flags;
1954 cfg->fc_nlinfo.nl_net = net;
1956 cfg->fc_dst = rtmsg->rtmsg_dst;
1957 cfg->fc_src = rtmsg->rtmsg_src;
1958 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1961 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1963 struct fib6_config cfg;
1964 struct in6_rtmsg rtmsg;
1965 int err;
1967 switch(cmd) {
1968 case SIOCADDRT: /* Add a route */
1969 case SIOCDELRT: /* Delete a route */
1970 if (!capable(CAP_NET_ADMIN))
1971 return -EPERM;
1972 err = copy_from_user(&rtmsg, arg,
1973 sizeof(struct in6_rtmsg));
1974 if (err)
1975 return -EFAULT;
1977 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1979 rtnl_lock();
1980 switch (cmd) {
1981 case SIOCADDRT:
1982 err = ip6_route_add(&cfg);
1983 break;
1984 case SIOCDELRT:
1985 err = ip6_route_del(&cfg);
1986 break;
1987 default:
1988 err = -EINVAL;
1990 rtnl_unlock();
1992 return err;
1995 return -EINVAL;
1999 * Drop the packet on the floor
2002 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2004 int type;
2005 struct dst_entry *dst = skb_dst(skb);
2006 switch (ipstats_mib_noroutes) {
2007 case IPSTATS_MIB_INNOROUTES:
2008 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2009 if (type == IPV6_ADDR_ANY) {
2010 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2011 IPSTATS_MIB_INADDRERRORS);
2012 break;
2014 /* FALLTHROUGH */
2015 case IPSTATS_MIB_OUTNOROUTES:
2016 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2017 ipstats_mib_noroutes);
2018 break;
2020 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2021 kfree_skb(skb);
2022 return 0;
2025 static int ip6_pkt_discard(struct sk_buff *skb)
2027 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2030 static int ip6_pkt_discard_out(struct sk_buff *skb)
2032 skb->dev = skb_dst(skb)->dev;
2033 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2036 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2038 static int ip6_pkt_prohibit(struct sk_buff *skb)
2040 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2043 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2045 skb->dev = skb_dst(skb)->dev;
2046 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2049 #endif
2052 * Allocate a dst for local (unicast / anycast) address.
2055 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2056 const struct in6_addr *addr,
2057 int anycast)
2059 struct net *net = dev_net(idev->dev);
2060 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2061 net->loopback_dev, 0);
2062 struct neighbour *neigh;
2064 if (rt == NULL) {
2065 if (net_ratelimit())
2066 pr_warning("IPv6: Maximum number of routes reached,"
2067 " consider increasing route/max_size.\n");
2068 return ERR_PTR(-ENOMEM);
2071 in6_dev_hold(idev);
2073 rt->dst.flags |= DST_HOST;
2074 rt->dst.input = ip6_input;
2075 rt->dst.output = ip6_output;
2076 rt->rt6i_idev = idev;
2077 rt->dst.obsolete = -1;
2079 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2080 if (anycast)
2081 rt->rt6i_flags |= RTF_ANYCAST;
2082 else
2083 rt->rt6i_flags |= RTF_LOCAL;
2084 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2085 if (IS_ERR(neigh)) {
2086 dst_free(&rt->dst);
2088 return ERR_CAST(neigh);
2090 dst_set_neighbour(&rt->dst, neigh);
2092 rt->rt6i_dst.addr = *addr;
2093 rt->rt6i_dst.plen = 128;
2094 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2096 atomic_set(&rt->dst.__refcnt, 1);
2098 return rt;
2101 int ip6_route_get_saddr(struct net *net,
2102 struct rt6_info *rt,
2103 const struct in6_addr *daddr,
2104 unsigned int prefs,
2105 struct in6_addr *saddr)
2107 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2108 int err = 0;
2109 if (rt->rt6i_prefsrc.plen)
2110 *saddr = rt->rt6i_prefsrc.addr;
2111 else
2112 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2113 daddr, prefs, saddr);
2114 return err;
2117 /* remove deleted ip from prefsrc entries */
2118 struct arg_dev_net_ip {
2119 struct net_device *dev;
2120 struct net *net;
2121 struct in6_addr *addr;
2124 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2126 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2127 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2128 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2130 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2131 rt != net->ipv6.ip6_null_entry &&
2132 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2133 /* remove prefsrc entry */
2134 rt->rt6i_prefsrc.plen = 0;
2136 return 0;
2139 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2141 struct net *net = dev_net(ifp->idev->dev);
2142 struct arg_dev_net_ip adni = {
2143 .dev = ifp->idev->dev,
2144 .net = net,
2145 .addr = &ifp->addr,
2147 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2150 struct arg_dev_net {
2151 struct net_device *dev;
2152 struct net *net;
2155 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2157 const struct arg_dev_net *adn = arg;
2158 const struct net_device *dev = adn->dev;
2160 if ((rt->rt6i_dev == dev || dev == NULL) &&
2161 rt != adn->net->ipv6.ip6_null_entry) {
2162 RT6_TRACE("deleted by ifdown %p\n", rt);
2163 return -1;
2165 return 0;
2168 void rt6_ifdown(struct net *net, struct net_device *dev)
2170 struct arg_dev_net adn = {
2171 .dev = dev,
2172 .net = net,
2175 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2176 icmp6_clean_all(fib6_ifdown, &adn);
2179 struct rt6_mtu_change_arg
2181 struct net_device *dev;
2182 unsigned mtu;
2185 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2187 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2188 struct inet6_dev *idev;
2190 /* In IPv6 pmtu discovery is not optional,
2191 so that RTAX_MTU lock cannot disable it.
2192 We still use this lock to block changes
2193 caused by addrconf/ndisc.
2196 idev = __in6_dev_get(arg->dev);
2197 if (idev == NULL)
2198 return 0;
2200 /* For administrative MTU increase, there is no way to discover
2201 IPv6 PMTU increase, so PMTU increase should be updated here.
2202 Since RFC 1981 doesn't include administrative MTU increase
2203 update PMTU increase is a MUST. (i.e. jumbo frame)
2206 If new MTU is less than route PMTU, this new MTU will be the
2207 lowest MTU in the path, update the route PMTU to reflect PMTU
2208 decreases; if new MTU is greater than route PMTU, and the
2209 old MTU is the lowest MTU in the path, update the route PMTU
2210 to reflect the increase. In this case if the other nodes' MTU
2211 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2212 PMTU discouvery.
2214 if (rt->rt6i_dev == arg->dev &&
2215 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2216 (dst_mtu(&rt->dst) >= arg->mtu ||
2217 (dst_mtu(&rt->dst) < arg->mtu &&
2218 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2219 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2221 return 0;
2224 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2226 struct rt6_mtu_change_arg arg = {
2227 .dev = dev,
2228 .mtu = mtu,
2231 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2234 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2235 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2236 [RTA_OIF] = { .type = NLA_U32 },
2237 [RTA_IIF] = { .type = NLA_U32 },
2238 [RTA_PRIORITY] = { .type = NLA_U32 },
2239 [RTA_METRICS] = { .type = NLA_NESTED },
2242 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2243 struct fib6_config *cfg)
2245 struct rtmsg *rtm;
2246 struct nlattr *tb[RTA_MAX+1];
2247 int err;
2249 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2250 if (err < 0)
2251 goto errout;
2253 err = -EINVAL;
2254 rtm = nlmsg_data(nlh);
2255 memset(cfg, 0, sizeof(*cfg));
2257 cfg->fc_table = rtm->rtm_table;
2258 cfg->fc_dst_len = rtm->rtm_dst_len;
2259 cfg->fc_src_len = rtm->rtm_src_len;
2260 cfg->fc_flags = RTF_UP;
2261 cfg->fc_protocol = rtm->rtm_protocol;
2263 if (rtm->rtm_type == RTN_UNREACHABLE)
2264 cfg->fc_flags |= RTF_REJECT;
2266 if (rtm->rtm_type == RTN_LOCAL)
2267 cfg->fc_flags |= RTF_LOCAL;
2269 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2270 cfg->fc_nlinfo.nlh = nlh;
2271 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2273 if (tb[RTA_GATEWAY]) {
2274 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2275 cfg->fc_flags |= RTF_GATEWAY;
2278 if (tb[RTA_DST]) {
2279 int plen = (rtm->rtm_dst_len + 7) >> 3;
2281 if (nla_len(tb[RTA_DST]) < plen)
2282 goto errout;
2284 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2287 if (tb[RTA_SRC]) {
2288 int plen = (rtm->rtm_src_len + 7) >> 3;
2290 if (nla_len(tb[RTA_SRC]) < plen)
2291 goto errout;
2293 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2296 if (tb[RTA_PREFSRC])
2297 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2299 if (tb[RTA_OIF])
2300 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2302 if (tb[RTA_PRIORITY])
2303 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2305 if (tb[RTA_METRICS]) {
2306 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2307 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2310 if (tb[RTA_TABLE])
2311 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2313 err = 0;
2314 errout:
2315 return err;
2318 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2320 struct fib6_config cfg;
2321 int err;
2323 err = rtm_to_fib6_config(skb, nlh, &cfg);
2324 if (err < 0)
2325 return err;
2327 return ip6_route_del(&cfg);
2330 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2332 struct fib6_config cfg;
2333 int err;
2335 err = rtm_to_fib6_config(skb, nlh, &cfg);
2336 if (err < 0)
2337 return err;
2339 return ip6_route_add(&cfg);
2342 static inline size_t rt6_nlmsg_size(void)
2344 return NLMSG_ALIGN(sizeof(struct rtmsg))
2345 + nla_total_size(16) /* RTA_SRC */
2346 + nla_total_size(16) /* RTA_DST */
2347 + nla_total_size(16) /* RTA_GATEWAY */
2348 + nla_total_size(16) /* RTA_PREFSRC */
2349 + nla_total_size(4) /* RTA_TABLE */
2350 + nla_total_size(4) /* RTA_IIF */
2351 + nla_total_size(4) /* RTA_OIF */
2352 + nla_total_size(4) /* RTA_PRIORITY */
2353 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2354 + nla_total_size(sizeof(struct rta_cacheinfo));
2357 static int rt6_fill_node(struct net *net,
2358 struct sk_buff *skb, struct rt6_info *rt,
2359 struct in6_addr *dst, struct in6_addr *src,
2360 int iif, int type, u32 pid, u32 seq,
2361 int prefix, int nowait, unsigned int flags)
2363 struct rtmsg *rtm;
2364 struct nlmsghdr *nlh;
2365 long expires;
2366 u32 table;
2367 struct neighbour *n;
2369 if (prefix) { /* user wants prefix routes only */
2370 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2371 /* success since this is not a prefix route */
2372 return 1;
2376 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2377 if (nlh == NULL)
2378 return -EMSGSIZE;
2380 rtm = nlmsg_data(nlh);
2381 rtm->rtm_family = AF_INET6;
2382 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2383 rtm->rtm_src_len = rt->rt6i_src.plen;
2384 rtm->rtm_tos = 0;
2385 if (rt->rt6i_table)
2386 table = rt->rt6i_table->tb6_id;
2387 else
2388 table = RT6_TABLE_UNSPEC;
2389 rtm->rtm_table = table;
2390 NLA_PUT_U32(skb, RTA_TABLE, table);
2391 if (rt->rt6i_flags&RTF_REJECT)
2392 rtm->rtm_type = RTN_UNREACHABLE;
2393 else if (rt->rt6i_flags&RTF_LOCAL)
2394 rtm->rtm_type = RTN_LOCAL;
2395 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2396 rtm->rtm_type = RTN_LOCAL;
2397 else
2398 rtm->rtm_type = RTN_UNICAST;
2399 rtm->rtm_flags = 0;
2400 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2401 rtm->rtm_protocol = rt->rt6i_protocol;
2402 if (rt->rt6i_flags&RTF_DYNAMIC)
2403 rtm->rtm_protocol = RTPROT_REDIRECT;
2404 else if (rt->rt6i_flags & RTF_ADDRCONF)
2405 rtm->rtm_protocol = RTPROT_KERNEL;
2406 else if (rt->rt6i_flags&RTF_DEFAULT)
2407 rtm->rtm_protocol = RTPROT_RA;
2409 if (rt->rt6i_flags&RTF_CACHE)
2410 rtm->rtm_flags |= RTM_F_CLONED;
2412 if (dst) {
2413 NLA_PUT(skb, RTA_DST, 16, dst);
2414 rtm->rtm_dst_len = 128;
2415 } else if (rtm->rtm_dst_len)
2416 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2417 #ifdef CONFIG_IPV6_SUBTREES
2418 if (src) {
2419 NLA_PUT(skb, RTA_SRC, 16, src);
2420 rtm->rtm_src_len = 128;
2421 } else if (rtm->rtm_src_len)
2422 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2423 #endif
2424 if (iif) {
2425 #ifdef CONFIG_IPV6_MROUTE
2426 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2427 int err = ip6mr_get_route(net, skb, rtm, nowait);
2428 if (err <= 0) {
2429 if (!nowait) {
2430 if (err == 0)
2431 return 0;
2432 goto nla_put_failure;
2433 } else {
2434 if (err == -EMSGSIZE)
2435 goto nla_put_failure;
2438 } else
2439 #endif
2440 NLA_PUT_U32(skb, RTA_IIF, iif);
2441 } else if (dst) {
2442 struct in6_addr saddr_buf;
2443 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2444 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2447 if (rt->rt6i_prefsrc.plen) {
2448 struct in6_addr saddr_buf;
2449 saddr_buf = rt->rt6i_prefsrc.addr;
2450 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2453 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2454 goto nla_put_failure;
2456 rcu_read_lock();
2457 n = dst_get_neighbour(&rt->dst);
2458 if (n)
2459 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2460 rcu_read_unlock();
2462 if (rt->dst.dev)
2463 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2465 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2467 if (!(rt->rt6i_flags & RTF_EXPIRES))
2468 expires = 0;
2469 else if (rt->rt6i_expires - jiffies < INT_MAX)
2470 expires = rt->rt6i_expires - jiffies;
2471 else
2472 expires = INT_MAX;
2474 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2475 expires, rt->dst.error) < 0)
2476 goto nla_put_failure;
2478 return nlmsg_end(skb, nlh);
2480 nla_put_failure:
2481 nlmsg_cancel(skb, nlh);
2482 return -EMSGSIZE;
2485 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2487 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2488 int prefix;
2490 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2491 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2492 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2493 } else
2494 prefix = 0;
2496 return rt6_fill_node(arg->net,
2497 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2498 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2499 prefix, 0, NLM_F_MULTI);
2502 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2504 struct net *net = sock_net(in_skb->sk);
2505 struct nlattr *tb[RTA_MAX+1];
2506 struct rt6_info *rt;
2507 struct sk_buff *skb;
2508 struct rtmsg *rtm;
2509 struct flowi6 fl6;
2510 int err, iif = 0;
2512 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2513 if (err < 0)
2514 goto errout;
2516 err = -EINVAL;
2517 memset(&fl6, 0, sizeof(fl6));
2519 if (tb[RTA_SRC]) {
2520 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2521 goto errout;
2523 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2526 if (tb[RTA_DST]) {
2527 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2528 goto errout;
2530 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2533 if (tb[RTA_IIF])
2534 iif = nla_get_u32(tb[RTA_IIF]);
2536 if (tb[RTA_OIF])
2537 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2539 if (iif) {
2540 struct net_device *dev;
2541 dev = __dev_get_by_index(net, iif);
2542 if (!dev) {
2543 err = -ENODEV;
2544 goto errout;
2548 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2549 if (skb == NULL) {
2550 err = -ENOBUFS;
2551 goto errout;
2554 /* Reserve room for dummy headers, this skb can pass
2555 through good chunk of routing engine.
2557 skb_reset_mac_header(skb);
2558 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2560 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2561 skb_dst_set(skb, &rt->dst);
2563 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2564 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2565 nlh->nlmsg_seq, 0, 0, 0);
2566 if (err < 0) {
2567 kfree_skb(skb);
2568 goto errout;
2571 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2572 errout:
2573 return err;
2576 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2578 struct sk_buff *skb;
2579 struct net *net = info->nl_net;
2580 u32 seq;
2581 int err;
2583 err = -ENOBUFS;
2584 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2586 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2587 if (skb == NULL)
2588 goto errout;
2590 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2591 event, info->pid, seq, 0, 0, 0);
2592 if (err < 0) {
2593 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2594 WARN_ON(err == -EMSGSIZE);
2595 kfree_skb(skb);
2596 goto errout;
2598 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2599 info->nlh, gfp_any());
2600 return;
2601 errout:
2602 if (err < 0)
2603 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2606 static int ip6_route_dev_notify(struct notifier_block *this,
2607 unsigned long event, void *data)
2609 struct net_device *dev = (struct net_device *)data;
2610 struct net *net = dev_net(dev);
2612 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2613 net->ipv6.ip6_null_entry->dst.dev = dev;
2614 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2615 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2616 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2617 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2618 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2619 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2620 #endif
2623 return NOTIFY_OK;
2627 * /proc
2630 #ifdef CONFIG_PROC_FS
2632 struct rt6_proc_arg
2634 char *buffer;
2635 int offset;
2636 int length;
2637 int skip;
2638 int len;
2641 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2643 struct seq_file *m = p_arg;
2644 struct neighbour *n;
2646 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2648 #ifdef CONFIG_IPV6_SUBTREES
2649 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2650 #else
2651 seq_puts(m, "00000000000000000000000000000000 00 ");
2652 #endif
2653 rcu_read_lock();
2654 n = dst_get_neighbour(&rt->dst);
2655 if (n) {
2656 seq_printf(m, "%pi6", n->primary_key);
2657 } else {
2658 seq_puts(m, "00000000000000000000000000000000");
2660 rcu_read_unlock();
2661 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2662 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2663 rt->dst.__use, rt->rt6i_flags,
2664 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2665 return 0;
2668 static int ipv6_route_show(struct seq_file *m, void *v)
2670 struct net *net = (struct net *)m->private;
2671 fib6_clean_all(net, rt6_info_route, 0, m);
2672 return 0;
2675 static int ipv6_route_open(struct inode *inode, struct file *file)
2677 return single_open_net(inode, file, ipv6_route_show);
2680 static const struct file_operations ipv6_route_proc_fops = {
2681 .owner = THIS_MODULE,
2682 .open = ipv6_route_open,
2683 .read = seq_read,
2684 .llseek = seq_lseek,
2685 .release = single_release_net,
2688 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2690 struct net *net = (struct net *)seq->private;
2691 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2692 net->ipv6.rt6_stats->fib_nodes,
2693 net->ipv6.rt6_stats->fib_route_nodes,
2694 net->ipv6.rt6_stats->fib_rt_alloc,
2695 net->ipv6.rt6_stats->fib_rt_entries,
2696 net->ipv6.rt6_stats->fib_rt_cache,
2697 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2698 net->ipv6.rt6_stats->fib_discarded_routes);
2700 return 0;
2703 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2705 return single_open_net(inode, file, rt6_stats_seq_show);
2708 static const struct file_operations rt6_stats_seq_fops = {
2709 .owner = THIS_MODULE,
2710 .open = rt6_stats_seq_open,
2711 .read = seq_read,
2712 .llseek = seq_lseek,
2713 .release = single_release_net,
2715 #endif /* CONFIG_PROC_FS */
2717 #ifdef CONFIG_SYSCTL
2719 static
2720 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2721 void __user *buffer, size_t *lenp, loff_t *ppos)
2723 struct net *net;
2724 int delay;
2725 if (!write)
2726 return -EINVAL;
2728 net = (struct net *)ctl->extra1;
2729 delay = net->ipv6.sysctl.flush_delay;
2730 proc_dointvec(ctl, write, buffer, lenp, ppos);
2731 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2732 return 0;
2735 ctl_table ipv6_route_table_template[] = {
2737 .procname = "flush",
2738 .data = &init_net.ipv6.sysctl.flush_delay,
2739 .maxlen = sizeof(int),
2740 .mode = 0200,
2741 .proc_handler = ipv6_sysctl_rtcache_flush
2744 .procname = "gc_thresh",
2745 .data = &ip6_dst_ops_template.gc_thresh,
2746 .maxlen = sizeof(int),
2747 .mode = 0644,
2748 .proc_handler = proc_dointvec,
2751 .procname = "max_size",
2752 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2753 .maxlen = sizeof(int),
2754 .mode = 0644,
2755 .proc_handler = proc_dointvec,
2758 .procname = "gc_min_interval",
2759 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2760 .maxlen = sizeof(int),
2761 .mode = 0644,
2762 .proc_handler = proc_dointvec_jiffies,
2765 .procname = "gc_timeout",
2766 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2767 .maxlen = sizeof(int),
2768 .mode = 0644,
2769 .proc_handler = proc_dointvec_jiffies,
2772 .procname = "gc_interval",
2773 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2774 .maxlen = sizeof(int),
2775 .mode = 0644,
2776 .proc_handler = proc_dointvec_jiffies,
2779 .procname = "gc_elasticity",
2780 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2781 .maxlen = sizeof(int),
2782 .mode = 0644,
2783 .proc_handler = proc_dointvec,
2786 .procname = "mtu_expires",
2787 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2788 .maxlen = sizeof(int),
2789 .mode = 0644,
2790 .proc_handler = proc_dointvec_jiffies,
2793 .procname = "min_adv_mss",
2794 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2795 .maxlen = sizeof(int),
2796 .mode = 0644,
2797 .proc_handler = proc_dointvec,
2800 .procname = "gc_min_interval_ms",
2801 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2802 .maxlen = sizeof(int),
2803 .mode = 0644,
2804 .proc_handler = proc_dointvec_ms_jiffies,
2809 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2811 struct ctl_table *table;
2813 table = kmemdup(ipv6_route_table_template,
2814 sizeof(ipv6_route_table_template),
2815 GFP_KERNEL);
2817 if (table) {
2818 table[0].data = &net->ipv6.sysctl.flush_delay;
2819 table[0].extra1 = net;
2820 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2821 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2822 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2823 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2824 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2825 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2826 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2827 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2828 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831 return table;
2833 #endif
2835 static int __net_init ip6_route_net_init(struct net *net)
2837 int ret = -ENOMEM;
2839 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2840 sizeof(net->ipv6.ip6_dst_ops));
2842 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2843 goto out_ip6_dst_ops;
2845 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2846 sizeof(*net->ipv6.ip6_null_entry),
2847 GFP_KERNEL);
2848 if (!net->ipv6.ip6_null_entry)
2849 goto out_ip6_dst_entries;
2850 net->ipv6.ip6_null_entry->dst.path =
2851 (struct dst_entry *)net->ipv6.ip6_null_entry;
2852 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2853 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2854 ip6_template_metrics, true);
2856 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2857 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2858 sizeof(*net->ipv6.ip6_prohibit_entry),
2859 GFP_KERNEL);
2860 if (!net->ipv6.ip6_prohibit_entry)
2861 goto out_ip6_null_entry;
2862 net->ipv6.ip6_prohibit_entry->dst.path =
2863 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2864 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2865 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2866 ip6_template_metrics, true);
2868 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2869 sizeof(*net->ipv6.ip6_blk_hole_entry),
2870 GFP_KERNEL);
2871 if (!net->ipv6.ip6_blk_hole_entry)
2872 goto out_ip6_prohibit_entry;
2873 net->ipv6.ip6_blk_hole_entry->dst.path =
2874 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2875 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2876 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2877 ip6_template_metrics, true);
2878 #endif
2880 net->ipv6.sysctl.flush_delay = 0;
2881 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2882 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2883 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2884 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2885 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2886 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2887 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2889 #ifdef CONFIG_PROC_FS
2890 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2891 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2892 #endif
2893 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2895 ret = 0;
2896 out:
2897 return ret;
2899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2900 out_ip6_prohibit_entry:
2901 kfree(net->ipv6.ip6_prohibit_entry);
2902 out_ip6_null_entry:
2903 kfree(net->ipv6.ip6_null_entry);
2904 #endif
2905 out_ip6_dst_entries:
2906 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907 out_ip6_dst_ops:
2908 goto out;
2911 static void __net_exit ip6_route_net_exit(struct net *net)
2913 #ifdef CONFIG_PROC_FS
2914 proc_net_remove(net, "ipv6_route");
2915 proc_net_remove(net, "rt6_stats");
2916 #endif
2917 kfree(net->ipv6.ip6_null_entry);
2918 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2919 kfree(net->ipv6.ip6_prohibit_entry);
2920 kfree(net->ipv6.ip6_blk_hole_entry);
2921 #endif
2922 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2925 static struct pernet_operations ip6_route_net_ops = {
2926 .init = ip6_route_net_init,
2927 .exit = ip6_route_net_exit,
2930 static struct notifier_block ip6_route_dev_notifier = {
2931 .notifier_call = ip6_route_dev_notify,
2932 .priority = 0,
2935 int __init ip6_route_init(void)
2937 int ret;
2939 ret = -ENOMEM;
2940 ip6_dst_ops_template.kmem_cachep =
2941 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2942 SLAB_HWCACHE_ALIGN, NULL);
2943 if (!ip6_dst_ops_template.kmem_cachep)
2944 goto out;
2946 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2947 if (ret)
2948 goto out_kmem_cache;
2950 ret = register_pernet_subsys(&ip6_route_net_ops);
2951 if (ret)
2952 goto out_dst_entries;
2954 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2956 /* Registering of the loopback is done before this portion of code,
2957 * the loopback reference in rt6_info will not be taken, do it
2958 * manually for init_net */
2959 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2960 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2961 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2962 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2963 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2964 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2965 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2966 #endif
2967 ret = fib6_init();
2968 if (ret)
2969 goto out_register_subsys;
2971 ret = xfrm6_init();
2972 if (ret)
2973 goto out_fib6_init;
2975 ret = fib6_rules_init();
2976 if (ret)
2977 goto xfrm6_init;
2979 ret = -ENOBUFS;
2980 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2981 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2982 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2983 goto fib6_rules_init;
2985 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2986 if (ret)
2987 goto fib6_rules_init;
2989 out:
2990 return ret;
2992 fib6_rules_init:
2993 fib6_rules_cleanup();
2994 xfrm6_init:
2995 xfrm6_fini();
2996 out_fib6_init:
2997 fib6_gc_cleanup();
2998 out_register_subsys:
2999 unregister_pernet_subsys(&ip6_route_net_ops);
3000 out_dst_entries:
3001 dst_entries_destroy(&ip6_dst_blackhole_ops);
3002 out_kmem_cache:
3003 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3004 goto out;
3007 void ip6_route_cleanup(void)
3009 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3010 fib6_rules_cleanup();
3011 xfrm6_fini();
3012 fib6_gc_cleanup();
3013 unregister_pernet_subsys(&ip6_route_net_ops);
3014 dst_entries_destroy(&ip6_dst_blackhole_ops);
3015 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);