ipv6: don't use inetpeer to store metrics for routes.
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv6 / route.c
blob1250f902067016d694ea59910778555ad68702bc
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76 const struct in6_addr *dest);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void ip6_dst_destroy(struct dst_entry *);
82 static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how);
84 static int ip6_dst_gc(struct dst_ops *ops);
86 static int ip6_pkt_discard(struct sk_buff *skb);
87 static int ip6_pkt_discard_out(struct sk_buff *skb);
88 static void ip6_link_failure(struct sk_buff *skb);
89 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 const struct in6_addr *prefix, int prefixlen,
94 const struct in6_addr *gwaddr, int ifindex,
95 unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 const struct in6_addr *prefix, int prefixlen,
98 const struct in6_addr *gwaddr, int ifindex);
99 #endif
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 struct rt6_info *rt = (struct rt6_info *) dst;
104 struct inet_peer *peer;
105 u32 *p = NULL;
107 if (!(rt->dst.flags & DST_HOST))
108 return NULL;
110 if (!rt->rt6i_peer)
111 rt6_bind_peer(rt, 1);
113 peer = rt->rt6i_peer;
114 if (peer) {
115 u32 *old_p = __DST_METRICS_PTR(old);
116 unsigned long prev, new;
118 p = peer->metrics;
119 if (inet_metrics_new(peer))
120 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122 new = (unsigned long) p;
123 prev = cmpxchg(&dst->_metrics, old, new);
125 if (prev != old) {
126 p = __DST_METRICS_PTR(prev);
127 if (prev & DST_METRICS_READ_ONLY)
128 p = NULL;
131 return p;
134 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
139 static struct dst_ops ip6_dst_ops_template = {
140 .family = AF_INET6,
141 .protocol = cpu_to_be16(ETH_P_IPV6),
142 .gc = ip6_dst_gc,
143 .gc_thresh = 1024,
144 .check = ip6_dst_check,
145 .default_advmss = ip6_default_advmss,
146 .default_mtu = ip6_default_mtu,
147 .cow_metrics = ipv6_cow_metrics,
148 .destroy = ip6_dst_destroy,
149 .ifdown = ip6_dst_ifdown,
150 .negative_advice = ip6_negative_advice,
151 .link_failure = ip6_link_failure,
152 .update_pmtu = ip6_rt_update_pmtu,
153 .local_out = __ip6_local_out,
154 .neigh_lookup = ip6_neigh_lookup,
157 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
159 return 0;
162 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
167 unsigned long old)
169 return NULL;
172 static struct dst_ops ip6_dst_blackhole_ops = {
173 .family = AF_INET6,
174 .protocol = cpu_to_be16(ETH_P_IPV6),
175 .destroy = ip6_dst_destroy,
176 .check = ip6_dst_check,
177 .default_mtu = ip6_blackhole_default_mtu,
178 .default_advmss = ip6_default_advmss,
179 .update_pmtu = ip6_rt_blackhole_update_pmtu,
180 .cow_metrics = ip6_rt_blackhole_cow_metrics,
181 .neigh_lookup = ip6_neigh_lookup,
184 static const u32 ip6_template_metrics[RTAX_MAX] = {
185 [RTAX_HOPLIMIT - 1] = 255,
188 static struct rt6_info ip6_null_entry_template = {
189 .dst = {
190 .__refcnt = ATOMIC_INIT(1),
191 .__use = 1,
192 .obsolete = -1,
193 .error = -ENETUNREACH,
194 .input = ip6_pkt_discard,
195 .output = ip6_pkt_discard_out,
197 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
198 .rt6i_protocol = RTPROT_KERNEL,
199 .rt6i_metric = ~(u32) 0,
200 .rt6i_ref = ATOMIC_INIT(1),
203 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
205 static int ip6_pkt_prohibit(struct sk_buff *skb);
206 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
208 static struct rt6_info ip6_prohibit_entry_template = {
209 .dst = {
210 .__refcnt = ATOMIC_INIT(1),
211 .__use = 1,
212 .obsolete = -1,
213 .error = -EACCES,
214 .input = ip6_pkt_prohibit,
215 .output = ip6_pkt_prohibit_out,
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
223 static struct rt6_info ip6_blk_hole_entry_template = {
224 .dst = {
225 .__refcnt = ATOMIC_INIT(1),
226 .__use = 1,
227 .obsolete = -1,
228 .error = -EINVAL,
229 .input = dst_discard,
230 .output = dst_discard,
232 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
233 .rt6i_protocol = RTPROT_KERNEL,
234 .rt6i_metric = ~(u32) 0,
235 .rt6i_ref = ATOMIC_INIT(1),
238 #endif
240 /* allocate dst with ip6_dst_ops */
241 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
242 struct net_device *dev,
243 int flags)
245 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
247 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
249 return rt;
252 static void ip6_dst_destroy(struct dst_entry *dst)
254 struct rt6_info *rt = (struct rt6_info *)dst;
255 struct inet6_dev *idev = rt->rt6i_idev;
256 struct inet_peer *peer = rt->rt6i_peer;
258 if (!(rt->dst.flags & DST_HOST))
259 dst_destroy_metrics_generic(dst);
261 if (idev != NULL) {
262 rt->rt6i_idev = NULL;
263 in6_dev_put(idev);
265 if (peer) {
266 rt->rt6i_peer = NULL;
267 inet_putpeer(peer);
271 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
273 static u32 rt6_peer_genid(void)
275 return atomic_read(&__rt6_peer_genid);
278 void rt6_bind_peer(struct rt6_info *rt, int create)
280 struct inet_peer *peer;
282 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
283 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
284 inet_putpeer(peer);
285 else
286 rt->rt6i_peer_genid = rt6_peer_genid();
289 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
290 int how)
292 struct rt6_info *rt = (struct rt6_info *)dst;
293 struct inet6_dev *idev = rt->rt6i_idev;
294 struct net_device *loopback_dev =
295 dev_net(dev)->loopback_dev;
297 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
298 struct inet6_dev *loopback_idev =
299 in6_dev_get(loopback_dev);
300 if (loopback_idev != NULL) {
301 rt->rt6i_idev = loopback_idev;
302 in6_dev_put(idev);
307 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
309 return (rt->rt6i_flags & RTF_EXPIRES) &&
310 time_after(jiffies, rt->rt6i_expires);
313 static inline int rt6_need_strict(const struct in6_addr *daddr)
315 return ipv6_addr_type(daddr) &
316 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
320 * Route lookup. Any table->tb6_lock is implied.
323 static inline struct rt6_info *rt6_device_match(struct net *net,
324 struct rt6_info *rt,
325 const struct in6_addr *saddr,
326 int oif,
327 int flags)
329 struct rt6_info *local = NULL;
330 struct rt6_info *sprt;
332 if (!oif && ipv6_addr_any(saddr))
333 goto out;
335 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
336 struct net_device *dev = sprt->rt6i_dev;
338 if (oif) {
339 if (dev->ifindex == oif)
340 return sprt;
341 if (dev->flags & IFF_LOOPBACK) {
342 if (sprt->rt6i_idev == NULL ||
343 sprt->rt6i_idev->dev->ifindex != oif) {
344 if (flags & RT6_LOOKUP_F_IFACE && oif)
345 continue;
346 if (local && (!oif ||
347 local->rt6i_idev->dev->ifindex == oif))
348 continue;
350 local = sprt;
352 } else {
353 if (ipv6_chk_addr(net, saddr, dev,
354 flags & RT6_LOOKUP_F_IFACE))
355 return sprt;
359 if (oif) {
360 if (local)
361 return local;
363 if (flags & RT6_LOOKUP_F_IFACE)
364 return net->ipv6.ip6_null_entry;
366 out:
367 return rt;
370 #ifdef CONFIG_IPV6_ROUTER_PREF
371 static void rt6_probe(struct rt6_info *rt)
373 struct neighbour *neigh;
375 * Okay, this does not seem to be appropriate
376 * for now, however, we need to check if it
377 * is really so; aka Router Reachability Probing.
379 * Router Reachability Probe MUST be rate-limited
380 * to no more than one per minute.
382 rcu_read_lock();
383 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
384 if (!neigh || (neigh->nud_state & NUD_VALID))
385 goto out;
386 read_lock_bh(&neigh->lock);
387 if (!(neigh->nud_state & NUD_VALID) &&
388 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
389 struct in6_addr mcaddr;
390 struct in6_addr *target;
392 neigh->updated = jiffies;
393 read_unlock_bh(&neigh->lock);
395 target = (struct in6_addr *)&neigh->primary_key;
396 addrconf_addr_solict_mult(target, &mcaddr);
397 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
398 } else {
399 read_unlock_bh(&neigh->lock);
401 out:
402 rcu_read_unlock();
404 #else
405 static inline void rt6_probe(struct rt6_info *rt)
408 #endif
411 * Default Router Selection (RFC 2461 6.3.6)
413 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
415 struct net_device *dev = rt->rt6i_dev;
416 if (!oif || dev->ifindex == oif)
417 return 2;
418 if ((dev->flags & IFF_LOOPBACK) &&
419 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
420 return 1;
421 return 0;
424 static inline int rt6_check_neigh(struct rt6_info *rt)
426 struct neighbour *neigh;
427 int m;
429 rcu_read_lock();
430 neigh = dst_get_neighbour(&rt->dst);
431 if (rt->rt6i_flags & RTF_NONEXTHOP ||
432 !(rt->rt6i_flags & RTF_GATEWAY))
433 m = 1;
434 else if (neigh) {
435 read_lock_bh(&neigh->lock);
436 if (neigh->nud_state & NUD_VALID)
437 m = 2;
438 #ifdef CONFIG_IPV6_ROUTER_PREF
439 else if (neigh->nud_state & NUD_FAILED)
440 m = 0;
441 #endif
442 else
443 m = 1;
444 read_unlock_bh(&neigh->lock);
445 } else
446 m = 0;
447 rcu_read_unlock();
448 return m;
451 static int rt6_score_route(struct rt6_info *rt, int oif,
452 int strict)
454 int m, n;
456 m = rt6_check_dev(rt, oif);
457 if (!m && (strict & RT6_LOOKUP_F_IFACE))
458 return -1;
459 #ifdef CONFIG_IPV6_ROUTER_PREF
460 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
461 #endif
462 n = rt6_check_neigh(rt);
463 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
464 return -1;
465 return m;
468 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
469 int *mpri, struct rt6_info *match)
471 int m;
473 if (rt6_check_expired(rt))
474 goto out;
476 m = rt6_score_route(rt, oif, strict);
477 if (m < 0)
478 goto out;
480 if (m > *mpri) {
481 if (strict & RT6_LOOKUP_F_REACHABLE)
482 rt6_probe(match);
483 *mpri = m;
484 match = rt;
485 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
486 rt6_probe(rt);
489 out:
490 return match;
493 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
494 struct rt6_info *rr_head,
495 u32 metric, int oif, int strict)
497 struct rt6_info *rt, *match;
498 int mpri = -1;
500 match = NULL;
501 for (rt = rr_head; rt && rt->rt6i_metric == metric;
502 rt = rt->dst.rt6_next)
503 match = find_match(rt, oif, strict, &mpri, match);
504 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
505 rt = rt->dst.rt6_next)
506 match = find_match(rt, oif, strict, &mpri, match);
508 return match;
511 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
513 struct rt6_info *match, *rt0;
514 struct net *net;
516 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
517 __func__, fn->leaf, oif);
519 rt0 = fn->rr_ptr;
520 if (!rt0)
521 fn->rr_ptr = rt0 = fn->leaf;
523 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
525 if (!match &&
526 (strict & RT6_LOOKUP_F_REACHABLE)) {
527 struct rt6_info *next = rt0->dst.rt6_next;
529 /* no entries matched; do round-robin */
530 if (!next || next->rt6i_metric != rt0->rt6i_metric)
531 next = fn->leaf;
533 if (next != rt0)
534 fn->rr_ptr = next;
537 RT6_TRACE("%s() => %p\n",
538 __func__, match);
540 net = dev_net(rt0->rt6i_dev);
541 return match ? match : net->ipv6.ip6_null_entry;
544 #ifdef CONFIG_IPV6_ROUTE_INFO
545 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
546 const struct in6_addr *gwaddr)
548 struct net *net = dev_net(dev);
549 struct route_info *rinfo = (struct route_info *) opt;
550 struct in6_addr prefix_buf, *prefix;
551 unsigned int pref;
552 unsigned long lifetime;
553 struct rt6_info *rt;
555 if (len < sizeof(struct route_info)) {
556 return -EINVAL;
559 /* Sanity check for prefix_len and length */
560 if (rinfo->length > 3) {
561 return -EINVAL;
562 } else if (rinfo->prefix_len > 128) {
563 return -EINVAL;
564 } else if (rinfo->prefix_len > 64) {
565 if (rinfo->length < 2) {
566 return -EINVAL;
568 } else if (rinfo->prefix_len > 0) {
569 if (rinfo->length < 1) {
570 return -EINVAL;
574 pref = rinfo->route_pref;
575 if (pref == ICMPV6_ROUTER_PREF_INVALID)
576 return -EINVAL;
578 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
580 if (rinfo->length == 3)
581 prefix = (struct in6_addr *)rinfo->prefix;
582 else {
583 /* this function is safe */
584 ipv6_addr_prefix(&prefix_buf,
585 (struct in6_addr *)rinfo->prefix,
586 rinfo->prefix_len);
587 prefix = &prefix_buf;
590 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
591 dev->ifindex);
593 if (rt && !lifetime) {
594 ip6_del_rt(rt);
595 rt = NULL;
598 if (!rt && lifetime)
599 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
600 pref);
601 else if (rt)
602 rt->rt6i_flags = RTF_ROUTEINFO |
603 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
605 if (rt) {
606 if (!addrconf_finite_timeout(lifetime)) {
607 rt->rt6i_flags &= ~RTF_EXPIRES;
608 } else {
609 rt->rt6i_expires = jiffies + HZ * lifetime;
610 rt->rt6i_flags |= RTF_EXPIRES;
612 dst_release(&rt->dst);
614 return 0;
616 #endif
618 #define BACKTRACK(__net, saddr) \
619 do { \
620 if (rt == __net->ipv6.ip6_null_entry) { \
621 struct fib6_node *pn; \
622 while (1) { \
623 if (fn->fn_flags & RTN_TL_ROOT) \
624 goto out; \
625 pn = fn->parent; \
626 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
627 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
628 else \
629 fn = pn; \
630 if (fn->fn_flags & RTN_RTINFO) \
631 goto restart; \
634 } while(0)
636 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
637 struct fib6_table *table,
638 struct flowi6 *fl6, int flags)
640 struct fib6_node *fn;
641 struct rt6_info *rt;
643 read_lock_bh(&table->tb6_lock);
644 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
645 restart:
646 rt = fn->leaf;
647 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
648 BACKTRACK(net, &fl6->saddr);
649 out:
650 dst_use(&rt->dst, jiffies);
651 read_unlock_bh(&table->tb6_lock);
652 return rt;
656 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
657 const struct in6_addr *saddr, int oif, int strict)
659 struct flowi6 fl6 = {
660 .flowi6_oif = oif,
661 .daddr = *daddr,
663 struct dst_entry *dst;
664 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
666 if (saddr) {
667 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
668 flags |= RT6_LOOKUP_F_HAS_SADDR;
671 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
672 if (dst->error == 0)
673 return (struct rt6_info *) dst;
675 dst_release(dst);
677 return NULL;
680 EXPORT_SYMBOL(rt6_lookup);
682 /* ip6_ins_rt is called with FREE table->tb6_lock.
683 It takes new route entry, the addition fails by any reason the
684 route is freed. In any case, if caller does not hold it, it may
685 be destroyed.
688 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
690 int err;
691 struct fib6_table *table;
693 table = rt->rt6i_table;
694 write_lock_bh(&table->tb6_lock);
695 err = fib6_add(&table->tb6_root, rt, info);
696 write_unlock_bh(&table->tb6_lock);
698 return err;
701 int ip6_ins_rt(struct rt6_info *rt)
703 struct nl_info info = {
704 .nl_net = dev_net(rt->rt6i_dev),
706 return __ip6_ins_rt(rt, &info);
709 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
710 const struct in6_addr *daddr,
711 const struct in6_addr *saddr)
713 struct rt6_info *rt;
716 * Clone the route.
719 rt = ip6_rt_copy(ort, daddr);
721 if (rt) {
722 struct neighbour *neigh;
723 int attempts = !in_softirq();
725 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
726 if (rt->rt6i_dst.plen != 128 &&
727 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
728 rt->rt6i_flags |= RTF_ANYCAST;
729 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
732 rt->rt6i_flags |= RTF_CACHE;
734 #ifdef CONFIG_IPV6_SUBTREES
735 if (rt->rt6i_src.plen && saddr) {
736 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
737 rt->rt6i_src.plen = 128;
739 #endif
741 retry:
742 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
743 if (IS_ERR(neigh)) {
744 struct net *net = dev_net(rt->rt6i_dev);
745 int saved_rt_min_interval =
746 net->ipv6.sysctl.ip6_rt_gc_min_interval;
747 int saved_rt_elasticity =
748 net->ipv6.sysctl.ip6_rt_gc_elasticity;
750 if (attempts-- > 0) {
751 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
752 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
754 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
756 net->ipv6.sysctl.ip6_rt_gc_elasticity =
757 saved_rt_elasticity;
758 net->ipv6.sysctl.ip6_rt_gc_min_interval =
759 saved_rt_min_interval;
760 goto retry;
763 if (net_ratelimit())
764 printk(KERN_WARNING
765 "ipv6: Neighbour table overflow.\n");
766 dst_free(&rt->dst);
767 return NULL;
769 dst_set_neighbour(&rt->dst, neigh);
773 return rt;
776 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
777 const struct in6_addr *daddr)
779 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
781 if (rt) {
782 rt->rt6i_flags |= RTF_CACHE;
783 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
785 return rt;
788 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
789 struct flowi6 *fl6, int flags)
791 struct fib6_node *fn;
792 struct rt6_info *rt, *nrt;
793 int strict = 0;
794 int attempts = 3;
795 int err;
796 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
798 strict |= flags & RT6_LOOKUP_F_IFACE;
800 relookup:
801 read_lock_bh(&table->tb6_lock);
803 restart_2:
804 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
806 restart:
807 rt = rt6_select(fn, oif, strict | reachable);
809 BACKTRACK(net, &fl6->saddr);
810 if (rt == net->ipv6.ip6_null_entry ||
811 rt->rt6i_flags & RTF_CACHE)
812 goto out;
814 dst_hold(&rt->dst);
815 read_unlock_bh(&table->tb6_lock);
817 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
818 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
819 else if (!(rt->dst.flags & DST_HOST))
820 nrt = rt6_alloc_clone(rt, &fl6->daddr);
821 else
822 goto out2;
824 dst_release(&rt->dst);
825 rt = nrt ? : net->ipv6.ip6_null_entry;
827 dst_hold(&rt->dst);
828 if (nrt) {
829 err = ip6_ins_rt(nrt);
830 if (!err)
831 goto out2;
834 if (--attempts <= 0)
835 goto out2;
838 * Race condition! In the gap, when table->tb6_lock was
839 * released someone could insert this route. Relookup.
841 dst_release(&rt->dst);
842 goto relookup;
844 out:
845 if (reachable) {
846 reachable = 0;
847 goto restart_2;
849 dst_hold(&rt->dst);
850 read_unlock_bh(&table->tb6_lock);
851 out2:
852 rt->dst.lastuse = jiffies;
853 rt->dst.__use++;
855 return rt;
858 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
859 struct flowi6 *fl6, int flags)
861 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
864 void ip6_route_input(struct sk_buff *skb)
866 const struct ipv6hdr *iph = ipv6_hdr(skb);
867 struct net *net = dev_net(skb->dev);
868 int flags = RT6_LOOKUP_F_HAS_SADDR;
869 struct flowi6 fl6 = {
870 .flowi6_iif = skb->dev->ifindex,
871 .daddr = iph->daddr,
872 .saddr = iph->saddr,
873 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
874 .flowi6_mark = skb->mark,
875 .flowi6_proto = iph->nexthdr,
878 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
879 flags |= RT6_LOOKUP_F_IFACE;
881 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
884 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
885 struct flowi6 *fl6, int flags)
887 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
890 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
891 struct flowi6 *fl6)
893 int flags = 0;
895 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
896 flags |= RT6_LOOKUP_F_IFACE;
898 if (!ipv6_addr_any(&fl6->saddr))
899 flags |= RT6_LOOKUP_F_HAS_SADDR;
900 else if (sk)
901 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
903 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
906 EXPORT_SYMBOL(ip6_route_output);
908 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
910 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
911 struct dst_entry *new = NULL;
913 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
914 if (rt) {
915 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
917 new = &rt->dst;
919 new->__use = 1;
920 new->input = dst_discard;
921 new->output = dst_discard;
923 if (dst_metrics_read_only(&ort->dst))
924 new->_metrics = ort->dst._metrics;
925 else
926 dst_copy_metrics(new, &ort->dst);
927 rt->rt6i_idev = ort->rt6i_idev;
928 if (rt->rt6i_idev)
929 in6_dev_hold(rt->rt6i_idev);
930 rt->rt6i_expires = 0;
932 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
933 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
934 rt->rt6i_metric = 0;
936 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
937 #ifdef CONFIG_IPV6_SUBTREES
938 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
939 #endif
941 dst_free(new);
944 dst_release(dst_orig);
945 return new ? new : ERR_PTR(-ENOMEM);
949 * Destination cache support functions
952 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
954 struct rt6_info *rt;
956 rt = (struct rt6_info *) dst;
958 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
959 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
960 if (!rt->rt6i_peer)
961 rt6_bind_peer(rt, 0);
962 rt->rt6i_peer_genid = rt6_peer_genid();
964 return dst;
966 return NULL;
969 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
971 struct rt6_info *rt = (struct rt6_info *) dst;
973 if (rt) {
974 if (rt->rt6i_flags & RTF_CACHE) {
975 if (rt6_check_expired(rt)) {
976 ip6_del_rt(rt);
977 dst = NULL;
979 } else {
980 dst_release(dst);
981 dst = NULL;
984 return dst;
987 static void ip6_link_failure(struct sk_buff *skb)
989 struct rt6_info *rt;
991 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
993 rt = (struct rt6_info *) skb_dst(skb);
994 if (rt) {
995 if (rt->rt6i_flags&RTF_CACHE) {
996 dst_set_expires(&rt->dst, 0);
997 rt->rt6i_flags |= RTF_EXPIRES;
998 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
999 rt->rt6i_node->fn_sernum = -1;
1003 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1005 struct rt6_info *rt6 = (struct rt6_info*)dst;
1007 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1008 rt6->rt6i_flags |= RTF_MODIFIED;
1009 if (mtu < IPV6_MIN_MTU) {
1010 u32 features = dst_metric(dst, RTAX_FEATURES);
1011 mtu = IPV6_MIN_MTU;
1012 features |= RTAX_FEATURE_ALLFRAG;
1013 dst_metric_set(dst, RTAX_FEATURES, features);
1015 dst_metric_set(dst, RTAX_MTU, mtu);
1019 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1021 struct net_device *dev = dst->dev;
1022 unsigned int mtu = dst_mtu(dst);
1023 struct net *net = dev_net(dev);
1025 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1027 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1028 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1031 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1032 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1033 * IPV6_MAXPLEN is also valid and means: "any MSS,
1034 * rely only on pmtu discovery"
1036 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1037 mtu = IPV6_MAXPLEN;
1038 return mtu;
1041 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1043 unsigned int mtu = IPV6_MIN_MTU;
1044 struct inet6_dev *idev;
1046 rcu_read_lock();
1047 idev = __in6_dev_get(dst->dev);
1048 if (idev)
1049 mtu = idev->cnf.mtu6;
1050 rcu_read_unlock();
1052 return mtu;
1055 static struct dst_entry *icmp6_dst_gc_list;
1056 static DEFINE_SPINLOCK(icmp6_dst_lock);
1058 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1059 struct neighbour *neigh,
1060 const struct in6_addr *addr)
1062 struct rt6_info *rt;
1063 struct inet6_dev *idev = in6_dev_get(dev);
1064 struct net *net = dev_net(dev);
1066 if (unlikely(idev == NULL))
1067 return NULL;
1069 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1070 if (unlikely(rt == NULL)) {
1071 in6_dev_put(idev);
1072 goto out;
1075 if (neigh)
1076 neigh_hold(neigh);
1077 else {
1078 neigh = ndisc_get_neigh(dev, addr);
1079 if (IS_ERR(neigh))
1080 neigh = NULL;
1083 rt->dst.flags |= DST_HOST;
1084 rt->dst.output = ip6_output;
1085 dst_set_neighbour(&rt->dst, neigh);
1086 atomic_set(&rt->dst.__refcnt, 1);
1087 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1089 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1090 rt->rt6i_dst.plen = 128;
1091 rt->rt6i_idev = idev;
1093 spin_lock_bh(&icmp6_dst_lock);
1094 rt->dst.next = icmp6_dst_gc_list;
1095 icmp6_dst_gc_list = &rt->dst;
1096 spin_unlock_bh(&icmp6_dst_lock);
1098 fib6_force_start_gc(net);
1100 out:
1101 return &rt->dst;
1104 int icmp6_dst_gc(void)
1106 struct dst_entry *dst, **pprev;
1107 int more = 0;
1109 spin_lock_bh(&icmp6_dst_lock);
1110 pprev = &icmp6_dst_gc_list;
1112 while ((dst = *pprev) != NULL) {
1113 if (!atomic_read(&dst->__refcnt)) {
1114 *pprev = dst->next;
1115 dst_free(dst);
1116 } else {
1117 pprev = &dst->next;
1118 ++more;
1122 spin_unlock_bh(&icmp6_dst_lock);
1124 return more;
1127 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1128 void *arg)
1130 struct dst_entry *dst, **pprev;
1132 spin_lock_bh(&icmp6_dst_lock);
1133 pprev = &icmp6_dst_gc_list;
1134 while ((dst = *pprev) != NULL) {
1135 struct rt6_info *rt = (struct rt6_info *) dst;
1136 if (func(rt, arg)) {
1137 *pprev = dst->next;
1138 dst_free(dst);
1139 } else {
1140 pprev = &dst->next;
1143 spin_unlock_bh(&icmp6_dst_lock);
1146 static int ip6_dst_gc(struct dst_ops *ops)
1148 unsigned long now = jiffies;
1149 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1150 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1151 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1152 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1153 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1154 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1155 int entries;
1157 entries = dst_entries_get_fast(ops);
1158 if (time_after(rt_last_gc + rt_min_interval, now) &&
1159 entries <= rt_max_size)
1160 goto out;
1162 net->ipv6.ip6_rt_gc_expire++;
1163 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1164 net->ipv6.ip6_rt_last_gc = now;
1165 entries = dst_entries_get_slow(ops);
1166 if (entries < ops->gc_thresh)
1167 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1168 out:
1169 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1170 return entries > rt_max_size;
1173 /* Clean host part of a prefix. Not necessary in radix tree,
1174 but results in cleaner routing tables.
1176 Remove it only when all the things will work!
1179 int ip6_dst_hoplimit(struct dst_entry *dst)
1181 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1182 if (hoplimit == 0) {
1183 struct net_device *dev = dst->dev;
1184 struct inet6_dev *idev;
1186 rcu_read_lock();
1187 idev = __in6_dev_get(dev);
1188 if (idev)
1189 hoplimit = idev->cnf.hop_limit;
1190 else
1191 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1192 rcu_read_unlock();
1194 return hoplimit;
1196 EXPORT_SYMBOL(ip6_dst_hoplimit);
1202 int ip6_route_add(struct fib6_config *cfg)
1204 int err;
1205 struct net *net = cfg->fc_nlinfo.nl_net;
1206 struct rt6_info *rt = NULL;
1207 struct net_device *dev = NULL;
1208 struct inet6_dev *idev = NULL;
1209 struct fib6_table *table;
1210 int addr_type;
1212 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1213 return -EINVAL;
1214 #ifndef CONFIG_IPV6_SUBTREES
1215 if (cfg->fc_src_len)
1216 return -EINVAL;
1217 #endif
1218 if (cfg->fc_ifindex) {
1219 err = -ENODEV;
1220 dev = dev_get_by_index(net, cfg->fc_ifindex);
1221 if (!dev)
1222 goto out;
1223 idev = in6_dev_get(dev);
1224 if (!idev)
1225 goto out;
1228 if (cfg->fc_metric == 0)
1229 cfg->fc_metric = IP6_RT_PRIO_USER;
1231 table = fib6_new_table(net, cfg->fc_table);
1232 if (table == NULL) {
1233 err = -ENOBUFS;
1234 goto out;
1237 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1239 if (rt == NULL) {
1240 err = -ENOMEM;
1241 goto out;
1244 rt->dst.obsolete = -1;
1245 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1246 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1249 if (cfg->fc_protocol == RTPROT_UNSPEC)
1250 cfg->fc_protocol = RTPROT_BOOT;
1251 rt->rt6i_protocol = cfg->fc_protocol;
1253 addr_type = ipv6_addr_type(&cfg->fc_dst);
1255 if (addr_type & IPV6_ADDR_MULTICAST)
1256 rt->dst.input = ip6_mc_input;
1257 else if (cfg->fc_flags & RTF_LOCAL)
1258 rt->dst.input = ip6_input;
1259 else
1260 rt->dst.input = ip6_forward;
1262 rt->dst.output = ip6_output;
1264 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1265 rt->rt6i_dst.plen = cfg->fc_dst_len;
1266 if (rt->rt6i_dst.plen == 128)
1267 rt->dst.flags |= DST_HOST;
1269 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1270 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1271 if (!metrics) {
1272 err = -ENOMEM;
1273 goto out;
1275 dst_init_metrics(&rt->dst, metrics, 0);
1277 #ifdef CONFIG_IPV6_SUBTREES
1278 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1279 rt->rt6i_src.plen = cfg->fc_src_len;
1280 #endif
1282 rt->rt6i_metric = cfg->fc_metric;
1284 /* We cannot add true routes via loopback here,
1285 they would result in kernel looping; promote them to reject routes
1287 if ((cfg->fc_flags & RTF_REJECT) ||
1288 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1289 && !(cfg->fc_flags&RTF_LOCAL))) {
1290 /* hold loopback dev/idev if we haven't done so. */
1291 if (dev != net->loopback_dev) {
1292 if (dev) {
1293 dev_put(dev);
1294 in6_dev_put(idev);
1296 dev = net->loopback_dev;
1297 dev_hold(dev);
1298 idev = in6_dev_get(dev);
1299 if (!idev) {
1300 err = -ENODEV;
1301 goto out;
1304 rt->dst.output = ip6_pkt_discard_out;
1305 rt->dst.input = ip6_pkt_discard;
1306 rt->dst.error = -ENETUNREACH;
1307 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1308 goto install_route;
1311 if (cfg->fc_flags & RTF_GATEWAY) {
1312 const struct in6_addr *gw_addr;
1313 int gwa_type;
1315 gw_addr = &cfg->fc_gateway;
1316 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1317 gwa_type = ipv6_addr_type(gw_addr);
1319 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1320 struct rt6_info *grt;
1322 /* IPv6 strictly inhibits using not link-local
1323 addresses as nexthop address.
1324 Otherwise, router will not able to send redirects.
1325 It is very good, but in some (rare!) circumstances
1326 (SIT, PtP, NBMA NOARP links) it is handy to allow
1327 some exceptions. --ANK
1329 err = -EINVAL;
1330 if (!(gwa_type&IPV6_ADDR_UNICAST))
1331 goto out;
1333 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1335 err = -EHOSTUNREACH;
1336 if (grt == NULL)
1337 goto out;
1338 if (dev) {
1339 if (dev != grt->rt6i_dev) {
1340 dst_release(&grt->dst);
1341 goto out;
1343 } else {
1344 dev = grt->rt6i_dev;
1345 idev = grt->rt6i_idev;
1346 dev_hold(dev);
1347 in6_dev_hold(grt->rt6i_idev);
1349 if (!(grt->rt6i_flags&RTF_GATEWAY))
1350 err = 0;
1351 dst_release(&grt->dst);
1353 if (err)
1354 goto out;
1356 err = -EINVAL;
1357 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1358 goto out;
1361 err = -ENODEV;
1362 if (dev == NULL)
1363 goto out;
1365 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1366 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1367 err = -EINVAL;
1368 goto out;
1370 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1371 rt->rt6i_prefsrc.plen = 128;
1372 } else
1373 rt->rt6i_prefsrc.plen = 0;
1375 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1376 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1377 if (IS_ERR(n)) {
1378 err = PTR_ERR(n);
1379 goto out;
1381 dst_set_neighbour(&rt->dst, n);
1384 rt->rt6i_flags = cfg->fc_flags;
1386 install_route:
1387 if (cfg->fc_mx) {
1388 struct nlattr *nla;
1389 int remaining;
1391 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1392 int type = nla_type(nla);
1394 if (type) {
1395 if (type > RTAX_MAX) {
1396 err = -EINVAL;
1397 goto out;
1400 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1405 rt->dst.dev = dev;
1406 rt->rt6i_idev = idev;
1407 rt->rt6i_table = table;
1409 cfg->fc_nlinfo.nl_net = dev_net(dev);
1411 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1413 out:
1414 if (dev)
1415 dev_put(dev);
1416 if (idev)
1417 in6_dev_put(idev);
1418 if (rt)
1419 dst_free(&rt->dst);
1420 return err;
1423 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1425 int err;
1426 struct fib6_table *table;
1427 struct net *net = dev_net(rt->rt6i_dev);
1429 if (rt == net->ipv6.ip6_null_entry)
1430 return -ENOENT;
1432 table = rt->rt6i_table;
1433 write_lock_bh(&table->tb6_lock);
1435 err = fib6_del(rt, info);
1436 dst_release(&rt->dst);
1438 write_unlock_bh(&table->tb6_lock);
1440 return err;
1443 int ip6_del_rt(struct rt6_info *rt)
1445 struct nl_info info = {
1446 .nl_net = dev_net(rt->rt6i_dev),
1448 return __ip6_del_rt(rt, &info);
1451 static int ip6_route_del(struct fib6_config *cfg)
1453 struct fib6_table *table;
1454 struct fib6_node *fn;
1455 struct rt6_info *rt;
1456 int err = -ESRCH;
1458 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1459 if (table == NULL)
1460 return err;
1462 read_lock_bh(&table->tb6_lock);
1464 fn = fib6_locate(&table->tb6_root,
1465 &cfg->fc_dst, cfg->fc_dst_len,
1466 &cfg->fc_src, cfg->fc_src_len);
1468 if (fn) {
1469 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1470 if (cfg->fc_ifindex &&
1471 (rt->rt6i_dev == NULL ||
1472 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1473 continue;
1474 if (cfg->fc_flags & RTF_GATEWAY &&
1475 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1476 continue;
1477 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1478 continue;
1479 dst_hold(&rt->dst);
1480 read_unlock_bh(&table->tb6_lock);
1482 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1485 read_unlock_bh(&table->tb6_lock);
1487 return err;
1491 * Handle redirects
1493 struct ip6rd_flowi {
1494 struct flowi6 fl6;
1495 struct in6_addr gateway;
1498 static struct rt6_info *__ip6_route_redirect(struct net *net,
1499 struct fib6_table *table,
1500 struct flowi6 *fl6,
1501 int flags)
1503 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1504 struct rt6_info *rt;
1505 struct fib6_node *fn;
1508 * Get the "current" route for this destination and
1509 * check if the redirect has come from approriate router.
1511 * RFC 2461 specifies that redirects should only be
1512 * accepted if they come from the nexthop to the target.
1513 * Due to the way the routes are chosen, this notion
1514 * is a bit fuzzy and one might need to check all possible
1515 * routes.
1518 read_lock_bh(&table->tb6_lock);
1519 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1520 restart:
1521 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1523 * Current route is on-link; redirect is always invalid.
1525 * Seems, previous statement is not true. It could
1526 * be node, which looks for us as on-link (f.e. proxy ndisc)
1527 * But then router serving it might decide, that we should
1528 * know truth 8)8) --ANK (980726).
1530 if (rt6_check_expired(rt))
1531 continue;
1532 if (!(rt->rt6i_flags & RTF_GATEWAY))
1533 continue;
1534 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1535 continue;
1536 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1537 continue;
1538 break;
1541 if (!rt)
1542 rt = net->ipv6.ip6_null_entry;
1543 BACKTRACK(net, &fl6->saddr);
1544 out:
1545 dst_hold(&rt->dst);
1547 read_unlock_bh(&table->tb6_lock);
1549 return rt;
1552 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1553 const struct in6_addr *src,
1554 const struct in6_addr *gateway,
1555 struct net_device *dev)
1557 int flags = RT6_LOOKUP_F_HAS_SADDR;
1558 struct net *net = dev_net(dev);
1559 struct ip6rd_flowi rdfl = {
1560 .fl6 = {
1561 .flowi6_oif = dev->ifindex,
1562 .daddr = *dest,
1563 .saddr = *src,
1567 ipv6_addr_copy(&rdfl.gateway, gateway);
1569 if (rt6_need_strict(dest))
1570 flags |= RT6_LOOKUP_F_IFACE;
1572 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1573 flags, __ip6_route_redirect);
1576 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1577 const struct in6_addr *saddr,
1578 struct neighbour *neigh, u8 *lladdr, int on_link)
1580 struct rt6_info *rt, *nrt = NULL;
1581 struct netevent_redirect netevent;
1582 struct net *net = dev_net(neigh->dev);
1584 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1586 if (rt == net->ipv6.ip6_null_entry) {
1587 if (net_ratelimit())
1588 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1589 "for redirect target\n");
1590 goto out;
1594 * We have finally decided to accept it.
1597 neigh_update(neigh, lladdr, NUD_STALE,
1598 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1599 NEIGH_UPDATE_F_OVERRIDE|
1600 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1601 NEIGH_UPDATE_F_ISROUTER))
1605 * Redirect received -> path was valid.
1606 * Look, redirects are sent only in response to data packets,
1607 * so that this nexthop apparently is reachable. --ANK
1609 dst_confirm(&rt->dst);
1611 /* Duplicate redirect: silently ignore. */
1612 if (neigh == dst_get_neighbour_raw(&rt->dst))
1613 goto out;
1615 nrt = ip6_rt_copy(rt, dest);
1616 if (nrt == NULL)
1617 goto out;
1619 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1620 if (on_link)
1621 nrt->rt6i_flags &= ~RTF_GATEWAY;
1623 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1624 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1626 if (ip6_ins_rt(nrt))
1627 goto out;
1629 netevent.old = &rt->dst;
1630 netevent.new = &nrt->dst;
1631 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1633 if (rt->rt6i_flags&RTF_CACHE) {
1634 ip6_del_rt(rt);
1635 return;
1638 out:
1639 dst_release(&rt->dst);
1643 * Handle ICMP "packet too big" messages
1644 * i.e. Path MTU discovery
1647 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1648 struct net *net, u32 pmtu, int ifindex)
1650 struct rt6_info *rt, *nrt;
1651 int allfrag = 0;
1652 again:
1653 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1654 if (rt == NULL)
1655 return;
1657 if (rt6_check_expired(rt)) {
1658 ip6_del_rt(rt);
1659 goto again;
1662 if (pmtu >= dst_mtu(&rt->dst))
1663 goto out;
1665 if (pmtu < IPV6_MIN_MTU) {
1667 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1668 * MTU (1280) and a fragment header should always be included
1669 * after a node receiving Too Big message reporting PMTU is
1670 * less than the IPv6 Minimum Link MTU.
1672 pmtu = IPV6_MIN_MTU;
1673 allfrag = 1;
1676 /* New mtu received -> path was valid.
1677 They are sent only in response to data packets,
1678 so that this nexthop apparently is reachable. --ANK
1680 dst_confirm(&rt->dst);
1682 /* Host route. If it is static, it would be better
1683 not to override it, but add new one, so that
1684 when cache entry will expire old pmtu
1685 would return automatically.
1687 if (rt->rt6i_flags & RTF_CACHE) {
1688 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1689 if (allfrag) {
1690 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1691 features |= RTAX_FEATURE_ALLFRAG;
1692 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1694 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1695 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1696 goto out;
1699 /* Network route.
1700 Two cases are possible:
1701 1. It is connected route. Action: COW
1702 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1704 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1705 nrt = rt6_alloc_cow(rt, daddr, saddr);
1706 else
1707 nrt = rt6_alloc_clone(rt, daddr);
1709 if (nrt) {
1710 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1711 if (allfrag) {
1712 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1713 features |= RTAX_FEATURE_ALLFRAG;
1714 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1717 /* According to RFC 1981, detecting PMTU increase shouldn't be
1718 * happened within 5 mins, the recommended timer is 10 mins.
1719 * Here this route expiration time is set to ip6_rt_mtu_expires
1720 * which is 10 mins. After 10 mins the decreased pmtu is expired
1721 * and detecting PMTU increase will be automatically happened.
1723 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1724 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1726 ip6_ins_rt(nrt);
1728 out:
1729 dst_release(&rt->dst);
1732 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1733 struct net_device *dev, u32 pmtu)
1735 struct net *net = dev_net(dev);
1738 * RFC 1981 states that a node "MUST reduce the size of the packets it
1739 * is sending along the path" that caused the Packet Too Big message.
1740 * Since it's not possible in the general case to determine which
1741 * interface was used to send the original packet, we update the MTU
1742 * on the interface that will be used to send future packets. We also
1743 * update the MTU on the interface that received the Packet Too Big in
1744 * case the original packet was forced out that interface with
1745 * SO_BINDTODEVICE or similar. This is the next best thing to the
1746 * correct behaviour, which would be to update the MTU on all
1747 * interfaces.
1749 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1750 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1754 * Misc support functions
1757 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1758 const struct in6_addr *dest)
1760 struct net *net = dev_net(ort->rt6i_dev);
1761 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1762 ort->dst.dev, 0);
1764 if (rt) {
1765 rt->dst.input = ort->dst.input;
1766 rt->dst.output = ort->dst.output;
1767 rt->dst.flags |= DST_HOST;
1769 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1770 rt->rt6i_dst.plen = 128;
1771 dst_copy_metrics(&rt->dst, &ort->dst);
1772 rt->dst.error = ort->dst.error;
1773 rt->rt6i_idev = ort->rt6i_idev;
1774 if (rt->rt6i_idev)
1775 in6_dev_hold(rt->rt6i_idev);
1776 rt->dst.lastuse = jiffies;
1777 rt->rt6i_expires = 0;
1779 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1780 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1781 rt->rt6i_metric = 0;
1783 #ifdef CONFIG_IPV6_SUBTREES
1784 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1785 #endif
1786 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1787 rt->rt6i_table = ort->rt6i_table;
1789 return rt;
1792 #ifdef CONFIG_IPV6_ROUTE_INFO
1793 static struct rt6_info *rt6_get_route_info(struct net *net,
1794 const struct in6_addr *prefix, int prefixlen,
1795 const struct in6_addr *gwaddr, int ifindex)
1797 struct fib6_node *fn;
1798 struct rt6_info *rt = NULL;
1799 struct fib6_table *table;
1801 table = fib6_get_table(net, RT6_TABLE_INFO);
1802 if (table == NULL)
1803 return NULL;
1805 write_lock_bh(&table->tb6_lock);
1806 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1807 if (!fn)
1808 goto out;
1810 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1811 if (rt->rt6i_dev->ifindex != ifindex)
1812 continue;
1813 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1814 continue;
1815 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1816 continue;
1817 dst_hold(&rt->dst);
1818 break;
1820 out:
1821 write_unlock_bh(&table->tb6_lock);
1822 return rt;
1825 static struct rt6_info *rt6_add_route_info(struct net *net,
1826 const struct in6_addr *prefix, int prefixlen,
1827 const struct in6_addr *gwaddr, int ifindex,
1828 unsigned pref)
1830 struct fib6_config cfg = {
1831 .fc_table = RT6_TABLE_INFO,
1832 .fc_metric = IP6_RT_PRIO_USER,
1833 .fc_ifindex = ifindex,
1834 .fc_dst_len = prefixlen,
1835 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1836 RTF_UP | RTF_PREF(pref),
1837 .fc_nlinfo.pid = 0,
1838 .fc_nlinfo.nlh = NULL,
1839 .fc_nlinfo.nl_net = net,
1842 ipv6_addr_copy(&cfg.fc_dst, prefix);
1843 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1845 /* We should treat it as a default route if prefix length is 0. */
1846 if (!prefixlen)
1847 cfg.fc_flags |= RTF_DEFAULT;
1849 ip6_route_add(&cfg);
1851 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1853 #endif
1855 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1857 struct rt6_info *rt;
1858 struct fib6_table *table;
1860 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1861 if (table == NULL)
1862 return NULL;
1864 write_lock_bh(&table->tb6_lock);
1865 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1866 if (dev == rt->rt6i_dev &&
1867 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1868 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1869 break;
1871 if (rt)
1872 dst_hold(&rt->dst);
1873 write_unlock_bh(&table->tb6_lock);
1874 return rt;
1877 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1878 struct net_device *dev,
1879 unsigned int pref)
1881 struct fib6_config cfg = {
1882 .fc_table = RT6_TABLE_DFLT,
1883 .fc_metric = IP6_RT_PRIO_USER,
1884 .fc_ifindex = dev->ifindex,
1885 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1886 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1887 .fc_nlinfo.pid = 0,
1888 .fc_nlinfo.nlh = NULL,
1889 .fc_nlinfo.nl_net = dev_net(dev),
1892 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1894 ip6_route_add(&cfg);
1896 return rt6_get_dflt_router(gwaddr, dev);
1899 void rt6_purge_dflt_routers(struct net *net)
1901 struct rt6_info *rt;
1902 struct fib6_table *table;
1904 /* NOTE: Keep consistent with rt6_get_dflt_router */
1905 table = fib6_get_table(net, RT6_TABLE_DFLT);
1906 if (table == NULL)
1907 return;
1909 restart:
1910 read_lock_bh(&table->tb6_lock);
1911 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1912 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1913 dst_hold(&rt->dst);
1914 read_unlock_bh(&table->tb6_lock);
1915 ip6_del_rt(rt);
1916 goto restart;
1919 read_unlock_bh(&table->tb6_lock);
1922 static void rtmsg_to_fib6_config(struct net *net,
1923 struct in6_rtmsg *rtmsg,
1924 struct fib6_config *cfg)
1926 memset(cfg, 0, sizeof(*cfg));
1928 cfg->fc_table = RT6_TABLE_MAIN;
1929 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1930 cfg->fc_metric = rtmsg->rtmsg_metric;
1931 cfg->fc_expires = rtmsg->rtmsg_info;
1932 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1933 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1934 cfg->fc_flags = rtmsg->rtmsg_flags;
1936 cfg->fc_nlinfo.nl_net = net;
1938 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1939 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1940 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1943 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1945 struct fib6_config cfg;
1946 struct in6_rtmsg rtmsg;
1947 int err;
1949 switch(cmd) {
1950 case SIOCADDRT: /* Add a route */
1951 case SIOCDELRT: /* Delete a route */
1952 if (!capable(CAP_NET_ADMIN))
1953 return -EPERM;
1954 err = copy_from_user(&rtmsg, arg,
1955 sizeof(struct in6_rtmsg));
1956 if (err)
1957 return -EFAULT;
1959 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1961 rtnl_lock();
1962 switch (cmd) {
1963 case SIOCADDRT:
1964 err = ip6_route_add(&cfg);
1965 break;
1966 case SIOCDELRT:
1967 err = ip6_route_del(&cfg);
1968 break;
1969 default:
1970 err = -EINVAL;
1972 rtnl_unlock();
1974 return err;
1977 return -EINVAL;
1981 * Drop the packet on the floor
1984 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1986 int type;
1987 struct dst_entry *dst = skb_dst(skb);
1988 switch (ipstats_mib_noroutes) {
1989 case IPSTATS_MIB_INNOROUTES:
1990 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1991 if (type == IPV6_ADDR_ANY) {
1992 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1993 IPSTATS_MIB_INADDRERRORS);
1994 break;
1996 /* FALLTHROUGH */
1997 case IPSTATS_MIB_OUTNOROUTES:
1998 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1999 ipstats_mib_noroutes);
2000 break;
2002 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2003 kfree_skb(skb);
2004 return 0;
2007 static int ip6_pkt_discard(struct sk_buff *skb)
2009 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2012 static int ip6_pkt_discard_out(struct sk_buff *skb)
2014 skb->dev = skb_dst(skb)->dev;
2015 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2018 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2020 static int ip6_pkt_prohibit(struct sk_buff *skb)
2022 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2025 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2027 skb->dev = skb_dst(skb)->dev;
2028 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2031 #endif
2034 * Allocate a dst for local (unicast / anycast) address.
2037 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2038 const struct in6_addr *addr,
2039 int anycast)
2041 struct net *net = dev_net(idev->dev);
2042 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2043 net->loopback_dev, 0);
2044 struct neighbour *neigh;
2046 if (rt == NULL) {
2047 if (net_ratelimit())
2048 pr_warning("IPv6: Maximum number of routes reached,"
2049 " consider increasing route/max_size.\n");
2050 return ERR_PTR(-ENOMEM);
2053 in6_dev_hold(idev);
2055 rt->dst.flags |= DST_HOST;
2056 rt->dst.input = ip6_input;
2057 rt->dst.output = ip6_output;
2058 rt->rt6i_idev = idev;
2059 rt->dst.obsolete = -1;
2061 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2062 if (anycast)
2063 rt->rt6i_flags |= RTF_ANYCAST;
2064 else
2065 rt->rt6i_flags |= RTF_LOCAL;
2066 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2067 if (IS_ERR(neigh)) {
2068 dst_free(&rt->dst);
2070 return ERR_CAST(neigh);
2072 dst_set_neighbour(&rt->dst, neigh);
2074 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2075 rt->rt6i_dst.plen = 128;
2076 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2078 atomic_set(&rt->dst.__refcnt, 1);
2080 return rt;
2083 int ip6_route_get_saddr(struct net *net,
2084 struct rt6_info *rt,
2085 const struct in6_addr *daddr,
2086 unsigned int prefs,
2087 struct in6_addr *saddr)
2089 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2090 int err = 0;
2091 if (rt->rt6i_prefsrc.plen)
2092 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2093 else
2094 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2095 daddr, prefs, saddr);
2096 return err;
2099 /* remove deleted ip from prefsrc entries */
2100 struct arg_dev_net_ip {
2101 struct net_device *dev;
2102 struct net *net;
2103 struct in6_addr *addr;
2106 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2108 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2109 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2110 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2112 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2113 rt != net->ipv6.ip6_null_entry &&
2114 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2115 /* remove prefsrc entry */
2116 rt->rt6i_prefsrc.plen = 0;
2118 return 0;
2121 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2123 struct net *net = dev_net(ifp->idev->dev);
2124 struct arg_dev_net_ip adni = {
2125 .dev = ifp->idev->dev,
2126 .net = net,
2127 .addr = &ifp->addr,
2129 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2132 struct arg_dev_net {
2133 struct net_device *dev;
2134 struct net *net;
2137 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2139 const struct arg_dev_net *adn = arg;
2140 const struct net_device *dev = adn->dev;
2142 if ((rt->rt6i_dev == dev || dev == NULL) &&
2143 rt != adn->net->ipv6.ip6_null_entry) {
2144 RT6_TRACE("deleted by ifdown %p\n", rt);
2145 return -1;
2147 return 0;
2150 void rt6_ifdown(struct net *net, struct net_device *dev)
2152 struct arg_dev_net adn = {
2153 .dev = dev,
2154 .net = net,
2157 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2158 icmp6_clean_all(fib6_ifdown, &adn);
2161 struct rt6_mtu_change_arg
2163 struct net_device *dev;
2164 unsigned mtu;
2167 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2169 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2170 struct inet6_dev *idev;
2172 /* In IPv6 pmtu discovery is not optional,
2173 so that RTAX_MTU lock cannot disable it.
2174 We still use this lock to block changes
2175 caused by addrconf/ndisc.
2178 idev = __in6_dev_get(arg->dev);
2179 if (idev == NULL)
2180 return 0;
2182 /* For administrative MTU increase, there is no way to discover
2183 IPv6 PMTU increase, so PMTU increase should be updated here.
2184 Since RFC 1981 doesn't include administrative MTU increase
2185 update PMTU increase is a MUST. (i.e. jumbo frame)
2188 If new MTU is less than route PMTU, this new MTU will be the
2189 lowest MTU in the path, update the route PMTU to reflect PMTU
2190 decreases; if new MTU is greater than route PMTU, and the
2191 old MTU is the lowest MTU in the path, update the route PMTU
2192 to reflect the increase. In this case if the other nodes' MTU
2193 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2194 PMTU discouvery.
2196 if (rt->rt6i_dev == arg->dev &&
2197 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2198 (dst_mtu(&rt->dst) >= arg->mtu ||
2199 (dst_mtu(&rt->dst) < arg->mtu &&
2200 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2201 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2203 return 0;
2206 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2208 struct rt6_mtu_change_arg arg = {
2209 .dev = dev,
2210 .mtu = mtu,
2213 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2216 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2217 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2218 [RTA_OIF] = { .type = NLA_U32 },
2219 [RTA_IIF] = { .type = NLA_U32 },
2220 [RTA_PRIORITY] = { .type = NLA_U32 },
2221 [RTA_METRICS] = { .type = NLA_NESTED },
2224 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2225 struct fib6_config *cfg)
2227 struct rtmsg *rtm;
2228 struct nlattr *tb[RTA_MAX+1];
2229 int err;
2231 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2232 if (err < 0)
2233 goto errout;
2235 err = -EINVAL;
2236 rtm = nlmsg_data(nlh);
2237 memset(cfg, 0, sizeof(*cfg));
2239 cfg->fc_table = rtm->rtm_table;
2240 cfg->fc_dst_len = rtm->rtm_dst_len;
2241 cfg->fc_src_len = rtm->rtm_src_len;
2242 cfg->fc_flags = RTF_UP;
2243 cfg->fc_protocol = rtm->rtm_protocol;
2245 if (rtm->rtm_type == RTN_UNREACHABLE)
2246 cfg->fc_flags |= RTF_REJECT;
2248 if (rtm->rtm_type == RTN_LOCAL)
2249 cfg->fc_flags |= RTF_LOCAL;
2251 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2252 cfg->fc_nlinfo.nlh = nlh;
2253 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2255 if (tb[RTA_GATEWAY]) {
2256 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2257 cfg->fc_flags |= RTF_GATEWAY;
2260 if (tb[RTA_DST]) {
2261 int plen = (rtm->rtm_dst_len + 7) >> 3;
2263 if (nla_len(tb[RTA_DST]) < plen)
2264 goto errout;
2266 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2269 if (tb[RTA_SRC]) {
2270 int plen = (rtm->rtm_src_len + 7) >> 3;
2272 if (nla_len(tb[RTA_SRC]) < plen)
2273 goto errout;
2275 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2278 if (tb[RTA_PREFSRC])
2279 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2281 if (tb[RTA_OIF])
2282 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2284 if (tb[RTA_PRIORITY])
2285 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2287 if (tb[RTA_METRICS]) {
2288 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2289 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2292 if (tb[RTA_TABLE])
2293 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2295 err = 0;
2296 errout:
2297 return err;
2300 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2302 struct fib6_config cfg;
2303 int err;
2305 err = rtm_to_fib6_config(skb, nlh, &cfg);
2306 if (err < 0)
2307 return err;
2309 return ip6_route_del(&cfg);
2312 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2314 struct fib6_config cfg;
2315 int err;
2317 err = rtm_to_fib6_config(skb, nlh, &cfg);
2318 if (err < 0)
2319 return err;
2321 return ip6_route_add(&cfg);
2324 static inline size_t rt6_nlmsg_size(void)
2326 return NLMSG_ALIGN(sizeof(struct rtmsg))
2327 + nla_total_size(16) /* RTA_SRC */
2328 + nla_total_size(16) /* RTA_DST */
2329 + nla_total_size(16) /* RTA_GATEWAY */
2330 + nla_total_size(16) /* RTA_PREFSRC */
2331 + nla_total_size(4) /* RTA_TABLE */
2332 + nla_total_size(4) /* RTA_IIF */
2333 + nla_total_size(4) /* RTA_OIF */
2334 + nla_total_size(4) /* RTA_PRIORITY */
2335 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2336 + nla_total_size(sizeof(struct rta_cacheinfo));
2339 static int rt6_fill_node(struct net *net,
2340 struct sk_buff *skb, struct rt6_info *rt,
2341 struct in6_addr *dst, struct in6_addr *src,
2342 int iif, int type, u32 pid, u32 seq,
2343 int prefix, int nowait, unsigned int flags)
2345 struct rtmsg *rtm;
2346 struct nlmsghdr *nlh;
2347 long expires;
2348 u32 table;
2349 struct neighbour *n;
2351 if (prefix) { /* user wants prefix routes only */
2352 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2353 /* success since this is not a prefix route */
2354 return 1;
2358 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2359 if (nlh == NULL)
2360 return -EMSGSIZE;
2362 rtm = nlmsg_data(nlh);
2363 rtm->rtm_family = AF_INET6;
2364 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2365 rtm->rtm_src_len = rt->rt6i_src.plen;
2366 rtm->rtm_tos = 0;
2367 if (rt->rt6i_table)
2368 table = rt->rt6i_table->tb6_id;
2369 else
2370 table = RT6_TABLE_UNSPEC;
2371 rtm->rtm_table = table;
2372 NLA_PUT_U32(skb, RTA_TABLE, table);
2373 if (rt->rt6i_flags&RTF_REJECT)
2374 rtm->rtm_type = RTN_UNREACHABLE;
2375 else if (rt->rt6i_flags&RTF_LOCAL)
2376 rtm->rtm_type = RTN_LOCAL;
2377 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2378 rtm->rtm_type = RTN_LOCAL;
2379 else
2380 rtm->rtm_type = RTN_UNICAST;
2381 rtm->rtm_flags = 0;
2382 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2383 rtm->rtm_protocol = rt->rt6i_protocol;
2384 if (rt->rt6i_flags&RTF_DYNAMIC)
2385 rtm->rtm_protocol = RTPROT_REDIRECT;
2386 else if (rt->rt6i_flags & RTF_ADDRCONF)
2387 rtm->rtm_protocol = RTPROT_KERNEL;
2388 else if (rt->rt6i_flags&RTF_DEFAULT)
2389 rtm->rtm_protocol = RTPROT_RA;
2391 if (rt->rt6i_flags&RTF_CACHE)
2392 rtm->rtm_flags |= RTM_F_CLONED;
2394 if (dst) {
2395 NLA_PUT(skb, RTA_DST, 16, dst);
2396 rtm->rtm_dst_len = 128;
2397 } else if (rtm->rtm_dst_len)
2398 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2399 #ifdef CONFIG_IPV6_SUBTREES
2400 if (src) {
2401 NLA_PUT(skb, RTA_SRC, 16, src);
2402 rtm->rtm_src_len = 128;
2403 } else if (rtm->rtm_src_len)
2404 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2405 #endif
2406 if (iif) {
2407 #ifdef CONFIG_IPV6_MROUTE
2408 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2409 int err = ip6mr_get_route(net, skb, rtm, nowait);
2410 if (err <= 0) {
2411 if (!nowait) {
2412 if (err == 0)
2413 return 0;
2414 goto nla_put_failure;
2415 } else {
2416 if (err == -EMSGSIZE)
2417 goto nla_put_failure;
2420 } else
2421 #endif
2422 NLA_PUT_U32(skb, RTA_IIF, iif);
2423 } else if (dst) {
2424 struct in6_addr saddr_buf;
2425 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2426 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2429 if (rt->rt6i_prefsrc.plen) {
2430 struct in6_addr saddr_buf;
2431 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2432 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2435 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2436 goto nla_put_failure;
2438 rcu_read_lock();
2439 n = dst_get_neighbour(&rt->dst);
2440 if (n)
2441 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2442 rcu_read_unlock();
2444 if (rt->dst.dev)
2445 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2447 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2449 if (!(rt->rt6i_flags & RTF_EXPIRES))
2450 expires = 0;
2451 else if (rt->rt6i_expires - jiffies < INT_MAX)
2452 expires = rt->rt6i_expires - jiffies;
2453 else
2454 expires = INT_MAX;
2456 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2457 expires, rt->dst.error) < 0)
2458 goto nla_put_failure;
2460 return nlmsg_end(skb, nlh);
2462 nla_put_failure:
2463 nlmsg_cancel(skb, nlh);
2464 return -EMSGSIZE;
2467 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2469 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2470 int prefix;
2472 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2473 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2474 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2475 } else
2476 prefix = 0;
2478 return rt6_fill_node(arg->net,
2479 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2480 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2481 prefix, 0, NLM_F_MULTI);
2484 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2486 struct net *net = sock_net(in_skb->sk);
2487 struct nlattr *tb[RTA_MAX+1];
2488 struct rt6_info *rt;
2489 struct sk_buff *skb;
2490 struct rtmsg *rtm;
2491 struct flowi6 fl6;
2492 int err, iif = 0;
2494 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2495 if (err < 0)
2496 goto errout;
2498 err = -EINVAL;
2499 memset(&fl6, 0, sizeof(fl6));
2501 if (tb[RTA_SRC]) {
2502 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2503 goto errout;
2505 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2508 if (tb[RTA_DST]) {
2509 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2510 goto errout;
2512 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2515 if (tb[RTA_IIF])
2516 iif = nla_get_u32(tb[RTA_IIF]);
2518 if (tb[RTA_OIF])
2519 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2521 if (iif) {
2522 struct net_device *dev;
2523 dev = __dev_get_by_index(net, iif);
2524 if (!dev) {
2525 err = -ENODEV;
2526 goto errout;
2530 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2531 if (skb == NULL) {
2532 err = -ENOBUFS;
2533 goto errout;
2536 /* Reserve room for dummy headers, this skb can pass
2537 through good chunk of routing engine.
2539 skb_reset_mac_header(skb);
2540 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2542 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2543 skb_dst_set(skb, &rt->dst);
2545 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2546 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2547 nlh->nlmsg_seq, 0, 0, 0);
2548 if (err < 0) {
2549 kfree_skb(skb);
2550 goto errout;
2553 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2554 errout:
2555 return err;
2558 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2560 struct sk_buff *skb;
2561 struct net *net = info->nl_net;
2562 u32 seq;
2563 int err;
2565 err = -ENOBUFS;
2566 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2568 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2569 if (skb == NULL)
2570 goto errout;
2572 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2573 event, info->pid, seq, 0, 0, 0);
2574 if (err < 0) {
2575 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2576 WARN_ON(err == -EMSGSIZE);
2577 kfree_skb(skb);
2578 goto errout;
2580 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2581 info->nlh, gfp_any());
2582 return;
2583 errout:
2584 if (err < 0)
2585 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2588 static int ip6_route_dev_notify(struct notifier_block *this,
2589 unsigned long event, void *data)
2591 struct net_device *dev = (struct net_device *)data;
2592 struct net *net = dev_net(dev);
2594 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2595 net->ipv6.ip6_null_entry->dst.dev = dev;
2596 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2597 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2598 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2599 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2600 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2601 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2602 #endif
2605 return NOTIFY_OK;
2609 * /proc
2612 #ifdef CONFIG_PROC_FS
2614 struct rt6_proc_arg
2616 char *buffer;
2617 int offset;
2618 int length;
2619 int skip;
2620 int len;
2623 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2625 struct seq_file *m = p_arg;
2626 struct neighbour *n;
2628 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2630 #ifdef CONFIG_IPV6_SUBTREES
2631 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2632 #else
2633 seq_puts(m, "00000000000000000000000000000000 00 ");
2634 #endif
2635 rcu_read_lock();
2636 n = dst_get_neighbour(&rt->dst);
2637 if (n) {
2638 seq_printf(m, "%pi6", n->primary_key);
2639 } else {
2640 seq_puts(m, "00000000000000000000000000000000");
2642 rcu_read_unlock();
2643 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2644 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2645 rt->dst.__use, rt->rt6i_flags,
2646 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2647 return 0;
2650 static int ipv6_route_show(struct seq_file *m, void *v)
2652 struct net *net = (struct net *)m->private;
2653 fib6_clean_all(net, rt6_info_route, 0, m);
2654 return 0;
2657 static int ipv6_route_open(struct inode *inode, struct file *file)
2659 return single_open_net(inode, file, ipv6_route_show);
2662 static const struct file_operations ipv6_route_proc_fops = {
2663 .owner = THIS_MODULE,
2664 .open = ipv6_route_open,
2665 .read = seq_read,
2666 .llseek = seq_lseek,
2667 .release = single_release_net,
2670 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2672 struct net *net = (struct net *)seq->private;
2673 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2674 net->ipv6.rt6_stats->fib_nodes,
2675 net->ipv6.rt6_stats->fib_route_nodes,
2676 net->ipv6.rt6_stats->fib_rt_alloc,
2677 net->ipv6.rt6_stats->fib_rt_entries,
2678 net->ipv6.rt6_stats->fib_rt_cache,
2679 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2680 net->ipv6.rt6_stats->fib_discarded_routes);
2682 return 0;
2685 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2687 return single_open_net(inode, file, rt6_stats_seq_show);
2690 static const struct file_operations rt6_stats_seq_fops = {
2691 .owner = THIS_MODULE,
2692 .open = rt6_stats_seq_open,
2693 .read = seq_read,
2694 .llseek = seq_lseek,
2695 .release = single_release_net,
2697 #endif /* CONFIG_PROC_FS */
2699 #ifdef CONFIG_SYSCTL
2701 static
2702 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2703 void __user *buffer, size_t *lenp, loff_t *ppos)
2705 struct net *net;
2706 int delay;
2707 if (!write)
2708 return -EINVAL;
2710 net = (struct net *)ctl->extra1;
2711 delay = net->ipv6.sysctl.flush_delay;
2712 proc_dointvec(ctl, write, buffer, lenp, ppos);
2713 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2714 return 0;
2717 ctl_table ipv6_route_table_template[] = {
2719 .procname = "flush",
2720 .data = &init_net.ipv6.sysctl.flush_delay,
2721 .maxlen = sizeof(int),
2722 .mode = 0200,
2723 .proc_handler = ipv6_sysctl_rtcache_flush
2726 .procname = "gc_thresh",
2727 .data = &ip6_dst_ops_template.gc_thresh,
2728 .maxlen = sizeof(int),
2729 .mode = 0644,
2730 .proc_handler = proc_dointvec,
2733 .procname = "max_size",
2734 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2735 .maxlen = sizeof(int),
2736 .mode = 0644,
2737 .proc_handler = proc_dointvec,
2740 .procname = "gc_min_interval",
2741 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2742 .maxlen = sizeof(int),
2743 .mode = 0644,
2744 .proc_handler = proc_dointvec_jiffies,
2747 .procname = "gc_timeout",
2748 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2749 .maxlen = sizeof(int),
2750 .mode = 0644,
2751 .proc_handler = proc_dointvec_jiffies,
2754 .procname = "gc_interval",
2755 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2756 .maxlen = sizeof(int),
2757 .mode = 0644,
2758 .proc_handler = proc_dointvec_jiffies,
2761 .procname = "gc_elasticity",
2762 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2763 .maxlen = sizeof(int),
2764 .mode = 0644,
2765 .proc_handler = proc_dointvec,
2768 .procname = "mtu_expires",
2769 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2770 .maxlen = sizeof(int),
2771 .mode = 0644,
2772 .proc_handler = proc_dointvec_jiffies,
2775 .procname = "min_adv_mss",
2776 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2777 .maxlen = sizeof(int),
2778 .mode = 0644,
2779 .proc_handler = proc_dointvec,
2782 .procname = "gc_min_interval_ms",
2783 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2784 .maxlen = sizeof(int),
2785 .mode = 0644,
2786 .proc_handler = proc_dointvec_ms_jiffies,
2791 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2793 struct ctl_table *table;
2795 table = kmemdup(ipv6_route_table_template,
2796 sizeof(ipv6_route_table_template),
2797 GFP_KERNEL);
2799 if (table) {
2800 table[0].data = &net->ipv6.sysctl.flush_delay;
2801 table[0].extra1 = net;
2802 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2803 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2804 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2805 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2806 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2807 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2808 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2809 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2810 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2813 return table;
2815 #endif
2817 static int __net_init ip6_route_net_init(struct net *net)
2819 int ret = -ENOMEM;
2821 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2822 sizeof(net->ipv6.ip6_dst_ops));
2824 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2825 goto out_ip6_dst_ops;
2827 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2828 sizeof(*net->ipv6.ip6_null_entry),
2829 GFP_KERNEL);
2830 if (!net->ipv6.ip6_null_entry)
2831 goto out_ip6_dst_entries;
2832 net->ipv6.ip6_null_entry->dst.path =
2833 (struct dst_entry *)net->ipv6.ip6_null_entry;
2834 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2835 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2836 ip6_template_metrics, true);
2838 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2839 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2840 sizeof(*net->ipv6.ip6_prohibit_entry),
2841 GFP_KERNEL);
2842 if (!net->ipv6.ip6_prohibit_entry)
2843 goto out_ip6_null_entry;
2844 net->ipv6.ip6_prohibit_entry->dst.path =
2845 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2846 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2847 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2848 ip6_template_metrics, true);
2850 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2851 sizeof(*net->ipv6.ip6_blk_hole_entry),
2852 GFP_KERNEL);
2853 if (!net->ipv6.ip6_blk_hole_entry)
2854 goto out_ip6_prohibit_entry;
2855 net->ipv6.ip6_blk_hole_entry->dst.path =
2856 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2857 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2858 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2859 ip6_template_metrics, true);
2860 #endif
2862 net->ipv6.sysctl.flush_delay = 0;
2863 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2864 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2865 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2866 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2867 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2868 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2869 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2871 #ifdef CONFIG_PROC_FS
2872 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2873 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2874 #endif
2875 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2877 ret = 0;
2878 out:
2879 return ret;
2881 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2882 out_ip6_prohibit_entry:
2883 kfree(net->ipv6.ip6_prohibit_entry);
2884 out_ip6_null_entry:
2885 kfree(net->ipv6.ip6_null_entry);
2886 #endif
2887 out_ip6_dst_entries:
2888 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2889 out_ip6_dst_ops:
2890 goto out;
2893 static void __net_exit ip6_route_net_exit(struct net *net)
2895 #ifdef CONFIG_PROC_FS
2896 proc_net_remove(net, "ipv6_route");
2897 proc_net_remove(net, "rt6_stats");
2898 #endif
2899 kfree(net->ipv6.ip6_null_entry);
2900 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2901 kfree(net->ipv6.ip6_prohibit_entry);
2902 kfree(net->ipv6.ip6_blk_hole_entry);
2903 #endif
2904 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907 static struct pernet_operations ip6_route_net_ops = {
2908 .init = ip6_route_net_init,
2909 .exit = ip6_route_net_exit,
2912 static struct notifier_block ip6_route_dev_notifier = {
2913 .notifier_call = ip6_route_dev_notify,
2914 .priority = 0,
2917 int __init ip6_route_init(void)
2919 int ret;
2921 ret = -ENOMEM;
2922 ip6_dst_ops_template.kmem_cachep =
2923 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2924 SLAB_HWCACHE_ALIGN, NULL);
2925 if (!ip6_dst_ops_template.kmem_cachep)
2926 goto out;
2928 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2929 if (ret)
2930 goto out_kmem_cache;
2932 ret = register_pernet_subsys(&ip6_route_net_ops);
2933 if (ret)
2934 goto out_dst_entries;
2936 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2938 /* Registering of the loopback is done before this portion of code,
2939 * the loopback reference in rt6_info will not be taken, do it
2940 * manually for init_net */
2941 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2942 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2943 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2944 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2945 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2946 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2947 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2948 #endif
2949 ret = fib6_init();
2950 if (ret)
2951 goto out_register_subsys;
2953 ret = xfrm6_init();
2954 if (ret)
2955 goto out_fib6_init;
2957 ret = fib6_rules_init();
2958 if (ret)
2959 goto xfrm6_init;
2961 ret = -ENOBUFS;
2962 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2963 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2964 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2965 goto fib6_rules_init;
2967 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2968 if (ret)
2969 goto fib6_rules_init;
2971 out:
2972 return ret;
2974 fib6_rules_init:
2975 fib6_rules_cleanup();
2976 xfrm6_init:
2977 xfrm6_fini();
2978 out_fib6_init:
2979 fib6_gc_cleanup();
2980 out_register_subsys:
2981 unregister_pernet_subsys(&ip6_route_net_ops);
2982 out_dst_entries:
2983 dst_entries_destroy(&ip6_dst_blackhole_ops);
2984 out_kmem_cache:
2985 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2986 goto out;
2989 void ip6_route_cleanup(void)
2991 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2992 fib6_rules_cleanup();
2993 xfrm6_fini();
2994 fib6_gc_cleanup();
2995 unregister_pernet_subsys(&ip6_route_net_ops);
2996 dst_entries_destroy(&ip6_dst_blackhole_ops);
2997 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);