mm/vmalloc.c: report more vmalloc failures
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv6 / route.c
blob57b82dc1ae91c8426b0894b5e4e6030453c66352
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76 const struct in6_addr *dest);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void ip6_dst_destroy(struct dst_entry *);
82 static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how);
84 static int ip6_dst_gc(struct dst_ops *ops);
86 static int ip6_pkt_discard(struct sk_buff *skb);
87 static int ip6_pkt_discard_out(struct sk_buff *skb);
88 static void ip6_link_failure(struct sk_buff *skb);
89 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 const struct in6_addr *prefix, int prefixlen,
94 const struct in6_addr *gwaddr, int ifindex,
95 unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 const struct in6_addr *prefix, int prefixlen,
98 const struct in6_addr *gwaddr, int ifindex);
99 #endif
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 struct rt6_info *rt = (struct rt6_info *) dst;
104 struct inet_peer *peer;
105 u32 *p = NULL;
107 if (!(rt->dst.flags & DST_HOST))
108 return NULL;
110 if (!rt->rt6i_peer)
111 rt6_bind_peer(rt, 1);
113 peer = rt->rt6i_peer;
114 if (peer) {
115 u32 *old_p = __DST_METRICS_PTR(old);
116 unsigned long prev, new;
118 p = peer->metrics;
119 if (inet_metrics_new(peer))
120 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122 new = (unsigned long) p;
123 prev = cmpxchg(&dst->_metrics, old, new);
125 if (prev != old) {
126 p = __DST_METRICS_PTR(prev);
127 if (prev & DST_METRICS_READ_ONLY)
128 p = NULL;
131 return p;
134 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
139 static struct dst_ops ip6_dst_ops_template = {
140 .family = AF_INET6,
141 .protocol = cpu_to_be16(ETH_P_IPV6),
142 .gc = ip6_dst_gc,
143 .gc_thresh = 1024,
144 .check = ip6_dst_check,
145 .default_advmss = ip6_default_advmss,
146 .default_mtu = ip6_default_mtu,
147 .cow_metrics = ipv6_cow_metrics,
148 .destroy = ip6_dst_destroy,
149 .ifdown = ip6_dst_ifdown,
150 .negative_advice = ip6_negative_advice,
151 .link_failure = ip6_link_failure,
152 .update_pmtu = ip6_rt_update_pmtu,
153 .local_out = __ip6_local_out,
154 .neigh_lookup = ip6_neigh_lookup,
157 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
159 return 0;
162 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
167 unsigned long old)
169 return NULL;
172 static struct dst_ops ip6_dst_blackhole_ops = {
173 .family = AF_INET6,
174 .protocol = cpu_to_be16(ETH_P_IPV6),
175 .destroy = ip6_dst_destroy,
176 .check = ip6_dst_check,
177 .default_mtu = ip6_blackhole_default_mtu,
178 .default_advmss = ip6_default_advmss,
179 .update_pmtu = ip6_rt_blackhole_update_pmtu,
180 .cow_metrics = ip6_rt_blackhole_cow_metrics,
181 .neigh_lookup = ip6_neigh_lookup,
184 static const u32 ip6_template_metrics[RTAX_MAX] = {
185 [RTAX_HOPLIMIT - 1] = 255,
188 static struct rt6_info ip6_null_entry_template = {
189 .dst = {
190 .__refcnt = ATOMIC_INIT(1),
191 .__use = 1,
192 .obsolete = -1,
193 .error = -ENETUNREACH,
194 .input = ip6_pkt_discard,
195 .output = ip6_pkt_discard_out,
197 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
198 .rt6i_protocol = RTPROT_KERNEL,
199 .rt6i_metric = ~(u32) 0,
200 .rt6i_ref = ATOMIC_INIT(1),
203 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
205 static int ip6_pkt_prohibit(struct sk_buff *skb);
206 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
208 static struct rt6_info ip6_prohibit_entry_template = {
209 .dst = {
210 .__refcnt = ATOMIC_INIT(1),
211 .__use = 1,
212 .obsolete = -1,
213 .error = -EACCES,
214 .input = ip6_pkt_prohibit,
215 .output = ip6_pkt_prohibit_out,
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
223 static struct rt6_info ip6_blk_hole_entry_template = {
224 .dst = {
225 .__refcnt = ATOMIC_INIT(1),
226 .__use = 1,
227 .obsolete = -1,
228 .error = -EINVAL,
229 .input = dst_discard,
230 .output = dst_discard,
232 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
233 .rt6i_protocol = RTPROT_KERNEL,
234 .rt6i_metric = ~(u32) 0,
235 .rt6i_ref = ATOMIC_INIT(1),
238 #endif
240 /* allocate dst with ip6_dst_ops */
241 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
242 struct net_device *dev,
243 int flags)
245 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
247 if (rt != NULL)
248 memset(&rt->rt6i_table, 0,
249 sizeof(*rt) - sizeof(struct dst_entry));
251 return rt;
254 static void ip6_dst_destroy(struct dst_entry *dst)
256 struct rt6_info *rt = (struct rt6_info *)dst;
257 struct inet6_dev *idev = rt->rt6i_idev;
258 struct inet_peer *peer = rt->rt6i_peer;
260 if (!(rt->dst.flags & DST_HOST))
261 dst_destroy_metrics_generic(dst);
263 if (idev != NULL) {
264 rt->rt6i_idev = NULL;
265 in6_dev_put(idev);
267 if (peer) {
268 rt->rt6i_peer = NULL;
269 inet_putpeer(peer);
273 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
275 static u32 rt6_peer_genid(void)
277 return atomic_read(&__rt6_peer_genid);
280 void rt6_bind_peer(struct rt6_info *rt, int create)
282 struct inet_peer *peer;
284 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
285 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
286 inet_putpeer(peer);
287 else
288 rt->rt6i_peer_genid = rt6_peer_genid();
291 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
292 int how)
294 struct rt6_info *rt = (struct rt6_info *)dst;
295 struct inet6_dev *idev = rt->rt6i_idev;
296 struct net_device *loopback_dev =
297 dev_net(dev)->loopback_dev;
299 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
300 struct inet6_dev *loopback_idev =
301 in6_dev_get(loopback_dev);
302 if (loopback_idev != NULL) {
303 rt->rt6i_idev = loopback_idev;
304 in6_dev_put(idev);
309 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
311 return (rt->rt6i_flags & RTF_EXPIRES) &&
312 time_after(jiffies, rt->rt6i_expires);
315 static inline int rt6_need_strict(const struct in6_addr *daddr)
317 return ipv6_addr_type(daddr) &
318 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 * Route lookup. Any table->tb6_lock is implied.
325 static inline struct rt6_info *rt6_device_match(struct net *net,
326 struct rt6_info *rt,
327 const struct in6_addr *saddr,
328 int oif,
329 int flags)
331 struct rt6_info *local = NULL;
332 struct rt6_info *sprt;
334 if (!oif && ipv6_addr_any(saddr))
335 goto out;
337 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
338 struct net_device *dev = sprt->rt6i_dev;
340 if (oif) {
341 if (dev->ifindex == oif)
342 return sprt;
343 if (dev->flags & IFF_LOOPBACK) {
344 if (sprt->rt6i_idev == NULL ||
345 sprt->rt6i_idev->dev->ifindex != oif) {
346 if (flags & RT6_LOOKUP_F_IFACE && oif)
347 continue;
348 if (local && (!oif ||
349 local->rt6i_idev->dev->ifindex == oif))
350 continue;
352 local = sprt;
354 } else {
355 if (ipv6_chk_addr(net, saddr, dev,
356 flags & RT6_LOOKUP_F_IFACE))
357 return sprt;
361 if (oif) {
362 if (local)
363 return local;
365 if (flags & RT6_LOOKUP_F_IFACE)
366 return net->ipv6.ip6_null_entry;
368 out:
369 return rt;
372 #ifdef CONFIG_IPV6_ROUTER_PREF
373 static void rt6_probe(struct rt6_info *rt)
375 struct neighbour *neigh;
377 * Okay, this does not seem to be appropriate
378 * for now, however, we need to check if it
379 * is really so; aka Router Reachability Probing.
381 * Router Reachability Probe MUST be rate-limited
382 * to no more than one per minute.
384 rcu_read_lock();
385 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
386 if (!neigh || (neigh->nud_state & NUD_VALID))
387 goto out;
388 read_lock_bh(&neigh->lock);
389 if (!(neigh->nud_state & NUD_VALID) &&
390 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
391 struct in6_addr mcaddr;
392 struct in6_addr *target;
394 neigh->updated = jiffies;
395 read_unlock_bh(&neigh->lock);
397 target = (struct in6_addr *)&neigh->primary_key;
398 addrconf_addr_solict_mult(target, &mcaddr);
399 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
400 } else {
401 read_unlock_bh(&neigh->lock);
403 out:
404 rcu_read_unlock();
406 #else
407 static inline void rt6_probe(struct rt6_info *rt)
410 #endif
413 * Default Router Selection (RFC 2461 6.3.6)
415 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
417 struct net_device *dev = rt->rt6i_dev;
418 if (!oif || dev->ifindex == oif)
419 return 2;
420 if ((dev->flags & IFF_LOOPBACK) &&
421 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
422 return 1;
423 return 0;
426 static inline int rt6_check_neigh(struct rt6_info *rt)
428 struct neighbour *neigh;
429 int m;
431 rcu_read_lock();
432 neigh = dst_get_neighbour(&rt->dst);
433 if (rt->rt6i_flags & RTF_NONEXTHOP ||
434 !(rt->rt6i_flags & RTF_GATEWAY))
435 m = 1;
436 else if (neigh) {
437 read_lock_bh(&neigh->lock);
438 if (neigh->nud_state & NUD_VALID)
439 m = 2;
440 #ifdef CONFIG_IPV6_ROUTER_PREF
441 else if (neigh->nud_state & NUD_FAILED)
442 m = 0;
443 #endif
444 else
445 m = 1;
446 read_unlock_bh(&neigh->lock);
447 } else
448 m = 0;
449 rcu_read_unlock();
450 return m;
453 static int rt6_score_route(struct rt6_info *rt, int oif,
454 int strict)
456 int m, n;
458 m = rt6_check_dev(rt, oif);
459 if (!m && (strict & RT6_LOOKUP_F_IFACE))
460 return -1;
461 #ifdef CONFIG_IPV6_ROUTER_PREF
462 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
463 #endif
464 n = rt6_check_neigh(rt);
465 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
466 return -1;
467 return m;
470 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
471 int *mpri, struct rt6_info *match)
473 int m;
475 if (rt6_check_expired(rt))
476 goto out;
478 m = rt6_score_route(rt, oif, strict);
479 if (m < 0)
480 goto out;
482 if (m > *mpri) {
483 if (strict & RT6_LOOKUP_F_REACHABLE)
484 rt6_probe(match);
485 *mpri = m;
486 match = rt;
487 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
488 rt6_probe(rt);
491 out:
492 return match;
495 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
496 struct rt6_info *rr_head,
497 u32 metric, int oif, int strict)
499 struct rt6_info *rt, *match;
500 int mpri = -1;
502 match = NULL;
503 for (rt = rr_head; rt && rt->rt6i_metric == metric;
504 rt = rt->dst.rt6_next)
505 match = find_match(rt, oif, strict, &mpri, match);
506 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
510 return match;
513 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
515 struct rt6_info *match, *rt0;
516 struct net *net;
518 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
519 __func__, fn->leaf, oif);
521 rt0 = fn->rr_ptr;
522 if (!rt0)
523 fn->rr_ptr = rt0 = fn->leaf;
525 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
527 if (!match &&
528 (strict & RT6_LOOKUP_F_REACHABLE)) {
529 struct rt6_info *next = rt0->dst.rt6_next;
531 /* no entries matched; do round-robin */
532 if (!next || next->rt6i_metric != rt0->rt6i_metric)
533 next = fn->leaf;
535 if (next != rt0)
536 fn->rr_ptr = next;
539 RT6_TRACE("%s() => %p\n",
540 __func__, match);
542 net = dev_net(rt0->rt6i_dev);
543 return match ? match : net->ipv6.ip6_null_entry;
546 #ifdef CONFIG_IPV6_ROUTE_INFO
547 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
548 const struct in6_addr *gwaddr)
550 struct net *net = dev_net(dev);
551 struct route_info *rinfo = (struct route_info *) opt;
552 struct in6_addr prefix_buf, *prefix;
553 unsigned int pref;
554 unsigned long lifetime;
555 struct rt6_info *rt;
557 if (len < sizeof(struct route_info)) {
558 return -EINVAL;
561 /* Sanity check for prefix_len and length */
562 if (rinfo->length > 3) {
563 return -EINVAL;
564 } else if (rinfo->prefix_len > 128) {
565 return -EINVAL;
566 } else if (rinfo->prefix_len > 64) {
567 if (rinfo->length < 2) {
568 return -EINVAL;
570 } else if (rinfo->prefix_len > 0) {
571 if (rinfo->length < 1) {
572 return -EINVAL;
576 pref = rinfo->route_pref;
577 if (pref == ICMPV6_ROUTER_PREF_INVALID)
578 return -EINVAL;
580 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
582 if (rinfo->length == 3)
583 prefix = (struct in6_addr *)rinfo->prefix;
584 else {
585 /* this function is safe */
586 ipv6_addr_prefix(&prefix_buf,
587 (struct in6_addr *)rinfo->prefix,
588 rinfo->prefix_len);
589 prefix = &prefix_buf;
592 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
593 dev->ifindex);
595 if (rt && !lifetime) {
596 ip6_del_rt(rt);
597 rt = NULL;
600 if (!rt && lifetime)
601 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
602 pref);
603 else if (rt)
604 rt->rt6i_flags = RTF_ROUTEINFO |
605 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
607 if (rt) {
608 if (!addrconf_finite_timeout(lifetime)) {
609 rt->rt6i_flags &= ~RTF_EXPIRES;
610 } else {
611 rt->rt6i_expires = jiffies + HZ * lifetime;
612 rt->rt6i_flags |= RTF_EXPIRES;
614 dst_release(&rt->dst);
616 return 0;
618 #endif
620 #define BACKTRACK(__net, saddr) \
621 do { \
622 if (rt == __net->ipv6.ip6_null_entry) { \
623 struct fib6_node *pn; \
624 while (1) { \
625 if (fn->fn_flags & RTN_TL_ROOT) \
626 goto out; \
627 pn = fn->parent; \
628 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
629 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
630 else \
631 fn = pn; \
632 if (fn->fn_flags & RTN_RTINFO) \
633 goto restart; \
636 } while(0)
638 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
639 struct fib6_table *table,
640 struct flowi6 *fl6, int flags)
642 struct fib6_node *fn;
643 struct rt6_info *rt;
645 read_lock_bh(&table->tb6_lock);
646 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
647 restart:
648 rt = fn->leaf;
649 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
650 BACKTRACK(net, &fl6->saddr);
651 out:
652 dst_use(&rt->dst, jiffies);
653 read_unlock_bh(&table->tb6_lock);
654 return rt;
658 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
659 const struct in6_addr *saddr, int oif, int strict)
661 struct flowi6 fl6 = {
662 .flowi6_oif = oif,
663 .daddr = *daddr,
665 struct dst_entry *dst;
666 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
668 if (saddr) {
669 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
670 flags |= RT6_LOOKUP_F_HAS_SADDR;
673 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
674 if (dst->error == 0)
675 return (struct rt6_info *) dst;
677 dst_release(dst);
679 return NULL;
682 EXPORT_SYMBOL(rt6_lookup);
684 /* ip6_ins_rt is called with FREE table->tb6_lock.
685 It takes new route entry, the addition fails by any reason the
686 route is freed. In any case, if caller does not hold it, it may
687 be destroyed.
690 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
692 int err;
693 struct fib6_table *table;
695 table = rt->rt6i_table;
696 write_lock_bh(&table->tb6_lock);
697 err = fib6_add(&table->tb6_root, rt, info);
698 write_unlock_bh(&table->tb6_lock);
700 return err;
703 int ip6_ins_rt(struct rt6_info *rt)
705 struct nl_info info = {
706 .nl_net = dev_net(rt->rt6i_dev),
708 return __ip6_ins_rt(rt, &info);
711 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
712 const struct in6_addr *daddr,
713 const struct in6_addr *saddr)
715 struct rt6_info *rt;
718 * Clone the route.
721 rt = ip6_rt_copy(ort, daddr);
723 if (rt) {
724 struct neighbour *neigh;
725 int attempts = !in_softirq();
727 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
728 if (rt->rt6i_dst.plen != 128 &&
729 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
730 rt->rt6i_flags |= RTF_ANYCAST;
731 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
734 rt->rt6i_flags |= RTF_CACHE;
736 #ifdef CONFIG_IPV6_SUBTREES
737 if (rt->rt6i_src.plen && saddr) {
738 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
739 rt->rt6i_src.plen = 128;
741 #endif
743 retry:
744 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
745 if (IS_ERR(neigh)) {
746 struct net *net = dev_net(rt->rt6i_dev);
747 int saved_rt_min_interval =
748 net->ipv6.sysctl.ip6_rt_gc_min_interval;
749 int saved_rt_elasticity =
750 net->ipv6.sysctl.ip6_rt_gc_elasticity;
752 if (attempts-- > 0) {
753 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
754 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
756 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
758 net->ipv6.sysctl.ip6_rt_gc_elasticity =
759 saved_rt_elasticity;
760 net->ipv6.sysctl.ip6_rt_gc_min_interval =
761 saved_rt_min_interval;
762 goto retry;
765 if (net_ratelimit())
766 printk(KERN_WARNING
767 "ipv6: Neighbour table overflow.\n");
768 dst_free(&rt->dst);
769 return NULL;
771 dst_set_neighbour(&rt->dst, neigh);
775 return rt;
778 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
779 const struct in6_addr *daddr)
781 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
783 if (rt) {
784 rt->rt6i_flags |= RTF_CACHE;
785 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
787 return rt;
790 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
791 struct flowi6 *fl6, int flags)
793 struct fib6_node *fn;
794 struct rt6_info *rt, *nrt;
795 int strict = 0;
796 int attempts = 3;
797 int err;
798 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
800 strict |= flags & RT6_LOOKUP_F_IFACE;
802 relookup:
803 read_lock_bh(&table->tb6_lock);
805 restart_2:
806 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
808 restart:
809 rt = rt6_select(fn, oif, strict | reachable);
811 BACKTRACK(net, &fl6->saddr);
812 if (rt == net->ipv6.ip6_null_entry ||
813 rt->rt6i_flags & RTF_CACHE)
814 goto out;
816 dst_hold(&rt->dst);
817 read_unlock_bh(&table->tb6_lock);
819 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
820 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
821 else if (!(rt->dst.flags & DST_HOST))
822 nrt = rt6_alloc_clone(rt, &fl6->daddr);
823 else
824 goto out2;
826 dst_release(&rt->dst);
827 rt = nrt ? : net->ipv6.ip6_null_entry;
829 dst_hold(&rt->dst);
830 if (nrt) {
831 err = ip6_ins_rt(nrt);
832 if (!err)
833 goto out2;
836 if (--attempts <= 0)
837 goto out2;
840 * Race condition! In the gap, when table->tb6_lock was
841 * released someone could insert this route. Relookup.
843 dst_release(&rt->dst);
844 goto relookup;
846 out:
847 if (reachable) {
848 reachable = 0;
849 goto restart_2;
851 dst_hold(&rt->dst);
852 read_unlock_bh(&table->tb6_lock);
853 out2:
854 rt->dst.lastuse = jiffies;
855 rt->dst.__use++;
857 return rt;
860 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
861 struct flowi6 *fl6, int flags)
863 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
866 void ip6_route_input(struct sk_buff *skb)
868 const struct ipv6hdr *iph = ipv6_hdr(skb);
869 struct net *net = dev_net(skb->dev);
870 int flags = RT6_LOOKUP_F_HAS_SADDR;
871 struct flowi6 fl6 = {
872 .flowi6_iif = skb->dev->ifindex,
873 .daddr = iph->daddr,
874 .saddr = iph->saddr,
875 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
876 .flowi6_mark = skb->mark,
877 .flowi6_proto = iph->nexthdr,
880 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
881 flags |= RT6_LOOKUP_F_IFACE;
883 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
886 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
887 struct flowi6 *fl6, int flags)
889 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
892 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
893 struct flowi6 *fl6)
895 int flags = 0;
897 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
898 flags |= RT6_LOOKUP_F_IFACE;
900 if (!ipv6_addr_any(&fl6->saddr))
901 flags |= RT6_LOOKUP_F_HAS_SADDR;
902 else if (sk)
903 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
905 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
908 EXPORT_SYMBOL(ip6_route_output);
910 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
912 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
913 struct dst_entry *new = NULL;
915 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
916 if (rt) {
917 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
919 new = &rt->dst;
921 new->__use = 1;
922 new->input = dst_discard;
923 new->output = dst_discard;
925 if (dst_metrics_read_only(&ort->dst))
926 new->_metrics = ort->dst._metrics;
927 else
928 dst_copy_metrics(new, &ort->dst);
929 rt->rt6i_idev = ort->rt6i_idev;
930 if (rt->rt6i_idev)
931 in6_dev_hold(rt->rt6i_idev);
932 rt->rt6i_expires = 0;
934 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
935 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
936 rt->rt6i_metric = 0;
938 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
939 #ifdef CONFIG_IPV6_SUBTREES
940 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
941 #endif
943 dst_free(new);
946 dst_release(dst_orig);
947 return new ? new : ERR_PTR(-ENOMEM);
951 * Destination cache support functions
954 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
956 struct rt6_info *rt;
958 rt = (struct rt6_info *) dst;
960 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
961 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
962 if (!rt->rt6i_peer)
963 rt6_bind_peer(rt, 0);
964 rt->rt6i_peer_genid = rt6_peer_genid();
966 return dst;
968 return NULL;
971 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
973 struct rt6_info *rt = (struct rt6_info *) dst;
975 if (rt) {
976 if (rt->rt6i_flags & RTF_CACHE) {
977 if (rt6_check_expired(rt)) {
978 ip6_del_rt(rt);
979 dst = NULL;
981 } else {
982 dst_release(dst);
983 dst = NULL;
986 return dst;
989 static void ip6_link_failure(struct sk_buff *skb)
991 struct rt6_info *rt;
993 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
995 rt = (struct rt6_info *) skb_dst(skb);
996 if (rt) {
997 if (rt->rt6i_flags&RTF_CACHE) {
998 dst_set_expires(&rt->dst, 0);
999 rt->rt6i_flags |= RTF_EXPIRES;
1000 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1001 rt->rt6i_node->fn_sernum = -1;
1005 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1007 struct rt6_info *rt6 = (struct rt6_info*)dst;
1009 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1010 rt6->rt6i_flags |= RTF_MODIFIED;
1011 if (mtu < IPV6_MIN_MTU) {
1012 u32 features = dst_metric(dst, RTAX_FEATURES);
1013 mtu = IPV6_MIN_MTU;
1014 features |= RTAX_FEATURE_ALLFRAG;
1015 dst_metric_set(dst, RTAX_FEATURES, features);
1017 dst_metric_set(dst, RTAX_MTU, mtu);
1021 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1023 struct net_device *dev = dst->dev;
1024 unsigned int mtu = dst_mtu(dst);
1025 struct net *net = dev_net(dev);
1027 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1029 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1030 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1033 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1034 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1035 * IPV6_MAXPLEN is also valid and means: "any MSS,
1036 * rely only on pmtu discovery"
1038 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1039 mtu = IPV6_MAXPLEN;
1040 return mtu;
1043 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1045 unsigned int mtu = IPV6_MIN_MTU;
1046 struct inet6_dev *idev;
1048 rcu_read_lock();
1049 idev = __in6_dev_get(dst->dev);
1050 if (idev)
1051 mtu = idev->cnf.mtu6;
1052 rcu_read_unlock();
1054 return mtu;
1057 static struct dst_entry *icmp6_dst_gc_list;
1058 static DEFINE_SPINLOCK(icmp6_dst_lock);
1060 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1061 struct neighbour *neigh,
1062 const struct in6_addr *addr)
1064 struct rt6_info *rt;
1065 struct inet6_dev *idev = in6_dev_get(dev);
1066 struct net *net = dev_net(dev);
1068 if (unlikely(idev == NULL))
1069 return NULL;
1071 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1072 if (unlikely(rt == NULL)) {
1073 in6_dev_put(idev);
1074 goto out;
1077 if (neigh)
1078 neigh_hold(neigh);
1079 else {
1080 neigh = ndisc_get_neigh(dev, addr);
1081 if (IS_ERR(neigh))
1082 neigh = NULL;
1085 rt->dst.flags |= DST_HOST;
1086 rt->dst.output = ip6_output;
1087 dst_set_neighbour(&rt->dst, neigh);
1088 atomic_set(&rt->dst.__refcnt, 1);
1089 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1090 rt->rt6i_dst.plen = 128;
1091 rt->rt6i_idev = idev;
1092 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1094 spin_lock_bh(&icmp6_dst_lock);
1095 rt->dst.next = icmp6_dst_gc_list;
1096 icmp6_dst_gc_list = &rt->dst;
1097 spin_unlock_bh(&icmp6_dst_lock);
1099 fib6_force_start_gc(net);
1101 out:
1102 return &rt->dst;
1105 int icmp6_dst_gc(void)
1107 struct dst_entry *dst, **pprev;
1108 int more = 0;
1110 spin_lock_bh(&icmp6_dst_lock);
1111 pprev = &icmp6_dst_gc_list;
1113 while ((dst = *pprev) != NULL) {
1114 if (!atomic_read(&dst->__refcnt)) {
1115 *pprev = dst->next;
1116 dst_free(dst);
1117 } else {
1118 pprev = &dst->next;
1119 ++more;
1123 spin_unlock_bh(&icmp6_dst_lock);
1125 return more;
1128 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1129 void *arg)
1131 struct dst_entry *dst, **pprev;
1133 spin_lock_bh(&icmp6_dst_lock);
1134 pprev = &icmp6_dst_gc_list;
1135 while ((dst = *pprev) != NULL) {
1136 struct rt6_info *rt = (struct rt6_info *) dst;
1137 if (func(rt, arg)) {
1138 *pprev = dst->next;
1139 dst_free(dst);
1140 } else {
1141 pprev = &dst->next;
1144 spin_unlock_bh(&icmp6_dst_lock);
1147 static int ip6_dst_gc(struct dst_ops *ops)
1149 unsigned long now = jiffies;
1150 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1151 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1152 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1153 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1154 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1155 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1156 int entries;
1158 entries = dst_entries_get_fast(ops);
1159 if (time_after(rt_last_gc + rt_min_interval, now) &&
1160 entries <= rt_max_size)
1161 goto out;
1163 net->ipv6.ip6_rt_gc_expire++;
1164 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1165 net->ipv6.ip6_rt_last_gc = now;
1166 entries = dst_entries_get_slow(ops);
1167 if (entries < ops->gc_thresh)
1168 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1169 out:
1170 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1171 return entries > rt_max_size;
1174 /* Clean host part of a prefix. Not necessary in radix tree,
1175 but results in cleaner routing tables.
1177 Remove it only when all the things will work!
1180 int ip6_dst_hoplimit(struct dst_entry *dst)
1182 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1183 if (hoplimit == 0) {
1184 struct net_device *dev = dst->dev;
1185 struct inet6_dev *idev;
1187 rcu_read_lock();
1188 idev = __in6_dev_get(dev);
1189 if (idev)
1190 hoplimit = idev->cnf.hop_limit;
1191 else
1192 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1193 rcu_read_unlock();
1195 return hoplimit;
1197 EXPORT_SYMBOL(ip6_dst_hoplimit);
1203 int ip6_route_add(struct fib6_config *cfg)
1205 int err;
1206 struct net *net = cfg->fc_nlinfo.nl_net;
1207 struct rt6_info *rt = NULL;
1208 struct net_device *dev = NULL;
1209 struct inet6_dev *idev = NULL;
1210 struct fib6_table *table;
1211 int addr_type;
1213 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1214 return -EINVAL;
1215 #ifndef CONFIG_IPV6_SUBTREES
1216 if (cfg->fc_src_len)
1217 return -EINVAL;
1218 #endif
1219 if (cfg->fc_ifindex) {
1220 err = -ENODEV;
1221 dev = dev_get_by_index(net, cfg->fc_ifindex);
1222 if (!dev)
1223 goto out;
1224 idev = in6_dev_get(dev);
1225 if (!idev)
1226 goto out;
1229 if (cfg->fc_metric == 0)
1230 cfg->fc_metric = IP6_RT_PRIO_USER;
1232 table = fib6_new_table(net, cfg->fc_table);
1233 if (table == NULL) {
1234 err = -ENOBUFS;
1235 goto out;
1238 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1240 if (rt == NULL) {
1241 err = -ENOMEM;
1242 goto out;
1245 rt->dst.obsolete = -1;
1246 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1247 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1250 if (cfg->fc_protocol == RTPROT_UNSPEC)
1251 cfg->fc_protocol = RTPROT_BOOT;
1252 rt->rt6i_protocol = cfg->fc_protocol;
1254 addr_type = ipv6_addr_type(&cfg->fc_dst);
1256 if (addr_type & IPV6_ADDR_MULTICAST)
1257 rt->dst.input = ip6_mc_input;
1258 else if (cfg->fc_flags & RTF_LOCAL)
1259 rt->dst.input = ip6_input;
1260 else
1261 rt->dst.input = ip6_forward;
1263 rt->dst.output = ip6_output;
1265 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1266 rt->rt6i_dst.plen = cfg->fc_dst_len;
1267 if (rt->rt6i_dst.plen == 128)
1268 rt->dst.flags |= DST_HOST;
1270 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1271 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1272 if (!metrics) {
1273 err = -ENOMEM;
1274 goto out;
1276 dst_init_metrics(&rt->dst, metrics, 0);
1278 #ifdef CONFIG_IPV6_SUBTREES
1279 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1280 rt->rt6i_src.plen = cfg->fc_src_len;
1281 #endif
1283 rt->rt6i_metric = cfg->fc_metric;
1285 /* We cannot add true routes via loopback here,
1286 they would result in kernel looping; promote them to reject routes
1288 if ((cfg->fc_flags & RTF_REJECT) ||
1289 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1290 && !(cfg->fc_flags&RTF_LOCAL))) {
1291 /* hold loopback dev/idev if we haven't done so. */
1292 if (dev != net->loopback_dev) {
1293 if (dev) {
1294 dev_put(dev);
1295 in6_dev_put(idev);
1297 dev = net->loopback_dev;
1298 dev_hold(dev);
1299 idev = in6_dev_get(dev);
1300 if (!idev) {
1301 err = -ENODEV;
1302 goto out;
1305 rt->dst.output = ip6_pkt_discard_out;
1306 rt->dst.input = ip6_pkt_discard;
1307 rt->dst.error = -ENETUNREACH;
1308 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1309 goto install_route;
1312 if (cfg->fc_flags & RTF_GATEWAY) {
1313 const struct in6_addr *gw_addr;
1314 int gwa_type;
1316 gw_addr = &cfg->fc_gateway;
1317 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1318 gwa_type = ipv6_addr_type(gw_addr);
1320 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1321 struct rt6_info *grt;
1323 /* IPv6 strictly inhibits using not link-local
1324 addresses as nexthop address.
1325 Otherwise, router will not able to send redirects.
1326 It is very good, but in some (rare!) circumstances
1327 (SIT, PtP, NBMA NOARP links) it is handy to allow
1328 some exceptions. --ANK
1330 err = -EINVAL;
1331 if (!(gwa_type&IPV6_ADDR_UNICAST))
1332 goto out;
1334 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1336 err = -EHOSTUNREACH;
1337 if (grt == NULL)
1338 goto out;
1339 if (dev) {
1340 if (dev != grt->rt6i_dev) {
1341 dst_release(&grt->dst);
1342 goto out;
1344 } else {
1345 dev = grt->rt6i_dev;
1346 idev = grt->rt6i_idev;
1347 dev_hold(dev);
1348 in6_dev_hold(grt->rt6i_idev);
1350 if (!(grt->rt6i_flags&RTF_GATEWAY))
1351 err = 0;
1352 dst_release(&grt->dst);
1354 if (err)
1355 goto out;
1357 err = -EINVAL;
1358 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1359 goto out;
1362 err = -ENODEV;
1363 if (dev == NULL)
1364 goto out;
1366 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1367 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1368 err = -EINVAL;
1369 goto out;
1371 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1372 rt->rt6i_prefsrc.plen = 128;
1373 } else
1374 rt->rt6i_prefsrc.plen = 0;
1376 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1377 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1378 if (IS_ERR(n)) {
1379 err = PTR_ERR(n);
1380 goto out;
1382 dst_set_neighbour(&rt->dst, n);
1385 rt->rt6i_flags = cfg->fc_flags;
1387 install_route:
1388 if (cfg->fc_mx) {
1389 struct nlattr *nla;
1390 int remaining;
1392 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1393 int type = nla_type(nla);
1395 if (type) {
1396 if (type > RTAX_MAX) {
1397 err = -EINVAL;
1398 goto out;
1401 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1406 rt->dst.dev = dev;
1407 rt->rt6i_idev = idev;
1408 rt->rt6i_table = table;
1410 cfg->fc_nlinfo.nl_net = dev_net(dev);
1412 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1414 out:
1415 if (dev)
1416 dev_put(dev);
1417 if (idev)
1418 in6_dev_put(idev);
1419 if (rt)
1420 dst_free(&rt->dst);
1421 return err;
1424 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1426 int err;
1427 struct fib6_table *table;
1428 struct net *net = dev_net(rt->rt6i_dev);
1430 if (rt == net->ipv6.ip6_null_entry)
1431 return -ENOENT;
1433 table = rt->rt6i_table;
1434 write_lock_bh(&table->tb6_lock);
1436 err = fib6_del(rt, info);
1437 dst_release(&rt->dst);
1439 write_unlock_bh(&table->tb6_lock);
1441 return err;
1444 int ip6_del_rt(struct rt6_info *rt)
1446 struct nl_info info = {
1447 .nl_net = dev_net(rt->rt6i_dev),
1449 return __ip6_del_rt(rt, &info);
1452 static int ip6_route_del(struct fib6_config *cfg)
1454 struct fib6_table *table;
1455 struct fib6_node *fn;
1456 struct rt6_info *rt;
1457 int err = -ESRCH;
1459 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1460 if (table == NULL)
1461 return err;
1463 read_lock_bh(&table->tb6_lock);
1465 fn = fib6_locate(&table->tb6_root,
1466 &cfg->fc_dst, cfg->fc_dst_len,
1467 &cfg->fc_src, cfg->fc_src_len);
1469 if (fn) {
1470 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1471 if (cfg->fc_ifindex &&
1472 (rt->rt6i_dev == NULL ||
1473 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1474 continue;
1475 if (cfg->fc_flags & RTF_GATEWAY &&
1476 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1477 continue;
1478 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1479 continue;
1480 dst_hold(&rt->dst);
1481 read_unlock_bh(&table->tb6_lock);
1483 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1486 read_unlock_bh(&table->tb6_lock);
1488 return err;
1492 * Handle redirects
1494 struct ip6rd_flowi {
1495 struct flowi6 fl6;
1496 struct in6_addr gateway;
1499 static struct rt6_info *__ip6_route_redirect(struct net *net,
1500 struct fib6_table *table,
1501 struct flowi6 *fl6,
1502 int flags)
1504 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1505 struct rt6_info *rt;
1506 struct fib6_node *fn;
1509 * Get the "current" route for this destination and
1510 * check if the redirect has come from approriate router.
1512 * RFC 2461 specifies that redirects should only be
1513 * accepted if they come from the nexthop to the target.
1514 * Due to the way the routes are chosen, this notion
1515 * is a bit fuzzy and one might need to check all possible
1516 * routes.
1519 read_lock_bh(&table->tb6_lock);
1520 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1521 restart:
1522 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1524 * Current route is on-link; redirect is always invalid.
1526 * Seems, previous statement is not true. It could
1527 * be node, which looks for us as on-link (f.e. proxy ndisc)
1528 * But then router serving it might decide, that we should
1529 * know truth 8)8) --ANK (980726).
1531 if (rt6_check_expired(rt))
1532 continue;
1533 if (!(rt->rt6i_flags & RTF_GATEWAY))
1534 continue;
1535 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1536 continue;
1537 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1538 continue;
1539 break;
1542 if (!rt)
1543 rt = net->ipv6.ip6_null_entry;
1544 BACKTRACK(net, &fl6->saddr);
1545 out:
1546 dst_hold(&rt->dst);
1548 read_unlock_bh(&table->tb6_lock);
1550 return rt;
1553 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1554 const struct in6_addr *src,
1555 const struct in6_addr *gateway,
1556 struct net_device *dev)
1558 int flags = RT6_LOOKUP_F_HAS_SADDR;
1559 struct net *net = dev_net(dev);
1560 struct ip6rd_flowi rdfl = {
1561 .fl6 = {
1562 .flowi6_oif = dev->ifindex,
1563 .daddr = *dest,
1564 .saddr = *src,
1568 ipv6_addr_copy(&rdfl.gateway, gateway);
1570 if (rt6_need_strict(dest))
1571 flags |= RT6_LOOKUP_F_IFACE;
1573 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1574 flags, __ip6_route_redirect);
1577 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1578 const struct in6_addr *saddr,
1579 struct neighbour *neigh, u8 *lladdr, int on_link)
1581 struct rt6_info *rt, *nrt = NULL;
1582 struct netevent_redirect netevent;
1583 struct net *net = dev_net(neigh->dev);
1585 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1587 if (rt == net->ipv6.ip6_null_entry) {
1588 if (net_ratelimit())
1589 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1590 "for redirect target\n");
1591 goto out;
1595 * We have finally decided to accept it.
1598 neigh_update(neigh, lladdr, NUD_STALE,
1599 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1600 NEIGH_UPDATE_F_OVERRIDE|
1601 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1602 NEIGH_UPDATE_F_ISROUTER))
1606 * Redirect received -> path was valid.
1607 * Look, redirects are sent only in response to data packets,
1608 * so that this nexthop apparently is reachable. --ANK
1610 dst_confirm(&rt->dst);
1612 /* Duplicate redirect: silently ignore. */
1613 if (neigh == dst_get_neighbour_raw(&rt->dst))
1614 goto out;
1616 nrt = ip6_rt_copy(rt, dest);
1617 if (nrt == NULL)
1618 goto out;
1620 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1621 if (on_link)
1622 nrt->rt6i_flags &= ~RTF_GATEWAY;
1624 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1625 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1627 if (ip6_ins_rt(nrt))
1628 goto out;
1630 netevent.old = &rt->dst;
1631 netevent.new = &nrt->dst;
1632 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1634 if (rt->rt6i_flags&RTF_CACHE) {
1635 ip6_del_rt(rt);
1636 return;
1639 out:
1640 dst_release(&rt->dst);
1644 * Handle ICMP "packet too big" messages
1645 * i.e. Path MTU discovery
1648 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1649 struct net *net, u32 pmtu, int ifindex)
1651 struct rt6_info *rt, *nrt;
1652 int allfrag = 0;
1653 again:
1654 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1655 if (rt == NULL)
1656 return;
1658 if (rt6_check_expired(rt)) {
1659 ip6_del_rt(rt);
1660 goto again;
1663 if (pmtu >= dst_mtu(&rt->dst))
1664 goto out;
1666 if (pmtu < IPV6_MIN_MTU) {
1668 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1669 * MTU (1280) and a fragment header should always be included
1670 * after a node receiving Too Big message reporting PMTU is
1671 * less than the IPv6 Minimum Link MTU.
1673 pmtu = IPV6_MIN_MTU;
1674 allfrag = 1;
1677 /* New mtu received -> path was valid.
1678 They are sent only in response to data packets,
1679 so that this nexthop apparently is reachable. --ANK
1681 dst_confirm(&rt->dst);
1683 /* Host route. If it is static, it would be better
1684 not to override it, but add new one, so that
1685 when cache entry will expire old pmtu
1686 would return automatically.
1688 if (rt->rt6i_flags & RTF_CACHE) {
1689 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1690 if (allfrag) {
1691 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1692 features |= RTAX_FEATURE_ALLFRAG;
1693 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1695 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1696 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1697 goto out;
1700 /* Network route.
1701 Two cases are possible:
1702 1. It is connected route. Action: COW
1703 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1705 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1706 nrt = rt6_alloc_cow(rt, daddr, saddr);
1707 else
1708 nrt = rt6_alloc_clone(rt, daddr);
1710 if (nrt) {
1711 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1712 if (allfrag) {
1713 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1714 features |= RTAX_FEATURE_ALLFRAG;
1715 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1718 /* According to RFC 1981, detecting PMTU increase shouldn't be
1719 * happened within 5 mins, the recommended timer is 10 mins.
1720 * Here this route expiration time is set to ip6_rt_mtu_expires
1721 * which is 10 mins. After 10 mins the decreased pmtu is expired
1722 * and detecting PMTU increase will be automatically happened.
1724 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1725 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1727 ip6_ins_rt(nrt);
1729 out:
1730 dst_release(&rt->dst);
1733 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1734 struct net_device *dev, u32 pmtu)
1736 struct net *net = dev_net(dev);
1739 * RFC 1981 states that a node "MUST reduce the size of the packets it
1740 * is sending along the path" that caused the Packet Too Big message.
1741 * Since it's not possible in the general case to determine which
1742 * interface was used to send the original packet, we update the MTU
1743 * on the interface that will be used to send future packets. We also
1744 * update the MTU on the interface that received the Packet Too Big in
1745 * case the original packet was forced out that interface with
1746 * SO_BINDTODEVICE or similar. This is the next best thing to the
1747 * correct behaviour, which would be to update the MTU on all
1748 * interfaces.
1750 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1751 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1755 * Misc support functions
1758 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1759 const struct in6_addr *dest)
1761 struct net *net = dev_net(ort->rt6i_dev);
1762 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1763 ort->dst.dev, 0);
1765 if (rt) {
1766 rt->dst.input = ort->dst.input;
1767 rt->dst.output = ort->dst.output;
1768 rt->dst.flags |= DST_HOST;
1770 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1771 rt->rt6i_dst.plen = 128;
1772 dst_copy_metrics(&rt->dst, &ort->dst);
1773 rt->dst.error = ort->dst.error;
1774 rt->rt6i_idev = ort->rt6i_idev;
1775 if (rt->rt6i_idev)
1776 in6_dev_hold(rt->rt6i_idev);
1777 rt->dst.lastuse = jiffies;
1778 rt->rt6i_expires = 0;
1780 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1781 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1782 rt->rt6i_metric = 0;
1784 #ifdef CONFIG_IPV6_SUBTREES
1785 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1786 #endif
1787 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1788 rt->rt6i_table = ort->rt6i_table;
1790 return rt;
1793 #ifdef CONFIG_IPV6_ROUTE_INFO
1794 static struct rt6_info *rt6_get_route_info(struct net *net,
1795 const struct in6_addr *prefix, int prefixlen,
1796 const struct in6_addr *gwaddr, int ifindex)
1798 struct fib6_node *fn;
1799 struct rt6_info *rt = NULL;
1800 struct fib6_table *table;
1802 table = fib6_get_table(net, RT6_TABLE_INFO);
1803 if (table == NULL)
1804 return NULL;
1806 write_lock_bh(&table->tb6_lock);
1807 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1808 if (!fn)
1809 goto out;
1811 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1812 if (rt->rt6i_dev->ifindex != ifindex)
1813 continue;
1814 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1815 continue;
1816 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1817 continue;
1818 dst_hold(&rt->dst);
1819 break;
1821 out:
1822 write_unlock_bh(&table->tb6_lock);
1823 return rt;
1826 static struct rt6_info *rt6_add_route_info(struct net *net,
1827 const struct in6_addr *prefix, int prefixlen,
1828 const struct in6_addr *gwaddr, int ifindex,
1829 unsigned pref)
1831 struct fib6_config cfg = {
1832 .fc_table = RT6_TABLE_INFO,
1833 .fc_metric = IP6_RT_PRIO_USER,
1834 .fc_ifindex = ifindex,
1835 .fc_dst_len = prefixlen,
1836 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1837 RTF_UP | RTF_PREF(pref),
1838 .fc_nlinfo.pid = 0,
1839 .fc_nlinfo.nlh = NULL,
1840 .fc_nlinfo.nl_net = net,
1843 ipv6_addr_copy(&cfg.fc_dst, prefix);
1844 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1846 /* We should treat it as a default route if prefix length is 0. */
1847 if (!prefixlen)
1848 cfg.fc_flags |= RTF_DEFAULT;
1850 ip6_route_add(&cfg);
1852 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1854 #endif
1856 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1858 struct rt6_info *rt;
1859 struct fib6_table *table;
1861 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1862 if (table == NULL)
1863 return NULL;
1865 write_lock_bh(&table->tb6_lock);
1866 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1867 if (dev == rt->rt6i_dev &&
1868 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1869 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1870 break;
1872 if (rt)
1873 dst_hold(&rt->dst);
1874 write_unlock_bh(&table->tb6_lock);
1875 return rt;
1878 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1879 struct net_device *dev,
1880 unsigned int pref)
1882 struct fib6_config cfg = {
1883 .fc_table = RT6_TABLE_DFLT,
1884 .fc_metric = IP6_RT_PRIO_USER,
1885 .fc_ifindex = dev->ifindex,
1886 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1887 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1888 .fc_nlinfo.pid = 0,
1889 .fc_nlinfo.nlh = NULL,
1890 .fc_nlinfo.nl_net = dev_net(dev),
1893 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1895 ip6_route_add(&cfg);
1897 return rt6_get_dflt_router(gwaddr, dev);
1900 void rt6_purge_dflt_routers(struct net *net)
1902 struct rt6_info *rt;
1903 struct fib6_table *table;
1905 /* NOTE: Keep consistent with rt6_get_dflt_router */
1906 table = fib6_get_table(net, RT6_TABLE_DFLT);
1907 if (table == NULL)
1908 return;
1910 restart:
1911 read_lock_bh(&table->tb6_lock);
1912 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1913 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1914 dst_hold(&rt->dst);
1915 read_unlock_bh(&table->tb6_lock);
1916 ip6_del_rt(rt);
1917 goto restart;
1920 read_unlock_bh(&table->tb6_lock);
1923 static void rtmsg_to_fib6_config(struct net *net,
1924 struct in6_rtmsg *rtmsg,
1925 struct fib6_config *cfg)
1927 memset(cfg, 0, sizeof(*cfg));
1929 cfg->fc_table = RT6_TABLE_MAIN;
1930 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1931 cfg->fc_metric = rtmsg->rtmsg_metric;
1932 cfg->fc_expires = rtmsg->rtmsg_info;
1933 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1934 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1935 cfg->fc_flags = rtmsg->rtmsg_flags;
1937 cfg->fc_nlinfo.nl_net = net;
1939 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1940 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1941 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1944 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1946 struct fib6_config cfg;
1947 struct in6_rtmsg rtmsg;
1948 int err;
1950 switch(cmd) {
1951 case SIOCADDRT: /* Add a route */
1952 case SIOCDELRT: /* Delete a route */
1953 if (!capable(CAP_NET_ADMIN))
1954 return -EPERM;
1955 err = copy_from_user(&rtmsg, arg,
1956 sizeof(struct in6_rtmsg));
1957 if (err)
1958 return -EFAULT;
1960 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1962 rtnl_lock();
1963 switch (cmd) {
1964 case SIOCADDRT:
1965 err = ip6_route_add(&cfg);
1966 break;
1967 case SIOCDELRT:
1968 err = ip6_route_del(&cfg);
1969 break;
1970 default:
1971 err = -EINVAL;
1973 rtnl_unlock();
1975 return err;
1978 return -EINVAL;
1982 * Drop the packet on the floor
1985 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1987 int type;
1988 struct dst_entry *dst = skb_dst(skb);
1989 switch (ipstats_mib_noroutes) {
1990 case IPSTATS_MIB_INNOROUTES:
1991 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1992 if (type == IPV6_ADDR_ANY) {
1993 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1994 IPSTATS_MIB_INADDRERRORS);
1995 break;
1997 /* FALLTHROUGH */
1998 case IPSTATS_MIB_OUTNOROUTES:
1999 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2000 ipstats_mib_noroutes);
2001 break;
2003 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2004 kfree_skb(skb);
2005 return 0;
2008 static int ip6_pkt_discard(struct sk_buff *skb)
2010 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2013 static int ip6_pkt_discard_out(struct sk_buff *skb)
2015 skb->dev = skb_dst(skb)->dev;
2016 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2019 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2021 static int ip6_pkt_prohibit(struct sk_buff *skb)
2023 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2026 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2028 skb->dev = skb_dst(skb)->dev;
2029 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2032 #endif
2035 * Allocate a dst for local (unicast / anycast) address.
2038 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2039 const struct in6_addr *addr,
2040 int anycast)
2042 struct net *net = dev_net(idev->dev);
2043 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2044 net->loopback_dev, 0);
2045 struct neighbour *neigh;
2047 if (rt == NULL) {
2048 if (net_ratelimit())
2049 pr_warning("IPv6: Maximum number of routes reached,"
2050 " consider increasing route/max_size.\n");
2051 return ERR_PTR(-ENOMEM);
2054 in6_dev_hold(idev);
2056 rt->dst.flags |= DST_HOST;
2057 rt->dst.input = ip6_input;
2058 rt->dst.output = ip6_output;
2059 rt->rt6i_idev = idev;
2060 rt->dst.obsolete = -1;
2062 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2063 if (anycast)
2064 rt->rt6i_flags |= RTF_ANYCAST;
2065 else
2066 rt->rt6i_flags |= RTF_LOCAL;
2067 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2068 if (IS_ERR(neigh)) {
2069 dst_free(&rt->dst);
2071 return ERR_CAST(neigh);
2073 dst_set_neighbour(&rt->dst, neigh);
2075 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2076 rt->rt6i_dst.plen = 128;
2077 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2079 atomic_set(&rt->dst.__refcnt, 1);
2081 return rt;
2084 int ip6_route_get_saddr(struct net *net,
2085 struct rt6_info *rt,
2086 const struct in6_addr *daddr,
2087 unsigned int prefs,
2088 struct in6_addr *saddr)
2090 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2091 int err = 0;
2092 if (rt->rt6i_prefsrc.plen)
2093 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2094 else
2095 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2096 daddr, prefs, saddr);
2097 return err;
2100 /* remove deleted ip from prefsrc entries */
2101 struct arg_dev_net_ip {
2102 struct net_device *dev;
2103 struct net *net;
2104 struct in6_addr *addr;
2107 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2109 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2110 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2111 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2113 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2114 rt != net->ipv6.ip6_null_entry &&
2115 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2116 /* remove prefsrc entry */
2117 rt->rt6i_prefsrc.plen = 0;
2119 return 0;
2122 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2124 struct net *net = dev_net(ifp->idev->dev);
2125 struct arg_dev_net_ip adni = {
2126 .dev = ifp->idev->dev,
2127 .net = net,
2128 .addr = &ifp->addr,
2130 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2133 struct arg_dev_net {
2134 struct net_device *dev;
2135 struct net *net;
2138 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2140 const struct arg_dev_net *adn = arg;
2141 const struct net_device *dev = adn->dev;
2143 if ((rt->rt6i_dev == dev || dev == NULL) &&
2144 rt != adn->net->ipv6.ip6_null_entry) {
2145 RT6_TRACE("deleted by ifdown %p\n", rt);
2146 return -1;
2148 return 0;
2151 void rt6_ifdown(struct net *net, struct net_device *dev)
2153 struct arg_dev_net adn = {
2154 .dev = dev,
2155 .net = net,
2158 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2159 icmp6_clean_all(fib6_ifdown, &adn);
2162 struct rt6_mtu_change_arg
2164 struct net_device *dev;
2165 unsigned mtu;
2168 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2170 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2171 struct inet6_dev *idev;
2173 /* In IPv6 pmtu discovery is not optional,
2174 so that RTAX_MTU lock cannot disable it.
2175 We still use this lock to block changes
2176 caused by addrconf/ndisc.
2179 idev = __in6_dev_get(arg->dev);
2180 if (idev == NULL)
2181 return 0;
2183 /* For administrative MTU increase, there is no way to discover
2184 IPv6 PMTU increase, so PMTU increase should be updated here.
2185 Since RFC 1981 doesn't include administrative MTU increase
2186 update PMTU increase is a MUST. (i.e. jumbo frame)
2189 If new MTU is less than route PMTU, this new MTU will be the
2190 lowest MTU in the path, update the route PMTU to reflect PMTU
2191 decreases; if new MTU is greater than route PMTU, and the
2192 old MTU is the lowest MTU in the path, update the route PMTU
2193 to reflect the increase. In this case if the other nodes' MTU
2194 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2195 PMTU discouvery.
2197 if (rt->rt6i_dev == arg->dev &&
2198 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2199 (dst_mtu(&rt->dst) >= arg->mtu ||
2200 (dst_mtu(&rt->dst) < arg->mtu &&
2201 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2202 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2204 return 0;
2207 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2209 struct rt6_mtu_change_arg arg = {
2210 .dev = dev,
2211 .mtu = mtu,
2214 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2217 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2218 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2219 [RTA_OIF] = { .type = NLA_U32 },
2220 [RTA_IIF] = { .type = NLA_U32 },
2221 [RTA_PRIORITY] = { .type = NLA_U32 },
2222 [RTA_METRICS] = { .type = NLA_NESTED },
2225 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2226 struct fib6_config *cfg)
2228 struct rtmsg *rtm;
2229 struct nlattr *tb[RTA_MAX+1];
2230 int err;
2232 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2233 if (err < 0)
2234 goto errout;
2236 err = -EINVAL;
2237 rtm = nlmsg_data(nlh);
2238 memset(cfg, 0, sizeof(*cfg));
2240 cfg->fc_table = rtm->rtm_table;
2241 cfg->fc_dst_len = rtm->rtm_dst_len;
2242 cfg->fc_src_len = rtm->rtm_src_len;
2243 cfg->fc_flags = RTF_UP;
2244 cfg->fc_protocol = rtm->rtm_protocol;
2246 if (rtm->rtm_type == RTN_UNREACHABLE)
2247 cfg->fc_flags |= RTF_REJECT;
2249 if (rtm->rtm_type == RTN_LOCAL)
2250 cfg->fc_flags |= RTF_LOCAL;
2252 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2253 cfg->fc_nlinfo.nlh = nlh;
2254 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2256 if (tb[RTA_GATEWAY]) {
2257 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2258 cfg->fc_flags |= RTF_GATEWAY;
2261 if (tb[RTA_DST]) {
2262 int plen = (rtm->rtm_dst_len + 7) >> 3;
2264 if (nla_len(tb[RTA_DST]) < plen)
2265 goto errout;
2267 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2270 if (tb[RTA_SRC]) {
2271 int plen = (rtm->rtm_src_len + 7) >> 3;
2273 if (nla_len(tb[RTA_SRC]) < plen)
2274 goto errout;
2276 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2279 if (tb[RTA_PREFSRC])
2280 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2282 if (tb[RTA_OIF])
2283 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2285 if (tb[RTA_PRIORITY])
2286 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2288 if (tb[RTA_METRICS]) {
2289 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2290 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2293 if (tb[RTA_TABLE])
2294 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2296 err = 0;
2297 errout:
2298 return err;
2301 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2303 struct fib6_config cfg;
2304 int err;
2306 err = rtm_to_fib6_config(skb, nlh, &cfg);
2307 if (err < 0)
2308 return err;
2310 return ip6_route_del(&cfg);
2313 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2315 struct fib6_config cfg;
2316 int err;
2318 err = rtm_to_fib6_config(skb, nlh, &cfg);
2319 if (err < 0)
2320 return err;
2322 return ip6_route_add(&cfg);
2325 static inline size_t rt6_nlmsg_size(void)
2327 return NLMSG_ALIGN(sizeof(struct rtmsg))
2328 + nla_total_size(16) /* RTA_SRC */
2329 + nla_total_size(16) /* RTA_DST */
2330 + nla_total_size(16) /* RTA_GATEWAY */
2331 + nla_total_size(16) /* RTA_PREFSRC */
2332 + nla_total_size(4) /* RTA_TABLE */
2333 + nla_total_size(4) /* RTA_IIF */
2334 + nla_total_size(4) /* RTA_OIF */
2335 + nla_total_size(4) /* RTA_PRIORITY */
2336 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2337 + nla_total_size(sizeof(struct rta_cacheinfo));
2340 static int rt6_fill_node(struct net *net,
2341 struct sk_buff *skb, struct rt6_info *rt,
2342 struct in6_addr *dst, struct in6_addr *src,
2343 int iif, int type, u32 pid, u32 seq,
2344 int prefix, int nowait, unsigned int flags)
2346 struct rtmsg *rtm;
2347 struct nlmsghdr *nlh;
2348 long expires;
2349 u32 table;
2350 struct neighbour *n;
2352 if (prefix) { /* user wants prefix routes only */
2353 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2354 /* success since this is not a prefix route */
2355 return 1;
2359 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2360 if (nlh == NULL)
2361 return -EMSGSIZE;
2363 rtm = nlmsg_data(nlh);
2364 rtm->rtm_family = AF_INET6;
2365 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2366 rtm->rtm_src_len = rt->rt6i_src.plen;
2367 rtm->rtm_tos = 0;
2368 if (rt->rt6i_table)
2369 table = rt->rt6i_table->tb6_id;
2370 else
2371 table = RT6_TABLE_UNSPEC;
2372 rtm->rtm_table = table;
2373 NLA_PUT_U32(skb, RTA_TABLE, table);
2374 if (rt->rt6i_flags&RTF_REJECT)
2375 rtm->rtm_type = RTN_UNREACHABLE;
2376 else if (rt->rt6i_flags&RTF_LOCAL)
2377 rtm->rtm_type = RTN_LOCAL;
2378 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2379 rtm->rtm_type = RTN_LOCAL;
2380 else
2381 rtm->rtm_type = RTN_UNICAST;
2382 rtm->rtm_flags = 0;
2383 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2384 rtm->rtm_protocol = rt->rt6i_protocol;
2385 if (rt->rt6i_flags&RTF_DYNAMIC)
2386 rtm->rtm_protocol = RTPROT_REDIRECT;
2387 else if (rt->rt6i_flags & RTF_ADDRCONF)
2388 rtm->rtm_protocol = RTPROT_KERNEL;
2389 else if (rt->rt6i_flags&RTF_DEFAULT)
2390 rtm->rtm_protocol = RTPROT_RA;
2392 if (rt->rt6i_flags&RTF_CACHE)
2393 rtm->rtm_flags |= RTM_F_CLONED;
2395 if (dst) {
2396 NLA_PUT(skb, RTA_DST, 16, dst);
2397 rtm->rtm_dst_len = 128;
2398 } else if (rtm->rtm_dst_len)
2399 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2400 #ifdef CONFIG_IPV6_SUBTREES
2401 if (src) {
2402 NLA_PUT(skb, RTA_SRC, 16, src);
2403 rtm->rtm_src_len = 128;
2404 } else if (rtm->rtm_src_len)
2405 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2406 #endif
2407 if (iif) {
2408 #ifdef CONFIG_IPV6_MROUTE
2409 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2410 int err = ip6mr_get_route(net, skb, rtm, nowait);
2411 if (err <= 0) {
2412 if (!nowait) {
2413 if (err == 0)
2414 return 0;
2415 goto nla_put_failure;
2416 } else {
2417 if (err == -EMSGSIZE)
2418 goto nla_put_failure;
2421 } else
2422 #endif
2423 NLA_PUT_U32(skb, RTA_IIF, iif);
2424 } else if (dst) {
2425 struct in6_addr saddr_buf;
2426 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2427 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2430 if (rt->rt6i_prefsrc.plen) {
2431 struct in6_addr saddr_buf;
2432 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2433 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2436 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2437 goto nla_put_failure;
2439 rcu_read_lock();
2440 n = dst_get_neighbour(&rt->dst);
2441 if (n)
2442 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2443 rcu_read_unlock();
2445 if (rt->dst.dev)
2446 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2448 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2450 if (!(rt->rt6i_flags & RTF_EXPIRES))
2451 expires = 0;
2452 else if (rt->rt6i_expires - jiffies < INT_MAX)
2453 expires = rt->rt6i_expires - jiffies;
2454 else
2455 expires = INT_MAX;
2457 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2458 expires, rt->dst.error) < 0)
2459 goto nla_put_failure;
2461 return nlmsg_end(skb, nlh);
2463 nla_put_failure:
2464 nlmsg_cancel(skb, nlh);
2465 return -EMSGSIZE;
2468 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2470 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2471 int prefix;
2473 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2474 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2475 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2476 } else
2477 prefix = 0;
2479 return rt6_fill_node(arg->net,
2480 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2481 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2482 prefix, 0, NLM_F_MULTI);
2485 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2487 struct net *net = sock_net(in_skb->sk);
2488 struct nlattr *tb[RTA_MAX+1];
2489 struct rt6_info *rt;
2490 struct sk_buff *skb;
2491 struct rtmsg *rtm;
2492 struct flowi6 fl6;
2493 int err, iif = 0;
2495 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2496 if (err < 0)
2497 goto errout;
2499 err = -EINVAL;
2500 memset(&fl6, 0, sizeof(fl6));
2502 if (tb[RTA_SRC]) {
2503 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2504 goto errout;
2506 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2509 if (tb[RTA_DST]) {
2510 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2511 goto errout;
2513 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2516 if (tb[RTA_IIF])
2517 iif = nla_get_u32(tb[RTA_IIF]);
2519 if (tb[RTA_OIF])
2520 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2522 if (iif) {
2523 struct net_device *dev;
2524 dev = __dev_get_by_index(net, iif);
2525 if (!dev) {
2526 err = -ENODEV;
2527 goto errout;
2531 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2532 if (skb == NULL) {
2533 err = -ENOBUFS;
2534 goto errout;
2537 /* Reserve room for dummy headers, this skb can pass
2538 through good chunk of routing engine.
2540 skb_reset_mac_header(skb);
2541 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2543 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2544 skb_dst_set(skb, &rt->dst);
2546 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2547 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2548 nlh->nlmsg_seq, 0, 0, 0);
2549 if (err < 0) {
2550 kfree_skb(skb);
2551 goto errout;
2554 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2555 errout:
2556 return err;
2559 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2561 struct sk_buff *skb;
2562 struct net *net = info->nl_net;
2563 u32 seq;
2564 int err;
2566 err = -ENOBUFS;
2567 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2569 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2570 if (skb == NULL)
2571 goto errout;
2573 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2574 event, info->pid, seq, 0, 0, 0);
2575 if (err < 0) {
2576 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2577 WARN_ON(err == -EMSGSIZE);
2578 kfree_skb(skb);
2579 goto errout;
2581 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2582 info->nlh, gfp_any());
2583 return;
2584 errout:
2585 if (err < 0)
2586 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2589 static int ip6_route_dev_notify(struct notifier_block *this,
2590 unsigned long event, void *data)
2592 struct net_device *dev = (struct net_device *)data;
2593 struct net *net = dev_net(dev);
2595 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2596 net->ipv6.ip6_null_entry->dst.dev = dev;
2597 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2598 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2599 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2600 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2601 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2602 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2603 #endif
2606 return NOTIFY_OK;
2610 * /proc
2613 #ifdef CONFIG_PROC_FS
2615 struct rt6_proc_arg
2617 char *buffer;
2618 int offset;
2619 int length;
2620 int skip;
2621 int len;
2624 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2626 struct seq_file *m = p_arg;
2627 struct neighbour *n;
2629 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2631 #ifdef CONFIG_IPV6_SUBTREES
2632 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2633 #else
2634 seq_puts(m, "00000000000000000000000000000000 00 ");
2635 #endif
2636 rcu_read_lock();
2637 n = dst_get_neighbour(&rt->dst);
2638 if (n) {
2639 seq_printf(m, "%pi6", n->primary_key);
2640 } else {
2641 seq_puts(m, "00000000000000000000000000000000");
2643 rcu_read_unlock();
2644 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2645 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2646 rt->dst.__use, rt->rt6i_flags,
2647 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2648 return 0;
2651 static int ipv6_route_show(struct seq_file *m, void *v)
2653 struct net *net = (struct net *)m->private;
2654 fib6_clean_all(net, rt6_info_route, 0, m);
2655 return 0;
2658 static int ipv6_route_open(struct inode *inode, struct file *file)
2660 return single_open_net(inode, file, ipv6_route_show);
2663 static const struct file_operations ipv6_route_proc_fops = {
2664 .owner = THIS_MODULE,
2665 .open = ipv6_route_open,
2666 .read = seq_read,
2667 .llseek = seq_lseek,
2668 .release = single_release_net,
2671 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2673 struct net *net = (struct net *)seq->private;
2674 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2675 net->ipv6.rt6_stats->fib_nodes,
2676 net->ipv6.rt6_stats->fib_route_nodes,
2677 net->ipv6.rt6_stats->fib_rt_alloc,
2678 net->ipv6.rt6_stats->fib_rt_entries,
2679 net->ipv6.rt6_stats->fib_rt_cache,
2680 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2681 net->ipv6.rt6_stats->fib_discarded_routes);
2683 return 0;
2686 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2688 return single_open_net(inode, file, rt6_stats_seq_show);
2691 static const struct file_operations rt6_stats_seq_fops = {
2692 .owner = THIS_MODULE,
2693 .open = rt6_stats_seq_open,
2694 .read = seq_read,
2695 .llseek = seq_lseek,
2696 .release = single_release_net,
2698 #endif /* CONFIG_PROC_FS */
2700 #ifdef CONFIG_SYSCTL
2702 static
2703 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2704 void __user *buffer, size_t *lenp, loff_t *ppos)
2706 struct net *net;
2707 int delay;
2708 if (!write)
2709 return -EINVAL;
2711 net = (struct net *)ctl->extra1;
2712 delay = net->ipv6.sysctl.flush_delay;
2713 proc_dointvec(ctl, write, buffer, lenp, ppos);
2714 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2715 return 0;
2718 ctl_table ipv6_route_table_template[] = {
2720 .procname = "flush",
2721 .data = &init_net.ipv6.sysctl.flush_delay,
2722 .maxlen = sizeof(int),
2723 .mode = 0200,
2724 .proc_handler = ipv6_sysctl_rtcache_flush
2727 .procname = "gc_thresh",
2728 .data = &ip6_dst_ops_template.gc_thresh,
2729 .maxlen = sizeof(int),
2730 .mode = 0644,
2731 .proc_handler = proc_dointvec,
2734 .procname = "max_size",
2735 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2736 .maxlen = sizeof(int),
2737 .mode = 0644,
2738 .proc_handler = proc_dointvec,
2741 .procname = "gc_min_interval",
2742 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2743 .maxlen = sizeof(int),
2744 .mode = 0644,
2745 .proc_handler = proc_dointvec_jiffies,
2748 .procname = "gc_timeout",
2749 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2750 .maxlen = sizeof(int),
2751 .mode = 0644,
2752 .proc_handler = proc_dointvec_jiffies,
2755 .procname = "gc_interval",
2756 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2757 .maxlen = sizeof(int),
2758 .mode = 0644,
2759 .proc_handler = proc_dointvec_jiffies,
2762 .procname = "gc_elasticity",
2763 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2764 .maxlen = sizeof(int),
2765 .mode = 0644,
2766 .proc_handler = proc_dointvec,
2769 .procname = "mtu_expires",
2770 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2771 .maxlen = sizeof(int),
2772 .mode = 0644,
2773 .proc_handler = proc_dointvec_jiffies,
2776 .procname = "min_adv_mss",
2777 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2778 .maxlen = sizeof(int),
2779 .mode = 0644,
2780 .proc_handler = proc_dointvec,
2783 .procname = "gc_min_interval_ms",
2784 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2785 .maxlen = sizeof(int),
2786 .mode = 0644,
2787 .proc_handler = proc_dointvec_ms_jiffies,
2792 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2794 struct ctl_table *table;
2796 table = kmemdup(ipv6_route_table_template,
2797 sizeof(ipv6_route_table_template),
2798 GFP_KERNEL);
2800 if (table) {
2801 table[0].data = &net->ipv6.sysctl.flush_delay;
2802 table[0].extra1 = net;
2803 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2804 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2805 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2806 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2807 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2808 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2809 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2810 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2811 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2814 return table;
2816 #endif
2818 static int __net_init ip6_route_net_init(struct net *net)
2820 int ret = -ENOMEM;
2822 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2823 sizeof(net->ipv6.ip6_dst_ops));
2825 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2826 goto out_ip6_dst_ops;
2828 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2829 sizeof(*net->ipv6.ip6_null_entry),
2830 GFP_KERNEL);
2831 if (!net->ipv6.ip6_null_entry)
2832 goto out_ip6_dst_entries;
2833 net->ipv6.ip6_null_entry->dst.path =
2834 (struct dst_entry *)net->ipv6.ip6_null_entry;
2835 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2836 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2837 ip6_template_metrics, true);
2839 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2840 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2841 sizeof(*net->ipv6.ip6_prohibit_entry),
2842 GFP_KERNEL);
2843 if (!net->ipv6.ip6_prohibit_entry)
2844 goto out_ip6_null_entry;
2845 net->ipv6.ip6_prohibit_entry->dst.path =
2846 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2847 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2848 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2849 ip6_template_metrics, true);
2851 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2852 sizeof(*net->ipv6.ip6_blk_hole_entry),
2853 GFP_KERNEL);
2854 if (!net->ipv6.ip6_blk_hole_entry)
2855 goto out_ip6_prohibit_entry;
2856 net->ipv6.ip6_blk_hole_entry->dst.path =
2857 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2858 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2859 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2860 ip6_template_metrics, true);
2861 #endif
2863 net->ipv6.sysctl.flush_delay = 0;
2864 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2865 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2866 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2867 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2868 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2869 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2870 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2872 #ifdef CONFIG_PROC_FS
2873 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2874 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2875 #endif
2876 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2878 ret = 0;
2879 out:
2880 return ret;
2882 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2883 out_ip6_prohibit_entry:
2884 kfree(net->ipv6.ip6_prohibit_entry);
2885 out_ip6_null_entry:
2886 kfree(net->ipv6.ip6_null_entry);
2887 #endif
2888 out_ip6_dst_entries:
2889 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2890 out_ip6_dst_ops:
2891 goto out;
2894 static void __net_exit ip6_route_net_exit(struct net *net)
2896 #ifdef CONFIG_PROC_FS
2897 proc_net_remove(net, "ipv6_route");
2898 proc_net_remove(net, "rt6_stats");
2899 #endif
2900 kfree(net->ipv6.ip6_null_entry);
2901 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2902 kfree(net->ipv6.ip6_prohibit_entry);
2903 kfree(net->ipv6.ip6_blk_hole_entry);
2904 #endif
2905 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2908 static struct pernet_operations ip6_route_net_ops = {
2909 .init = ip6_route_net_init,
2910 .exit = ip6_route_net_exit,
2913 static struct notifier_block ip6_route_dev_notifier = {
2914 .notifier_call = ip6_route_dev_notify,
2915 .priority = 0,
2918 int __init ip6_route_init(void)
2920 int ret;
2922 ret = -ENOMEM;
2923 ip6_dst_ops_template.kmem_cachep =
2924 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2925 SLAB_HWCACHE_ALIGN, NULL);
2926 if (!ip6_dst_ops_template.kmem_cachep)
2927 goto out;
2929 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2930 if (ret)
2931 goto out_kmem_cache;
2933 ret = register_pernet_subsys(&ip6_route_net_ops);
2934 if (ret)
2935 goto out_dst_entries;
2937 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2939 /* Registering of the loopback is done before this portion of code,
2940 * the loopback reference in rt6_info will not be taken, do it
2941 * manually for init_net */
2942 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2943 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2944 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2945 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2946 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2947 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2948 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2949 #endif
2950 ret = fib6_init();
2951 if (ret)
2952 goto out_register_subsys;
2954 ret = xfrm6_init();
2955 if (ret)
2956 goto out_fib6_init;
2958 ret = fib6_rules_init();
2959 if (ret)
2960 goto xfrm6_init;
2962 ret = -ENOBUFS;
2963 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2964 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2965 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2966 goto fib6_rules_init;
2968 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2969 if (ret)
2970 goto fib6_rules_init;
2972 out:
2973 return ret;
2975 fib6_rules_init:
2976 fib6_rules_cleanup();
2977 xfrm6_init:
2978 xfrm6_fini();
2979 out_fib6_init:
2980 fib6_gc_cleanup();
2981 out_register_subsys:
2982 unregister_pernet_subsys(&ip6_route_net_ops);
2983 out_dst_entries:
2984 dst_entries_destroy(&ip6_dst_blackhole_ops);
2985 out_kmem_cache:
2986 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2987 goto out;
2990 void ip6_route_cleanup(void)
2992 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2993 fib6_rules_cleanup();
2994 xfrm6_fini();
2995 fib6_gc_cleanup();
2996 unregister_pernet_subsys(&ip6_route_net_ops);
2997 dst_entries_destroy(&ip6_dst_blackhole_ops);
2998 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);