net: Fix IPv6 PMTU disc. w/ asymmetric routes
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv6 / route.c
blob4d947c131937afd6fdabd21c1da12aae5050c720
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
75 #define CLONE_OFFLINK_ROUTE 0
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 struct in6_addr *prefix, int prefixlen,
93 struct in6_addr *gwaddr, int ifindex,
94 unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 struct in6_addr *prefix, int prefixlen,
97 struct in6_addr *gwaddr, int ifindex);
98 #endif
100 static struct dst_ops ip6_dst_ops_template = {
101 .family = AF_INET6,
102 .protocol = cpu_to_be16(ETH_P_IPV6),
103 .gc = ip6_dst_gc,
104 .gc_thresh = 1024,
105 .check = ip6_dst_check,
106 .destroy = ip6_dst_destroy,
107 .ifdown = ip6_dst_ifdown,
108 .negative_advice = ip6_negative_advice,
109 .link_failure = ip6_link_failure,
110 .update_pmtu = ip6_rt_update_pmtu,
111 .local_out = __ip6_local_out,
112 .entries = ATOMIC_INIT(0),
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
119 static struct dst_ops ip6_dst_blackhole_ops = {
120 .family = AF_INET6,
121 .protocol = cpu_to_be16(ETH_P_IPV6),
122 .destroy = ip6_dst_destroy,
123 .check = ip6_dst_check,
124 .update_pmtu = ip6_rt_blackhole_update_pmtu,
125 .entries = ATOMIC_INIT(0),
128 static struct rt6_info ip6_null_entry_template = {
129 .u = {
130 .dst = {
131 .__refcnt = ATOMIC_INIT(1),
132 .__use = 1,
133 .obsolete = -1,
134 .error = -ENETUNREACH,
135 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
136 .input = ip6_pkt_discard,
137 .output = ip6_pkt_discard_out,
140 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
141 .rt6i_protocol = RTPROT_KERNEL,
142 .rt6i_metric = ~(u32) 0,
143 .rt6i_ref = ATOMIC_INIT(1),
146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
148 static int ip6_pkt_prohibit(struct sk_buff *skb);
149 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
151 static struct rt6_info ip6_prohibit_entry_template = {
152 .u = {
153 .dst = {
154 .__refcnt = ATOMIC_INIT(1),
155 .__use = 1,
156 .obsolete = -1,
157 .error = -EACCES,
158 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
159 .input = ip6_pkt_prohibit,
160 .output = ip6_pkt_prohibit_out,
163 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
164 .rt6i_protocol = RTPROT_KERNEL,
165 .rt6i_metric = ~(u32) 0,
166 .rt6i_ref = ATOMIC_INIT(1),
169 static struct rt6_info ip6_blk_hole_entry_template = {
170 .u = {
171 .dst = {
172 .__refcnt = ATOMIC_INIT(1),
173 .__use = 1,
174 .obsolete = -1,
175 .error = -EINVAL,
176 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
177 .input = dst_discard,
178 .output = dst_discard,
181 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
182 .rt6i_protocol = RTPROT_KERNEL,
183 .rt6i_metric = ~(u32) 0,
184 .rt6i_ref = ATOMIC_INIT(1),
187 #endif
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
192 return (struct rt6_info *)dst_alloc(ops);
195 static void ip6_dst_destroy(struct dst_entry *dst)
197 struct rt6_info *rt = (struct rt6_info *)dst;
198 struct inet6_dev *idev = rt->rt6i_idev;
200 if (idev != NULL) {
201 rt->rt6i_idev = NULL;
202 in6_dev_put(idev);
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 int how)
209 struct rt6_info *rt = (struct rt6_info *)dst;
210 struct inet6_dev *idev = rt->rt6i_idev;
211 struct net_device *loopback_dev =
212 dev_net(dev)->loopback_dev;
214 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215 struct inet6_dev *loopback_idev =
216 in6_dev_get(loopback_dev);
217 if (loopback_idev != NULL) {
218 rt->rt6i_idev = loopback_idev;
219 in6_dev_put(idev);
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
226 return (rt->rt6i_flags & RTF_EXPIRES &&
227 time_after(jiffies, rt->rt6i_expires));
230 static inline int rt6_need_strict(struct in6_addr *daddr)
232 return (ipv6_addr_type(daddr) &
233 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
237 * Route lookup. Any table->tb6_lock is implied.
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241 struct rt6_info *rt,
242 struct in6_addr *saddr,
243 int oif,
244 int flags)
246 struct rt6_info *local = NULL;
247 struct rt6_info *sprt;
249 if (!oif && ipv6_addr_any(saddr))
250 goto out;
252 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
253 struct net_device *dev = sprt->rt6i_dev;
255 if (oif) {
256 if (dev->ifindex == oif)
257 return sprt;
258 if (dev->flags & IFF_LOOPBACK) {
259 if (sprt->rt6i_idev == NULL ||
260 sprt->rt6i_idev->dev->ifindex != oif) {
261 if (flags & RT6_LOOKUP_F_IFACE && oif)
262 continue;
263 if (local && (!oif ||
264 local->rt6i_idev->dev->ifindex == oif))
265 continue;
267 local = sprt;
269 } else {
270 if (ipv6_chk_addr(net, saddr, dev,
271 flags & RT6_LOOKUP_F_IFACE))
272 return sprt;
276 if (oif) {
277 if (local)
278 return local;
280 if (flags & RT6_LOOKUP_F_IFACE)
281 return net->ipv6.ip6_null_entry;
283 out:
284 return rt;
287 #ifdef CONFIG_IPV6_ROUTER_PREF
288 static void rt6_probe(struct rt6_info *rt)
290 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
292 * Okay, this does not seem to be appropriate
293 * for now, however, we need to check if it
294 * is really so; aka Router Reachability Probing.
296 * Router Reachability Probe MUST be rate-limited
297 * to no more than one per minute.
299 if (!neigh || (neigh->nud_state & NUD_VALID))
300 return;
301 read_lock_bh(&neigh->lock);
302 if (!(neigh->nud_state & NUD_VALID) &&
303 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
304 struct in6_addr mcaddr;
305 struct in6_addr *target;
307 neigh->updated = jiffies;
308 read_unlock_bh(&neigh->lock);
310 target = (struct in6_addr *)&neigh->primary_key;
311 addrconf_addr_solict_mult(target, &mcaddr);
312 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
313 } else
314 read_unlock_bh(&neigh->lock);
316 #else
317 static inline void rt6_probe(struct rt6_info *rt)
320 #endif
323 * Default Router Selection (RFC 2461 6.3.6)
325 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
327 struct net_device *dev = rt->rt6i_dev;
328 if (!oif || dev->ifindex == oif)
329 return 2;
330 if ((dev->flags & IFF_LOOPBACK) &&
331 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
332 return 1;
333 return 0;
336 static inline int rt6_check_neigh(struct rt6_info *rt)
338 struct neighbour *neigh = rt->rt6i_nexthop;
339 int m;
340 if (rt->rt6i_flags & RTF_NONEXTHOP ||
341 !(rt->rt6i_flags & RTF_GATEWAY))
342 m = 1;
343 else if (neigh) {
344 read_lock_bh(&neigh->lock);
345 if (neigh->nud_state & NUD_VALID)
346 m = 2;
347 #ifdef CONFIG_IPV6_ROUTER_PREF
348 else if (neigh->nud_state & NUD_FAILED)
349 m = 0;
350 #endif
351 else
352 m = 1;
353 read_unlock_bh(&neigh->lock);
354 } else
355 m = 0;
356 return m;
359 static int rt6_score_route(struct rt6_info *rt, int oif,
360 int strict)
362 int m, n;
364 m = rt6_check_dev(rt, oif);
365 if (!m && (strict & RT6_LOOKUP_F_IFACE))
366 return -1;
367 #ifdef CONFIG_IPV6_ROUTER_PREF
368 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
369 #endif
370 n = rt6_check_neigh(rt);
371 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
372 return -1;
373 return m;
376 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
377 int *mpri, struct rt6_info *match)
379 int m;
381 if (rt6_check_expired(rt))
382 goto out;
384 m = rt6_score_route(rt, oif, strict);
385 if (m < 0)
386 goto out;
388 if (m > *mpri) {
389 if (strict & RT6_LOOKUP_F_REACHABLE)
390 rt6_probe(match);
391 *mpri = m;
392 match = rt;
393 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
394 rt6_probe(rt);
397 out:
398 return match;
401 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
402 struct rt6_info *rr_head,
403 u32 metric, int oif, int strict)
405 struct rt6_info *rt, *match;
406 int mpri = -1;
408 match = NULL;
409 for (rt = rr_head; rt && rt->rt6i_metric == metric;
410 rt = rt->u.dst.rt6_next)
411 match = find_match(rt, oif, strict, &mpri, match);
412 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
413 rt = rt->u.dst.rt6_next)
414 match = find_match(rt, oif, strict, &mpri, match);
416 return match;
419 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
421 struct rt6_info *match, *rt0;
422 struct net *net;
424 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
425 __func__, fn->leaf, oif);
427 rt0 = fn->rr_ptr;
428 if (!rt0)
429 fn->rr_ptr = rt0 = fn->leaf;
431 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
433 if (!match &&
434 (strict & RT6_LOOKUP_F_REACHABLE)) {
435 struct rt6_info *next = rt0->u.dst.rt6_next;
437 /* no entries matched; do round-robin */
438 if (!next || next->rt6i_metric != rt0->rt6i_metric)
439 next = fn->leaf;
441 if (next != rt0)
442 fn->rr_ptr = next;
445 RT6_TRACE("%s() => %p\n",
446 __func__, match);
448 net = dev_net(rt0->rt6i_dev);
449 return (match ? match : net->ipv6.ip6_null_entry);
452 #ifdef CONFIG_IPV6_ROUTE_INFO
453 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
454 struct in6_addr *gwaddr)
456 struct net *net = dev_net(dev);
457 struct route_info *rinfo = (struct route_info *) opt;
458 struct in6_addr prefix_buf, *prefix;
459 unsigned int pref;
460 unsigned long lifetime;
461 struct rt6_info *rt;
463 if (len < sizeof(struct route_info)) {
464 return -EINVAL;
467 /* Sanity check for prefix_len and length */
468 if (rinfo->length > 3) {
469 return -EINVAL;
470 } else if (rinfo->prefix_len > 128) {
471 return -EINVAL;
472 } else if (rinfo->prefix_len > 64) {
473 if (rinfo->length < 2) {
474 return -EINVAL;
476 } else if (rinfo->prefix_len > 0) {
477 if (rinfo->length < 1) {
478 return -EINVAL;
482 pref = rinfo->route_pref;
483 if (pref == ICMPV6_ROUTER_PREF_INVALID)
484 return -EINVAL;
486 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
488 if (rinfo->length == 3)
489 prefix = (struct in6_addr *)rinfo->prefix;
490 else {
491 /* this function is safe */
492 ipv6_addr_prefix(&prefix_buf,
493 (struct in6_addr *)rinfo->prefix,
494 rinfo->prefix_len);
495 prefix = &prefix_buf;
498 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
499 dev->ifindex);
501 if (rt && !lifetime) {
502 ip6_del_rt(rt);
503 rt = NULL;
506 if (!rt && lifetime)
507 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
508 pref);
509 else if (rt)
510 rt->rt6i_flags = RTF_ROUTEINFO |
511 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
513 if (rt) {
514 if (!addrconf_finite_timeout(lifetime)) {
515 rt->rt6i_flags &= ~RTF_EXPIRES;
516 } else {
517 rt->rt6i_expires = jiffies + HZ * lifetime;
518 rt->rt6i_flags |= RTF_EXPIRES;
520 dst_release(&rt->u.dst);
522 return 0;
524 #endif
526 #define BACKTRACK(__net, saddr) \
527 do { \
528 if (rt == __net->ipv6.ip6_null_entry) { \
529 struct fib6_node *pn; \
530 while (1) { \
531 if (fn->fn_flags & RTN_TL_ROOT) \
532 goto out; \
533 pn = fn->parent; \
534 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
536 else \
537 fn = pn; \
538 if (fn->fn_flags & RTN_RTINFO) \
539 goto restart; \
542 } while(0)
544 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
545 struct fib6_table *table,
546 struct flowi *fl, int flags)
548 struct fib6_node *fn;
549 struct rt6_info *rt;
551 read_lock_bh(&table->tb6_lock);
552 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
553 restart:
554 rt = fn->leaf;
555 rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
556 BACKTRACK(net, &fl->fl6_src);
557 out:
558 dst_use(&rt->u.dst, jiffies);
559 read_unlock_bh(&table->tb6_lock);
560 return rt;
564 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
565 const struct in6_addr *saddr, int oif, int strict)
567 struct flowi fl = {
568 .oif = oif,
569 .nl_u = {
570 .ip6_u = {
571 .daddr = *daddr,
575 struct dst_entry *dst;
576 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
578 if (saddr) {
579 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
580 flags |= RT6_LOOKUP_F_HAS_SADDR;
583 dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
584 if (dst->error == 0)
585 return (struct rt6_info *) dst;
587 dst_release(dst);
589 return NULL;
592 EXPORT_SYMBOL(rt6_lookup);
594 /* ip6_ins_rt is called with FREE table->tb6_lock.
595 It takes new route entry, the addition fails by any reason the
596 route is freed. In any case, if caller does not hold it, it may
597 be destroyed.
600 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
602 int err;
603 struct fib6_table *table;
605 table = rt->rt6i_table;
606 write_lock_bh(&table->tb6_lock);
607 err = fib6_add(&table->tb6_root, rt, info);
608 write_unlock_bh(&table->tb6_lock);
610 return err;
613 int ip6_ins_rt(struct rt6_info *rt)
615 struct nl_info info = {
616 .nl_net = dev_net(rt->rt6i_dev),
618 return __ip6_ins_rt(rt, &info);
621 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
622 struct in6_addr *saddr)
624 struct rt6_info *rt;
627 * Clone the route.
630 rt = ip6_rt_copy(ort);
632 if (rt) {
633 struct neighbour *neigh;
634 int attempts = !in_softirq();
636 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
637 if (rt->rt6i_dst.plen != 128 &&
638 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
639 rt->rt6i_flags |= RTF_ANYCAST;
640 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
643 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
644 rt->rt6i_dst.plen = 128;
645 rt->rt6i_flags |= RTF_CACHE;
646 rt->u.dst.flags |= DST_HOST;
648 #ifdef CONFIG_IPV6_SUBTREES
649 if (rt->rt6i_src.plen && saddr) {
650 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
651 rt->rt6i_src.plen = 128;
653 #endif
655 retry:
656 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
657 if (IS_ERR(neigh)) {
658 struct net *net = dev_net(rt->rt6i_dev);
659 int saved_rt_min_interval =
660 net->ipv6.sysctl.ip6_rt_gc_min_interval;
661 int saved_rt_elasticity =
662 net->ipv6.sysctl.ip6_rt_gc_elasticity;
664 if (attempts-- > 0) {
665 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
666 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
668 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
670 net->ipv6.sysctl.ip6_rt_gc_elasticity =
671 saved_rt_elasticity;
672 net->ipv6.sysctl.ip6_rt_gc_min_interval =
673 saved_rt_min_interval;
674 goto retry;
677 if (net_ratelimit())
678 printk(KERN_WARNING
679 "Neighbour table overflow.\n");
680 dst_free(&rt->u.dst);
681 return NULL;
683 rt->rt6i_nexthop = neigh;
687 return rt;
690 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
692 struct rt6_info *rt = ip6_rt_copy(ort);
693 if (rt) {
694 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695 rt->rt6i_dst.plen = 128;
696 rt->rt6i_flags |= RTF_CACHE;
697 rt->u.dst.flags |= DST_HOST;
698 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
700 return rt;
703 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
704 struct flowi *fl, int flags)
706 struct fib6_node *fn;
707 struct rt6_info *rt, *nrt;
708 int strict = 0;
709 int attempts = 3;
710 int err;
711 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
713 strict |= flags & RT6_LOOKUP_F_IFACE;
715 relookup:
716 read_lock_bh(&table->tb6_lock);
718 restart_2:
719 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
721 restart:
722 rt = rt6_select(fn, oif, strict | reachable);
724 BACKTRACK(net, &fl->fl6_src);
725 if (rt == net->ipv6.ip6_null_entry ||
726 rt->rt6i_flags & RTF_CACHE)
727 goto out;
729 dst_hold(&rt->u.dst);
730 read_unlock_bh(&table->tb6_lock);
732 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
733 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
734 else {
735 #if CLONE_OFFLINK_ROUTE
736 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
737 #else
738 goto out2;
739 #endif
742 dst_release(&rt->u.dst);
743 rt = nrt ? : net->ipv6.ip6_null_entry;
745 dst_hold(&rt->u.dst);
746 if (nrt) {
747 err = ip6_ins_rt(nrt);
748 if (!err)
749 goto out2;
752 if (--attempts <= 0)
753 goto out2;
756 * Race condition! In the gap, when table->tb6_lock was
757 * released someone could insert this route. Relookup.
759 dst_release(&rt->u.dst);
760 goto relookup;
762 out:
763 if (reachable) {
764 reachable = 0;
765 goto restart_2;
767 dst_hold(&rt->u.dst);
768 read_unlock_bh(&table->tb6_lock);
769 out2:
770 rt->u.dst.lastuse = jiffies;
771 rt->u.dst.__use++;
773 return rt;
776 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
777 struct flowi *fl, int flags)
779 return ip6_pol_route(net, table, fl->iif, fl, flags);
782 void ip6_route_input(struct sk_buff *skb)
784 struct ipv6hdr *iph = ipv6_hdr(skb);
785 struct net *net = dev_net(skb->dev);
786 int flags = RT6_LOOKUP_F_HAS_SADDR;
787 struct flowi fl = {
788 .iif = skb->dev->ifindex,
789 .nl_u = {
790 .ip6_u = {
791 .daddr = iph->daddr,
792 .saddr = iph->saddr,
793 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
796 .mark = skb->mark,
797 .proto = iph->nexthdr,
800 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
801 flags |= RT6_LOOKUP_F_IFACE;
803 skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
806 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
807 struct flowi *fl, int flags)
809 return ip6_pol_route(net, table, fl->oif, fl, flags);
812 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
813 struct flowi *fl)
815 int flags = 0;
817 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
818 flags |= RT6_LOOKUP_F_IFACE;
820 if (!ipv6_addr_any(&fl->fl6_src))
821 flags |= RT6_LOOKUP_F_HAS_SADDR;
822 else if (sk)
823 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
825 return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
828 EXPORT_SYMBOL(ip6_route_output);
830 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
832 struct rt6_info *ort = (struct rt6_info *) *dstp;
833 struct rt6_info *rt = (struct rt6_info *)
834 dst_alloc(&ip6_dst_blackhole_ops);
835 struct dst_entry *new = NULL;
837 if (rt) {
838 new = &rt->u.dst;
840 atomic_set(&new->__refcnt, 1);
841 new->__use = 1;
842 new->input = dst_discard;
843 new->output = dst_discard;
845 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
846 new->dev = ort->u.dst.dev;
847 if (new->dev)
848 dev_hold(new->dev);
849 rt->rt6i_idev = ort->rt6i_idev;
850 if (rt->rt6i_idev)
851 in6_dev_hold(rt->rt6i_idev);
852 rt->rt6i_expires = 0;
854 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
855 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
856 rt->rt6i_metric = 0;
858 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
859 #ifdef CONFIG_IPV6_SUBTREES
860 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
861 #endif
863 dst_free(new);
866 dst_release(*dstp);
867 *dstp = new;
868 return (new ? 0 : -ENOMEM);
870 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
873 * Destination cache support functions
876 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
878 struct rt6_info *rt;
880 rt = (struct rt6_info *) dst;
882 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
883 return dst;
885 return NULL;
888 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
890 struct rt6_info *rt = (struct rt6_info *) dst;
892 if (rt) {
893 if (rt->rt6i_flags & RTF_CACHE) {
894 if (rt6_check_expired(rt)) {
895 ip6_del_rt(rt);
896 dst = NULL;
898 } else {
899 dst_release(dst);
900 dst = NULL;
903 return dst;
906 static void ip6_link_failure(struct sk_buff *skb)
908 struct rt6_info *rt;
910 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
912 rt = (struct rt6_info *) skb_dst(skb);
913 if (rt) {
914 if (rt->rt6i_flags&RTF_CACHE) {
915 dst_set_expires(&rt->u.dst, 0);
916 rt->rt6i_flags |= RTF_EXPIRES;
917 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
918 rt->rt6i_node->fn_sernum = -1;
922 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
924 struct rt6_info *rt6 = (struct rt6_info*)dst;
926 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
927 rt6->rt6i_flags |= RTF_MODIFIED;
928 if (mtu < IPV6_MIN_MTU) {
929 mtu = IPV6_MIN_MTU;
930 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
932 dst->metrics[RTAX_MTU-1] = mtu;
933 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
937 static int ipv6_get_mtu(struct net_device *dev);
939 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
941 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
943 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
944 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
947 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
948 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
949 * IPV6_MAXPLEN is also valid and means: "any MSS,
950 * rely only on pmtu discovery"
952 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
953 mtu = IPV6_MAXPLEN;
954 return mtu;
957 static struct dst_entry *icmp6_dst_gc_list;
958 static DEFINE_SPINLOCK(icmp6_dst_lock);
960 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
961 struct neighbour *neigh,
962 const struct in6_addr *addr)
964 struct rt6_info *rt;
965 struct inet6_dev *idev = in6_dev_get(dev);
966 struct net *net = dev_net(dev);
968 if (unlikely(idev == NULL))
969 return NULL;
971 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
972 if (unlikely(rt == NULL)) {
973 in6_dev_put(idev);
974 goto out;
977 dev_hold(dev);
978 if (neigh)
979 neigh_hold(neigh);
980 else {
981 neigh = ndisc_get_neigh(dev, addr);
982 if (IS_ERR(neigh))
983 neigh = NULL;
986 rt->rt6i_dev = dev;
987 rt->rt6i_idev = idev;
988 rt->rt6i_nexthop = neigh;
989 atomic_set(&rt->u.dst.__refcnt, 1);
990 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
991 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
992 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
993 rt->u.dst.output = ip6_output;
995 #if 0 /* there's no chance to use these for ndisc */
996 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
997 ? DST_HOST
998 : 0;
999 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1000 rt->rt6i_dst.plen = 128;
1001 #endif
1003 spin_lock_bh(&icmp6_dst_lock);
1004 rt->u.dst.next = icmp6_dst_gc_list;
1005 icmp6_dst_gc_list = &rt->u.dst;
1006 spin_unlock_bh(&icmp6_dst_lock);
1008 fib6_force_start_gc(net);
1010 out:
1011 return &rt->u.dst;
1014 int icmp6_dst_gc(void)
1016 struct dst_entry *dst, *next, **pprev;
1017 int more = 0;
1019 next = NULL;
1021 spin_lock_bh(&icmp6_dst_lock);
1022 pprev = &icmp6_dst_gc_list;
1024 while ((dst = *pprev) != NULL) {
1025 if (!atomic_read(&dst->__refcnt)) {
1026 *pprev = dst->next;
1027 dst_free(dst);
1028 } else {
1029 pprev = &dst->next;
1030 ++more;
1034 spin_unlock_bh(&icmp6_dst_lock);
1036 return more;
1039 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1040 void *arg)
1042 struct dst_entry *dst, **pprev;
1044 spin_lock_bh(&icmp6_dst_lock);
1045 pprev = &icmp6_dst_gc_list;
1046 while ((dst = *pprev) != NULL) {
1047 struct rt6_info *rt = (struct rt6_info *) dst;
1048 if (func(rt, arg)) {
1049 *pprev = dst->next;
1050 dst_free(dst);
1051 } else {
1052 pprev = &dst->next;
1055 spin_unlock_bh(&icmp6_dst_lock);
1058 static int ip6_dst_gc(struct dst_ops *ops)
1060 unsigned long now = jiffies;
1061 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1062 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1063 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1064 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1065 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1066 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1068 if (time_after(rt_last_gc + rt_min_interval, now) &&
1069 atomic_read(&ops->entries) <= rt_max_size)
1070 goto out;
1072 net->ipv6.ip6_rt_gc_expire++;
1073 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1074 net->ipv6.ip6_rt_last_gc = now;
1075 if (atomic_read(&ops->entries) < ops->gc_thresh)
1076 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1077 out:
1078 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1079 return (atomic_read(&ops->entries) > rt_max_size);
1082 /* Clean host part of a prefix. Not necessary in radix tree,
1083 but results in cleaner routing tables.
1085 Remove it only when all the things will work!
1088 static int ipv6_get_mtu(struct net_device *dev)
1090 int mtu = IPV6_MIN_MTU;
1091 struct inet6_dev *idev;
1093 idev = in6_dev_get(dev);
1094 if (idev) {
1095 mtu = idev->cnf.mtu6;
1096 in6_dev_put(idev);
1098 return mtu;
1101 int ip6_dst_hoplimit(struct dst_entry *dst)
1103 int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1104 if (hoplimit < 0) {
1105 struct net_device *dev = dst->dev;
1106 struct inet6_dev *idev = in6_dev_get(dev);
1107 if (idev) {
1108 hoplimit = idev->cnf.hop_limit;
1109 in6_dev_put(idev);
1110 } else
1111 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1113 return hoplimit;
1120 int ip6_route_add(struct fib6_config *cfg)
1122 int err;
1123 struct net *net = cfg->fc_nlinfo.nl_net;
1124 struct rt6_info *rt = NULL;
1125 struct net_device *dev = NULL;
1126 struct inet6_dev *idev = NULL;
1127 struct fib6_table *table;
1128 int addr_type;
1130 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1131 return -EINVAL;
1132 #ifndef CONFIG_IPV6_SUBTREES
1133 if (cfg->fc_src_len)
1134 return -EINVAL;
1135 #endif
1136 if (cfg->fc_ifindex) {
1137 err = -ENODEV;
1138 dev = dev_get_by_index(net, cfg->fc_ifindex);
1139 if (!dev)
1140 goto out;
1141 idev = in6_dev_get(dev);
1142 if (!idev)
1143 goto out;
1146 if (cfg->fc_metric == 0)
1147 cfg->fc_metric = IP6_RT_PRIO_USER;
1149 table = fib6_new_table(net, cfg->fc_table);
1150 if (table == NULL) {
1151 err = -ENOBUFS;
1152 goto out;
1155 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1157 if (rt == NULL) {
1158 err = -ENOMEM;
1159 goto out;
1162 rt->u.dst.obsolete = -1;
1163 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1164 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1167 if (cfg->fc_protocol == RTPROT_UNSPEC)
1168 cfg->fc_protocol = RTPROT_BOOT;
1169 rt->rt6i_protocol = cfg->fc_protocol;
1171 addr_type = ipv6_addr_type(&cfg->fc_dst);
1173 if (addr_type & IPV6_ADDR_MULTICAST)
1174 rt->u.dst.input = ip6_mc_input;
1175 else
1176 rt->u.dst.input = ip6_forward;
1178 rt->u.dst.output = ip6_output;
1180 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1181 rt->rt6i_dst.plen = cfg->fc_dst_len;
1182 if (rt->rt6i_dst.plen == 128)
1183 rt->u.dst.flags = DST_HOST;
1185 #ifdef CONFIG_IPV6_SUBTREES
1186 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1187 rt->rt6i_src.plen = cfg->fc_src_len;
1188 #endif
1190 rt->rt6i_metric = cfg->fc_metric;
1192 /* We cannot add true routes via loopback here,
1193 they would result in kernel looping; promote them to reject routes
1195 if ((cfg->fc_flags & RTF_REJECT) ||
1196 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1197 /* hold loopback dev/idev if we haven't done so. */
1198 if (dev != net->loopback_dev) {
1199 if (dev) {
1200 dev_put(dev);
1201 in6_dev_put(idev);
1203 dev = net->loopback_dev;
1204 dev_hold(dev);
1205 idev = in6_dev_get(dev);
1206 if (!idev) {
1207 err = -ENODEV;
1208 goto out;
1211 rt->u.dst.output = ip6_pkt_discard_out;
1212 rt->u.dst.input = ip6_pkt_discard;
1213 rt->u.dst.error = -ENETUNREACH;
1214 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1215 goto install_route;
1218 if (cfg->fc_flags & RTF_GATEWAY) {
1219 struct in6_addr *gw_addr;
1220 int gwa_type;
1222 gw_addr = &cfg->fc_gateway;
1223 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1224 gwa_type = ipv6_addr_type(gw_addr);
1226 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1227 struct rt6_info *grt;
1229 /* IPv6 strictly inhibits using not link-local
1230 addresses as nexthop address.
1231 Otherwise, router will not able to send redirects.
1232 It is very good, but in some (rare!) circumstances
1233 (SIT, PtP, NBMA NOARP links) it is handy to allow
1234 some exceptions. --ANK
1236 err = -EINVAL;
1237 if (!(gwa_type&IPV6_ADDR_UNICAST))
1238 goto out;
1240 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1242 err = -EHOSTUNREACH;
1243 if (grt == NULL)
1244 goto out;
1245 if (dev) {
1246 if (dev != grt->rt6i_dev) {
1247 dst_release(&grt->u.dst);
1248 goto out;
1250 } else {
1251 dev = grt->rt6i_dev;
1252 idev = grt->rt6i_idev;
1253 dev_hold(dev);
1254 in6_dev_hold(grt->rt6i_idev);
1256 if (!(grt->rt6i_flags&RTF_GATEWAY))
1257 err = 0;
1258 dst_release(&grt->u.dst);
1260 if (err)
1261 goto out;
1263 err = -EINVAL;
1264 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1265 goto out;
1268 err = -ENODEV;
1269 if (dev == NULL)
1270 goto out;
1272 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1273 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1274 if (IS_ERR(rt->rt6i_nexthop)) {
1275 err = PTR_ERR(rt->rt6i_nexthop);
1276 rt->rt6i_nexthop = NULL;
1277 goto out;
1281 rt->rt6i_flags = cfg->fc_flags;
1283 install_route:
1284 if (cfg->fc_mx) {
1285 struct nlattr *nla;
1286 int remaining;
1288 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1289 int type = nla_type(nla);
1291 if (type) {
1292 if (type > RTAX_MAX) {
1293 err = -EINVAL;
1294 goto out;
1297 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1302 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1303 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1304 if (!dst_mtu(&rt->u.dst))
1305 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1306 if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1307 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1308 rt->u.dst.dev = dev;
1309 rt->rt6i_idev = idev;
1310 rt->rt6i_table = table;
1312 cfg->fc_nlinfo.nl_net = dev_net(dev);
1314 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1316 out:
1317 if (dev)
1318 dev_put(dev);
1319 if (idev)
1320 in6_dev_put(idev);
1321 if (rt)
1322 dst_free(&rt->u.dst);
1323 return err;
1326 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1328 int err;
1329 struct fib6_table *table;
1330 struct net *net = dev_net(rt->rt6i_dev);
1332 if (rt == net->ipv6.ip6_null_entry)
1333 return -ENOENT;
1335 table = rt->rt6i_table;
1336 write_lock_bh(&table->tb6_lock);
1338 err = fib6_del(rt, info);
1339 dst_release(&rt->u.dst);
1341 write_unlock_bh(&table->tb6_lock);
1343 return err;
1346 int ip6_del_rt(struct rt6_info *rt)
1348 struct nl_info info = {
1349 .nl_net = dev_net(rt->rt6i_dev),
1351 return __ip6_del_rt(rt, &info);
1354 static int ip6_route_del(struct fib6_config *cfg)
1356 struct fib6_table *table;
1357 struct fib6_node *fn;
1358 struct rt6_info *rt;
1359 int err = -ESRCH;
1361 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1362 if (table == NULL)
1363 return err;
1365 read_lock_bh(&table->tb6_lock);
1367 fn = fib6_locate(&table->tb6_root,
1368 &cfg->fc_dst, cfg->fc_dst_len,
1369 &cfg->fc_src, cfg->fc_src_len);
1371 if (fn) {
1372 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1373 if (cfg->fc_ifindex &&
1374 (rt->rt6i_dev == NULL ||
1375 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1376 continue;
1377 if (cfg->fc_flags & RTF_GATEWAY &&
1378 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1379 continue;
1380 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1381 continue;
1382 dst_hold(&rt->u.dst);
1383 read_unlock_bh(&table->tb6_lock);
1385 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1388 read_unlock_bh(&table->tb6_lock);
1390 return err;
1394 * Handle redirects
1396 struct ip6rd_flowi {
1397 struct flowi fl;
1398 struct in6_addr gateway;
1401 static struct rt6_info *__ip6_route_redirect(struct net *net,
1402 struct fib6_table *table,
1403 struct flowi *fl,
1404 int flags)
1406 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1407 struct rt6_info *rt;
1408 struct fib6_node *fn;
1411 * Get the "current" route for this destination and
1412 * check if the redirect has come from approriate router.
1414 * RFC 2461 specifies that redirects should only be
1415 * accepted if they come from the nexthop to the target.
1416 * Due to the way the routes are chosen, this notion
1417 * is a bit fuzzy and one might need to check all possible
1418 * routes.
1421 read_lock_bh(&table->tb6_lock);
1422 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1423 restart:
1424 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1426 * Current route is on-link; redirect is always invalid.
1428 * Seems, previous statement is not true. It could
1429 * be node, which looks for us as on-link (f.e. proxy ndisc)
1430 * But then router serving it might decide, that we should
1431 * know truth 8)8) --ANK (980726).
1433 if (rt6_check_expired(rt))
1434 continue;
1435 if (!(rt->rt6i_flags & RTF_GATEWAY))
1436 continue;
1437 if (fl->oif != rt->rt6i_dev->ifindex)
1438 continue;
1439 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1440 continue;
1441 break;
1444 if (!rt)
1445 rt = net->ipv6.ip6_null_entry;
1446 BACKTRACK(net, &fl->fl6_src);
1447 out:
1448 dst_hold(&rt->u.dst);
1450 read_unlock_bh(&table->tb6_lock);
1452 return rt;
1455 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1456 struct in6_addr *src,
1457 struct in6_addr *gateway,
1458 struct net_device *dev)
1460 int flags = RT6_LOOKUP_F_HAS_SADDR;
1461 struct net *net = dev_net(dev);
1462 struct ip6rd_flowi rdfl = {
1463 .fl = {
1464 .oif = dev->ifindex,
1465 .nl_u = {
1466 .ip6_u = {
1467 .daddr = *dest,
1468 .saddr = *src,
1474 ipv6_addr_copy(&rdfl.gateway, gateway);
1476 if (rt6_need_strict(dest))
1477 flags |= RT6_LOOKUP_F_IFACE;
1479 return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1480 flags, __ip6_route_redirect);
1483 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1484 struct in6_addr *saddr,
1485 struct neighbour *neigh, u8 *lladdr, int on_link)
1487 struct rt6_info *rt, *nrt = NULL;
1488 struct netevent_redirect netevent;
1489 struct net *net = dev_net(neigh->dev);
1491 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493 if (rt == net->ipv6.ip6_null_entry) {
1494 if (net_ratelimit())
1495 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1496 "for redirect target\n");
1497 goto out;
1501 * We have finally decided to accept it.
1504 neigh_update(neigh, lladdr, NUD_STALE,
1505 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1506 NEIGH_UPDATE_F_OVERRIDE|
1507 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1508 NEIGH_UPDATE_F_ISROUTER))
1512 * Redirect received -> path was valid.
1513 * Look, redirects are sent only in response to data packets,
1514 * so that this nexthop apparently is reachable. --ANK
1516 dst_confirm(&rt->u.dst);
1518 /* Duplicate redirect: silently ignore. */
1519 if (neigh == rt->u.dst.neighbour)
1520 goto out;
1522 nrt = ip6_rt_copy(rt);
1523 if (nrt == NULL)
1524 goto out;
1526 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1527 if (on_link)
1528 nrt->rt6i_flags &= ~RTF_GATEWAY;
1530 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1531 nrt->rt6i_dst.plen = 128;
1532 nrt->u.dst.flags |= DST_HOST;
1534 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1535 nrt->rt6i_nexthop = neigh_clone(neigh);
1536 /* Reset pmtu, it may be better */
1537 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1538 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1539 dst_mtu(&nrt->u.dst));
1541 if (ip6_ins_rt(nrt))
1542 goto out;
1544 netevent.old = &rt->u.dst;
1545 netevent.new = &nrt->u.dst;
1546 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1548 if (rt->rt6i_flags&RTF_CACHE) {
1549 ip6_del_rt(rt);
1550 return;
1553 out:
1554 dst_release(&rt->u.dst);
1558 * Handle ICMP "packet too big" messages
1559 * i.e. Path MTU discovery
1562 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1563 struct net *net, u32 pmtu, int ifindex)
1565 struct rt6_info *rt, *nrt;
1566 int allfrag = 0;
1568 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1569 if (rt == NULL)
1570 return;
1572 if (pmtu >= dst_mtu(&rt->u.dst))
1573 goto out;
1575 if (pmtu < IPV6_MIN_MTU) {
1577 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578 * MTU (1280) and a fragment header should always be included
1579 * after a node receiving Too Big message reporting PMTU is
1580 * less than the IPv6 Minimum Link MTU.
1582 pmtu = IPV6_MIN_MTU;
1583 allfrag = 1;
1586 /* New mtu received -> path was valid.
1587 They are sent only in response to data packets,
1588 so that this nexthop apparently is reachable. --ANK
1590 dst_confirm(&rt->u.dst);
1592 /* Host route. If it is static, it would be better
1593 not to override it, but add new one, so that
1594 when cache entry will expire old pmtu
1595 would return automatically.
1597 if (rt->rt6i_flags & RTF_CACHE) {
1598 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1599 if (allfrag)
1600 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1601 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1602 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1603 goto out;
1606 /* Network route.
1607 Two cases are possible:
1608 1. It is connected route. Action: COW
1609 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1611 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1612 nrt = rt6_alloc_cow(rt, daddr, saddr);
1613 else
1614 nrt = rt6_alloc_clone(rt, daddr);
1616 if (nrt) {
1617 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1618 if (allfrag)
1619 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1621 /* According to RFC 1981, detecting PMTU increase shouldn't be
1622 * happened within 5 mins, the recommended timer is 10 mins.
1623 * Here this route expiration time is set to ip6_rt_mtu_expires
1624 * which is 10 mins. After 10 mins the decreased pmtu is expired
1625 * and detecting PMTU increase will be automatically happened.
1627 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1628 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1630 ip6_ins_rt(nrt);
1632 out:
1633 dst_release(&rt->u.dst);
1636 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1637 struct net_device *dev, u32 pmtu)
1639 struct net *net = dev_net(dev);
1642 * RFC 1981 states that a node "MUST reduce the size of the packets it
1643 * is sending along the path" that caused the Packet Too Big message.
1644 * Since it's not possible in the general case to determine which
1645 * interface was used to send the original packet, we update the MTU
1646 * on the interface that will be used to send future packets. We also
1647 * update the MTU on the interface that received the Packet Too Big in
1648 * case the original packet was forced out that interface with
1649 * SO_BINDTODEVICE or similar. This is the next best thing to the
1650 * correct behaviour, which would be to update the MTU on all
1651 * interfaces.
1653 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1654 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1658 * Misc support functions
1661 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1663 struct net *net = dev_net(ort->rt6i_dev);
1664 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1666 if (rt) {
1667 rt->u.dst.input = ort->u.dst.input;
1668 rt->u.dst.output = ort->u.dst.output;
1670 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1671 rt->u.dst.error = ort->u.dst.error;
1672 rt->u.dst.dev = ort->u.dst.dev;
1673 if (rt->u.dst.dev)
1674 dev_hold(rt->u.dst.dev);
1675 rt->rt6i_idev = ort->rt6i_idev;
1676 if (rt->rt6i_idev)
1677 in6_dev_hold(rt->rt6i_idev);
1678 rt->u.dst.lastuse = jiffies;
1679 rt->rt6i_expires = 0;
1681 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1682 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1683 rt->rt6i_metric = 0;
1685 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1686 #ifdef CONFIG_IPV6_SUBTREES
1687 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1688 #endif
1689 rt->rt6i_table = ort->rt6i_table;
1691 return rt;
1694 #ifdef CONFIG_IPV6_ROUTE_INFO
1695 static struct rt6_info *rt6_get_route_info(struct net *net,
1696 struct in6_addr *prefix, int prefixlen,
1697 struct in6_addr *gwaddr, int ifindex)
1699 struct fib6_node *fn;
1700 struct rt6_info *rt = NULL;
1701 struct fib6_table *table;
1703 table = fib6_get_table(net, RT6_TABLE_INFO);
1704 if (table == NULL)
1705 return NULL;
1707 write_lock_bh(&table->tb6_lock);
1708 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1709 if (!fn)
1710 goto out;
1712 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1713 if (rt->rt6i_dev->ifindex != ifindex)
1714 continue;
1715 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1716 continue;
1717 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1718 continue;
1719 dst_hold(&rt->u.dst);
1720 break;
1722 out:
1723 write_unlock_bh(&table->tb6_lock);
1724 return rt;
1727 static struct rt6_info *rt6_add_route_info(struct net *net,
1728 struct in6_addr *prefix, int prefixlen,
1729 struct in6_addr *gwaddr, int ifindex,
1730 unsigned pref)
1732 struct fib6_config cfg = {
1733 .fc_table = RT6_TABLE_INFO,
1734 .fc_metric = IP6_RT_PRIO_USER,
1735 .fc_ifindex = ifindex,
1736 .fc_dst_len = prefixlen,
1737 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1738 RTF_UP | RTF_PREF(pref),
1739 .fc_nlinfo.pid = 0,
1740 .fc_nlinfo.nlh = NULL,
1741 .fc_nlinfo.nl_net = net,
1744 ipv6_addr_copy(&cfg.fc_dst, prefix);
1745 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1747 /* We should treat it as a default route if prefix length is 0. */
1748 if (!prefixlen)
1749 cfg.fc_flags |= RTF_DEFAULT;
1751 ip6_route_add(&cfg);
1753 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1755 #endif
1757 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1759 struct rt6_info *rt;
1760 struct fib6_table *table;
1762 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1763 if (table == NULL)
1764 return NULL;
1766 write_lock_bh(&table->tb6_lock);
1767 for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1768 if (dev == rt->rt6i_dev &&
1769 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1770 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1771 break;
1773 if (rt)
1774 dst_hold(&rt->u.dst);
1775 write_unlock_bh(&table->tb6_lock);
1776 return rt;
1779 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1780 struct net_device *dev,
1781 unsigned int pref)
1783 struct fib6_config cfg = {
1784 .fc_table = RT6_TABLE_DFLT,
1785 .fc_metric = IP6_RT_PRIO_USER,
1786 .fc_ifindex = dev->ifindex,
1787 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1788 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1789 .fc_nlinfo.pid = 0,
1790 .fc_nlinfo.nlh = NULL,
1791 .fc_nlinfo.nl_net = dev_net(dev),
1794 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1796 ip6_route_add(&cfg);
1798 return rt6_get_dflt_router(gwaddr, dev);
1801 void rt6_purge_dflt_routers(struct net *net)
1803 struct rt6_info *rt;
1804 struct fib6_table *table;
1806 /* NOTE: Keep consistent with rt6_get_dflt_router */
1807 table = fib6_get_table(net, RT6_TABLE_DFLT);
1808 if (table == NULL)
1809 return;
1811 restart:
1812 read_lock_bh(&table->tb6_lock);
1813 for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1814 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1815 dst_hold(&rt->u.dst);
1816 read_unlock_bh(&table->tb6_lock);
1817 ip6_del_rt(rt);
1818 goto restart;
1821 read_unlock_bh(&table->tb6_lock);
1824 static void rtmsg_to_fib6_config(struct net *net,
1825 struct in6_rtmsg *rtmsg,
1826 struct fib6_config *cfg)
1828 memset(cfg, 0, sizeof(*cfg));
1830 cfg->fc_table = RT6_TABLE_MAIN;
1831 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1832 cfg->fc_metric = rtmsg->rtmsg_metric;
1833 cfg->fc_expires = rtmsg->rtmsg_info;
1834 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1835 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1836 cfg->fc_flags = rtmsg->rtmsg_flags;
1838 cfg->fc_nlinfo.nl_net = net;
1840 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1841 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1842 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1845 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1847 struct fib6_config cfg;
1848 struct in6_rtmsg rtmsg;
1849 int err;
1851 switch(cmd) {
1852 case SIOCADDRT: /* Add a route */
1853 case SIOCDELRT: /* Delete a route */
1854 if (!capable(CAP_NET_ADMIN))
1855 return -EPERM;
1856 err = copy_from_user(&rtmsg, arg,
1857 sizeof(struct in6_rtmsg));
1858 if (err)
1859 return -EFAULT;
1861 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1863 rtnl_lock();
1864 switch (cmd) {
1865 case SIOCADDRT:
1866 err = ip6_route_add(&cfg);
1867 break;
1868 case SIOCDELRT:
1869 err = ip6_route_del(&cfg);
1870 break;
1871 default:
1872 err = -EINVAL;
1874 rtnl_unlock();
1876 return err;
1879 return -EINVAL;
1883 * Drop the packet on the floor
1886 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1888 int type;
1889 struct dst_entry *dst = skb_dst(skb);
1890 switch (ipstats_mib_noroutes) {
1891 case IPSTATS_MIB_INNOROUTES:
1892 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1893 if (type == IPV6_ADDR_ANY) {
1894 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1895 IPSTATS_MIB_INADDRERRORS);
1896 break;
1898 /* FALLTHROUGH */
1899 case IPSTATS_MIB_OUTNOROUTES:
1900 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1901 ipstats_mib_noroutes);
1902 break;
1904 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1905 kfree_skb(skb);
1906 return 0;
1909 static int ip6_pkt_discard(struct sk_buff *skb)
1911 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1914 static int ip6_pkt_discard_out(struct sk_buff *skb)
1916 skb->dev = skb_dst(skb)->dev;
1917 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1922 static int ip6_pkt_prohibit(struct sk_buff *skb)
1924 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1927 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1929 skb->dev = skb_dst(skb)->dev;
1930 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1933 #endif
1936 * Allocate a dst for local (unicast / anycast) address.
1939 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1940 const struct in6_addr *addr,
1941 int anycast)
1943 struct net *net = dev_net(idev->dev);
1944 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1945 struct neighbour *neigh;
1947 if (rt == NULL)
1948 return ERR_PTR(-ENOMEM);
1950 dev_hold(net->loopback_dev);
1951 in6_dev_hold(idev);
1953 rt->u.dst.flags = DST_HOST;
1954 rt->u.dst.input = ip6_input;
1955 rt->u.dst.output = ip6_output;
1956 rt->rt6i_dev = net->loopback_dev;
1957 rt->rt6i_idev = idev;
1958 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1959 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1960 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1961 rt->u.dst.obsolete = -1;
1963 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1964 if (anycast)
1965 rt->rt6i_flags |= RTF_ANYCAST;
1966 else
1967 rt->rt6i_flags |= RTF_LOCAL;
1968 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1969 if (IS_ERR(neigh)) {
1970 dst_free(&rt->u.dst);
1972 /* We are casting this because that is the return
1973 * value type. But an errno encoded pointer is the
1974 * same regardless of the underlying pointer type,
1975 * and that's what we are returning. So this is OK.
1977 return (struct rt6_info *) neigh;
1979 rt->rt6i_nexthop = neigh;
1981 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1982 rt->rt6i_dst.plen = 128;
1983 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1985 atomic_set(&rt->u.dst.__refcnt, 1);
1987 return rt;
1990 struct arg_dev_net {
1991 struct net_device *dev;
1992 struct net *net;
1995 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1997 struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1998 struct net *net = ((struct arg_dev_net *)arg)->net;
2000 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2001 rt != net->ipv6.ip6_null_entry) {
2002 RT6_TRACE("deleted by ifdown %p\n", rt);
2003 return -1;
2005 return 0;
2008 void rt6_ifdown(struct net *net, struct net_device *dev)
2010 struct arg_dev_net adn = {
2011 .dev = dev,
2012 .net = net,
2015 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2016 icmp6_clean_all(fib6_ifdown, &adn);
2019 struct rt6_mtu_change_arg
2021 struct net_device *dev;
2022 unsigned mtu;
2025 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2027 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2028 struct inet6_dev *idev;
2029 struct net *net = dev_net(arg->dev);
2031 /* In IPv6 pmtu discovery is not optional,
2032 so that RTAX_MTU lock cannot disable it.
2033 We still use this lock to block changes
2034 caused by addrconf/ndisc.
2037 idev = __in6_dev_get(arg->dev);
2038 if (idev == NULL)
2039 return 0;
2041 /* For administrative MTU increase, there is no way to discover
2042 IPv6 PMTU increase, so PMTU increase should be updated here.
2043 Since RFC 1981 doesn't include administrative MTU increase
2044 update PMTU increase is a MUST. (i.e. jumbo frame)
2047 If new MTU is less than route PMTU, this new MTU will be the
2048 lowest MTU in the path, update the route PMTU to reflect PMTU
2049 decreases; if new MTU is greater than route PMTU, and the
2050 old MTU is the lowest MTU in the path, update the route PMTU
2051 to reflect the increase. In this case if the other nodes' MTU
2052 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2053 PMTU discouvery.
2055 if (rt->rt6i_dev == arg->dev &&
2056 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2057 (dst_mtu(&rt->u.dst) >= arg->mtu ||
2058 (dst_mtu(&rt->u.dst) < arg->mtu &&
2059 dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2060 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2061 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2063 return 0;
2066 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2068 struct rt6_mtu_change_arg arg = {
2069 .dev = dev,
2070 .mtu = mtu,
2073 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2076 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2077 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2078 [RTA_OIF] = { .type = NLA_U32 },
2079 [RTA_IIF] = { .type = NLA_U32 },
2080 [RTA_PRIORITY] = { .type = NLA_U32 },
2081 [RTA_METRICS] = { .type = NLA_NESTED },
2084 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2085 struct fib6_config *cfg)
2087 struct rtmsg *rtm;
2088 struct nlattr *tb[RTA_MAX+1];
2089 int err;
2091 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2092 if (err < 0)
2093 goto errout;
2095 err = -EINVAL;
2096 rtm = nlmsg_data(nlh);
2097 memset(cfg, 0, sizeof(*cfg));
2099 cfg->fc_table = rtm->rtm_table;
2100 cfg->fc_dst_len = rtm->rtm_dst_len;
2101 cfg->fc_src_len = rtm->rtm_src_len;
2102 cfg->fc_flags = RTF_UP;
2103 cfg->fc_protocol = rtm->rtm_protocol;
2105 if (rtm->rtm_type == RTN_UNREACHABLE)
2106 cfg->fc_flags |= RTF_REJECT;
2108 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2109 cfg->fc_nlinfo.nlh = nlh;
2110 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2112 if (tb[RTA_GATEWAY]) {
2113 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2114 cfg->fc_flags |= RTF_GATEWAY;
2117 if (tb[RTA_DST]) {
2118 int plen = (rtm->rtm_dst_len + 7) >> 3;
2120 if (nla_len(tb[RTA_DST]) < plen)
2121 goto errout;
2123 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2126 if (tb[RTA_SRC]) {
2127 int plen = (rtm->rtm_src_len + 7) >> 3;
2129 if (nla_len(tb[RTA_SRC]) < plen)
2130 goto errout;
2132 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2135 if (tb[RTA_OIF])
2136 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2138 if (tb[RTA_PRIORITY])
2139 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2141 if (tb[RTA_METRICS]) {
2142 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2143 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2146 if (tb[RTA_TABLE])
2147 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2149 err = 0;
2150 errout:
2151 return err;
2154 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2156 struct fib6_config cfg;
2157 int err;
2159 err = rtm_to_fib6_config(skb, nlh, &cfg);
2160 if (err < 0)
2161 return err;
2163 return ip6_route_del(&cfg);
2166 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2168 struct fib6_config cfg;
2169 int err;
2171 err = rtm_to_fib6_config(skb, nlh, &cfg);
2172 if (err < 0)
2173 return err;
2175 return ip6_route_add(&cfg);
2178 static inline size_t rt6_nlmsg_size(void)
2180 return NLMSG_ALIGN(sizeof(struct rtmsg))
2181 + nla_total_size(16) /* RTA_SRC */
2182 + nla_total_size(16) /* RTA_DST */
2183 + nla_total_size(16) /* RTA_GATEWAY */
2184 + nla_total_size(16) /* RTA_PREFSRC */
2185 + nla_total_size(4) /* RTA_TABLE */
2186 + nla_total_size(4) /* RTA_IIF */
2187 + nla_total_size(4) /* RTA_OIF */
2188 + nla_total_size(4) /* RTA_PRIORITY */
2189 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2190 + nla_total_size(sizeof(struct rta_cacheinfo));
2193 static int rt6_fill_node(struct net *net,
2194 struct sk_buff *skb, struct rt6_info *rt,
2195 struct in6_addr *dst, struct in6_addr *src,
2196 int iif, int type, u32 pid, u32 seq,
2197 int prefix, int nowait, unsigned int flags)
2199 struct rtmsg *rtm;
2200 struct nlmsghdr *nlh;
2201 long expires;
2202 u32 table;
2204 if (prefix) { /* user wants prefix routes only */
2205 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2206 /* success since this is not a prefix route */
2207 return 1;
2211 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2212 if (nlh == NULL)
2213 return -EMSGSIZE;
2215 rtm = nlmsg_data(nlh);
2216 rtm->rtm_family = AF_INET6;
2217 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2218 rtm->rtm_src_len = rt->rt6i_src.plen;
2219 rtm->rtm_tos = 0;
2220 if (rt->rt6i_table)
2221 table = rt->rt6i_table->tb6_id;
2222 else
2223 table = RT6_TABLE_UNSPEC;
2224 rtm->rtm_table = table;
2225 NLA_PUT_U32(skb, RTA_TABLE, table);
2226 if (rt->rt6i_flags&RTF_REJECT)
2227 rtm->rtm_type = RTN_UNREACHABLE;
2228 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2229 rtm->rtm_type = RTN_LOCAL;
2230 else
2231 rtm->rtm_type = RTN_UNICAST;
2232 rtm->rtm_flags = 0;
2233 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2234 rtm->rtm_protocol = rt->rt6i_protocol;
2235 if (rt->rt6i_flags&RTF_DYNAMIC)
2236 rtm->rtm_protocol = RTPROT_REDIRECT;
2237 else if (rt->rt6i_flags & RTF_ADDRCONF)
2238 rtm->rtm_protocol = RTPROT_KERNEL;
2239 else if (rt->rt6i_flags&RTF_DEFAULT)
2240 rtm->rtm_protocol = RTPROT_RA;
2242 if (rt->rt6i_flags&RTF_CACHE)
2243 rtm->rtm_flags |= RTM_F_CLONED;
2245 if (dst) {
2246 NLA_PUT(skb, RTA_DST, 16, dst);
2247 rtm->rtm_dst_len = 128;
2248 } else if (rtm->rtm_dst_len)
2249 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2250 #ifdef CONFIG_IPV6_SUBTREES
2251 if (src) {
2252 NLA_PUT(skb, RTA_SRC, 16, src);
2253 rtm->rtm_src_len = 128;
2254 } else if (rtm->rtm_src_len)
2255 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2256 #endif
2257 if (iif) {
2258 #ifdef CONFIG_IPV6_MROUTE
2259 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2260 int err = ip6mr_get_route(net, skb, rtm, nowait);
2261 if (err <= 0) {
2262 if (!nowait) {
2263 if (err == 0)
2264 return 0;
2265 goto nla_put_failure;
2266 } else {
2267 if (err == -EMSGSIZE)
2268 goto nla_put_failure;
2271 } else
2272 #endif
2273 NLA_PUT_U32(skb, RTA_IIF, iif);
2274 } else if (dst) {
2275 struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2276 struct in6_addr saddr_buf;
2277 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2278 dst, 0, &saddr_buf) == 0)
2279 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2282 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2283 goto nla_put_failure;
2285 if (rt->u.dst.neighbour)
2286 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2288 if (rt->u.dst.dev)
2289 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2291 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2293 if (!(rt->rt6i_flags & RTF_EXPIRES))
2294 expires = 0;
2295 else if (rt->rt6i_expires - jiffies < INT_MAX)
2296 expires = rt->rt6i_expires - jiffies;
2297 else
2298 expires = INT_MAX;
2300 if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2301 expires, rt->u.dst.error) < 0)
2302 goto nla_put_failure;
2304 return nlmsg_end(skb, nlh);
2306 nla_put_failure:
2307 nlmsg_cancel(skb, nlh);
2308 return -EMSGSIZE;
2311 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2313 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2314 int prefix;
2316 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2317 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2318 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2319 } else
2320 prefix = 0;
2322 return rt6_fill_node(arg->net,
2323 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2324 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2325 prefix, 0, NLM_F_MULTI);
2328 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2330 struct net *net = sock_net(in_skb->sk);
2331 struct nlattr *tb[RTA_MAX+1];
2332 struct rt6_info *rt;
2333 struct sk_buff *skb;
2334 struct rtmsg *rtm;
2335 struct flowi fl;
2336 int err, iif = 0;
2338 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2339 if (err < 0)
2340 goto errout;
2342 err = -EINVAL;
2343 memset(&fl, 0, sizeof(fl));
2345 if (tb[RTA_SRC]) {
2346 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2347 goto errout;
2349 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2352 if (tb[RTA_DST]) {
2353 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2354 goto errout;
2356 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2359 if (tb[RTA_IIF])
2360 iif = nla_get_u32(tb[RTA_IIF]);
2362 if (tb[RTA_OIF])
2363 fl.oif = nla_get_u32(tb[RTA_OIF]);
2365 if (iif) {
2366 struct net_device *dev;
2367 dev = __dev_get_by_index(net, iif);
2368 if (!dev) {
2369 err = -ENODEV;
2370 goto errout;
2374 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2375 if (skb == NULL) {
2376 err = -ENOBUFS;
2377 goto errout;
2380 /* Reserve room for dummy headers, this skb can pass
2381 through good chunk of routing engine.
2383 skb_reset_mac_header(skb);
2384 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2386 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2387 skb_dst_set(skb, &rt->u.dst);
2389 err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2390 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2391 nlh->nlmsg_seq, 0, 0, 0);
2392 if (err < 0) {
2393 kfree_skb(skb);
2394 goto errout;
2397 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2398 errout:
2399 return err;
2402 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2404 struct sk_buff *skb;
2405 struct net *net = info->nl_net;
2406 u32 seq;
2407 int err;
2409 err = -ENOBUFS;
2410 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2412 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2413 if (skb == NULL)
2414 goto errout;
2416 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2417 event, info->pid, seq, 0, 0, 0);
2418 if (err < 0) {
2419 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2420 WARN_ON(err == -EMSGSIZE);
2421 kfree_skb(skb);
2422 goto errout;
2424 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2425 info->nlh, gfp_any());
2426 return;
2427 errout:
2428 if (err < 0)
2429 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2432 static int ip6_route_dev_notify(struct notifier_block *this,
2433 unsigned long event, void *data)
2435 struct net_device *dev = (struct net_device *)data;
2436 struct net *net = dev_net(dev);
2438 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2439 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2440 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2441 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2442 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2443 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2444 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2445 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2446 #endif
2449 return NOTIFY_OK;
2453 * /proc
2456 #ifdef CONFIG_PROC_FS
2458 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2460 struct rt6_proc_arg
2462 char *buffer;
2463 int offset;
2464 int length;
2465 int skip;
2466 int len;
2469 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2471 struct seq_file *m = p_arg;
2473 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2475 #ifdef CONFIG_IPV6_SUBTREES
2476 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2477 #else
2478 seq_puts(m, "00000000000000000000000000000000 00 ");
2479 #endif
2481 if (rt->rt6i_nexthop) {
2482 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2483 } else {
2484 seq_puts(m, "00000000000000000000000000000000");
2486 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2487 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2488 rt->u.dst.__use, rt->rt6i_flags,
2489 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2490 return 0;
2493 static int ipv6_route_show(struct seq_file *m, void *v)
2495 struct net *net = (struct net *)m->private;
2496 fib6_clean_all(net, rt6_info_route, 0, m);
2497 return 0;
2500 static int ipv6_route_open(struct inode *inode, struct file *file)
2502 return single_open_net(inode, file, ipv6_route_show);
2505 static const struct file_operations ipv6_route_proc_fops = {
2506 .owner = THIS_MODULE,
2507 .open = ipv6_route_open,
2508 .read = seq_read,
2509 .llseek = seq_lseek,
2510 .release = single_release_net,
2513 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2515 struct net *net = (struct net *)seq->private;
2516 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2517 net->ipv6.rt6_stats->fib_nodes,
2518 net->ipv6.rt6_stats->fib_route_nodes,
2519 net->ipv6.rt6_stats->fib_rt_alloc,
2520 net->ipv6.rt6_stats->fib_rt_entries,
2521 net->ipv6.rt6_stats->fib_rt_cache,
2522 atomic_read(&net->ipv6.ip6_dst_ops.entries),
2523 net->ipv6.rt6_stats->fib_discarded_routes);
2525 return 0;
2528 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2530 return single_open_net(inode, file, rt6_stats_seq_show);
2533 static const struct file_operations rt6_stats_seq_fops = {
2534 .owner = THIS_MODULE,
2535 .open = rt6_stats_seq_open,
2536 .read = seq_read,
2537 .llseek = seq_lseek,
2538 .release = single_release_net,
2540 #endif /* CONFIG_PROC_FS */
2542 #ifdef CONFIG_SYSCTL
2544 static
2545 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2546 void __user *buffer, size_t *lenp, loff_t *ppos)
2548 struct net *net = current->nsproxy->net_ns;
2549 int delay = net->ipv6.sysctl.flush_delay;
2550 if (write) {
2551 proc_dointvec(ctl, write, buffer, lenp, ppos);
2552 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2553 return 0;
2554 } else
2555 return -EINVAL;
2558 ctl_table ipv6_route_table_template[] = {
2560 .procname = "flush",
2561 .data = &init_net.ipv6.sysctl.flush_delay,
2562 .maxlen = sizeof(int),
2563 .mode = 0200,
2564 .proc_handler = ipv6_sysctl_rtcache_flush
2567 .procname = "gc_thresh",
2568 .data = &ip6_dst_ops_template.gc_thresh,
2569 .maxlen = sizeof(int),
2570 .mode = 0644,
2571 .proc_handler = proc_dointvec,
2574 .procname = "max_size",
2575 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2576 .maxlen = sizeof(int),
2577 .mode = 0644,
2578 .proc_handler = proc_dointvec,
2581 .procname = "gc_min_interval",
2582 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2583 .maxlen = sizeof(int),
2584 .mode = 0644,
2585 .proc_handler = proc_dointvec_jiffies,
2588 .procname = "gc_timeout",
2589 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2590 .maxlen = sizeof(int),
2591 .mode = 0644,
2592 .proc_handler = proc_dointvec_jiffies,
2595 .procname = "gc_interval",
2596 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2597 .maxlen = sizeof(int),
2598 .mode = 0644,
2599 .proc_handler = proc_dointvec_jiffies,
2602 .procname = "gc_elasticity",
2603 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2604 .maxlen = sizeof(int),
2605 .mode = 0644,
2606 .proc_handler = proc_dointvec_jiffies,
2609 .procname = "mtu_expires",
2610 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2611 .maxlen = sizeof(int),
2612 .mode = 0644,
2613 .proc_handler = proc_dointvec_jiffies,
2616 .procname = "min_adv_mss",
2617 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2618 .maxlen = sizeof(int),
2619 .mode = 0644,
2620 .proc_handler = proc_dointvec_jiffies,
2623 .procname = "gc_min_interval_ms",
2624 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2625 .maxlen = sizeof(int),
2626 .mode = 0644,
2627 .proc_handler = proc_dointvec_ms_jiffies,
2632 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2634 struct ctl_table *table;
2636 table = kmemdup(ipv6_route_table_template,
2637 sizeof(ipv6_route_table_template),
2638 GFP_KERNEL);
2640 if (table) {
2641 table[0].data = &net->ipv6.sysctl.flush_delay;
2642 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2643 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2644 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2645 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2646 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2647 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2648 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2649 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2650 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2653 return table;
2655 #endif
2657 static int __net_init ip6_route_net_init(struct net *net)
2659 int ret = -ENOMEM;
2661 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2662 sizeof(net->ipv6.ip6_dst_ops));
2664 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2665 sizeof(*net->ipv6.ip6_null_entry),
2666 GFP_KERNEL);
2667 if (!net->ipv6.ip6_null_entry)
2668 goto out_ip6_dst_ops;
2669 net->ipv6.ip6_null_entry->u.dst.path =
2670 (struct dst_entry *)net->ipv6.ip6_null_entry;
2671 net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2673 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2674 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2675 sizeof(*net->ipv6.ip6_prohibit_entry),
2676 GFP_KERNEL);
2677 if (!net->ipv6.ip6_prohibit_entry)
2678 goto out_ip6_null_entry;
2679 net->ipv6.ip6_prohibit_entry->u.dst.path =
2680 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2681 net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2683 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2684 sizeof(*net->ipv6.ip6_blk_hole_entry),
2685 GFP_KERNEL);
2686 if (!net->ipv6.ip6_blk_hole_entry)
2687 goto out_ip6_prohibit_entry;
2688 net->ipv6.ip6_blk_hole_entry->u.dst.path =
2689 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2690 net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2691 #endif
2693 net->ipv6.sysctl.flush_delay = 0;
2694 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2695 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2696 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2697 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2698 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2699 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2700 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2702 #ifdef CONFIG_PROC_FS
2703 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2704 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2705 #endif
2706 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2708 ret = 0;
2709 out:
2710 return ret;
2712 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2713 out_ip6_prohibit_entry:
2714 kfree(net->ipv6.ip6_prohibit_entry);
2715 out_ip6_null_entry:
2716 kfree(net->ipv6.ip6_null_entry);
2717 #endif
2718 out_ip6_dst_ops:
2719 goto out;
2722 static void __net_exit ip6_route_net_exit(struct net *net)
2724 #ifdef CONFIG_PROC_FS
2725 proc_net_remove(net, "ipv6_route");
2726 proc_net_remove(net, "rt6_stats");
2727 #endif
2728 kfree(net->ipv6.ip6_null_entry);
2729 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2730 kfree(net->ipv6.ip6_prohibit_entry);
2731 kfree(net->ipv6.ip6_blk_hole_entry);
2732 #endif
2735 static struct pernet_operations ip6_route_net_ops = {
2736 .init = ip6_route_net_init,
2737 .exit = ip6_route_net_exit,
2740 static struct notifier_block ip6_route_dev_notifier = {
2741 .notifier_call = ip6_route_dev_notify,
2742 .priority = 0,
2745 int __init ip6_route_init(void)
2747 int ret;
2749 ret = -ENOMEM;
2750 ip6_dst_ops_template.kmem_cachep =
2751 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2752 SLAB_HWCACHE_ALIGN, NULL);
2753 if (!ip6_dst_ops_template.kmem_cachep)
2754 goto out;
2756 ret = register_pernet_subsys(&ip6_route_net_ops);
2757 if (ret)
2758 goto out_kmem_cache;
2760 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2762 /* Registering of the loopback is done before this portion of code,
2763 * the loopback reference in rt6_info will not be taken, do it
2764 * manually for init_net */
2765 init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2766 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2767 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2768 init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2769 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2770 init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2771 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2772 #endif
2773 ret = fib6_init();
2774 if (ret)
2775 goto out_register_subsys;
2777 ret = xfrm6_init();
2778 if (ret)
2779 goto out_fib6_init;
2781 ret = fib6_rules_init();
2782 if (ret)
2783 goto xfrm6_init;
2785 ret = -ENOBUFS;
2786 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2787 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2788 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2789 goto fib6_rules_init;
2791 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2792 if (ret)
2793 goto fib6_rules_init;
2795 out:
2796 return ret;
2798 fib6_rules_init:
2799 fib6_rules_cleanup();
2800 xfrm6_init:
2801 xfrm6_fini();
2802 out_fib6_init:
2803 fib6_gc_cleanup();
2804 out_register_subsys:
2805 unregister_pernet_subsys(&ip6_route_net_ops);
2806 out_kmem_cache:
2807 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2808 goto out;
2811 void ip6_route_cleanup(void)
2813 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2814 fib6_rules_cleanup();
2815 xfrm6_fini();
2816 fib6_gc_cleanup();
2817 unregister_pernet_subsys(&ip6_route_net_ops);
2818 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);