[NETNS][IPV6] ip6_fib - add net to gc timer parameter
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv6 / route.c
blobfd44721abebbbf0b616228a4ca45054331cfd13a
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 /* Changes:
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
25 * Ville Nuorvala
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
75 #define CLONE_OFFLINK_ROUTE 0
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
92 struct in6_addr *gwaddr, int ifindex,
93 unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
95 struct in6_addr *gwaddr, int ifindex);
96 #endif
98 static struct dst_ops ip6_dst_ops = {
99 .family = AF_INET6,
100 .protocol = __constant_htons(ETH_P_IPV6),
101 .gc = ip6_dst_gc,
102 .gc_thresh = 1024,
103 .check = ip6_dst_check,
104 .destroy = ip6_dst_destroy,
105 .ifdown = ip6_dst_ifdown,
106 .negative_advice = ip6_negative_advice,
107 .link_failure = ip6_link_failure,
108 .update_pmtu = ip6_rt_update_pmtu,
109 .local_out = ip6_local_out,
110 .entry_size = sizeof(struct rt6_info),
111 .entries = ATOMIC_INIT(0),
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
118 static struct dst_ops ip6_dst_blackhole_ops = {
119 .family = AF_INET6,
120 .protocol = __constant_htons(ETH_P_IPV6),
121 .destroy = ip6_dst_destroy,
122 .check = ip6_dst_check,
123 .update_pmtu = ip6_rt_blackhole_update_pmtu,
124 .entry_size = sizeof(struct rt6_info),
125 .entries = ATOMIC_INIT(0),
128 struct rt6_info ip6_null_entry = {
129 .u = {
130 .dst = {
131 .__refcnt = ATOMIC_INIT(1),
132 .__use = 1,
133 .obsolete = -1,
134 .error = -ENETUNREACH,
135 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
136 .input = ip6_pkt_discard,
137 .output = ip6_pkt_discard_out,
138 .ops = &ip6_dst_ops,
139 .path = (struct dst_entry*)&ip6_null_entry,
142 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
143 .rt6i_metric = ~(u32) 0,
144 .rt6i_ref = ATOMIC_INIT(1),
147 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
149 static int ip6_pkt_prohibit(struct sk_buff *skb);
150 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
152 struct rt6_info ip6_prohibit_entry = {
153 .u = {
154 .dst = {
155 .__refcnt = ATOMIC_INIT(1),
156 .__use = 1,
157 .obsolete = -1,
158 .error = -EACCES,
159 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
160 .input = ip6_pkt_prohibit,
161 .output = ip6_pkt_prohibit_out,
162 .ops = &ip6_dst_ops,
163 .path = (struct dst_entry*)&ip6_prohibit_entry,
166 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
167 .rt6i_metric = ~(u32) 0,
168 .rt6i_ref = ATOMIC_INIT(1),
171 struct rt6_info ip6_blk_hole_entry = {
172 .u = {
173 .dst = {
174 .__refcnt = ATOMIC_INIT(1),
175 .__use = 1,
176 .obsolete = -1,
177 .error = -EINVAL,
178 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
179 .input = dst_discard,
180 .output = dst_discard,
181 .ops = &ip6_dst_ops,
182 .path = (struct dst_entry*)&ip6_blk_hole_entry,
185 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
186 .rt6i_metric = ~(u32) 0,
187 .rt6i_ref = ATOMIC_INIT(1),
190 #endif
192 /* allocate dst with ip6_dst_ops */
193 static __inline__ struct rt6_info *ip6_dst_alloc(void)
195 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
198 static void ip6_dst_destroy(struct dst_entry *dst)
200 struct rt6_info *rt = (struct rt6_info *)dst;
201 struct inet6_dev *idev = rt->rt6i_idev;
203 if (idev != NULL) {
204 rt->rt6i_idev = NULL;
205 in6_dev_put(idev);
209 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
210 int how)
212 struct rt6_info *rt = (struct rt6_info *)dst;
213 struct inet6_dev *idev = rt->rt6i_idev;
214 struct net_device *loopback_dev =
215 dev->nd_net->loopback_dev;
217 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
218 struct inet6_dev *loopback_idev =
219 in6_dev_get(loopback_dev);
220 if (loopback_idev != NULL) {
221 rt->rt6i_idev = loopback_idev;
222 in6_dev_put(idev);
227 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
229 return (rt->rt6i_flags & RTF_EXPIRES &&
230 time_after(jiffies, rt->rt6i_expires));
233 static inline int rt6_need_strict(struct in6_addr *daddr)
235 return (ipv6_addr_type(daddr) &
236 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
240 * Route lookup. Any table->tb6_lock is implied.
243 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
244 int oif,
245 int strict)
247 struct rt6_info *local = NULL;
248 struct rt6_info *sprt;
250 if (oif) {
251 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
252 struct net_device *dev = sprt->rt6i_dev;
253 if (dev->ifindex == oif)
254 return sprt;
255 if (dev->flags & IFF_LOOPBACK) {
256 if (sprt->rt6i_idev == NULL ||
257 sprt->rt6i_idev->dev->ifindex != oif) {
258 if (strict && oif)
259 continue;
260 if (local && (!oif ||
261 local->rt6i_idev->dev->ifindex == oif))
262 continue;
264 local = sprt;
268 if (local)
269 return local;
271 if (strict)
272 return &ip6_null_entry;
274 return rt;
277 #ifdef CONFIG_IPV6_ROUTER_PREF
278 static void rt6_probe(struct rt6_info *rt)
280 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
282 * Okay, this does not seem to be appropriate
283 * for now, however, we need to check if it
284 * is really so; aka Router Reachability Probing.
286 * Router Reachability Probe MUST be rate-limited
287 * to no more than one per minute.
289 if (!neigh || (neigh->nud_state & NUD_VALID))
290 return;
291 read_lock_bh(&neigh->lock);
292 if (!(neigh->nud_state & NUD_VALID) &&
293 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
294 struct in6_addr mcaddr;
295 struct in6_addr *target;
297 neigh->updated = jiffies;
298 read_unlock_bh(&neigh->lock);
300 target = (struct in6_addr *)&neigh->primary_key;
301 addrconf_addr_solict_mult(target, &mcaddr);
302 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
303 } else
304 read_unlock_bh(&neigh->lock);
306 #else
307 static inline void rt6_probe(struct rt6_info *rt)
309 return;
311 #endif
314 * Default Router Selection (RFC 2461 6.3.6)
316 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
318 struct net_device *dev = rt->rt6i_dev;
319 if (!oif || dev->ifindex == oif)
320 return 2;
321 if ((dev->flags & IFF_LOOPBACK) &&
322 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
323 return 1;
324 return 0;
327 static inline int rt6_check_neigh(struct rt6_info *rt)
329 struct neighbour *neigh = rt->rt6i_nexthop;
330 int m;
331 if (rt->rt6i_flags & RTF_NONEXTHOP ||
332 !(rt->rt6i_flags & RTF_GATEWAY))
333 m = 1;
334 else if (neigh) {
335 read_lock_bh(&neigh->lock);
336 if (neigh->nud_state & NUD_VALID)
337 m = 2;
338 #ifdef CONFIG_IPV6_ROUTER_PREF
339 else if (neigh->nud_state & NUD_FAILED)
340 m = 0;
341 #endif
342 else
343 m = 1;
344 read_unlock_bh(&neigh->lock);
345 } else
346 m = 0;
347 return m;
350 static int rt6_score_route(struct rt6_info *rt, int oif,
351 int strict)
353 int m, n;
355 m = rt6_check_dev(rt, oif);
356 if (!m && (strict & RT6_LOOKUP_F_IFACE))
357 return -1;
358 #ifdef CONFIG_IPV6_ROUTER_PREF
359 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
360 #endif
361 n = rt6_check_neigh(rt);
362 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
363 return -1;
364 return m;
367 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
368 int *mpri, struct rt6_info *match)
370 int m;
372 if (rt6_check_expired(rt))
373 goto out;
375 m = rt6_score_route(rt, oif, strict);
376 if (m < 0)
377 goto out;
379 if (m > *mpri) {
380 if (strict & RT6_LOOKUP_F_REACHABLE)
381 rt6_probe(match);
382 *mpri = m;
383 match = rt;
384 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
385 rt6_probe(rt);
388 out:
389 return match;
392 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
393 struct rt6_info *rr_head,
394 u32 metric, int oif, int strict)
396 struct rt6_info *rt, *match;
397 int mpri = -1;
399 match = NULL;
400 for (rt = rr_head; rt && rt->rt6i_metric == metric;
401 rt = rt->u.dst.rt6_next)
402 match = find_match(rt, oif, strict, &mpri, match);
403 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
404 rt = rt->u.dst.rt6_next)
405 match = find_match(rt, oif, strict, &mpri, match);
407 return match;
410 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
412 struct rt6_info *match, *rt0;
414 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
415 __FUNCTION__, fn->leaf, oif);
417 rt0 = fn->rr_ptr;
418 if (!rt0)
419 fn->rr_ptr = rt0 = fn->leaf;
421 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
423 if (!match &&
424 (strict & RT6_LOOKUP_F_REACHABLE)) {
425 struct rt6_info *next = rt0->u.dst.rt6_next;
427 /* no entries matched; do round-robin */
428 if (!next || next->rt6i_metric != rt0->rt6i_metric)
429 next = fn->leaf;
431 if (next != rt0)
432 fn->rr_ptr = next;
435 RT6_TRACE("%s() => %p\n",
436 __FUNCTION__, match);
438 return (match ? match : &ip6_null_entry);
441 #ifdef CONFIG_IPV6_ROUTE_INFO
442 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
443 struct in6_addr *gwaddr)
445 struct route_info *rinfo = (struct route_info *) opt;
446 struct in6_addr prefix_buf, *prefix;
447 unsigned int pref;
448 u32 lifetime;
449 struct rt6_info *rt;
451 if (len < sizeof(struct route_info)) {
452 return -EINVAL;
455 /* Sanity check for prefix_len and length */
456 if (rinfo->length > 3) {
457 return -EINVAL;
458 } else if (rinfo->prefix_len > 128) {
459 return -EINVAL;
460 } else if (rinfo->prefix_len > 64) {
461 if (rinfo->length < 2) {
462 return -EINVAL;
464 } else if (rinfo->prefix_len > 0) {
465 if (rinfo->length < 1) {
466 return -EINVAL;
470 pref = rinfo->route_pref;
471 if (pref == ICMPV6_ROUTER_PREF_INVALID)
472 pref = ICMPV6_ROUTER_PREF_MEDIUM;
474 lifetime = ntohl(rinfo->lifetime);
475 if (lifetime == 0xffffffff) {
476 /* infinity */
477 } else if (lifetime > 0x7fffffff/HZ) {
478 /* Avoid arithmetic overflow */
479 lifetime = 0x7fffffff/HZ - 1;
482 if (rinfo->length == 3)
483 prefix = (struct in6_addr *)rinfo->prefix;
484 else {
485 /* this function is safe */
486 ipv6_addr_prefix(&prefix_buf,
487 (struct in6_addr *)rinfo->prefix,
488 rinfo->prefix_len);
489 prefix = &prefix_buf;
492 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
494 if (rt && !lifetime) {
495 ip6_del_rt(rt);
496 rt = NULL;
499 if (!rt && lifetime)
500 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
501 pref);
502 else if (rt)
503 rt->rt6i_flags = RTF_ROUTEINFO |
504 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506 if (rt) {
507 if (lifetime == 0xffffffff) {
508 rt->rt6i_flags &= ~RTF_EXPIRES;
509 } else {
510 rt->rt6i_expires = jiffies + HZ * lifetime;
511 rt->rt6i_flags |= RTF_EXPIRES;
513 dst_release(&rt->u.dst);
515 return 0;
517 #endif
519 #define BACKTRACK(saddr) \
520 do { \
521 if (rt == &ip6_null_entry) { \
522 struct fib6_node *pn; \
523 while (1) { \
524 if (fn->fn_flags & RTN_TL_ROOT) \
525 goto out; \
526 pn = fn->parent; \
527 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
528 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
529 else \
530 fn = pn; \
531 if (fn->fn_flags & RTN_RTINFO) \
532 goto restart; \
535 } while(0)
537 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
538 struct flowi *fl, int flags)
540 struct fib6_node *fn;
541 struct rt6_info *rt;
543 read_lock_bh(&table->tb6_lock);
544 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
545 restart:
546 rt = fn->leaf;
547 rt = rt6_device_match(rt, fl->oif, flags);
548 BACKTRACK(&fl->fl6_src);
549 out:
550 dst_use(&rt->u.dst, jiffies);
551 read_unlock_bh(&table->tb6_lock);
552 return rt;
556 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
557 int oif, int strict)
559 struct flowi fl = {
560 .oif = oif,
561 .nl_u = {
562 .ip6_u = {
563 .daddr = *daddr,
567 struct dst_entry *dst;
568 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
570 if (saddr) {
571 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
572 flags |= RT6_LOOKUP_F_HAS_SADDR;
575 dst = fib6_rule_lookup(&init_net, &fl, flags, ip6_pol_route_lookup);
576 if (dst->error == 0)
577 return (struct rt6_info *) dst;
579 dst_release(dst);
581 return NULL;
584 EXPORT_SYMBOL(rt6_lookup);
586 /* ip6_ins_rt is called with FREE table->tb6_lock.
587 It takes new route entry, the addition fails by any reason the
588 route is freed. In any case, if caller does not hold it, it may
589 be destroyed.
592 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
594 int err;
595 struct fib6_table *table;
597 table = rt->rt6i_table;
598 write_lock_bh(&table->tb6_lock);
599 err = fib6_add(&table->tb6_root, rt, info);
600 write_unlock_bh(&table->tb6_lock);
602 return err;
605 int ip6_ins_rt(struct rt6_info *rt)
607 struct nl_info info = {
608 .nl_net = &init_net,
610 return __ip6_ins_rt(rt, &info);
613 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
614 struct in6_addr *saddr)
616 struct rt6_info *rt;
619 * Clone the route.
622 rt = ip6_rt_copy(ort);
624 if (rt) {
625 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
626 if (rt->rt6i_dst.plen != 128 &&
627 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
628 rt->rt6i_flags |= RTF_ANYCAST;
629 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
632 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
633 rt->rt6i_dst.plen = 128;
634 rt->rt6i_flags |= RTF_CACHE;
635 rt->u.dst.flags |= DST_HOST;
637 #ifdef CONFIG_IPV6_SUBTREES
638 if (rt->rt6i_src.plen && saddr) {
639 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
640 rt->rt6i_src.plen = 128;
642 #endif
644 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
648 return rt;
651 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
653 struct rt6_info *rt = ip6_rt_copy(ort);
654 if (rt) {
655 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
656 rt->rt6i_dst.plen = 128;
657 rt->rt6i_flags |= RTF_CACHE;
658 rt->u.dst.flags |= DST_HOST;
659 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
661 return rt;
664 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
665 struct flowi *fl, int flags)
667 struct fib6_node *fn;
668 struct rt6_info *rt, *nrt;
669 int strict = 0;
670 int attempts = 3;
671 int err;
672 int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
674 strict |= flags & RT6_LOOKUP_F_IFACE;
676 relookup:
677 read_lock_bh(&table->tb6_lock);
679 restart_2:
680 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
682 restart:
683 rt = rt6_select(fn, oif, strict | reachable);
684 BACKTRACK(&fl->fl6_src);
685 if (rt == &ip6_null_entry ||
686 rt->rt6i_flags & RTF_CACHE)
687 goto out;
689 dst_hold(&rt->u.dst);
690 read_unlock_bh(&table->tb6_lock);
692 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
693 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
694 else {
695 #if CLONE_OFFLINK_ROUTE
696 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
697 #else
698 goto out2;
699 #endif
702 dst_release(&rt->u.dst);
703 rt = nrt ? : &ip6_null_entry;
705 dst_hold(&rt->u.dst);
706 if (nrt) {
707 err = ip6_ins_rt(nrt);
708 if (!err)
709 goto out2;
712 if (--attempts <= 0)
713 goto out2;
716 * Race condition! In the gap, when table->tb6_lock was
717 * released someone could insert this route. Relookup.
719 dst_release(&rt->u.dst);
720 goto relookup;
722 out:
723 if (reachable) {
724 reachable = 0;
725 goto restart_2;
727 dst_hold(&rt->u.dst);
728 read_unlock_bh(&table->tb6_lock);
729 out2:
730 rt->u.dst.lastuse = jiffies;
731 rt->u.dst.__use++;
733 return rt;
736 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
737 struct flowi *fl, int flags)
739 return ip6_pol_route(table, fl->iif, fl, flags);
742 void ip6_route_input(struct sk_buff *skb)
744 struct ipv6hdr *iph = ipv6_hdr(skb);
745 int flags = RT6_LOOKUP_F_HAS_SADDR;
746 struct flowi fl = {
747 .iif = skb->dev->ifindex,
748 .nl_u = {
749 .ip6_u = {
750 .daddr = iph->daddr,
751 .saddr = iph->saddr,
752 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
755 .mark = skb->mark,
756 .proto = iph->nexthdr,
759 if (rt6_need_strict(&iph->daddr))
760 flags |= RT6_LOOKUP_F_IFACE;
762 skb->dst = fib6_rule_lookup(&init_net, &fl, flags, ip6_pol_route_input);
765 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
766 struct flowi *fl, int flags)
768 return ip6_pol_route(table, fl->oif, fl, flags);
771 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
773 int flags = 0;
775 if (rt6_need_strict(&fl->fl6_dst))
776 flags |= RT6_LOOKUP_F_IFACE;
778 if (!ipv6_addr_any(&fl->fl6_src))
779 flags |= RT6_LOOKUP_F_HAS_SADDR;
781 return fib6_rule_lookup(&init_net, fl, flags, ip6_pol_route_output);
784 EXPORT_SYMBOL(ip6_route_output);
786 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
788 struct rt6_info *ort = (struct rt6_info *) *dstp;
789 struct rt6_info *rt = (struct rt6_info *)
790 dst_alloc(&ip6_dst_blackhole_ops);
791 struct dst_entry *new = NULL;
793 if (rt) {
794 new = &rt->u.dst;
796 atomic_set(&new->__refcnt, 1);
797 new->__use = 1;
798 new->input = dst_discard;
799 new->output = dst_discard;
801 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
802 new->dev = ort->u.dst.dev;
803 if (new->dev)
804 dev_hold(new->dev);
805 rt->rt6i_idev = ort->rt6i_idev;
806 if (rt->rt6i_idev)
807 in6_dev_hold(rt->rt6i_idev);
808 rt->rt6i_expires = 0;
810 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
811 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
812 rt->rt6i_metric = 0;
814 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
815 #ifdef CONFIG_IPV6_SUBTREES
816 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
817 #endif
819 dst_free(new);
822 dst_release(*dstp);
823 *dstp = new;
824 return (new ? 0 : -ENOMEM);
826 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
829 * Destination cache support functions
832 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
834 struct rt6_info *rt;
836 rt = (struct rt6_info *) dst;
838 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
839 return dst;
841 return NULL;
844 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
846 struct rt6_info *rt = (struct rt6_info *) dst;
848 if (rt) {
849 if (rt->rt6i_flags & RTF_CACHE)
850 ip6_del_rt(rt);
851 else
852 dst_release(dst);
854 return NULL;
857 static void ip6_link_failure(struct sk_buff *skb)
859 struct rt6_info *rt;
861 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
863 rt = (struct rt6_info *) skb->dst;
864 if (rt) {
865 if (rt->rt6i_flags&RTF_CACHE) {
866 dst_set_expires(&rt->u.dst, 0);
867 rt->rt6i_flags |= RTF_EXPIRES;
868 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
869 rt->rt6i_node->fn_sernum = -1;
873 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
875 struct rt6_info *rt6 = (struct rt6_info*)dst;
877 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
878 rt6->rt6i_flags |= RTF_MODIFIED;
879 if (mtu < IPV6_MIN_MTU) {
880 mtu = IPV6_MIN_MTU;
881 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
883 dst->metrics[RTAX_MTU-1] = mtu;
884 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
888 static int ipv6_get_mtu(struct net_device *dev);
890 static inline unsigned int ipv6_advmss(unsigned int mtu)
892 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
894 if (mtu < init_net.ipv6.sysctl.ip6_rt_min_advmss)
895 mtu = init_net.ipv6.sysctl.ip6_rt_min_advmss;
898 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
899 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
900 * IPV6_MAXPLEN is also valid and means: "any MSS,
901 * rely only on pmtu discovery"
903 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
904 mtu = IPV6_MAXPLEN;
905 return mtu;
908 static struct dst_entry *icmp6_dst_gc_list;
909 static DEFINE_SPINLOCK(icmp6_dst_lock);
911 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
912 struct neighbour *neigh,
913 struct in6_addr *addr)
915 struct rt6_info *rt;
916 struct inet6_dev *idev = in6_dev_get(dev);
918 if (unlikely(idev == NULL))
919 return NULL;
921 rt = ip6_dst_alloc();
922 if (unlikely(rt == NULL)) {
923 in6_dev_put(idev);
924 goto out;
927 dev_hold(dev);
928 if (neigh)
929 neigh_hold(neigh);
930 else
931 neigh = ndisc_get_neigh(dev, addr);
933 rt->rt6i_dev = dev;
934 rt->rt6i_idev = idev;
935 rt->rt6i_nexthop = neigh;
936 atomic_set(&rt->u.dst.__refcnt, 1);
937 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
938 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
939 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
940 rt->u.dst.output = ip6_output;
942 #if 0 /* there's no chance to use these for ndisc */
943 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
944 ? DST_HOST
945 : 0;
946 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
947 rt->rt6i_dst.plen = 128;
948 #endif
950 spin_lock_bh(&icmp6_dst_lock);
951 rt->u.dst.next = icmp6_dst_gc_list;
952 icmp6_dst_gc_list = &rt->u.dst;
953 spin_unlock_bh(&icmp6_dst_lock);
955 fib6_force_start_gc();
957 out:
958 return &rt->u.dst;
961 int icmp6_dst_gc(int *more)
963 struct dst_entry *dst, *next, **pprev;
964 int freed;
966 next = NULL;
967 freed = 0;
969 spin_lock_bh(&icmp6_dst_lock);
970 pprev = &icmp6_dst_gc_list;
972 while ((dst = *pprev) != NULL) {
973 if (!atomic_read(&dst->__refcnt)) {
974 *pprev = dst->next;
975 dst_free(dst);
976 freed++;
977 } else {
978 pprev = &dst->next;
979 (*more)++;
983 spin_unlock_bh(&icmp6_dst_lock);
985 return freed;
988 static int ip6_dst_gc(struct dst_ops *ops)
990 static unsigned expire = 30*HZ;
991 static unsigned long last_gc;
992 unsigned long now = jiffies;
994 if (time_after(last_gc + init_net.ipv6.sysctl.ip6_rt_gc_min_interval, now) &&
995 atomic_read(&ip6_dst_ops.entries) <= init_net.ipv6.sysctl.ip6_rt_max_size)
996 goto out;
998 expire++;
999 fib6_run_gc(expire, &init_net);
1000 last_gc = now;
1001 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1002 expire = init_net.ipv6.sysctl.ip6_rt_gc_timeout>>1;
1004 out:
1005 expire -= expire>>init_net.ipv6.sysctl.ip6_rt_gc_elasticity;
1006 return (atomic_read(&ip6_dst_ops.entries) > init_net.ipv6.sysctl.ip6_rt_max_size);
1009 /* Clean host part of a prefix. Not necessary in radix tree,
1010 but results in cleaner routing tables.
1012 Remove it only when all the things will work!
1015 static int ipv6_get_mtu(struct net_device *dev)
1017 int mtu = IPV6_MIN_MTU;
1018 struct inet6_dev *idev;
1020 idev = in6_dev_get(dev);
1021 if (idev) {
1022 mtu = idev->cnf.mtu6;
1023 in6_dev_put(idev);
1025 return mtu;
1028 int ipv6_get_hoplimit(struct net_device *dev)
1030 int hoplimit = ipv6_devconf.hop_limit;
1031 struct inet6_dev *idev;
1033 idev = in6_dev_get(dev);
1034 if (idev) {
1035 hoplimit = idev->cnf.hop_limit;
1036 in6_dev_put(idev);
1038 return hoplimit;
1045 int ip6_route_add(struct fib6_config *cfg)
1047 int err;
1048 struct rt6_info *rt = NULL;
1049 struct net_device *dev = NULL;
1050 struct inet6_dev *idev = NULL;
1051 struct fib6_table *table;
1052 int addr_type;
1054 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1055 return -EINVAL;
1056 #ifndef CONFIG_IPV6_SUBTREES
1057 if (cfg->fc_src_len)
1058 return -EINVAL;
1059 #endif
1060 if (cfg->fc_ifindex) {
1061 err = -ENODEV;
1062 dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1063 if (!dev)
1064 goto out;
1065 idev = in6_dev_get(dev);
1066 if (!idev)
1067 goto out;
1070 if (cfg->fc_metric == 0)
1071 cfg->fc_metric = IP6_RT_PRIO_USER;
1073 table = fib6_new_table(&init_net, cfg->fc_table);
1074 if (table == NULL) {
1075 err = -ENOBUFS;
1076 goto out;
1079 rt = ip6_dst_alloc();
1081 if (rt == NULL) {
1082 err = -ENOMEM;
1083 goto out;
1086 rt->u.dst.obsolete = -1;
1087 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1089 if (cfg->fc_protocol == RTPROT_UNSPEC)
1090 cfg->fc_protocol = RTPROT_BOOT;
1091 rt->rt6i_protocol = cfg->fc_protocol;
1093 addr_type = ipv6_addr_type(&cfg->fc_dst);
1095 if (addr_type & IPV6_ADDR_MULTICAST)
1096 rt->u.dst.input = ip6_mc_input;
1097 else
1098 rt->u.dst.input = ip6_forward;
1100 rt->u.dst.output = ip6_output;
1102 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1103 rt->rt6i_dst.plen = cfg->fc_dst_len;
1104 if (rt->rt6i_dst.plen == 128)
1105 rt->u.dst.flags = DST_HOST;
1107 #ifdef CONFIG_IPV6_SUBTREES
1108 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1109 rt->rt6i_src.plen = cfg->fc_src_len;
1110 #endif
1112 rt->rt6i_metric = cfg->fc_metric;
1114 /* We cannot add true routes via loopback here,
1115 they would result in kernel looping; promote them to reject routes
1117 if ((cfg->fc_flags & RTF_REJECT) ||
1118 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1119 /* hold loopback dev/idev if we haven't done so. */
1120 if (dev != init_net.loopback_dev) {
1121 if (dev) {
1122 dev_put(dev);
1123 in6_dev_put(idev);
1125 dev = init_net.loopback_dev;
1126 dev_hold(dev);
1127 idev = in6_dev_get(dev);
1128 if (!idev) {
1129 err = -ENODEV;
1130 goto out;
1133 rt->u.dst.output = ip6_pkt_discard_out;
1134 rt->u.dst.input = ip6_pkt_discard;
1135 rt->u.dst.error = -ENETUNREACH;
1136 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1137 goto install_route;
1140 if (cfg->fc_flags & RTF_GATEWAY) {
1141 struct in6_addr *gw_addr;
1142 int gwa_type;
1144 gw_addr = &cfg->fc_gateway;
1145 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1146 gwa_type = ipv6_addr_type(gw_addr);
1148 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1149 struct rt6_info *grt;
1151 /* IPv6 strictly inhibits using not link-local
1152 addresses as nexthop address.
1153 Otherwise, router will not able to send redirects.
1154 It is very good, but in some (rare!) circumstances
1155 (SIT, PtP, NBMA NOARP links) it is handy to allow
1156 some exceptions. --ANK
1158 err = -EINVAL;
1159 if (!(gwa_type&IPV6_ADDR_UNICAST))
1160 goto out;
1162 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1164 err = -EHOSTUNREACH;
1165 if (grt == NULL)
1166 goto out;
1167 if (dev) {
1168 if (dev != grt->rt6i_dev) {
1169 dst_release(&grt->u.dst);
1170 goto out;
1172 } else {
1173 dev = grt->rt6i_dev;
1174 idev = grt->rt6i_idev;
1175 dev_hold(dev);
1176 in6_dev_hold(grt->rt6i_idev);
1178 if (!(grt->rt6i_flags&RTF_GATEWAY))
1179 err = 0;
1180 dst_release(&grt->u.dst);
1182 if (err)
1183 goto out;
1185 err = -EINVAL;
1186 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1187 goto out;
1190 err = -ENODEV;
1191 if (dev == NULL)
1192 goto out;
1194 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1195 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1196 if (IS_ERR(rt->rt6i_nexthop)) {
1197 err = PTR_ERR(rt->rt6i_nexthop);
1198 rt->rt6i_nexthop = NULL;
1199 goto out;
1203 rt->rt6i_flags = cfg->fc_flags;
1205 install_route:
1206 if (cfg->fc_mx) {
1207 struct nlattr *nla;
1208 int remaining;
1210 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1211 int type = nla_type(nla);
1213 if (type) {
1214 if (type > RTAX_MAX) {
1215 err = -EINVAL;
1216 goto out;
1219 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1224 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1225 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1226 if (!rt->u.dst.metrics[RTAX_MTU-1])
1227 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1228 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1229 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1230 rt->u.dst.dev = dev;
1231 rt->rt6i_idev = idev;
1232 rt->rt6i_table = table;
1233 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1235 out:
1236 if (dev)
1237 dev_put(dev);
1238 if (idev)
1239 in6_dev_put(idev);
1240 if (rt)
1241 dst_free(&rt->u.dst);
1242 return err;
1245 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1247 int err;
1248 struct fib6_table *table;
1250 if (rt == &ip6_null_entry)
1251 return -ENOENT;
1253 table = rt->rt6i_table;
1254 write_lock_bh(&table->tb6_lock);
1256 err = fib6_del(rt, info);
1257 dst_release(&rt->u.dst);
1259 write_unlock_bh(&table->tb6_lock);
1261 return err;
1264 int ip6_del_rt(struct rt6_info *rt)
1266 struct nl_info info = {
1267 .nl_net = &init_net,
1269 return __ip6_del_rt(rt, &info);
1272 static int ip6_route_del(struct fib6_config *cfg)
1274 struct fib6_table *table;
1275 struct fib6_node *fn;
1276 struct rt6_info *rt;
1277 int err = -ESRCH;
1279 table = fib6_get_table(&init_net, cfg->fc_table);
1280 if (table == NULL)
1281 return err;
1283 read_lock_bh(&table->tb6_lock);
1285 fn = fib6_locate(&table->tb6_root,
1286 &cfg->fc_dst, cfg->fc_dst_len,
1287 &cfg->fc_src, cfg->fc_src_len);
1289 if (fn) {
1290 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1291 if (cfg->fc_ifindex &&
1292 (rt->rt6i_dev == NULL ||
1293 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1294 continue;
1295 if (cfg->fc_flags & RTF_GATEWAY &&
1296 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1297 continue;
1298 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1299 continue;
1300 dst_hold(&rt->u.dst);
1301 read_unlock_bh(&table->tb6_lock);
1303 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1306 read_unlock_bh(&table->tb6_lock);
1308 return err;
1312 * Handle redirects
1314 struct ip6rd_flowi {
1315 struct flowi fl;
1316 struct in6_addr gateway;
1319 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1320 struct flowi *fl,
1321 int flags)
1323 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1324 struct rt6_info *rt;
1325 struct fib6_node *fn;
1328 * Get the "current" route for this destination and
1329 * check if the redirect has come from approriate router.
1331 * RFC 2461 specifies that redirects should only be
1332 * accepted if they come from the nexthop to the target.
1333 * Due to the way the routes are chosen, this notion
1334 * is a bit fuzzy and one might need to check all possible
1335 * routes.
1338 read_lock_bh(&table->tb6_lock);
1339 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1340 restart:
1341 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1343 * Current route is on-link; redirect is always invalid.
1345 * Seems, previous statement is not true. It could
1346 * be node, which looks for us as on-link (f.e. proxy ndisc)
1347 * But then router serving it might decide, that we should
1348 * know truth 8)8) --ANK (980726).
1350 if (rt6_check_expired(rt))
1351 continue;
1352 if (!(rt->rt6i_flags & RTF_GATEWAY))
1353 continue;
1354 if (fl->oif != rt->rt6i_dev->ifindex)
1355 continue;
1356 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1357 continue;
1358 break;
1361 if (!rt)
1362 rt = &ip6_null_entry;
1363 BACKTRACK(&fl->fl6_src);
1364 out:
1365 dst_hold(&rt->u.dst);
1367 read_unlock_bh(&table->tb6_lock);
1369 return rt;
1372 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1373 struct in6_addr *src,
1374 struct in6_addr *gateway,
1375 struct net_device *dev)
1377 int flags = RT6_LOOKUP_F_HAS_SADDR;
1378 struct ip6rd_flowi rdfl = {
1379 .fl = {
1380 .oif = dev->ifindex,
1381 .nl_u = {
1382 .ip6_u = {
1383 .daddr = *dest,
1384 .saddr = *src,
1388 .gateway = *gateway,
1391 if (rt6_need_strict(dest))
1392 flags |= RT6_LOOKUP_F_IFACE;
1394 return (struct rt6_info *)fib6_rule_lookup(&init_net,
1395 (struct flowi *)&rdfl,
1396 flags, __ip6_route_redirect);
1399 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1400 struct in6_addr *saddr,
1401 struct neighbour *neigh, u8 *lladdr, int on_link)
1403 struct rt6_info *rt, *nrt = NULL;
1404 struct netevent_redirect netevent;
1406 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1408 if (rt == &ip6_null_entry) {
1409 if (net_ratelimit())
1410 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1411 "for redirect target\n");
1412 goto out;
1416 * We have finally decided to accept it.
1419 neigh_update(neigh, lladdr, NUD_STALE,
1420 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1421 NEIGH_UPDATE_F_OVERRIDE|
1422 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1423 NEIGH_UPDATE_F_ISROUTER))
1427 * Redirect received -> path was valid.
1428 * Look, redirects are sent only in response to data packets,
1429 * so that this nexthop apparently is reachable. --ANK
1431 dst_confirm(&rt->u.dst);
1433 /* Duplicate redirect: silently ignore. */
1434 if (neigh == rt->u.dst.neighbour)
1435 goto out;
1437 nrt = ip6_rt_copy(rt);
1438 if (nrt == NULL)
1439 goto out;
1441 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1442 if (on_link)
1443 nrt->rt6i_flags &= ~RTF_GATEWAY;
1445 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1446 nrt->rt6i_dst.plen = 128;
1447 nrt->u.dst.flags |= DST_HOST;
1449 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1450 nrt->rt6i_nexthop = neigh_clone(neigh);
1451 /* Reset pmtu, it may be better */
1452 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1453 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1455 if (ip6_ins_rt(nrt))
1456 goto out;
1458 netevent.old = &rt->u.dst;
1459 netevent.new = &nrt->u.dst;
1460 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1462 if (rt->rt6i_flags&RTF_CACHE) {
1463 ip6_del_rt(rt);
1464 return;
1467 out:
1468 dst_release(&rt->u.dst);
1469 return;
1473 * Handle ICMP "packet too big" messages
1474 * i.e. Path MTU discovery
1477 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1478 struct net_device *dev, u32 pmtu)
1480 struct rt6_info *rt, *nrt;
1481 int allfrag = 0;
1483 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1484 if (rt == NULL)
1485 return;
1487 if (pmtu >= dst_mtu(&rt->u.dst))
1488 goto out;
1490 if (pmtu < IPV6_MIN_MTU) {
1492 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1493 * MTU (1280) and a fragment header should always be included
1494 * after a node receiving Too Big message reporting PMTU is
1495 * less than the IPv6 Minimum Link MTU.
1497 pmtu = IPV6_MIN_MTU;
1498 allfrag = 1;
1501 /* New mtu received -> path was valid.
1502 They are sent only in response to data packets,
1503 so that this nexthop apparently is reachable. --ANK
1505 dst_confirm(&rt->u.dst);
1507 /* Host route. If it is static, it would be better
1508 not to override it, but add new one, so that
1509 when cache entry will expire old pmtu
1510 would return automatically.
1512 if (rt->rt6i_flags & RTF_CACHE) {
1513 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1514 if (allfrag)
1515 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1516 dst_set_expires(&rt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires);
1517 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1518 goto out;
1521 /* Network route.
1522 Two cases are possible:
1523 1. It is connected route. Action: COW
1524 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1526 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1527 nrt = rt6_alloc_cow(rt, daddr, saddr);
1528 else
1529 nrt = rt6_alloc_clone(rt, daddr);
1531 if (nrt) {
1532 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1533 if (allfrag)
1534 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1536 /* According to RFC 1981, detecting PMTU increase shouldn't be
1537 * happened within 5 mins, the recommended timer is 10 mins.
1538 * Here this route expiration time is set to ip6_rt_mtu_expires
1539 * which is 10 mins. After 10 mins the decreased pmtu is expired
1540 * and detecting PMTU increase will be automatically happened.
1542 dst_set_expires(&nrt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires);
1543 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1545 ip6_ins_rt(nrt);
1547 out:
1548 dst_release(&rt->u.dst);
1552 * Misc support functions
1555 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1557 struct rt6_info *rt = ip6_dst_alloc();
1559 if (rt) {
1560 rt->u.dst.input = ort->u.dst.input;
1561 rt->u.dst.output = ort->u.dst.output;
1563 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1564 rt->u.dst.error = ort->u.dst.error;
1565 rt->u.dst.dev = ort->u.dst.dev;
1566 if (rt->u.dst.dev)
1567 dev_hold(rt->u.dst.dev);
1568 rt->rt6i_idev = ort->rt6i_idev;
1569 if (rt->rt6i_idev)
1570 in6_dev_hold(rt->rt6i_idev);
1571 rt->u.dst.lastuse = jiffies;
1572 rt->rt6i_expires = 0;
1574 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1575 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1576 rt->rt6i_metric = 0;
1578 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1579 #ifdef CONFIG_IPV6_SUBTREES
1580 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1581 #endif
1582 rt->rt6i_table = ort->rt6i_table;
1584 return rt;
1587 #ifdef CONFIG_IPV6_ROUTE_INFO
1588 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1589 struct in6_addr *gwaddr, int ifindex)
1591 struct fib6_node *fn;
1592 struct rt6_info *rt = NULL;
1593 struct fib6_table *table;
1595 table = fib6_get_table(&init_net, RT6_TABLE_INFO);
1596 if (table == NULL)
1597 return NULL;
1599 write_lock_bh(&table->tb6_lock);
1600 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1601 if (!fn)
1602 goto out;
1604 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1605 if (rt->rt6i_dev->ifindex != ifindex)
1606 continue;
1607 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1608 continue;
1609 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1610 continue;
1611 dst_hold(&rt->u.dst);
1612 break;
1614 out:
1615 write_unlock_bh(&table->tb6_lock);
1616 return rt;
1619 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1620 struct in6_addr *gwaddr, int ifindex,
1621 unsigned pref)
1623 struct fib6_config cfg = {
1624 .fc_table = RT6_TABLE_INFO,
1625 .fc_metric = IP6_RT_PRIO_USER,
1626 .fc_ifindex = ifindex,
1627 .fc_dst_len = prefixlen,
1628 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1629 RTF_UP | RTF_PREF(pref),
1632 ipv6_addr_copy(&cfg.fc_dst, prefix);
1633 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1635 /* We should treat it as a default route if prefix length is 0. */
1636 if (!prefixlen)
1637 cfg.fc_flags |= RTF_DEFAULT;
1639 ip6_route_add(&cfg);
1641 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1643 #endif
1645 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1647 struct rt6_info *rt;
1648 struct fib6_table *table;
1650 table = fib6_get_table(&init_net, RT6_TABLE_DFLT);
1651 if (table == NULL)
1652 return NULL;
1654 write_lock_bh(&table->tb6_lock);
1655 for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1656 if (dev == rt->rt6i_dev &&
1657 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1658 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1659 break;
1661 if (rt)
1662 dst_hold(&rt->u.dst);
1663 write_unlock_bh(&table->tb6_lock);
1664 return rt;
1667 EXPORT_SYMBOL(rt6_get_dflt_router);
1669 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1670 struct net_device *dev,
1671 unsigned int pref)
1673 struct fib6_config cfg = {
1674 .fc_table = RT6_TABLE_DFLT,
1675 .fc_metric = IP6_RT_PRIO_USER,
1676 .fc_ifindex = dev->ifindex,
1677 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1678 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1681 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1683 ip6_route_add(&cfg);
1685 return rt6_get_dflt_router(gwaddr, dev);
1688 void rt6_purge_dflt_routers(void)
1690 struct rt6_info *rt;
1691 struct fib6_table *table;
1693 /* NOTE: Keep consistent with rt6_get_dflt_router */
1694 table = fib6_get_table(&init_net, RT6_TABLE_DFLT);
1695 if (table == NULL)
1696 return;
1698 restart:
1699 read_lock_bh(&table->tb6_lock);
1700 for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1701 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1702 dst_hold(&rt->u.dst);
1703 read_unlock_bh(&table->tb6_lock);
1704 ip6_del_rt(rt);
1705 goto restart;
1708 read_unlock_bh(&table->tb6_lock);
1711 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1712 struct fib6_config *cfg)
1714 memset(cfg, 0, sizeof(*cfg));
1716 cfg->fc_table = RT6_TABLE_MAIN;
1717 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1718 cfg->fc_metric = rtmsg->rtmsg_metric;
1719 cfg->fc_expires = rtmsg->rtmsg_info;
1720 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1721 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1722 cfg->fc_flags = rtmsg->rtmsg_flags;
1724 cfg->fc_nlinfo.nl_net = &init_net;
1726 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1727 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1728 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1731 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1733 struct fib6_config cfg;
1734 struct in6_rtmsg rtmsg;
1735 int err;
1737 switch(cmd) {
1738 case SIOCADDRT: /* Add a route */
1739 case SIOCDELRT: /* Delete a route */
1740 if (!capable(CAP_NET_ADMIN))
1741 return -EPERM;
1742 err = copy_from_user(&rtmsg, arg,
1743 sizeof(struct in6_rtmsg));
1744 if (err)
1745 return -EFAULT;
1747 rtmsg_to_fib6_config(&rtmsg, &cfg);
1749 rtnl_lock();
1750 switch (cmd) {
1751 case SIOCADDRT:
1752 err = ip6_route_add(&cfg);
1753 break;
1754 case SIOCDELRT:
1755 err = ip6_route_del(&cfg);
1756 break;
1757 default:
1758 err = -EINVAL;
1760 rtnl_unlock();
1762 return err;
1765 return -EINVAL;
1769 * Drop the packet on the floor
1772 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1774 int type;
1775 switch (ipstats_mib_noroutes) {
1776 case IPSTATS_MIB_INNOROUTES:
1777 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1778 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1779 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1780 break;
1782 /* FALLTHROUGH */
1783 case IPSTATS_MIB_OUTNOROUTES:
1784 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1785 break;
1787 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1788 kfree_skb(skb);
1789 return 0;
1792 static int ip6_pkt_discard(struct sk_buff *skb)
1794 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1797 static int ip6_pkt_discard_out(struct sk_buff *skb)
1799 skb->dev = skb->dst->dev;
1800 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1803 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1805 static int ip6_pkt_prohibit(struct sk_buff *skb)
1807 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1810 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1812 skb->dev = skb->dst->dev;
1813 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1816 #endif
1819 * Allocate a dst for local (unicast / anycast) address.
1822 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1823 const struct in6_addr *addr,
1824 int anycast)
1826 struct rt6_info *rt = ip6_dst_alloc();
1828 if (rt == NULL)
1829 return ERR_PTR(-ENOMEM);
1831 dev_hold(init_net.loopback_dev);
1832 in6_dev_hold(idev);
1834 rt->u.dst.flags = DST_HOST;
1835 rt->u.dst.input = ip6_input;
1836 rt->u.dst.output = ip6_output;
1837 rt->rt6i_dev = init_net.loopback_dev;
1838 rt->rt6i_idev = idev;
1839 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1840 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1841 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1842 rt->u.dst.obsolete = -1;
1844 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1845 if (anycast)
1846 rt->rt6i_flags |= RTF_ANYCAST;
1847 else
1848 rt->rt6i_flags |= RTF_LOCAL;
1849 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1850 if (rt->rt6i_nexthop == NULL) {
1851 dst_free(&rt->u.dst);
1852 return ERR_PTR(-ENOMEM);
1855 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1856 rt->rt6i_dst.plen = 128;
1857 rt->rt6i_table = fib6_get_table(&init_net, RT6_TABLE_LOCAL);
1859 atomic_set(&rt->u.dst.__refcnt, 1);
1861 return rt;
1864 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1866 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1867 rt != &ip6_null_entry) {
1868 RT6_TRACE("deleted by ifdown %p\n", rt);
1869 return -1;
1871 return 0;
1874 void rt6_ifdown(struct net *net, struct net_device *dev)
1876 fib6_clean_all(net, fib6_ifdown, 0, dev);
1879 struct rt6_mtu_change_arg
1881 struct net_device *dev;
1882 unsigned mtu;
1885 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1887 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1888 struct inet6_dev *idev;
1890 /* In IPv6 pmtu discovery is not optional,
1891 so that RTAX_MTU lock cannot disable it.
1892 We still use this lock to block changes
1893 caused by addrconf/ndisc.
1896 idev = __in6_dev_get(arg->dev);
1897 if (idev == NULL)
1898 return 0;
1900 /* For administrative MTU increase, there is no way to discover
1901 IPv6 PMTU increase, so PMTU increase should be updated here.
1902 Since RFC 1981 doesn't include administrative MTU increase
1903 update PMTU increase is a MUST. (i.e. jumbo frame)
1906 If new MTU is less than route PMTU, this new MTU will be the
1907 lowest MTU in the path, update the route PMTU to reflect PMTU
1908 decreases; if new MTU is greater than route PMTU, and the
1909 old MTU is the lowest MTU in the path, update the route PMTU
1910 to reflect the increase. In this case if the other nodes' MTU
1911 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1912 PMTU discouvery.
1914 if (rt->rt6i_dev == arg->dev &&
1915 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1916 (dst_mtu(&rt->u.dst) >= arg->mtu ||
1917 (dst_mtu(&rt->u.dst) < arg->mtu &&
1918 dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1919 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1920 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1922 return 0;
1925 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1927 struct rt6_mtu_change_arg arg = {
1928 .dev = dev,
1929 .mtu = mtu,
1932 fib6_clean_all(dev->nd_net, rt6_mtu_change_route, 0, &arg);
1935 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1936 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
1937 [RTA_OIF] = { .type = NLA_U32 },
1938 [RTA_IIF] = { .type = NLA_U32 },
1939 [RTA_PRIORITY] = { .type = NLA_U32 },
1940 [RTA_METRICS] = { .type = NLA_NESTED },
1943 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1944 struct fib6_config *cfg)
1946 struct rtmsg *rtm;
1947 struct nlattr *tb[RTA_MAX+1];
1948 int err;
1950 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1951 if (err < 0)
1952 goto errout;
1954 err = -EINVAL;
1955 rtm = nlmsg_data(nlh);
1956 memset(cfg, 0, sizeof(*cfg));
1958 cfg->fc_table = rtm->rtm_table;
1959 cfg->fc_dst_len = rtm->rtm_dst_len;
1960 cfg->fc_src_len = rtm->rtm_src_len;
1961 cfg->fc_flags = RTF_UP;
1962 cfg->fc_protocol = rtm->rtm_protocol;
1964 if (rtm->rtm_type == RTN_UNREACHABLE)
1965 cfg->fc_flags |= RTF_REJECT;
1967 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1968 cfg->fc_nlinfo.nlh = nlh;
1969 cfg->fc_nlinfo.nl_net = skb->sk->sk_net;
1971 if (tb[RTA_GATEWAY]) {
1972 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1973 cfg->fc_flags |= RTF_GATEWAY;
1976 if (tb[RTA_DST]) {
1977 int plen = (rtm->rtm_dst_len + 7) >> 3;
1979 if (nla_len(tb[RTA_DST]) < plen)
1980 goto errout;
1982 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1985 if (tb[RTA_SRC]) {
1986 int plen = (rtm->rtm_src_len + 7) >> 3;
1988 if (nla_len(tb[RTA_SRC]) < plen)
1989 goto errout;
1991 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1994 if (tb[RTA_OIF])
1995 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1997 if (tb[RTA_PRIORITY])
1998 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2000 if (tb[RTA_METRICS]) {
2001 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2002 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2005 if (tb[RTA_TABLE])
2006 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2008 err = 0;
2009 errout:
2010 return err;
2013 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2015 struct net *net = skb->sk->sk_net;
2016 struct fib6_config cfg;
2017 int err;
2019 if (net != &init_net)
2020 return -EINVAL;
2022 err = rtm_to_fib6_config(skb, nlh, &cfg);
2023 if (err < 0)
2024 return err;
2026 return ip6_route_del(&cfg);
2029 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2031 struct net *net = skb->sk->sk_net;
2032 struct fib6_config cfg;
2033 int err;
2035 if (net != &init_net)
2036 return -EINVAL;
2038 err = rtm_to_fib6_config(skb, nlh, &cfg);
2039 if (err < 0)
2040 return err;
2042 return ip6_route_add(&cfg);
2045 static inline size_t rt6_nlmsg_size(void)
2047 return NLMSG_ALIGN(sizeof(struct rtmsg))
2048 + nla_total_size(16) /* RTA_SRC */
2049 + nla_total_size(16) /* RTA_DST */
2050 + nla_total_size(16) /* RTA_GATEWAY */
2051 + nla_total_size(16) /* RTA_PREFSRC */
2052 + nla_total_size(4) /* RTA_TABLE */
2053 + nla_total_size(4) /* RTA_IIF */
2054 + nla_total_size(4) /* RTA_OIF */
2055 + nla_total_size(4) /* RTA_PRIORITY */
2056 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2057 + nla_total_size(sizeof(struct rta_cacheinfo));
2060 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2061 struct in6_addr *dst, struct in6_addr *src,
2062 int iif, int type, u32 pid, u32 seq,
2063 int prefix, unsigned int flags)
2065 struct rtmsg *rtm;
2066 struct nlmsghdr *nlh;
2067 long expires;
2068 u32 table;
2070 if (prefix) { /* user wants prefix routes only */
2071 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2072 /* success since this is not a prefix route */
2073 return 1;
2077 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2078 if (nlh == NULL)
2079 return -EMSGSIZE;
2081 rtm = nlmsg_data(nlh);
2082 rtm->rtm_family = AF_INET6;
2083 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2084 rtm->rtm_src_len = rt->rt6i_src.plen;
2085 rtm->rtm_tos = 0;
2086 if (rt->rt6i_table)
2087 table = rt->rt6i_table->tb6_id;
2088 else
2089 table = RT6_TABLE_UNSPEC;
2090 rtm->rtm_table = table;
2091 NLA_PUT_U32(skb, RTA_TABLE, table);
2092 if (rt->rt6i_flags&RTF_REJECT)
2093 rtm->rtm_type = RTN_UNREACHABLE;
2094 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2095 rtm->rtm_type = RTN_LOCAL;
2096 else
2097 rtm->rtm_type = RTN_UNICAST;
2098 rtm->rtm_flags = 0;
2099 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2100 rtm->rtm_protocol = rt->rt6i_protocol;
2101 if (rt->rt6i_flags&RTF_DYNAMIC)
2102 rtm->rtm_protocol = RTPROT_REDIRECT;
2103 else if (rt->rt6i_flags & RTF_ADDRCONF)
2104 rtm->rtm_protocol = RTPROT_KERNEL;
2105 else if (rt->rt6i_flags&RTF_DEFAULT)
2106 rtm->rtm_protocol = RTPROT_RA;
2108 if (rt->rt6i_flags&RTF_CACHE)
2109 rtm->rtm_flags |= RTM_F_CLONED;
2111 if (dst) {
2112 NLA_PUT(skb, RTA_DST, 16, dst);
2113 rtm->rtm_dst_len = 128;
2114 } else if (rtm->rtm_dst_len)
2115 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2116 #ifdef CONFIG_IPV6_SUBTREES
2117 if (src) {
2118 NLA_PUT(skb, RTA_SRC, 16, src);
2119 rtm->rtm_src_len = 128;
2120 } else if (rtm->rtm_src_len)
2121 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2122 #endif
2123 if (iif)
2124 NLA_PUT_U32(skb, RTA_IIF, iif);
2125 else if (dst) {
2126 struct in6_addr saddr_buf;
2127 if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2128 dst, &saddr_buf) == 0)
2129 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2132 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2133 goto nla_put_failure;
2135 if (rt->u.dst.neighbour)
2136 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2138 if (rt->u.dst.dev)
2139 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2141 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2143 expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2144 if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2145 expires, rt->u.dst.error) < 0)
2146 goto nla_put_failure;
2148 return nlmsg_end(skb, nlh);
2150 nla_put_failure:
2151 nlmsg_cancel(skb, nlh);
2152 return -EMSGSIZE;
2155 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2157 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2158 int prefix;
2160 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2161 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2162 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2163 } else
2164 prefix = 0;
2166 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2167 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2168 prefix, NLM_F_MULTI);
2171 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2173 struct net *net = in_skb->sk->sk_net;
2174 struct nlattr *tb[RTA_MAX+1];
2175 struct rt6_info *rt;
2176 struct sk_buff *skb;
2177 struct rtmsg *rtm;
2178 struct flowi fl;
2179 int err, iif = 0;
2181 if (net != &init_net)
2182 return -EINVAL;
2184 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2185 if (err < 0)
2186 goto errout;
2188 err = -EINVAL;
2189 memset(&fl, 0, sizeof(fl));
2191 if (tb[RTA_SRC]) {
2192 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2193 goto errout;
2195 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2198 if (tb[RTA_DST]) {
2199 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2200 goto errout;
2202 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2205 if (tb[RTA_IIF])
2206 iif = nla_get_u32(tb[RTA_IIF]);
2208 if (tb[RTA_OIF])
2209 fl.oif = nla_get_u32(tb[RTA_OIF]);
2211 if (iif) {
2212 struct net_device *dev;
2213 dev = __dev_get_by_index(&init_net, iif);
2214 if (!dev) {
2215 err = -ENODEV;
2216 goto errout;
2220 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2221 if (skb == NULL) {
2222 err = -ENOBUFS;
2223 goto errout;
2226 /* Reserve room for dummy headers, this skb can pass
2227 through good chunk of routing engine.
2229 skb_reset_mac_header(skb);
2230 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2232 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2233 skb->dst = &rt->u.dst;
2235 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2236 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2237 nlh->nlmsg_seq, 0, 0);
2238 if (err < 0) {
2239 kfree_skb(skb);
2240 goto errout;
2243 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2244 errout:
2245 return err;
2248 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2250 struct sk_buff *skb;
2251 u32 seq;
2252 int err;
2254 err = -ENOBUFS;
2255 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2257 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2258 if (skb == NULL)
2259 goto errout;
2261 err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2262 event, info->pid, seq, 0, 0);
2263 if (err < 0) {
2264 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2265 WARN_ON(err == -EMSGSIZE);
2266 kfree_skb(skb);
2267 goto errout;
2269 err = rtnl_notify(skb, &init_net, info->pid,
2270 RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any());
2271 errout:
2272 if (err < 0)
2273 rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_ROUTE, err);
2277 * /proc
2280 #ifdef CONFIG_PROC_FS
2282 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2284 struct rt6_proc_arg
2286 char *buffer;
2287 int offset;
2288 int length;
2289 int skip;
2290 int len;
2293 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2295 struct seq_file *m = p_arg;
2297 seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2298 rt->rt6i_dst.plen);
2300 #ifdef CONFIG_IPV6_SUBTREES
2301 seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2302 rt->rt6i_src.plen);
2303 #else
2304 seq_puts(m, "00000000000000000000000000000000 00 ");
2305 #endif
2307 if (rt->rt6i_nexthop) {
2308 seq_printf(m, NIP6_SEQFMT,
2309 NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2310 } else {
2311 seq_puts(m, "00000000000000000000000000000000");
2313 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2314 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2315 rt->u.dst.__use, rt->rt6i_flags,
2316 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2317 return 0;
2320 static int ipv6_route_show(struct seq_file *m, void *v)
2322 struct net *net = (struct net *)m->private;
2323 fib6_clean_all(net, rt6_info_route, 0, m);
2324 return 0;
2327 static int ipv6_route_open(struct inode *inode, struct file *file)
2329 struct net *net = get_proc_net(inode);
2330 if (!net)
2331 return -ENXIO;
2332 return single_open(file, ipv6_route_show, net);
2335 static int ipv6_route_release(struct inode *inode, struct file *file)
2337 struct seq_file *seq = file->private_data;
2338 struct net *net = seq->private;
2339 put_net(net);
2340 return single_release(inode, file);
2343 static const struct file_operations ipv6_route_proc_fops = {
2344 .owner = THIS_MODULE,
2345 .open = ipv6_route_open,
2346 .read = seq_read,
2347 .llseek = seq_lseek,
2348 .release = ipv6_route_release,
2351 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2353 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2354 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2355 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2356 rt6_stats.fib_rt_cache,
2357 atomic_read(&ip6_dst_ops.entries),
2358 rt6_stats.fib_discarded_routes);
2360 return 0;
2363 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2365 return single_open(file, rt6_stats_seq_show, NULL);
2368 static const struct file_operations rt6_stats_seq_fops = {
2369 .owner = THIS_MODULE,
2370 .open = rt6_stats_seq_open,
2371 .read = seq_read,
2372 .llseek = seq_lseek,
2373 .release = single_release,
2376 static int ipv6_route_proc_init(struct net *net)
2378 int ret = -ENOMEM;
2379 if (!proc_net_fops_create(net, "ipv6_route",
2380 0, &ipv6_route_proc_fops))
2381 goto out;
2383 if (!proc_net_fops_create(net, "rt6_stats",
2384 S_IRUGO, &rt6_stats_seq_fops))
2385 goto out_ipv6_route;
2387 ret = 0;
2388 out:
2389 return ret;
2390 out_ipv6_route:
2391 proc_net_remove(net, "ipv6_route");
2392 goto out;
2395 static void ipv6_route_proc_fini(struct net *net)
2397 proc_net_remove(net, "ipv6_route");
2398 proc_net_remove(net, "rt6_stats");
2400 #else
2401 static inline int ipv6_route_proc_init(struct net *net)
2403 return 0;
2405 static inline void ipv6_route_proc_fini(struct net *net)
2407 return ;
2409 #endif /* CONFIG_PROC_FS */
2411 #ifdef CONFIG_SYSCTL
2413 static
2414 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2415 void __user *buffer, size_t *lenp, loff_t *ppos)
2417 struct net *net = current->nsproxy->net_ns;
2418 int delay = net->ipv6.sysctl.flush_delay;
2419 if (write) {
2420 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2421 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2422 return 0;
2423 } else
2424 return -EINVAL;
2427 ctl_table ipv6_route_table_template[] = {
2429 .procname = "flush",
2430 .data = &init_net.ipv6.sysctl.flush_delay,
2431 .maxlen = sizeof(int),
2432 .mode = 0200,
2433 .proc_handler = &ipv6_sysctl_rtcache_flush
2436 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2437 .procname = "gc_thresh",
2438 .data = &ip6_dst_ops.gc_thresh,
2439 .maxlen = sizeof(int),
2440 .mode = 0644,
2441 .proc_handler = &proc_dointvec,
2444 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2445 .procname = "max_size",
2446 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2447 .maxlen = sizeof(int),
2448 .mode = 0644,
2449 .proc_handler = &proc_dointvec,
2452 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2453 .procname = "gc_min_interval",
2454 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2455 .maxlen = sizeof(int),
2456 .mode = 0644,
2457 .proc_handler = &proc_dointvec_jiffies,
2458 .strategy = &sysctl_jiffies,
2461 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2462 .procname = "gc_timeout",
2463 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2464 .maxlen = sizeof(int),
2465 .mode = 0644,
2466 .proc_handler = &proc_dointvec_jiffies,
2467 .strategy = &sysctl_jiffies,
2470 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2471 .procname = "gc_interval",
2472 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2473 .maxlen = sizeof(int),
2474 .mode = 0644,
2475 .proc_handler = &proc_dointvec_jiffies,
2476 .strategy = &sysctl_jiffies,
2479 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2480 .procname = "gc_elasticity",
2481 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2482 .maxlen = sizeof(int),
2483 .mode = 0644,
2484 .proc_handler = &proc_dointvec_jiffies,
2485 .strategy = &sysctl_jiffies,
2488 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2489 .procname = "mtu_expires",
2490 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2491 .maxlen = sizeof(int),
2492 .mode = 0644,
2493 .proc_handler = &proc_dointvec_jiffies,
2494 .strategy = &sysctl_jiffies,
2497 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2498 .procname = "min_adv_mss",
2499 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2500 .maxlen = sizeof(int),
2501 .mode = 0644,
2502 .proc_handler = &proc_dointvec_jiffies,
2503 .strategy = &sysctl_jiffies,
2506 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2507 .procname = "gc_min_interval_ms",
2508 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2509 .maxlen = sizeof(int),
2510 .mode = 0644,
2511 .proc_handler = &proc_dointvec_ms_jiffies,
2512 .strategy = &sysctl_ms_jiffies,
2514 { .ctl_name = 0 }
2517 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2519 struct ctl_table *table;
2521 table = kmemdup(ipv6_route_table_template,
2522 sizeof(ipv6_route_table_template),
2523 GFP_KERNEL);
2525 if (table) {
2526 table[0].data = &net->ipv6.sysctl.flush_delay;
2527 /* table[1].data will be handled when we have
2528 routes per namespace */
2529 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2530 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2531 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2532 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2533 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2534 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2535 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2538 return table;
2540 #endif
2542 int __init ip6_route_init(void)
2544 int ret;
2546 ip6_dst_ops.kmem_cachep =
2547 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2548 SLAB_HWCACHE_ALIGN, NULL);
2549 if (!ip6_dst_ops.kmem_cachep)
2550 return -ENOMEM;
2552 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2554 ret = fib6_init();
2555 if (ret)
2556 goto out_kmem_cache;
2558 ret = ipv6_route_proc_init(&init_net);
2559 if (ret)
2560 goto out_fib6_init;
2562 ret = xfrm6_init();
2563 if (ret)
2564 goto out_proc_init;
2566 ret = fib6_rules_init();
2567 if (ret)
2568 goto xfrm6_init;
2570 ret = -ENOBUFS;
2571 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2572 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2573 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2574 goto fib6_rules_init;
2576 ret = 0;
2577 out:
2578 return ret;
2580 fib6_rules_init:
2581 fib6_rules_cleanup();
2582 xfrm6_init:
2583 xfrm6_fini();
2584 out_proc_init:
2585 ipv6_route_proc_fini(&init_net);
2586 out_fib6_init:
2587 rt6_ifdown(&init_net, NULL);
2588 fib6_gc_cleanup();
2589 out_kmem_cache:
2590 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2591 goto out;
2594 void ip6_route_cleanup(void)
2596 fib6_rules_cleanup();
2597 ipv6_route_proc_fini(&init_net);
2598 xfrm6_fini();
2599 rt6_ifdown(&init_net, NULL);
2600 fib6_gc_cleanup();
2601 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);