Merge with Linux 2.5.59.
[linux-2.6/linux-mips.git] / net / ipv6 / route.c
blob1ff9ddc6ccdd087ee713a7bd0eb257c92ca22077
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 /* Changes:
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/config.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/init.h>
37 #include <linux/netlink.h>
38 #include <linux/if_arp.h>
40 #ifdef CONFIG_PROC_FS
41 #include <linux/proc_fs.h>
42 #endif
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
53 #include <asm/uaccess.h>
55 #ifdef CONFIG_SYSCTL
56 #include <linux/sysctl.h>
57 #endif
59 /* Set to 3 to get tracing. */
60 #define RT6_DEBUG 2
62 #if RT6_DEBUG >= 3
63 #define RDBG(x) printk x
64 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
65 #else
66 #define RDBG(x)
67 #define RT6_TRACE(x...) do { ; } while (0)
68 #endif
71 static int ip6_rt_max_size = 4096;
72 static int ip6_rt_gc_min_interval = 5*HZ;
73 static int ip6_rt_gc_timeout = 60*HZ;
74 int ip6_rt_gc_interval = 30*HZ;
75 static int ip6_rt_gc_elasticity = 9;
76 static int ip6_rt_mtu_expires = 10*60*HZ;
77 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
79 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
80 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static int ip6_dst_gc(void);
84 static int ip6_pkt_discard(struct sk_buff *skb);
85 static void ip6_link_failure(struct sk_buff *skb);
86 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88 static struct dst_ops ip6_dst_ops = {
89 .family = AF_INET6,
90 .protocol = __constant_htons(ETH_P_IPV6),
91 .gc = ip6_dst_gc,
92 .gc_thresh = 1024,
93 .check = ip6_dst_check,
94 .negative_advice = ip6_negative_advice,
95 .link_failure = ip6_link_failure,
96 .update_pmtu = ip6_rt_update_pmtu,
97 .entry_size = sizeof(struct rt6_info),
100 struct rt6_info ip6_null_entry = {
101 .u = {
102 .dst = {
103 .__refcnt = ATOMIC_INIT(1),
104 .__use = 1,
105 .dev = &loopback_dev,
106 .obsolete = -1,
107 .error = -ENETUNREACH,
108 .input = ip6_pkt_discard,
109 .output = ip6_pkt_discard,
110 .ops = &ip6_dst_ops,
111 .path = (struct dst_entry*)&ip6_null_entry,
114 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
115 .rt6i_metric = ~(u32) 0,
116 .rt6i_hoplimit = 255,
117 .rt6i_ref = ATOMIC_INIT(1),
120 struct fib6_node ip6_routing_table = {
121 NULL, NULL, NULL, NULL,
122 &ip6_null_entry,
123 0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0
126 /* Protects all the ip6 fib */
128 rwlock_t rt6_lock = RW_LOCK_UNLOCKED;
132 * Route lookup. Any rt6_lock is implied.
135 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
136 int oif,
137 int strict)
139 struct rt6_info *local = NULL;
140 struct rt6_info *sprt;
142 if (oif) {
143 for (sprt = rt; sprt; sprt = sprt->u.next) {
144 struct net_device *dev = sprt->rt6i_dev;
145 if (dev->ifindex == oif)
146 return sprt;
147 if (dev->flags&IFF_LOOPBACK)
148 local = sprt;
151 if (local)
152 return local;
154 if (strict)
155 return &ip6_null_entry;
157 return rt;
161 * pointer to the last default router chosen. BH is disabled locally.
163 static struct rt6_info *rt6_dflt_pointer = NULL;
164 static spinlock_t rt6_dflt_lock = SPIN_LOCK_UNLOCKED;
166 /* Default Router Selection (RFC 2461 6.3.6) */
167 static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif)
169 struct rt6_info *match = NULL;
170 struct rt6_info *sprt;
171 int mpri = 0;
173 for (sprt = rt; sprt; sprt = sprt->u.next) {
174 struct neighbour *neigh;
175 int m = 0;
177 if (!oif ||
178 (sprt->rt6i_dev &&
179 sprt->rt6i_dev->ifindex == oif))
180 m += 8;
182 if (sprt == rt6_dflt_pointer)
183 m += 4;
185 if ((neigh = sprt->rt6i_nexthop) != NULL) {
186 read_lock_bh(&neigh->lock);
187 switch (neigh->nud_state) {
188 case NUD_REACHABLE:
189 m += 3;
190 break;
192 case NUD_STALE:
193 case NUD_DELAY:
194 case NUD_PROBE:
195 m += 2;
196 break;
198 case NUD_NOARP:
199 case NUD_PERMANENT:
200 m += 1;
201 break;
203 case NUD_INCOMPLETE:
204 default:
205 read_unlock_bh(&neigh->lock);
206 continue;
208 read_unlock_bh(&neigh->lock);
209 } else {
210 continue;
213 if (m > mpri || m >= 12) {
214 match = sprt;
215 mpri = m;
216 if (m >= 12) {
217 /* we choose the lastest default router if it
218 * is in (probably) reachable state.
219 * If route changed, we should do pmtu
220 * discovery. --yoshfuji
222 break;
227 spin_lock(&rt6_dflt_lock);
228 if (!match) {
230 * No default routers are known to be reachable.
231 * SHOULD round robin
233 if (rt6_dflt_pointer) {
234 for (sprt = rt6_dflt_pointer->u.next;
235 sprt; sprt = sprt->u.next) {
236 if (sprt->u.dst.obsolete <= 0 &&
237 sprt->u.dst.error == 0) {
238 match = sprt;
239 break;
242 for (sprt = rt;
243 !match && sprt && sprt != rt6_dflt_pointer;
244 sprt = sprt->u.next) {
245 if (sprt->u.dst.obsolete <= 0 &&
246 sprt->u.dst.error == 0) {
247 match = sprt;
248 break;
254 if (match) {
255 if (rt6_dflt_pointer != match)
256 RT6_TRACE("changed default router: %p->%p\n",
257 rt6_dflt_pointer, match);
258 rt6_dflt_pointer = match;
260 spin_unlock(&rt6_dflt_lock);
262 if (!match) {
264 * Last Resort: if no default routers found,
265 * use addrconf default route.
266 * We don't record this route.
268 for (sprt = ip6_routing_table.leaf;
269 sprt; sprt = sprt->u.next) {
270 if ((sprt->rt6i_flags & RTF_DEFAULT) &&
271 (!oif ||
272 (sprt->rt6i_dev &&
273 sprt->rt6i_dev->ifindex == oif))) {
274 match = sprt;
275 break;
278 if (!match) {
279 /* no default route. give up. */
280 match = &ip6_null_entry;
284 return match;
287 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
288 int oif, int strict)
290 struct fib6_node *fn;
291 struct rt6_info *rt;
293 read_lock_bh(&rt6_lock);
294 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
295 rt = rt6_device_match(fn->leaf, oif, strict);
296 dst_hold(&rt->u.dst);
297 rt->u.dst.__use++;
298 read_unlock_bh(&rt6_lock);
300 rt->u.dst.lastuse = jiffies;
301 if (rt->u.dst.error == 0)
302 return rt;
303 dst_release(&rt->u.dst);
304 return NULL;
307 /* rt6_ins is called with FREE rt6_lock.
308 It takes new route entry, the addition fails by any reason the
309 route is freed. In any case, if caller does not hold it, it may
310 be destroyed.
313 static int rt6_ins(struct rt6_info *rt)
315 int err;
317 write_lock_bh(&rt6_lock);
318 err = fib6_add(&ip6_routing_table, rt);
319 write_unlock_bh(&rt6_lock);
321 return err;
324 /* No rt6_lock! If COW faild, the function returns dead route entry
325 with dst->error set to errno value.
328 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
329 struct in6_addr *saddr)
331 int err;
332 struct rt6_info *rt;
335 * Clone the route.
338 rt = ip6_rt_copy(ort);
340 if (rt) {
341 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
343 if (!(rt->rt6i_flags&RTF_GATEWAY))
344 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
346 rt->rt6i_dst.plen = 128;
347 rt->rt6i_flags |= RTF_CACHE;
348 rt->u.dst.flags |= DST_HOST;
350 #ifdef CONFIG_IPV6_SUBTREES
351 if (rt->rt6i_src.plen && saddr) {
352 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
353 rt->rt6i_src.plen = 128;
355 #endif
357 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
359 dst_clone(&rt->u.dst);
361 err = rt6_ins(rt);
362 if (err == 0)
363 return rt;
365 rt->u.dst.error = err;
367 return rt;
369 dst_clone(&ip6_null_entry.u.dst);
370 return &ip6_null_entry;
373 #define BACKTRACK() \
374 if (rt == &ip6_null_entry && strict) { \
375 while ((fn = fn->parent) != NULL) { \
376 if (fn->fn_flags & RTN_ROOT) { \
377 dst_clone(&rt->u.dst); \
378 goto out; \
380 if (fn->fn_flags & RTN_RTINFO) \
381 goto restart; \
386 void ip6_route_input(struct sk_buff *skb)
388 struct fib6_node *fn;
389 struct rt6_info *rt;
390 int strict;
391 int attempts = 3;
393 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
395 relookup:
396 read_lock_bh(&rt6_lock);
398 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
399 &skb->nh.ipv6h->saddr);
401 restart:
402 rt = fn->leaf;
404 if ((rt->rt6i_flags & RTF_CACHE)) {
405 rt = rt6_device_match(rt, skb->dev->ifindex, strict);
406 BACKTRACK();
407 dst_clone(&rt->u.dst);
408 goto out;
411 rt = rt6_device_match(rt, skb->dev->ifindex, 0);
412 BACKTRACK();
414 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
415 read_unlock_bh(&rt6_lock);
417 rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
418 &skb->nh.ipv6h->saddr);
420 if (rt->u.dst.error != -EEXIST || --attempts <= 0)
421 goto out2;
422 /* Race condition! In the gap, when rt6_lock was
423 released someone could insert this route. Relookup.
425 goto relookup;
427 dst_clone(&rt->u.dst);
429 out:
430 read_unlock_bh(&rt6_lock);
431 out2:
432 rt->u.dst.lastuse = jiffies;
433 rt->u.dst.__use++;
434 skb->dst = (struct dst_entry *) rt;
437 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
439 struct fib6_node *fn;
440 struct rt6_info *rt;
441 int strict;
442 int attempts = 3;
444 strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
446 relookup:
447 read_lock_bh(&rt6_lock);
449 fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr,
450 fl->nl_u.ip6_u.saddr);
452 restart:
453 rt = fn->leaf;
455 if ((rt->rt6i_flags & RTF_CACHE)) {
456 rt = rt6_device_match(rt, fl->oif, strict);
457 BACKTRACK();
458 dst_clone(&rt->u.dst);
459 goto out;
461 if (rt->rt6i_flags & RTF_DEFAULT) {
462 if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
463 rt = rt6_best_dflt(rt, fl->oif);
464 } else {
465 rt = rt6_device_match(rt, fl->oif, strict);
466 BACKTRACK();
469 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
470 read_unlock_bh(&rt6_lock);
472 rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr,
473 fl->nl_u.ip6_u.saddr);
475 if (rt->u.dst.error != -EEXIST || --attempts <= 0)
476 goto out2;
478 /* Race condition! In the gap, when rt6_lock was
479 released someone could insert this route. Relookup.
481 goto relookup;
483 dst_clone(&rt->u.dst);
485 out:
486 read_unlock_bh(&rt6_lock);
487 out2:
488 rt->u.dst.lastuse = jiffies;
489 rt->u.dst.__use++;
490 return &rt->u.dst;
495 * Destination cache support functions
498 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
500 struct rt6_info *rt;
502 rt = (struct rt6_info *) dst;
504 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
505 return dst;
507 dst_release(dst);
508 return NULL;
511 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
513 struct rt6_info *rt = (struct rt6_info *) dst;
515 if (rt) {
516 if (rt->rt6i_flags & RTF_CACHE)
517 ip6_del_rt(rt);
518 else
519 dst_release(dst);
521 return NULL;
524 static void ip6_link_failure(struct sk_buff *skb)
526 struct rt6_info *rt;
528 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
530 rt = (struct rt6_info *) skb->dst;
531 if (rt) {
532 if (rt->rt6i_flags&RTF_CACHE) {
533 dst_set_expires(&rt->u.dst, 0);
534 rt->rt6i_flags |= RTF_EXPIRES;
535 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
536 rt->rt6i_node->fn_sernum = -1;
540 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
542 struct rt6_info *rt6 = (struct rt6_info*)dst;
544 if (mtu < dst_pmtu(dst) && rt6->rt6i_dst.plen == 128) {
545 rt6->rt6i_flags |= RTF_MODIFIED;
546 dst->metrics[RTAX_MTU-1] = mtu;
550 static int ip6_dst_gc()
552 static unsigned expire = 30*HZ;
553 static unsigned long last_gc;
554 unsigned long now = jiffies;
556 if ((long)(now - last_gc) < ip6_rt_gc_min_interval &&
557 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
558 goto out;
560 expire++;
561 fib6_run_gc(expire);
562 last_gc = now;
563 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
564 expire = ip6_rt_gc_timeout>>1;
566 out:
567 expire -= expire>>ip6_rt_gc_elasticity;
568 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
571 /* Clean host part of a prefix. Not necessary in radix tree,
572 but results in cleaner routing tables.
574 Remove it only when all the things will work!
577 static void ipv6_wash_prefix(struct in6_addr *pfx, int plen)
579 int b = plen&0x7;
580 int o = (plen + 7)>>3;
582 if (o < 16)
583 memset(pfx->s6_addr + o, 0, 16 - o);
584 if (b != 0)
585 pfx->s6_addr[plen>>3] &= (0xFF<<(8-b));
588 static int ipv6_get_mtu(struct net_device *dev)
590 int mtu = IPV6_MIN_MTU;
591 struct inet6_dev *idev;
593 idev = in6_dev_get(dev);
594 if (idev) {
595 mtu = idev->cnf.mtu6;
596 in6_dev_put(idev);
598 return mtu;
601 static int ipv6_get_hoplimit(struct net_device *dev)
603 int hoplimit = ipv6_devconf.hop_limit;
604 struct inet6_dev *idev;
606 idev = in6_dev_get(dev);
607 if (idev) {
608 hoplimit = idev->cnf.hop_limit;
609 in6_dev_put(idev);
611 return hoplimit;
618 int ip6_route_add(struct in6_rtmsg *rtmsg)
620 int err;
621 struct rt6_info *rt;
622 struct net_device *dev = NULL;
623 int addr_type;
625 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
626 return -EINVAL;
627 #ifndef CONFIG_IPV6_SUBTREES
628 if (rtmsg->rtmsg_src_len)
629 return -EINVAL;
630 #endif
631 if (rtmsg->rtmsg_metric == 0)
632 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
634 rt = dst_alloc(&ip6_dst_ops);
636 if (rt == NULL)
637 return -ENOMEM;
639 rt->u.dst.obsolete = -1;
640 rt->rt6i_expires = rtmsg->rtmsg_info;
642 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
644 if (addr_type & IPV6_ADDR_MULTICAST)
645 rt->u.dst.input = ip6_mc_input;
646 else
647 rt->u.dst.input = ip6_forward;
649 rt->u.dst.output = ip6_output;
651 if (rtmsg->rtmsg_ifindex) {
652 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
653 err = -ENODEV;
654 if (dev == NULL)
655 goto out;
658 ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst);
659 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
660 if (rt->rt6i_dst.plen == 128)
661 rt->u.dst.flags = DST_HOST;
662 ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen);
664 #ifdef CONFIG_IPV6_SUBTREES
665 ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src);
666 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
667 ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen);
668 #endif
670 rt->rt6i_metric = rtmsg->rtmsg_metric;
672 /* We cannot add true routes via loopback here,
673 they would result in kernel looping; promote them to reject routes
675 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
676 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
677 if (dev)
678 dev_put(dev);
679 dev = &loopback_dev;
680 dev_hold(dev);
681 rt->u.dst.output = ip6_pkt_discard;
682 rt->u.dst.input = ip6_pkt_discard;
683 rt->u.dst.error = -ENETUNREACH;
684 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
685 goto install_route;
688 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
689 struct in6_addr *gw_addr;
690 int gwa_type;
692 gw_addr = &rtmsg->rtmsg_gateway;
693 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
694 gwa_type = ipv6_addr_type(gw_addr);
696 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
697 struct rt6_info *grt;
699 /* IPv6 strictly inhibits using not link-local
700 addresses as nexthop address.
701 Otherwise, router will not able to send redirects.
702 It is very good, but in some (rare!) curcumstances
703 (SIT, PtP, NBMA NOARP links) it is handy to allow
704 some exceptions. --ANK
706 err = -EINVAL;
707 if (!(gwa_type&IPV6_ADDR_UNICAST))
708 goto out;
710 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
712 err = -EHOSTUNREACH;
713 if (grt == NULL)
714 goto out;
715 if (dev) {
716 if (dev != grt->rt6i_dev) {
717 dst_release(&grt->u.dst);
718 goto out;
720 } else {
721 dev = grt->rt6i_dev;
722 dev_hold(dev);
724 if (!(grt->rt6i_flags&RTF_GATEWAY))
725 err = 0;
726 dst_release(&grt->u.dst);
728 if (err)
729 goto out;
731 err = -EINVAL;
732 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
733 goto out;
736 err = -ENODEV;
737 if (dev == NULL)
738 goto out;
740 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
741 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
742 if (IS_ERR(rt->rt6i_nexthop)) {
743 err = PTR_ERR(rt->rt6i_nexthop);
744 rt->rt6i_nexthop = NULL;
745 goto out;
749 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
750 rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS;
751 else
752 rt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
753 rt->rt6i_flags = rtmsg->rtmsg_flags;
755 install_route:
756 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
757 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, dst_pmtu(&rt->u.dst) - 60, ip6_rt_min_advmss);
758 /* Maximal non-jumbo IPv6 payload is 65535 and corresponding
759 MSS is 65535 - tcp_header_size. 65535 is also valid and
760 means: "any MSS, rely only on pmtu discovery"
762 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535-20)
763 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535;
764 rt->u.dst.dev = dev;
765 return rt6_ins(rt);
767 out:
768 if (dev)
769 dev_put(dev);
770 dst_free((struct dst_entry *) rt);
771 return err;
774 int ip6_del_rt(struct rt6_info *rt)
776 int err;
778 write_lock_bh(&rt6_lock);
780 spin_lock_bh(&rt6_dflt_lock);
781 rt6_dflt_pointer = NULL;
782 spin_unlock_bh(&rt6_dflt_lock);
784 dst_release(&rt->u.dst);
786 err = fib6_del(rt);
787 write_unlock_bh(&rt6_lock);
789 return err;
792 static int ip6_route_del(struct in6_rtmsg *rtmsg)
794 struct fib6_node *fn;
795 struct rt6_info *rt;
796 int err = -ESRCH;
798 read_lock_bh(&rt6_lock);
800 fn = fib6_locate(&ip6_routing_table,
801 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
802 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
804 if (fn) {
805 for (rt = fn->leaf; rt; rt = rt->u.next) {
806 if (rtmsg->rtmsg_ifindex &&
807 (rt->rt6i_dev == NULL ||
808 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
809 continue;
810 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
811 ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
812 continue;
813 if (rtmsg->rtmsg_metric &&
814 rtmsg->rtmsg_metric != rt->rt6i_metric)
815 continue;
816 dst_clone(&rt->u.dst);
817 read_unlock_bh(&rt6_lock);
819 return ip6_del_rt(rt);
822 read_unlock_bh(&rt6_lock);
824 return err;
828 * Handle redirects
830 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
831 struct neighbour *neigh, int on_link)
833 struct rt6_info *rt, *nrt;
835 /* Locate old route to this destination. */
836 rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
838 if (rt == NULL)
839 return;
841 if (neigh->dev != rt->rt6i_dev)
842 goto out;
844 /* Redirect received -> path was valid.
845 Look, redirects are sent only in response to data packets,
846 so that this nexthop apparently is reachable. --ANK
848 dst_confirm(&rt->u.dst);
850 /* Duplicate redirect: silently ignore. */
851 if (neigh == rt->u.dst.neighbour)
852 goto out;
854 /* Current route is on-link; redirect is always invalid.
856 Seems, previous statement is not true. It could
857 be node, which looks for us as on-link (f.e. proxy ndisc)
858 But then router serving it might decide, that we should
859 know truth 8)8) --ANK (980726).
861 if (!(rt->rt6i_flags&RTF_GATEWAY))
862 goto out;
865 * RFC 1970 specifies that redirects should only be
866 * accepted if they come from the nexthop to the target.
867 * Due to the way default routers are chosen, this notion
868 * is a bit fuzzy and one might need to check all default
869 * routers.
872 if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) {
873 if (rt->rt6i_flags & RTF_DEFAULT) {
874 struct rt6_info *rt1;
876 read_lock(&rt6_lock);
877 for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
878 if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) {
879 dst_clone(&rt1->u.dst);
880 dst_release(&rt->u.dst);
881 read_unlock(&rt6_lock);
882 rt = rt1;
883 goto source_ok;
886 read_unlock(&rt6_lock);
888 if (net_ratelimit())
889 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
890 "for redirect target\n");
891 goto out;
894 source_ok:
897 * We have finally decided to accept it.
900 nrt = ip6_rt_copy(rt);
901 if (nrt == NULL)
902 goto out;
904 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
905 if (on_link)
906 nrt->rt6i_flags &= ~RTF_GATEWAY;
908 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
909 nrt->rt6i_dst.plen = 128;
910 nrt->u.dst.flags |= DST_HOST;
912 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
913 nrt->rt6i_nexthop = neigh_clone(neigh);
914 /* Reset pmtu, it may be better */
915 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
916 nrt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, dst_pmtu(&nrt->u.dst) - 60, ip6_rt_min_advmss);
917 if (nrt->u.dst.metrics[RTAX_ADVMSS-1] > 65535-20)
918 nrt->u.dst.metrics[RTAX_ADVMSS-1] = 65535;
919 nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev);
921 if (rt6_ins(nrt))
922 goto out;
924 if (rt->rt6i_flags&RTF_CACHE) {
925 ip6_del_rt(rt);
926 return;
929 out:
930 dst_release(&rt->u.dst);
931 return;
935 * Handle ICMP "packet too big" messages
936 * i.e. Path MTU discovery
939 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
940 struct net_device *dev, u32 pmtu)
942 struct rt6_info *rt, *nrt;
944 if (pmtu < IPV6_MIN_MTU) {
945 if (net_ratelimit())
946 printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
947 pmtu);
948 /* According to RFC1981, the PMTU is set to the IPv6 minimum
949 link MTU if the node receives a Packet Too Big message
950 reporting next-hop MTU that is less than the IPv6 minimum MTU.
952 pmtu = IPV6_MIN_MTU;
955 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
957 if (rt == NULL)
958 return;
960 if (pmtu >= dst_pmtu(&rt->u.dst))
961 goto out;
963 /* New mtu received -> path was valid.
964 They are sent only in response to data packets,
965 so that this nexthop apparently is reachable. --ANK
967 dst_confirm(&rt->u.dst);
969 /* Host route. If it is static, it would be better
970 not to override it, but add new one, so that
971 when cache entry will expire old pmtu
972 would return automatically.
974 if (rt->rt6i_flags & RTF_CACHE) {
975 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
976 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
977 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
978 goto out;
981 /* Network route.
982 Two cases are possible:
983 1. It is connected route. Action: COW
984 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
986 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
987 nrt = rt6_cow(rt, daddr, saddr);
988 if (!nrt->u.dst.error) {
989 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
990 /* According to RFC 1981, detecting PMTU increase shouldn't be
991 happened within 5 mins, the recommended timer is 10 mins.
992 Here this route expiration time is set to ip6_rt_mtu_expires
993 which is 10 mins. After 10 mins the decreased pmtu is expired
994 and detecting PMTU increase will be automatically happened.
996 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
997 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
998 dst_release(&nrt->u.dst);
1000 } else {
1001 nrt = ip6_rt_copy(rt);
1002 if (nrt == NULL)
1003 goto out;
1004 ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
1005 nrt->rt6i_dst.plen = 128;
1006 nrt->u.dst.flags |= DST_HOST;
1007 nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
1008 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1009 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES;
1010 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1011 rt6_ins(nrt);
1014 out:
1015 dst_release(&rt->u.dst);
1019 * Misc support functions
1022 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1024 struct rt6_info *rt;
1026 rt = dst_alloc(&ip6_dst_ops);
1028 if (rt) {
1029 rt->u.dst.input = ort->u.dst.input;
1030 rt->u.dst.output = ort->u.dst.output;
1032 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1033 rt->u.dst.dev = ort->u.dst.dev;
1034 if (rt->u.dst.dev)
1035 dev_hold(rt->u.dst.dev);
1036 rt->u.dst.lastuse = jiffies;
1037 rt->rt6i_hoplimit = ort->rt6i_hoplimit;
1038 rt->rt6i_expires = 0;
1040 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1041 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1042 rt->rt6i_metric = 0;
1044 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1045 #ifdef CONFIG_IPV6_SUBTREES
1046 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1047 #endif
1049 return rt;
1052 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1054 struct rt6_info *rt;
1055 struct fib6_node *fn;
1057 fn = &ip6_routing_table;
1059 write_lock_bh(&rt6_lock);
1060 for (rt = fn->leaf; rt; rt=rt->u.next) {
1061 if (dev == rt->rt6i_dev &&
1062 ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0)
1063 break;
1065 if (rt)
1066 dst_clone(&rt->u.dst);
1067 write_unlock_bh(&rt6_lock);
1068 return rt;
1071 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1072 struct net_device *dev)
1074 struct in6_rtmsg rtmsg;
1076 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1077 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1078 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1079 rtmsg.rtmsg_metric = 1024;
1080 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP;
1082 rtmsg.rtmsg_ifindex = dev->ifindex;
1084 ip6_route_add(&rtmsg);
1085 return rt6_get_dflt_router(gwaddr, dev);
1088 void rt6_purge_dflt_routers(int last_resort)
1090 struct rt6_info *rt;
1091 u32 flags;
1093 if (last_resort)
1094 flags = RTF_ALLONLINK;
1095 else
1096 flags = RTF_DEFAULT | RTF_ADDRCONF;
1098 restart:
1099 read_lock_bh(&rt6_lock);
1100 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1101 if (rt->rt6i_flags & flags) {
1102 dst_hold(&rt->u.dst);
1104 spin_lock_bh(&rt6_dflt_lock);
1105 rt6_dflt_pointer = NULL;
1106 spin_unlock_bh(&rt6_dflt_lock);
1108 read_unlock_bh(&rt6_lock);
1110 ip6_del_rt(rt);
1112 goto restart;
1115 read_unlock_bh(&rt6_lock);
1118 int ipv6_route_ioctl(unsigned int cmd, void *arg)
1120 struct in6_rtmsg rtmsg;
1121 int err;
1123 switch(cmd) {
1124 case SIOCADDRT: /* Add a route */
1125 case SIOCDELRT: /* Delete a route */
1126 if (!capable(CAP_NET_ADMIN))
1127 return -EPERM;
1128 err = copy_from_user(&rtmsg, arg,
1129 sizeof(struct in6_rtmsg));
1130 if (err)
1131 return -EFAULT;
1133 rtnl_lock();
1134 switch (cmd) {
1135 case SIOCADDRT:
1136 err = ip6_route_add(&rtmsg);
1137 break;
1138 case SIOCDELRT:
1139 err = ip6_route_del(&rtmsg);
1140 break;
1141 default:
1142 err = -EINVAL;
1144 rtnl_unlock();
1146 return err;
1149 return -EINVAL;
1153 * Drop the packet on the floor
1156 int ip6_pkt_discard(struct sk_buff *skb)
1158 IP6_INC_STATS(Ip6OutNoRoutes);
1159 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
1160 kfree_skb(skb);
1161 return 0;
1165 * Add address
1168 int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev)
1170 struct rt6_info *rt;
1172 rt = dst_alloc(&ip6_dst_ops);
1173 if (rt == NULL)
1174 return -ENOMEM;
1176 rt->u.dst.flags = DST_HOST;
1177 rt->u.dst.input = ip6_input;
1178 rt->u.dst.output = ip6_output;
1179 rt->rt6i_dev = dev_get_by_name("lo");
1180 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1181 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, dst_pmtu(&rt->u.dst) - 60, ip6_rt_min_advmss);
1182 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535-20)
1183 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535;
1184 rt->rt6i_hoplimit = ipv6_get_hoplimit(rt->rt6i_dev);
1185 rt->u.dst.obsolete = -1;
1187 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1188 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1189 if (rt->rt6i_nexthop == NULL) {
1190 dst_free((struct dst_entry *) rt);
1191 return -ENOMEM;
1194 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1195 rt->rt6i_dst.plen = 128;
1196 rt6_ins(rt);
1198 return 0;
1201 /* Delete address. Warning: you should check that this address
1202 disappeared before calling this function.
1205 int ip6_rt_addr_del(struct in6_addr *addr, struct net_device *dev)
1207 struct rt6_info *rt;
1208 int err = -ENOENT;
1210 rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
1211 if (rt) {
1212 if (rt->rt6i_dst.plen == 128)
1213 err = ip6_del_rt(rt);
1214 else
1215 dst_release(&rt->u.dst);
1218 return err;
1221 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1223 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1224 rt != &ip6_null_entry) {
1225 RT6_TRACE("deleted by ifdown %p\n", rt);
1226 return -1;
1228 return 0;
1231 void rt6_ifdown(struct net_device *dev)
1233 write_lock_bh(&rt6_lock);
1234 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1235 write_unlock_bh(&rt6_lock);
1238 struct rt6_mtu_change_arg
1240 struct net_device *dev;
1241 unsigned mtu;
1244 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1246 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1247 struct inet6_dev *idev;
1249 /* In IPv6 pmtu discovery is not optional,
1250 so that RTAX_MTU lock cannot disable it.
1251 We still use this lock to block changes
1252 caused by addrconf/ndisc.
1255 idev = __in6_dev_get(arg->dev);
1256 if (idev == NULL)
1257 return 0;
1259 /* For administrative MTU increase, there is no way to discover
1260 IPv6 PMTU increase, so PMTU increase should be updated here.
1261 Since RFC 1981 doesn't include administrative MTU increase
1262 update PMTU increase is a MUST. (i.e. jumbo frame)
1265 If new MTU is less than route PMTU, this new MTU will be the
1266 lowest MTU in the path, update the route PMTU to refect PMTU
1267 decreases; if new MTU is greater than route PMTU, and the
1268 old MTU is the lowest MTU in the path, update the route PMTU
1269 to refect the increase. In this case if the other nodes' MTU
1270 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1271 PMTU discouvery.
1273 if (rt->rt6i_dev == arg->dev &&
1274 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1275 (dst_pmtu(&rt->u.dst) > arg->mtu ||
1276 (dst_pmtu(&rt->u.dst) < arg->mtu &&
1277 dst_pmtu(&rt->u.dst) == idev->cnf.mtu6)))
1278 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1279 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, arg->mtu - 60, ip6_rt_min_advmss);
1280 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535-20)
1281 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535;
1282 return 0;
1285 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1287 struct rt6_mtu_change_arg arg;
1289 arg.dev = dev;
1290 arg.mtu = mtu;
1291 read_lock_bh(&rt6_lock);
1292 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1293 read_unlock_bh(&rt6_lock);
1296 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1297 struct in6_rtmsg *rtmsg)
1299 memset(rtmsg, 0, sizeof(*rtmsg));
1301 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1302 rtmsg->rtmsg_src_len = r->rtm_src_len;
1303 rtmsg->rtmsg_flags = RTF_UP;
1304 if (r->rtm_type == RTN_UNREACHABLE)
1305 rtmsg->rtmsg_flags |= RTF_REJECT;
1307 if (rta[RTA_GATEWAY-1]) {
1308 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1309 return -EINVAL;
1310 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1311 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1313 if (rta[RTA_DST-1]) {
1314 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1315 return -EINVAL;
1316 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1318 if (rta[RTA_SRC-1]) {
1319 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1320 return -EINVAL;
1321 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1323 if (rta[RTA_OIF-1]) {
1324 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1325 return -EINVAL;
1326 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1328 if (rta[RTA_PRIORITY-1]) {
1329 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1330 return -EINVAL;
1331 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1333 return 0;
1336 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1338 struct rtmsg *r = NLMSG_DATA(nlh);
1339 struct in6_rtmsg rtmsg;
1341 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1342 return -EINVAL;
1343 return ip6_route_del(&rtmsg);
1346 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1348 struct rtmsg *r = NLMSG_DATA(nlh);
1349 struct in6_rtmsg rtmsg;
1351 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1352 return -EINVAL;
1353 return ip6_route_add(&rtmsg);
1356 struct rt6_rtnl_dump_arg
1358 struct sk_buff *skb;
1359 struct netlink_callback *cb;
1362 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1363 struct in6_addr *dst,
1364 struct in6_addr *src,
1365 int iif,
1366 int type, u32 pid, u32 seq)
1368 struct rtmsg *rtm;
1369 struct nlmsghdr *nlh;
1370 unsigned char *b = skb->tail;
1371 struct rta_cacheinfo ci;
1373 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
1374 rtm = NLMSG_DATA(nlh);
1375 rtm->rtm_family = AF_INET6;
1376 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1377 rtm->rtm_src_len = rt->rt6i_src.plen;
1378 rtm->rtm_tos = 0;
1379 rtm->rtm_table = RT_TABLE_MAIN;
1380 if (rt->rt6i_flags&RTF_REJECT)
1381 rtm->rtm_type = RTN_UNREACHABLE;
1382 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1383 rtm->rtm_type = RTN_LOCAL;
1384 else
1385 rtm->rtm_type = RTN_UNICAST;
1386 rtm->rtm_flags = 0;
1387 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1388 rtm->rtm_protocol = RTPROT_BOOT;
1389 if (rt->rt6i_flags&RTF_DYNAMIC)
1390 rtm->rtm_protocol = RTPROT_REDIRECT;
1391 else if (rt->rt6i_flags&(RTF_ADDRCONF|RTF_ALLONLINK))
1392 rtm->rtm_protocol = RTPROT_KERNEL;
1393 else if (rt->rt6i_flags&RTF_DEFAULT)
1394 rtm->rtm_protocol = RTPROT_RA;
1396 if (rt->rt6i_flags&RTF_CACHE)
1397 rtm->rtm_flags |= RTM_F_CLONED;
1399 if (dst) {
1400 RTA_PUT(skb, RTA_DST, 16, dst);
1401 rtm->rtm_dst_len = 128;
1402 } else if (rtm->rtm_dst_len)
1403 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1404 #ifdef CONFIG_IPV6_SUBTREES
1405 if (src) {
1406 RTA_PUT(skb, RTA_SRC, 16, src);
1407 rtm->rtm_src_len = 128;
1408 } else if (rtm->rtm_src_len)
1409 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1410 #endif
1411 if (iif)
1412 RTA_PUT(skb, RTA_IIF, 4, &iif);
1413 else if (dst) {
1414 struct in6_addr saddr_buf;
1415 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1416 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1418 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1419 goto rtattr_failure;
1420 if (rt->u.dst.neighbour)
1421 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1422 if (rt->u.dst.dev)
1423 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1424 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1425 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1426 if (rt->rt6i_expires)
1427 ci.rta_expires = rt->rt6i_expires - jiffies;
1428 else
1429 ci.rta_expires = 0;
1430 ci.rta_used = rt->u.dst.__use;
1431 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1432 ci.rta_error = rt->u.dst.error;
1433 ci.rta_id = 0;
1434 ci.rta_ts = 0;
1435 ci.rta_tsage = 0;
1436 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1437 nlh->nlmsg_len = skb->tail - b;
1438 return skb->len;
1440 nlmsg_failure:
1441 rtattr_failure:
1442 skb_trim(skb, b - skb->data);
1443 return -1;
1446 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1448 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1450 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1451 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq);
1454 static int fib6_dump_node(struct fib6_walker_t *w)
1456 int res;
1457 struct rt6_info *rt;
1459 for (rt = w->leaf; rt; rt = rt->u.next) {
1460 res = rt6_dump_route(rt, w->args);
1461 if (res < 0) {
1462 /* Frame is full, suspend walking */
1463 w->leaf = rt;
1464 return 1;
1466 BUG_TRAP(res!=0);
1468 w->leaf = NULL;
1469 return 0;
1472 static void fib6_dump_end(struct netlink_callback *cb)
1474 struct fib6_walker_t *w = (void*)cb->args[0];
1476 if (w) {
1477 cb->args[0] = 0;
1478 fib6_walker_unlink(w);
1479 kfree(w);
1481 if (cb->args[1]) {
1482 cb->done = (void*)cb->args[1];
1483 cb->args[1] = 0;
1487 static int fib6_dump_done(struct netlink_callback *cb)
1489 fib6_dump_end(cb);
1490 return cb->done(cb);
1493 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1495 struct rt6_rtnl_dump_arg arg;
1496 struct fib6_walker_t *w;
1497 int res;
1499 arg.skb = skb;
1500 arg.cb = cb;
1502 w = (void*)cb->args[0];
1503 if (w == NULL) {
1504 /* New dump:
1506 * 1. hook callback destructor.
1508 cb->args[1] = (long)cb->done;
1509 cb->done = fib6_dump_done;
1512 * 2. allocate and initialize walker.
1514 w = kmalloc(sizeof(*w), GFP_ATOMIC);
1515 if (w == NULL)
1516 return -ENOMEM;
1517 RT6_TRACE("dump<%p", w);
1518 memset(w, 0, sizeof(*w));
1519 w->root = &ip6_routing_table;
1520 w->func = fib6_dump_node;
1521 w->args = &arg;
1522 cb->args[0] = (long)w;
1523 read_lock_bh(&rt6_lock);
1524 res = fib6_walk(w);
1525 read_unlock_bh(&rt6_lock);
1526 } else {
1527 w->args = &arg;
1528 read_lock_bh(&rt6_lock);
1529 res = fib6_walk_continue(w);
1530 read_unlock_bh(&rt6_lock);
1532 #if RT6_DEBUG >= 3
1533 if (res <= 0 && skb->len == 0)
1534 RT6_TRACE("%p>dump end\n", w);
1535 #endif
1536 res = res < 0 ? res : skb->len;
1537 /* res < 0 is an error. (really, impossible)
1538 res == 0 means that dump is complete, but skb still can contain data.
1539 res > 0 dump is not complete, but frame is full.
1541 /* Destroy walker, if dump of this table is complete. */
1542 if (res <= 0)
1543 fib6_dump_end(cb);
1544 return res;
1547 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1549 struct rtattr **rta = arg;
1550 int iif = 0;
1551 int err;
1552 struct sk_buff *skb;
1553 struct flowi fl;
1554 struct rt6_info *rt;
1556 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1557 if (skb == NULL)
1558 return -ENOBUFS;
1560 /* Reserve room for dummy headers, this skb can pass
1561 through good chunk of routing engine.
1563 skb->mac.raw = skb->data;
1564 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1566 fl.proto = 0;
1567 fl.nl_u.ip6_u.daddr = NULL;
1568 fl.nl_u.ip6_u.saddr = NULL;
1569 fl.uli_u.icmpt.type = 0;
1570 fl.uli_u.icmpt.code = 0;
1571 if (rta[RTA_SRC-1])
1572 fl.nl_u.ip6_u.saddr = (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]);
1573 if (rta[RTA_DST-1])
1574 fl.nl_u.ip6_u.daddr = (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]);
1576 if (rta[RTA_IIF-1])
1577 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1579 if (iif) {
1580 struct net_device *dev;
1581 dev = __dev_get_by_index(iif);
1582 if (!dev)
1583 return -ENODEV;
1586 fl.oif = 0;
1587 if (rta[RTA_OIF-1])
1588 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1590 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1592 skb->dst = &rt->u.dst;
1594 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1595 err = rt6_fill_node(skb, rt,
1596 fl.nl_u.ip6_u.daddr,
1597 fl.nl_u.ip6_u.saddr,
1598 iif,
1599 RTM_NEWROUTE, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq);
1600 if (err < 0)
1601 return -EMSGSIZE;
1603 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1604 if (err < 0)
1605 return err;
1606 return 0;
1609 void inet6_rt_notify(int event, struct rt6_info *rt)
1611 struct sk_buff *skb;
1612 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1614 skb = alloc_skb(size, gfp_any());
1615 if (!skb) {
1616 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1617 return;
1619 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0) < 0) {
1620 kfree_skb(skb);
1621 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1622 return;
1624 NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
1625 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
1629 * /proc
1632 #ifdef CONFIG_PROC_FS
1634 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
1636 struct rt6_proc_arg
1638 char *buffer;
1639 int offset;
1640 int length;
1641 int skip;
1642 int len;
1645 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
1647 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
1648 int i;
1650 if (arg->skip < arg->offset / RT6_INFO_LEN) {
1651 arg->skip++;
1652 return 0;
1655 if (arg->len >= arg->length)
1656 return 0;
1658 for (i=0; i<16; i++) {
1659 sprintf(arg->buffer + arg->len, "%02x",
1660 rt->rt6i_dst.addr.s6_addr[i]);
1661 arg->len += 2;
1663 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1664 rt->rt6i_dst.plen);
1666 #ifdef CONFIG_IPV6_SUBTREES
1667 for (i=0; i<16; i++) {
1668 sprintf(arg->buffer + arg->len, "%02x",
1669 rt->rt6i_src.addr.s6_addr[i]);
1670 arg->len += 2;
1672 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1673 rt->rt6i_src.plen);
1674 #else
1675 sprintf(arg->buffer + arg->len,
1676 "00000000000000000000000000000000 00 ");
1677 arg->len += 36;
1678 #endif
1680 if (rt->rt6i_nexthop) {
1681 for (i=0; i<16; i++) {
1682 sprintf(arg->buffer + arg->len, "%02x",
1683 rt->rt6i_nexthop->primary_key[i]);
1684 arg->len += 2;
1686 } else {
1687 sprintf(arg->buffer + arg->len,
1688 "00000000000000000000000000000000");
1689 arg->len += 32;
1691 arg->len += sprintf(arg->buffer + arg->len,
1692 " %08x %08x %08x %08x %8s\n",
1693 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
1694 rt->u.dst.__use, rt->rt6i_flags,
1695 rt->rt6i_dev ? rt->rt6i_dev->name : "");
1696 return 0;
1699 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1701 struct rt6_proc_arg arg;
1702 arg.buffer = buffer;
1703 arg.offset = offset;
1704 arg.length = length;
1705 arg.skip = 0;
1706 arg.len = 0;
1708 read_lock_bh(&rt6_lock);
1709 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
1710 read_unlock_bh(&rt6_lock);
1712 *start = buffer;
1713 if (offset)
1714 *start += offset % RT6_INFO_LEN;
1716 arg.len -= offset % RT6_INFO_LEN;
1718 if (arg.len > length)
1719 arg.len = length;
1720 if (arg.len < 0)
1721 arg.len = 0;
1723 return arg.len;
1726 extern struct rt6_statistics rt6_stats;
1728 static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length)
1730 int len;
1732 len = sprintf(buffer, "%04x %04x %04x %04x %04x %04x\n",
1733 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
1734 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
1735 rt6_stats.fib_rt_cache,
1736 atomic_read(&ip6_dst_ops.entries));
1738 len -= offset;
1740 if (len > length)
1741 len = length;
1742 if(len < 0)
1743 len = 0;
1745 *start = buffer + offset;
1747 return len;
1749 #endif /* CONFIG_PROC_FS */
1751 #ifdef CONFIG_SYSCTL
1753 static int flush_delay;
1755 static
1756 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1757 void *buffer, size_t *lenp)
1759 if (write) {
1760 proc_dointvec(ctl, write, filp, buffer, lenp);
1761 if (flush_delay < 0)
1762 flush_delay = 0;
1763 fib6_run_gc((unsigned long)flush_delay);
1764 return 0;
1765 } else
1766 return -EINVAL;
1769 ctl_table ipv6_route_table[] = {
1770 {NET_IPV6_ROUTE_FLUSH, "flush",
1771 &flush_delay, sizeof(int), 0644, NULL,
1772 &ipv6_sysctl_rtcache_flush},
1773 {NET_IPV6_ROUTE_GC_THRESH, "gc_thresh",
1774 &ip6_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1775 &proc_dointvec},
1776 {NET_IPV6_ROUTE_MAX_SIZE, "max_size",
1777 &ip6_rt_max_size, sizeof(int), 0644, NULL,
1778 &proc_dointvec},
1779 {NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1780 &ip6_rt_gc_min_interval, sizeof(int), 0644, NULL,
1781 &proc_dointvec_jiffies, &sysctl_jiffies},
1782 {NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout",
1783 &ip6_rt_gc_timeout, sizeof(int), 0644, NULL,
1784 &proc_dointvec_jiffies, &sysctl_jiffies},
1785 {NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval",
1786 &ip6_rt_gc_interval, sizeof(int), 0644, NULL,
1787 &proc_dointvec_jiffies, &sysctl_jiffies},
1788 {NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity",
1789 &ip6_rt_gc_elasticity, sizeof(int), 0644, NULL,
1790 &proc_dointvec_jiffies, &sysctl_jiffies},
1791 {NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires",
1792 &ip6_rt_mtu_expires, sizeof(int), 0644, NULL,
1793 &proc_dointvec_jiffies, &sysctl_jiffies},
1794 {NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss",
1795 &ip6_rt_min_advmss, sizeof(int), 0644, NULL,
1796 &proc_dointvec_jiffies, &sysctl_jiffies},
1800 #endif
1803 void __init ip6_route_init(void)
1805 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
1806 sizeof(struct rt6_info),
1807 0, SLAB_HWCACHE_ALIGN,
1808 NULL, NULL);
1809 fib6_init();
1810 #ifdef CONFIG_PROC_FS
1811 proc_net_create("ipv6_route", 0, rt6_proc_info);
1812 proc_net_create("rt6_stats", 0, rt6_proc_stats);
1813 #endif
1816 #ifdef MODULE
1817 void ip6_route_cleanup(void)
1819 #ifdef CONFIG_PROC_FS
1820 proc_net_remove("ipv6_route");
1821 proc_net_remove("rt6_stats");
1822 #endif
1824 rt6_ifdown(NULL);
1825 fib6_gc_cleanup();
1827 #endif /* MODULE */