[NET]: Allow XFRM subsystem to be optional.
[linux-2.6/history.git] / net / ipv6 / route.c
blob692c0477837c738f4cb3bb931845f5f8b95afb11
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 /* Changes:
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/config.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/init.h>
37 #include <linux/netlink.h>
38 #include <linux/if_arp.h>
40 #ifdef CONFIG_PROC_FS
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #endif
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
56 #include <asm/uaccess.h>
58 #ifdef CONFIG_SYSCTL
59 #include <linux/sysctl.h>
60 #endif
62 /* Set to 3 to get tracing. */
63 #define RT6_DEBUG 2
65 #if RT6_DEBUG >= 3
66 #define RDBG(x) printk x
67 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
68 #else
69 #define RDBG(x)
70 #define RT6_TRACE(x...) do { ; } while (0)
71 #endif
74 static int ip6_rt_max_size = 4096;
75 static int ip6_rt_gc_min_interval = HZ / 2;
76 static int ip6_rt_gc_timeout = 60*HZ;
77 int ip6_rt_gc_interval = 30*HZ;
78 static int ip6_rt_gc_elasticity = 9;
79 static int ip6_rt_mtu_expires = 10*60*HZ;
80 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
82 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static int ip6_dst_gc(void);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static void ip6_link_failure(struct sk_buff *skb);
89 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91 static struct dst_ops ip6_dst_ops = {
92 .family = AF_INET6,
93 .protocol = __constant_htons(ETH_P_IPV6),
94 .gc = ip6_dst_gc,
95 .gc_thresh = 1024,
96 .check = ip6_dst_check,
97 .negative_advice = ip6_negative_advice,
98 .link_failure = ip6_link_failure,
99 .update_pmtu = ip6_rt_update_pmtu,
100 .entry_size = sizeof(struct rt6_info),
103 struct rt6_info ip6_null_entry = {
104 .u = {
105 .dst = {
106 .__refcnt = ATOMIC_INIT(1),
107 .__use = 1,
108 .dev = &loopback_dev,
109 .obsolete = -1,
110 .error = -ENETUNREACH,
111 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
112 .input = ip6_pkt_discard,
113 .output = ip6_pkt_discard,
114 .ops = &ip6_dst_ops,
115 .path = (struct dst_entry*)&ip6_null_entry,
118 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
119 .rt6i_metric = ~(u32) 0,
120 .rt6i_ref = ATOMIC_INIT(1),
123 struct fib6_node ip6_routing_table = {
124 .leaf = &ip6_null_entry,
125 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
128 /* Protects all the ip6 fib */
130 rwlock_t rt6_lock = RW_LOCK_UNLOCKED;
133 /* allocate dst with ip6_dst_ops */
134 static __inline__ struct rt6_info *ip6_dst_alloc(void)
136 return dst_alloc(&ip6_dst_ops);
140 * Route lookup. Any rt6_lock is implied.
143 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
144 int oif,
145 int strict)
147 struct rt6_info *local = NULL;
148 struct rt6_info *sprt;
150 if (oif) {
151 for (sprt = rt; sprt; sprt = sprt->u.next) {
152 struct net_device *dev = sprt->rt6i_dev;
153 if (dev->ifindex == oif)
154 return sprt;
155 if (dev->flags&IFF_LOOPBACK)
156 local = sprt;
159 if (local)
160 return local;
162 if (strict)
163 return &ip6_null_entry;
165 return rt;
169 * pointer to the last default router chosen. BH is disabled locally.
171 static struct rt6_info *rt6_dflt_pointer;
172 static spinlock_t rt6_dflt_lock = SPIN_LOCK_UNLOCKED;
174 /* Default Router Selection (RFC 2461 6.3.6) */
175 static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif)
177 struct rt6_info *match = NULL;
178 struct rt6_info *sprt;
179 int mpri = 0;
181 for (sprt = rt; sprt; sprt = sprt->u.next) {
182 struct neighbour *neigh;
183 int m = 0;
185 if (!oif ||
186 (sprt->rt6i_dev &&
187 sprt->rt6i_dev->ifindex == oif))
188 m += 8;
190 if (sprt == rt6_dflt_pointer)
191 m += 4;
193 if ((neigh = sprt->rt6i_nexthop) != NULL) {
194 read_lock_bh(&neigh->lock);
195 switch (neigh->nud_state) {
196 case NUD_REACHABLE:
197 m += 3;
198 break;
200 case NUD_STALE:
201 case NUD_DELAY:
202 case NUD_PROBE:
203 m += 2;
204 break;
206 case NUD_NOARP:
207 case NUD_PERMANENT:
208 m += 1;
209 break;
211 case NUD_INCOMPLETE:
212 default:
213 read_unlock_bh(&neigh->lock);
214 continue;
216 read_unlock_bh(&neigh->lock);
217 } else {
218 continue;
221 if (m > mpri || m >= 12) {
222 match = sprt;
223 mpri = m;
224 if (m >= 12) {
225 /* we choose the lastest default router if it
226 * is in (probably) reachable state.
227 * If route changed, we should do pmtu
228 * discovery. --yoshfuji
230 break;
235 spin_lock(&rt6_dflt_lock);
236 if (!match) {
238 * No default routers are known to be reachable.
239 * SHOULD round robin
241 if (rt6_dflt_pointer) {
242 for (sprt = rt6_dflt_pointer->u.next;
243 sprt; sprt = sprt->u.next) {
244 if (sprt->u.dst.obsolete <= 0 &&
245 sprt->u.dst.error == 0) {
246 match = sprt;
247 break;
250 for (sprt = rt;
251 !match && sprt;
252 sprt = sprt->u.next) {
253 if (sprt->u.dst.obsolete <= 0 &&
254 sprt->u.dst.error == 0) {
255 match = sprt;
256 break;
258 if (sprt == rt6_dflt_pointer)
259 break;
264 if (match) {
265 if (rt6_dflt_pointer != match)
266 RT6_TRACE("changed default router: %p->%p\n",
267 rt6_dflt_pointer, match);
268 rt6_dflt_pointer = match;
270 spin_unlock(&rt6_dflt_lock);
272 if (!match) {
274 * Last Resort: if no default routers found,
275 * use addrconf default route.
276 * We don't record this route.
278 for (sprt = ip6_routing_table.leaf;
279 sprt; sprt = sprt->u.next) {
280 if ((sprt->rt6i_flags & RTF_DEFAULT) &&
281 (!oif ||
282 (sprt->rt6i_dev &&
283 sprt->rt6i_dev->ifindex == oif))) {
284 match = sprt;
285 break;
288 if (!match) {
289 /* no default route. give up. */
290 match = &ip6_null_entry;
294 return match;
297 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
298 int oif, int strict)
300 struct fib6_node *fn;
301 struct rt6_info *rt;
303 read_lock_bh(&rt6_lock);
304 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
305 rt = rt6_device_match(fn->leaf, oif, strict);
306 dst_hold(&rt->u.dst);
307 rt->u.dst.__use++;
308 read_unlock_bh(&rt6_lock);
310 rt->u.dst.lastuse = jiffies;
311 if (rt->u.dst.error == 0)
312 return rt;
313 dst_release(&rt->u.dst);
314 return NULL;
317 /* rt6_ins is called with FREE rt6_lock.
318 It takes new route entry, the addition fails by any reason the
319 route is freed. In any case, if caller does not hold it, it may
320 be destroyed.
323 static int rt6_ins(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
325 int err;
327 write_lock_bh(&rt6_lock);
328 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr);
329 write_unlock_bh(&rt6_lock);
331 return err;
334 /* No rt6_lock! If COW failed, the function returns dead route entry
335 with dst->error set to errno value.
338 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
339 struct in6_addr *saddr)
341 int err;
342 struct rt6_info *rt;
345 * Clone the route.
348 rt = ip6_rt_copy(ort);
350 if (rt) {
351 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
353 if (!(rt->rt6i_flags&RTF_GATEWAY))
354 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
356 rt->rt6i_dst.plen = 128;
357 rt->rt6i_flags |= RTF_CACHE;
358 rt->u.dst.flags |= DST_HOST;
360 #ifdef CONFIG_IPV6_SUBTREES
361 if (rt->rt6i_src.plen && saddr) {
362 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
363 rt->rt6i_src.plen = 128;
365 #endif
367 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
369 dst_hold(&rt->u.dst);
371 err = rt6_ins(rt, NULL, NULL);
372 if (err == 0)
373 return rt;
375 rt->u.dst.error = err;
377 return rt;
379 dst_hold(&ip6_null_entry.u.dst);
380 return &ip6_null_entry;
383 #define BACKTRACK() \
384 if (rt == &ip6_null_entry && strict) { \
385 while ((fn = fn->parent) != NULL) { \
386 if (fn->fn_flags & RTN_ROOT) { \
387 dst_hold(&rt->u.dst); \
388 goto out; \
390 if (fn->fn_flags & RTN_RTINFO) \
391 goto restart; \
396 void ip6_route_input(struct sk_buff *skb)
398 struct fib6_node *fn;
399 struct rt6_info *rt;
400 int strict;
401 int attempts = 3;
403 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
405 relookup:
406 read_lock_bh(&rt6_lock);
408 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
409 &skb->nh.ipv6h->saddr);
411 restart:
412 rt = fn->leaf;
414 if ((rt->rt6i_flags & RTF_CACHE)) {
415 rt = rt6_device_match(rt, skb->dev->ifindex, strict);
416 BACKTRACK();
417 dst_hold(&rt->u.dst);
418 goto out;
421 rt = rt6_device_match(rt, skb->dev->ifindex, 0);
422 BACKTRACK();
424 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
425 read_unlock_bh(&rt6_lock);
427 rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
428 &skb->nh.ipv6h->saddr);
430 if (rt->u.dst.error != -EEXIST || --attempts <= 0)
431 goto out2;
432 /* Race condition! In the gap, when rt6_lock was
433 released someone could insert this route. Relookup.
435 goto relookup;
437 dst_hold(&rt->u.dst);
439 out:
440 read_unlock_bh(&rt6_lock);
441 out2:
442 rt->u.dst.lastuse = jiffies;
443 rt->u.dst.__use++;
444 skb->dst = (struct dst_entry *) rt;
447 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
449 struct fib6_node *fn;
450 struct rt6_info *rt;
451 int strict;
452 int attempts = 3;
454 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
456 relookup:
457 read_lock_bh(&rt6_lock);
459 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
461 restart:
462 rt = fn->leaf;
464 if ((rt->rt6i_flags & RTF_CACHE)) {
465 rt = rt6_device_match(rt, fl->oif, strict);
466 BACKTRACK();
467 dst_hold(&rt->u.dst);
468 goto out;
470 if (rt->rt6i_flags & RTF_DEFAULT) {
471 if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
472 rt = rt6_best_dflt(rt, fl->oif);
473 } else {
474 rt = rt6_device_match(rt, fl->oif, strict);
475 BACKTRACK();
478 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
479 read_unlock_bh(&rt6_lock);
481 rt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src);
483 if (rt->u.dst.error != -EEXIST || --attempts <= 0)
484 goto out2;
486 /* Race condition! In the gap, when rt6_lock was
487 released someone could insert this route. Relookup.
489 goto relookup;
491 dst_hold(&rt->u.dst);
493 out:
494 read_unlock_bh(&rt6_lock);
495 out2:
496 rt->u.dst.lastuse = jiffies;
497 rt->u.dst.__use++;
498 return &rt->u.dst;
503 * Destination cache support functions
506 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
508 struct rt6_info *rt;
510 rt = (struct rt6_info *) dst;
512 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
513 return dst;
515 dst_release(dst);
516 return NULL;
519 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
521 struct rt6_info *rt = (struct rt6_info *) dst;
523 if (rt) {
524 if (rt->rt6i_flags & RTF_CACHE)
525 ip6_del_rt(rt, NULL, NULL);
526 else
527 dst_release(dst);
529 return NULL;
532 static void ip6_link_failure(struct sk_buff *skb)
534 struct rt6_info *rt;
536 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
538 rt = (struct rt6_info *) skb->dst;
539 if (rt) {
540 if (rt->rt6i_flags&RTF_CACHE) {
541 dst_set_expires(&rt->u.dst, 0);
542 rt->rt6i_flags |= RTF_EXPIRES;
543 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
544 rt->rt6i_node->fn_sernum = -1;
548 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
550 struct rt6_info *rt6 = (struct rt6_info*)dst;
552 if (mtu < dst_pmtu(dst) && rt6->rt6i_dst.plen == 128) {
553 rt6->rt6i_flags |= RTF_MODIFIED;
554 dst->metrics[RTAX_MTU-1] = mtu;
558 /* Protected by rt6_lock. */
559 static struct dst_entry *ndisc_dst_gc_list;
561 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
562 struct neighbour *neigh,
563 int (*output)(struct sk_buff *))
565 struct rt6_info *rt = ip6_dst_alloc();
567 if (unlikely(rt == NULL))
568 goto out;
570 if (dev)
571 dev_hold(dev);
572 if (neigh)
573 neigh_hold(neigh);
575 rt->rt6i_dev = dev;
576 rt->rt6i_nexthop = neigh;
577 rt->rt6i_expires = 0;
578 rt->rt6i_flags = RTF_LOCAL;
579 rt->rt6i_metric = 0;
580 atomic_set(&rt->u.dst.__refcnt, 1);
581 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
582 rt->u.dst.output = output;
584 write_lock_bh(&rt6_lock);
585 rt->u.dst.next = ndisc_dst_gc_list;
586 ndisc_dst_gc_list = &rt->u.dst;
587 write_unlock_bh(&rt6_lock);
589 fib6_force_start_gc();
591 out:
592 return (struct dst_entry *)rt;
595 int ndisc_dst_gc(int *more)
597 struct dst_entry *dst, *next, **pprev;
598 int freed;
600 next = NULL;
601 pprev = &ndisc_dst_gc_list;
602 freed = 0;
603 while ((dst = *pprev) != NULL) {
604 if (!atomic_read(&dst->__refcnt)) {
605 *pprev = dst->next;
606 dst_free(dst);
607 freed++;
608 } else {
609 pprev = &dst->next;
610 (*more)++;
614 return freed;
617 static int ip6_dst_gc(void)
619 static unsigned expire = 30*HZ;
620 static unsigned long last_gc;
621 unsigned long now = jiffies;
623 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
624 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
625 goto out;
627 expire++;
628 fib6_run_gc(expire);
629 last_gc = now;
630 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
631 expire = ip6_rt_gc_timeout>>1;
633 out:
634 expire -= expire>>ip6_rt_gc_elasticity;
635 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
638 /* Clean host part of a prefix. Not necessary in radix tree,
639 but results in cleaner routing tables.
641 Remove it only when all the things will work!
644 static int ipv6_get_mtu(struct net_device *dev)
646 int mtu = IPV6_MIN_MTU;
647 struct inet6_dev *idev;
649 idev = in6_dev_get(dev);
650 if (idev) {
651 mtu = idev->cnf.mtu6;
652 in6_dev_put(idev);
654 return mtu;
657 static inline unsigned int ipv6_advmss(unsigned int mtu)
659 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
661 if (mtu < ip6_rt_min_advmss)
662 mtu = ip6_rt_min_advmss;
665 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
666 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
667 * IPV6_MAXPLEN is also valid and means: "any MSS,
668 * rely only on pmtu discovery"
670 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
671 mtu = IPV6_MAXPLEN;
672 return mtu;
675 static int ipv6_get_hoplimit(struct net_device *dev)
677 int hoplimit = ipv6_devconf.hop_limit;
678 struct inet6_dev *idev;
680 idev = in6_dev_get(dev);
681 if (idev) {
682 hoplimit = idev->cnf.hop_limit;
683 in6_dev_put(idev);
685 return hoplimit;
692 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
694 int err;
695 struct rtmsg *r;
696 struct rtattr **rta;
697 struct rt6_info *rt;
698 struct net_device *dev = NULL;
699 int addr_type;
701 rta = (struct rtattr **) _rtattr;
703 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
704 return -EINVAL;
705 #ifndef CONFIG_IPV6_SUBTREES
706 if (rtmsg->rtmsg_src_len)
707 return -EINVAL;
708 #endif
709 if (rtmsg->rtmsg_metric == 0)
710 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
712 rt = ip6_dst_alloc();
714 if (rt == NULL)
715 return -ENOMEM;
717 rt->u.dst.obsolete = -1;
718 rt->rt6i_expires = rtmsg->rtmsg_info;
719 if (nlh && (r = NLMSG_DATA(nlh))) {
720 rt->rt6i_protocol = r->rtm_protocol;
721 } else {
722 rt->rt6i_protocol = RTPROT_BOOT;
725 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
727 if (addr_type & IPV6_ADDR_MULTICAST)
728 rt->u.dst.input = ip6_mc_input;
729 else
730 rt->u.dst.input = ip6_forward;
732 rt->u.dst.output = ip6_output;
734 if (rtmsg->rtmsg_ifindex) {
735 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
736 err = -ENODEV;
737 if (dev == NULL)
738 goto out;
741 ipv6_addr_prefix(&rt->rt6i_dst.addr,
742 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
743 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
744 if (rt->rt6i_dst.plen == 128)
745 rt->u.dst.flags = DST_HOST;
747 #ifdef CONFIG_IPV6_SUBTREES
748 ipv6_addr_prefix(&rt->rt6i_src.addr,
749 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
750 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
751 #endif
753 rt->rt6i_metric = rtmsg->rtmsg_metric;
755 /* We cannot add true routes via loopback here,
756 they would result in kernel looping; promote them to reject routes
758 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
759 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
760 if (dev)
761 dev_put(dev);
762 dev = &loopback_dev;
763 dev_hold(dev);
764 rt->u.dst.output = ip6_pkt_discard;
765 rt->u.dst.input = ip6_pkt_discard;
766 rt->u.dst.error = -ENETUNREACH;
767 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
768 goto install_route;
771 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
772 struct in6_addr *gw_addr;
773 int gwa_type;
775 gw_addr = &rtmsg->rtmsg_gateway;
776 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
777 gwa_type = ipv6_addr_type(gw_addr);
779 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
780 struct rt6_info *grt;
782 /* IPv6 strictly inhibits using not link-local
783 addresses as nexthop address.
784 Otherwise, router will not able to send redirects.
785 It is very good, but in some (rare!) curcumstances
786 (SIT, PtP, NBMA NOARP links) it is handy to allow
787 some exceptions. --ANK
789 err = -EINVAL;
790 if (!(gwa_type&IPV6_ADDR_UNICAST))
791 goto out;
793 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
795 err = -EHOSTUNREACH;
796 if (grt == NULL)
797 goto out;
798 if (dev) {
799 if (dev != grt->rt6i_dev) {
800 dst_release(&grt->u.dst);
801 goto out;
803 } else {
804 dev = grt->rt6i_dev;
805 dev_hold(dev);
807 if (!(grt->rt6i_flags&RTF_GATEWAY))
808 err = 0;
809 dst_release(&grt->u.dst);
811 if (err)
812 goto out;
814 err = -EINVAL;
815 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
816 goto out;
819 err = -ENODEV;
820 if (dev == NULL)
821 goto out;
823 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
824 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
825 if (IS_ERR(rt->rt6i_nexthop)) {
826 err = PTR_ERR(rt->rt6i_nexthop);
827 rt->rt6i_nexthop = NULL;
828 goto out;
832 rt->rt6i_flags = rtmsg->rtmsg_flags;
834 install_route:
835 if (rta && rta[RTA_METRICS-1]) {
836 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
837 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
839 while (RTA_OK(attr, attrlen)) {
840 unsigned flavor = attr->rta_type;
841 if (flavor) {
842 if (flavor > RTAX_MAX) {
843 err = -EINVAL;
844 goto out;
846 rt->u.dst.metrics[flavor-1] =
847 *(u32 *)RTA_DATA(attr);
849 attr = RTA_NEXT(attr, attrlen);
853 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) {
854 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
855 rt->u.dst.metrics[RTAX_HOPLIMIT-1] =
856 IPV6_DEFAULT_MCASTHOPS;
857 else
858 rt->u.dst.metrics[RTAX_HOPLIMIT-1] =
859 ipv6_get_hoplimit(dev);
862 if (!rt->u.dst.metrics[RTAX_MTU-1])
863 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
864 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
865 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
866 rt->u.dst.dev = dev;
867 return rt6_ins(rt, nlh, _rtattr);
869 out:
870 if (dev)
871 dev_put(dev);
872 dst_free((struct dst_entry *) rt);
873 return err;
876 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
878 int err;
880 write_lock_bh(&rt6_lock);
882 spin_lock_bh(&rt6_dflt_lock);
883 rt6_dflt_pointer = NULL;
884 spin_unlock_bh(&rt6_dflt_lock);
886 dst_release(&rt->u.dst);
888 err = fib6_del(rt, nlh, _rtattr);
889 write_unlock_bh(&rt6_lock);
891 return err;
894 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
896 struct fib6_node *fn;
897 struct rt6_info *rt;
898 int err = -ESRCH;
900 read_lock_bh(&rt6_lock);
902 fn = fib6_locate(&ip6_routing_table,
903 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
904 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
906 if (fn) {
907 for (rt = fn->leaf; rt; rt = rt->u.next) {
908 if (rtmsg->rtmsg_ifindex &&
909 (rt->rt6i_dev == NULL ||
910 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
911 continue;
912 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
913 ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
914 continue;
915 if (rtmsg->rtmsg_metric &&
916 rtmsg->rtmsg_metric != rt->rt6i_metric)
917 continue;
918 dst_hold(&rt->u.dst);
919 read_unlock_bh(&rt6_lock);
921 return ip6_del_rt(rt, nlh, _rtattr);
924 read_unlock_bh(&rt6_lock);
926 return err;
930 * Handle redirects
932 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
933 struct neighbour *neigh, int on_link)
935 struct rt6_info *rt, *nrt;
937 /* Locate old route to this destination. */
938 rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
940 if (rt == NULL)
941 return;
943 if (neigh->dev != rt->rt6i_dev)
944 goto out;
946 /* Redirect received -> path was valid.
947 Look, redirects are sent only in response to data packets,
948 so that this nexthop apparently is reachable. --ANK
950 dst_confirm(&rt->u.dst);
952 /* Duplicate redirect: silently ignore. */
953 if (neigh == rt->u.dst.neighbour)
954 goto out;
956 /* Current route is on-link; redirect is always invalid.
958 Seems, previous statement is not true. It could
959 be node, which looks for us as on-link (f.e. proxy ndisc)
960 But then router serving it might decide, that we should
961 know truth 8)8) --ANK (980726).
963 if (!(rt->rt6i_flags&RTF_GATEWAY))
964 goto out;
967 * RFC 2461 specifies that redirects should only be
968 * accepted if they come from the nexthop to the target.
969 * Due to the way default routers are chosen, this notion
970 * is a bit fuzzy and one might need to check all default
971 * routers.
974 if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) {
975 if (rt->rt6i_flags & RTF_DEFAULT) {
976 struct rt6_info *rt1;
978 read_lock(&rt6_lock);
979 for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
980 if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) {
981 dst_hold(&rt1->u.dst);
982 dst_release(&rt->u.dst);
983 read_unlock(&rt6_lock);
984 rt = rt1;
985 goto source_ok;
988 read_unlock(&rt6_lock);
990 if (net_ratelimit())
991 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
992 "for redirect target\n");
993 goto out;
996 source_ok:
999 * We have finally decided to accept it.
1002 nrt = ip6_rt_copy(rt);
1003 if (nrt == NULL)
1004 goto out;
1006 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1007 if (on_link)
1008 nrt->rt6i_flags &= ~RTF_GATEWAY;
1010 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1011 nrt->rt6i_dst.plen = 128;
1012 nrt->u.dst.flags |= DST_HOST;
1014 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1015 nrt->rt6i_nexthop = neigh_clone(neigh);
1016 /* Reset pmtu, it may be better */
1017 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1018 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&nrt->u.dst));
1020 if (rt6_ins(nrt, NULL, NULL))
1021 goto out;
1023 if (rt->rt6i_flags&RTF_CACHE) {
1024 ip6_del_rt(rt, NULL, NULL);
1025 return;
1028 out:
1029 dst_release(&rt->u.dst);
1030 return;
1034 * Handle ICMP "packet too big" messages
1035 * i.e. Path MTU discovery
1038 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1039 struct net_device *dev, u32 pmtu)
1041 struct rt6_info *rt, *nrt;
1043 if (pmtu < IPV6_MIN_MTU) {
1044 if (net_ratelimit())
1045 printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
1046 pmtu);
1047 /* According to RFC1981, the PMTU is set to the IPv6 minimum
1048 link MTU if the node receives a Packet Too Big message
1049 reporting next-hop MTU that is less than the IPv6 minimum MTU.
1051 pmtu = IPV6_MIN_MTU;
1054 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1056 if (rt == NULL)
1057 return;
1059 if (pmtu >= dst_pmtu(&rt->u.dst))
1060 goto out;
1062 /* New mtu received -> path was valid.
1063 They are sent only in response to data packets,
1064 so that this nexthop apparently is reachable. --ANK
1066 dst_confirm(&rt->u.dst);
1068 /* Host route. If it is static, it would be better
1069 not to override it, but add new one, so that
1070 when cache entry will expire old pmtu
1071 would return automatically.
1073 if (rt->rt6i_flags & RTF_CACHE) {
1074 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1075 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1076 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1077 goto out;
1080 /* Network route.
1081 Two cases are possible:
1082 1. It is connected route. Action: COW
1083 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1085 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
1086 nrt = rt6_cow(rt, daddr, saddr);
1087 if (!nrt->u.dst.error) {
1088 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1089 /* According to RFC 1981, detecting PMTU increase shouldn't be
1090 happened within 5 mins, the recommended timer is 10 mins.
1091 Here this route expiration time is set to ip6_rt_mtu_expires
1092 which is 10 mins. After 10 mins the decreased pmtu is expired
1093 and detecting PMTU increase will be automatically happened.
1095 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1096 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1097 dst_release(&nrt->u.dst);
1099 } else {
1100 nrt = ip6_rt_copy(rt);
1101 if (nrt == NULL)
1102 goto out;
1103 ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
1104 nrt->rt6i_dst.plen = 128;
1105 nrt->u.dst.flags |= DST_HOST;
1106 nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
1107 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1108 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES;
1109 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1110 rt6_ins(nrt, NULL, NULL);
1113 out:
1114 dst_release(&rt->u.dst);
1118 * Misc support functions
1121 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1123 struct rt6_info *rt = ip6_dst_alloc();
1125 if (rt) {
1126 rt->u.dst.input = ort->u.dst.input;
1127 rt->u.dst.output = ort->u.dst.output;
1129 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1130 rt->u.dst.dev = ort->u.dst.dev;
1131 if (rt->u.dst.dev)
1132 dev_hold(rt->u.dst.dev);
1133 rt->u.dst.lastuse = jiffies;
1134 rt->rt6i_expires = 0;
1136 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1137 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1138 rt->rt6i_metric = 0;
1140 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1141 #ifdef CONFIG_IPV6_SUBTREES
1142 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1143 #endif
1145 return rt;
1148 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1150 struct rt6_info *rt;
1151 struct fib6_node *fn;
1153 fn = &ip6_routing_table;
1155 write_lock_bh(&rt6_lock);
1156 for (rt = fn->leaf; rt; rt=rt->u.next) {
1157 if (dev == rt->rt6i_dev &&
1158 ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0)
1159 break;
1161 if (rt)
1162 dst_hold(&rt->u.dst);
1163 write_unlock_bh(&rt6_lock);
1164 return rt;
1167 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1168 struct net_device *dev)
1170 struct in6_rtmsg rtmsg;
1172 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1173 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1174 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1175 rtmsg.rtmsg_metric = 1024;
1176 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP;
1178 rtmsg.rtmsg_ifindex = dev->ifindex;
1180 ip6_route_add(&rtmsg, NULL, NULL);
1181 return rt6_get_dflt_router(gwaddr, dev);
1184 void rt6_purge_dflt_routers(int last_resort)
1186 struct rt6_info *rt;
1187 u32 flags;
1189 if (last_resort)
1190 flags = RTF_ALLONLINK;
1191 else
1192 flags = RTF_DEFAULT | RTF_ADDRCONF;
1194 restart:
1195 read_lock_bh(&rt6_lock);
1196 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1197 if (rt->rt6i_flags & flags) {
1198 dst_hold(&rt->u.dst);
1200 spin_lock_bh(&rt6_dflt_lock);
1201 rt6_dflt_pointer = NULL;
1202 spin_unlock_bh(&rt6_dflt_lock);
1204 read_unlock_bh(&rt6_lock);
1206 ip6_del_rt(rt, NULL, NULL);
1208 goto restart;
1211 read_unlock_bh(&rt6_lock);
1214 int ipv6_route_ioctl(unsigned int cmd, void *arg)
1216 struct in6_rtmsg rtmsg;
1217 int err;
1219 switch(cmd) {
1220 case SIOCADDRT: /* Add a route */
1221 case SIOCDELRT: /* Delete a route */
1222 if (!capable(CAP_NET_ADMIN))
1223 return -EPERM;
1224 err = copy_from_user(&rtmsg, arg,
1225 sizeof(struct in6_rtmsg));
1226 if (err)
1227 return -EFAULT;
1229 rtnl_lock();
1230 switch (cmd) {
1231 case SIOCADDRT:
1232 err = ip6_route_add(&rtmsg, NULL, NULL);
1233 break;
1234 case SIOCDELRT:
1235 err = ip6_route_del(&rtmsg, NULL, NULL);
1236 break;
1237 default:
1238 err = -EINVAL;
1240 rtnl_unlock();
1242 return err;
1245 return -EINVAL;
1249 * Drop the packet on the floor
1252 int ip6_pkt_discard(struct sk_buff *skb)
1254 IP6_INC_STATS(Ip6OutNoRoutes);
1255 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1256 kfree_skb(skb);
1257 return 0;
1261 * Add address
1264 int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev, int anycast)
1266 struct rt6_info *rt = ip6_dst_alloc();
1268 if (rt == NULL)
1269 return -ENOMEM;
1271 dev_hold(&loopback_dev);
1273 rt->u.dst.flags = DST_HOST;
1274 rt->u.dst.input = ip6_input;
1275 rt->u.dst.output = ip6_output;
1276 rt->rt6i_dev = &loopback_dev;
1277 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1278 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
1279 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ipv6_get_hoplimit(rt->rt6i_dev);
1280 rt->u.dst.obsolete = -1;
1282 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1283 if (!anycast)
1284 rt->rt6i_flags |= RTF_LOCAL;
1285 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1286 if (rt->rt6i_nexthop == NULL) {
1287 dst_free((struct dst_entry *) rt);
1288 return -ENOMEM;
1291 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1292 rt->rt6i_dst.plen = 128;
1293 rt6_ins(rt, NULL, NULL);
1295 return 0;
1298 /* Delete address. Warning: you should check that this address
1299 disappeared before calling this function.
1302 int ip6_rt_addr_del(struct in6_addr *addr, struct net_device *dev)
1304 struct rt6_info *rt;
1305 int err = -ENOENT;
1307 rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
1308 if (rt) {
1309 if (rt->rt6i_dst.plen == 128)
1310 err = ip6_del_rt(rt, NULL, NULL);
1311 else
1312 dst_release(&rt->u.dst);
1315 return err;
1318 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1320 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1321 rt != &ip6_null_entry) {
1322 RT6_TRACE("deleted by ifdown %p\n", rt);
1323 return -1;
1325 return 0;
1328 void rt6_ifdown(struct net_device *dev)
1330 write_lock_bh(&rt6_lock);
1331 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1332 write_unlock_bh(&rt6_lock);
1335 struct rt6_mtu_change_arg
1337 struct net_device *dev;
1338 unsigned mtu;
1341 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1343 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1344 struct inet6_dev *idev;
1346 /* In IPv6 pmtu discovery is not optional,
1347 so that RTAX_MTU lock cannot disable it.
1348 We still use this lock to block changes
1349 caused by addrconf/ndisc.
1352 idev = __in6_dev_get(arg->dev);
1353 if (idev == NULL)
1354 return 0;
1356 /* For administrative MTU increase, there is no way to discover
1357 IPv6 PMTU increase, so PMTU increase should be updated here.
1358 Since RFC 1981 doesn't include administrative MTU increase
1359 update PMTU increase is a MUST. (i.e. jumbo frame)
1362 If new MTU is less than route PMTU, this new MTU will be the
1363 lowest MTU in the path, update the route PMTU to refect PMTU
1364 decreases; if new MTU is greater than route PMTU, and the
1365 old MTU is the lowest MTU in the path, update the route PMTU
1366 to refect the increase. In this case if the other nodes' MTU
1367 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1368 PMTU discouvery.
1370 if (rt->rt6i_dev == arg->dev &&
1371 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1372 (dst_pmtu(&rt->u.dst) > arg->mtu ||
1373 (dst_pmtu(&rt->u.dst) < arg->mtu &&
1374 dst_pmtu(&rt->u.dst) == idev->cnf.mtu6)))
1375 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1376 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1377 return 0;
1380 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1382 struct rt6_mtu_change_arg arg;
1384 arg.dev = dev;
1385 arg.mtu = mtu;
1386 read_lock_bh(&rt6_lock);
1387 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1388 read_unlock_bh(&rt6_lock);
1391 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1392 struct in6_rtmsg *rtmsg)
1394 memset(rtmsg, 0, sizeof(*rtmsg));
1396 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1397 rtmsg->rtmsg_src_len = r->rtm_src_len;
1398 rtmsg->rtmsg_flags = RTF_UP;
1399 if (r->rtm_type == RTN_UNREACHABLE)
1400 rtmsg->rtmsg_flags |= RTF_REJECT;
1402 if (rta[RTA_GATEWAY-1]) {
1403 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1404 return -EINVAL;
1405 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1406 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1408 if (rta[RTA_DST-1]) {
1409 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1410 return -EINVAL;
1411 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1413 if (rta[RTA_SRC-1]) {
1414 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1415 return -EINVAL;
1416 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1418 if (rta[RTA_OIF-1]) {
1419 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1420 return -EINVAL;
1421 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1423 if (rta[RTA_PRIORITY-1]) {
1424 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1425 return -EINVAL;
1426 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1428 return 0;
1431 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1433 struct rtmsg *r = NLMSG_DATA(nlh);
1434 struct in6_rtmsg rtmsg;
1436 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1437 return -EINVAL;
1438 return ip6_route_del(&rtmsg, nlh, arg);
1441 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1443 struct rtmsg *r = NLMSG_DATA(nlh);
1444 struct in6_rtmsg rtmsg;
1446 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1447 return -EINVAL;
1448 return ip6_route_add(&rtmsg, nlh, arg);
1451 struct rt6_rtnl_dump_arg
1453 struct sk_buff *skb;
1454 struct netlink_callback *cb;
1457 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1458 struct in6_addr *dst,
1459 struct in6_addr *src,
1460 int iif,
1461 int type, u32 pid, u32 seq,
1462 struct nlmsghdr *in_nlh, int prefix)
1464 struct rtmsg *rtm;
1465 struct nlmsghdr *nlh;
1466 unsigned char *b = skb->tail;
1467 struct rta_cacheinfo ci;
1469 if (prefix) { /* user wants prefix routes only */
1470 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1471 /* success since this is not a prefix route */
1472 return 1;
1476 if (!pid && in_nlh) {
1477 pid = in_nlh->nlmsg_pid;
1480 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
1481 rtm = NLMSG_DATA(nlh);
1482 rtm->rtm_family = AF_INET6;
1483 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1484 rtm->rtm_src_len = rt->rt6i_src.plen;
1485 rtm->rtm_tos = 0;
1486 rtm->rtm_table = RT_TABLE_MAIN;
1487 if (rt->rt6i_flags&RTF_REJECT)
1488 rtm->rtm_type = RTN_UNREACHABLE;
1489 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1490 rtm->rtm_type = RTN_LOCAL;
1491 else
1492 rtm->rtm_type = RTN_UNICAST;
1493 rtm->rtm_flags = 0;
1494 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1495 rtm->rtm_protocol = rt->rt6i_protocol;
1496 if (rt->rt6i_flags&RTF_DYNAMIC)
1497 rtm->rtm_protocol = RTPROT_REDIRECT;
1498 else if (rt->rt6i_flags&(RTF_ADDRCONF|RTF_ALLONLINK))
1499 rtm->rtm_protocol = RTPROT_KERNEL;
1500 else if (rt->rt6i_flags&RTF_DEFAULT)
1501 rtm->rtm_protocol = RTPROT_RA;
1503 if (rt->rt6i_flags&RTF_CACHE)
1504 rtm->rtm_flags |= RTM_F_CLONED;
1506 if (dst) {
1507 RTA_PUT(skb, RTA_DST, 16, dst);
1508 rtm->rtm_dst_len = 128;
1509 } else if (rtm->rtm_dst_len)
1510 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1511 #ifdef CONFIG_IPV6_SUBTREES
1512 if (src) {
1513 RTA_PUT(skb, RTA_SRC, 16, src);
1514 rtm->rtm_src_len = 128;
1515 } else if (rtm->rtm_src_len)
1516 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1517 #endif
1518 if (iif)
1519 RTA_PUT(skb, RTA_IIF, 4, &iif);
1520 else if (dst) {
1521 struct in6_addr saddr_buf;
1522 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1523 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1525 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1526 goto rtattr_failure;
1527 if (rt->u.dst.neighbour)
1528 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1529 if (rt->u.dst.dev)
1530 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1531 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1532 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1533 if (rt->rt6i_expires)
1534 ci.rta_expires = rt->rt6i_expires - jiffies;
1535 else
1536 ci.rta_expires = 0;
1537 ci.rta_used = rt->u.dst.__use;
1538 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1539 ci.rta_error = rt->u.dst.error;
1540 ci.rta_id = 0;
1541 ci.rta_ts = 0;
1542 ci.rta_tsage = 0;
1543 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1544 nlh->nlmsg_len = skb->tail - b;
1545 return skb->len;
1547 nlmsg_failure:
1548 rtattr_failure:
1549 skb_trim(skb, b - skb->data);
1550 return -1;
1553 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1555 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1556 struct rtmsg *rtm;
1557 int prefix;
1559 rtm = NLMSG_DATA(arg->cb->nlh);
1560 if (rtm)
1561 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1562 else prefix = 0;
1564 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1565 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1566 NULL, prefix);
1569 static int fib6_dump_node(struct fib6_walker_t *w)
1571 int res;
1572 struct rt6_info *rt;
1574 for (rt = w->leaf; rt; rt = rt->u.next) {
1575 res = rt6_dump_route(rt, w->args);
1576 if (res < 0) {
1577 /* Frame is full, suspend walking */
1578 w->leaf = rt;
1579 return 1;
1581 BUG_TRAP(res!=0);
1583 w->leaf = NULL;
1584 return 0;
1587 static void fib6_dump_end(struct netlink_callback *cb)
1589 struct fib6_walker_t *w = (void*)cb->args[0];
1591 if (w) {
1592 cb->args[0] = 0;
1593 fib6_walker_unlink(w);
1594 kfree(w);
1596 if (cb->args[1]) {
1597 cb->done = (void*)cb->args[1];
1598 cb->args[1] = 0;
1602 static int fib6_dump_done(struct netlink_callback *cb)
1604 fib6_dump_end(cb);
1605 return cb->done(cb);
1608 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1610 struct rt6_rtnl_dump_arg arg;
1611 struct fib6_walker_t *w;
1612 int res;
1614 arg.skb = skb;
1615 arg.cb = cb;
1617 w = (void*)cb->args[0];
1618 if (w == NULL) {
1619 /* New dump:
1621 * 1. hook callback destructor.
1623 cb->args[1] = (long)cb->done;
1624 cb->done = fib6_dump_done;
1627 * 2. allocate and initialize walker.
1629 w = kmalloc(sizeof(*w), GFP_ATOMIC);
1630 if (w == NULL)
1631 return -ENOMEM;
1632 RT6_TRACE("dump<%p", w);
1633 memset(w, 0, sizeof(*w));
1634 w->root = &ip6_routing_table;
1635 w->func = fib6_dump_node;
1636 w->args = &arg;
1637 cb->args[0] = (long)w;
1638 read_lock_bh(&rt6_lock);
1639 res = fib6_walk(w);
1640 read_unlock_bh(&rt6_lock);
1641 } else {
1642 w->args = &arg;
1643 read_lock_bh(&rt6_lock);
1644 res = fib6_walk_continue(w);
1645 read_unlock_bh(&rt6_lock);
1647 #if RT6_DEBUG >= 3
1648 if (res <= 0 && skb->len == 0)
1649 RT6_TRACE("%p>dump end\n", w);
1650 #endif
1651 res = res < 0 ? res : skb->len;
1652 /* res < 0 is an error. (really, impossible)
1653 res == 0 means that dump is complete, but skb still can contain data.
1654 res > 0 dump is not complete, but frame is full.
1656 /* Destroy walker, if dump of this table is complete. */
1657 if (res <= 0)
1658 fib6_dump_end(cb);
1659 return res;
1662 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1664 struct rtattr **rta = arg;
1665 int iif = 0;
1666 int err = -ENOBUFS;
1667 struct sk_buff *skb;
1668 struct flowi fl;
1669 struct rt6_info *rt;
1671 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1672 if (skb == NULL)
1673 goto out;
1675 /* Reserve room for dummy headers, this skb can pass
1676 through good chunk of routing engine.
1678 skb->mac.raw = skb->data;
1679 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1681 memset(&fl, 0, sizeof(fl));
1682 if (rta[RTA_SRC-1])
1683 ipv6_addr_copy(&fl.fl6_src,
1684 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1685 if (rta[RTA_DST-1])
1686 ipv6_addr_copy(&fl.fl6_dst,
1687 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1689 if (rta[RTA_IIF-1])
1690 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1692 if (iif) {
1693 struct net_device *dev;
1694 dev = __dev_get_by_index(iif);
1695 if (!dev) {
1696 err = -ENODEV;
1697 goto out_free;
1701 fl.oif = 0;
1702 if (rta[RTA_OIF-1])
1703 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1705 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1707 skb->dst = &rt->u.dst;
1709 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1710 err = rt6_fill_node(skb, rt,
1711 &fl.fl6_dst, &fl.fl6_src,
1712 iif,
1713 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1714 nlh->nlmsg_seq, nlh, 0);
1715 if (err < 0) {
1716 err = -EMSGSIZE;
1717 goto out_free;
1720 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1721 if (err > 0)
1722 err = 0;
1723 out:
1724 return err;
1725 out_free:
1726 kfree_skb(skb);
1727 goto out;
1730 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh)
1732 struct sk_buff *skb;
1733 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1735 skb = alloc_skb(size, gfp_any());
1736 if (!skb) {
1737 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1738 return;
1740 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) {
1741 kfree_skb(skb);
1742 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1743 return;
1745 NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
1746 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
1750 * /proc
1753 #ifdef CONFIG_PROC_FS
1755 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
1757 struct rt6_proc_arg
1759 char *buffer;
1760 int offset;
1761 int length;
1762 int skip;
1763 int len;
1766 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
1768 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
1769 int i;
1771 if (arg->skip < arg->offset / RT6_INFO_LEN) {
1772 arg->skip++;
1773 return 0;
1776 if (arg->len >= arg->length)
1777 return 0;
1779 for (i=0; i<16; i++) {
1780 sprintf(arg->buffer + arg->len, "%02x",
1781 rt->rt6i_dst.addr.s6_addr[i]);
1782 arg->len += 2;
1784 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1785 rt->rt6i_dst.plen);
1787 #ifdef CONFIG_IPV6_SUBTREES
1788 for (i=0; i<16; i++) {
1789 sprintf(arg->buffer + arg->len, "%02x",
1790 rt->rt6i_src.addr.s6_addr[i]);
1791 arg->len += 2;
1793 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1794 rt->rt6i_src.plen);
1795 #else
1796 sprintf(arg->buffer + arg->len,
1797 "00000000000000000000000000000000 00 ");
1798 arg->len += 36;
1799 #endif
1801 if (rt->rt6i_nexthop) {
1802 for (i=0; i<16; i++) {
1803 sprintf(arg->buffer + arg->len, "%02x",
1804 rt->rt6i_nexthop->primary_key[i]);
1805 arg->len += 2;
1807 } else {
1808 sprintf(arg->buffer + arg->len,
1809 "00000000000000000000000000000000");
1810 arg->len += 32;
1812 arg->len += sprintf(arg->buffer + arg->len,
1813 " %08x %08x %08x %08x %8s\n",
1814 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
1815 rt->u.dst.__use, rt->rt6i_flags,
1816 rt->rt6i_dev ? rt->rt6i_dev->name : "");
1817 return 0;
1820 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1822 struct rt6_proc_arg arg;
1823 arg.buffer = buffer;
1824 arg.offset = offset;
1825 arg.length = length;
1826 arg.skip = 0;
1827 arg.len = 0;
1829 read_lock_bh(&rt6_lock);
1830 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
1831 read_unlock_bh(&rt6_lock);
1833 *start = buffer;
1834 if (offset)
1835 *start += offset % RT6_INFO_LEN;
1837 arg.len -= offset % RT6_INFO_LEN;
1839 if (arg.len > length)
1840 arg.len = length;
1841 if (arg.len < 0)
1842 arg.len = 0;
1844 return arg.len;
1847 extern struct rt6_statistics rt6_stats;
1849 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
1851 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
1852 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
1853 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
1854 rt6_stats.fib_rt_cache,
1855 atomic_read(&ip6_dst_ops.entries),
1856 rt6_stats.fib_discarded_routes);
1858 return 0;
1861 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
1863 return single_open(file, rt6_stats_seq_show, NULL);
1866 static struct file_operations rt6_stats_seq_fops = {
1867 .owner = THIS_MODULE,
1868 .open = rt6_stats_seq_open,
1869 .read = seq_read,
1870 .llseek = seq_lseek,
1871 .release = single_release,
1873 #endif /* CONFIG_PROC_FS */
1875 #ifdef CONFIG_SYSCTL
1877 static int flush_delay;
1879 static
1880 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1881 void *buffer, size_t *lenp)
1883 if (write) {
1884 proc_dointvec(ctl, write, filp, buffer, lenp);
1885 if (flush_delay < 0)
1886 flush_delay = 0;
1887 fib6_run_gc((unsigned long)flush_delay);
1888 return 0;
1889 } else
1890 return -EINVAL;
1893 ctl_table ipv6_route_table[] = {
1895 .ctl_name = NET_IPV6_ROUTE_FLUSH,
1896 .procname = "flush",
1897 .data = &flush_delay,
1898 .maxlen = sizeof(int),
1899 .mode = 0644,
1900 .proc_handler = &ipv6_sysctl_rtcache_flush
1903 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
1904 .procname = "gc_thresh",
1905 .data = &ip6_dst_ops.gc_thresh,
1906 .maxlen = sizeof(int),
1907 .mode = 0644,
1908 .proc_handler = &proc_dointvec,
1911 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
1912 .procname = "max_size",
1913 .data = &ip6_rt_max_size,
1914 .maxlen = sizeof(int),
1915 .mode = 0644,
1916 .proc_handler = &proc_dointvec,
1919 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
1920 .procname = "gc_min_interval",
1921 .data = &ip6_rt_gc_min_interval,
1922 .maxlen = sizeof(int),
1923 .mode = 0644,
1924 .proc_handler = &proc_dointvec_jiffies,
1925 .strategy = &sysctl_jiffies,
1928 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
1929 .procname = "gc_timeout",
1930 .data = &ip6_rt_gc_timeout,
1931 .maxlen = sizeof(int),
1932 .mode = 0644,
1933 .proc_handler = &proc_dointvec_jiffies,
1934 .strategy = &sysctl_jiffies,
1937 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
1938 .procname = "gc_interval",
1939 .data = &ip6_rt_gc_interval,
1940 .maxlen = sizeof(int),
1941 .mode = 0644,
1942 .proc_handler = &proc_dointvec_jiffies,
1943 .strategy = &sysctl_jiffies,
1946 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
1947 .procname = "gc_elasticity",
1948 .data = &ip6_rt_gc_elasticity,
1949 .maxlen = sizeof(int),
1950 .mode = 0644,
1951 .proc_handler = &proc_dointvec_jiffies,
1952 .strategy = &sysctl_jiffies,
1955 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
1956 .procname = "mtu_expires",
1957 .data = &ip6_rt_mtu_expires,
1958 .maxlen = sizeof(int),
1959 .mode = 0644,
1960 .proc_handler = &proc_dointvec_jiffies,
1961 .strategy = &sysctl_jiffies,
1964 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
1965 .procname = "min_adv_mss",
1966 .data = &ip6_rt_min_advmss,
1967 .maxlen = sizeof(int),
1968 .mode = 0644,
1969 .proc_handler = &proc_dointvec_jiffies,
1970 .strategy = &sysctl_jiffies,
1974 #endif
1976 void __init ip6_route_init(void)
1978 struct proc_dir_entry *p;
1980 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
1981 sizeof(struct rt6_info),
1982 0, SLAB_HWCACHE_ALIGN,
1983 NULL, NULL);
1984 fib6_init();
1985 #ifdef CONFIG_PROC_FS
1986 proc_net_create("ipv6_route", 0, rt6_proc_info);
1987 p = create_proc_entry("rt6_stats", S_IRUGO, proc_net);
1988 if (p)
1989 p->proc_fops = &rt6_stats_seq_fops;
1990 #endif
1991 #ifdef CONFIG_XFRM
1992 xfrm6_init();
1993 #endif
1996 #ifdef MODULE
1997 void ip6_route_cleanup(void)
1999 #ifdef CONFIG_PROC_FS
2000 proc_net_remove("ipv6_route");
2001 proc_net_remove("rt6_stats");
2002 #endif
2003 xfrm6_fini();
2004 rt6_ifdown(NULL);
2005 fib6_gc_cleanup();
2006 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2008 #endif /* MODULE */