RT-AC66 3.0.0.4.374.130 core
[tomato.git] / release / src-rt-6.x / linux / linux-2.6 / net / ipv6 / route.c
blobb7c7eb689be3a23372f8760db940ebc9e1b8fb32
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 /* Changes:
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
25 * Ville Nuorvala
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/mroute6.h>
40 #include <linux/init.h>
41 #include <linux/if_arp.h>
43 #ifdef CONFIG_PROC_FS
44 #include <linux/proc_fs.h>
45 #include <linux/seq_file.h>
46 #endif
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
67 /* Set to 3 to get tracing. */
68 #define RT6_DEBUG 2
70 #if RT6_DEBUG >= 3
71 #define RDBG(x) printk x
72 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #else
74 #define RDBG(x)
75 #define RT6_TRACE(x...) do { ; } while (0)
76 #endif
78 #define CLONE_OFFLINK_ROUTE 0
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(void);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct sk_buff *skb);
98 static void ip6_link_failure(struct sk_buff *skb);
99 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
104 unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
107 #endif
109 static struct dst_ops ip6_dst_ops = {
110 .family = AF_INET6,
111 .protocol = __constant_htons(ETH_P_IPV6),
112 .gc = ip6_dst_gc,
113 .gc_thresh = 1024,
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
123 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
127 static struct dst_ops ip6_dst_blackhole_ops = {
128 .family = AF_INET6,
129 .protocol = __constant_htons(ETH_P_IPV6),
130 .destroy = ip6_dst_destroy,
131 .check = ip6_dst_check,
132 .update_pmtu = ip6_rt_blackhole_update_pmtu,
133 .entry_size = sizeof(struct rt6_info),
136 struct rt6_info ip6_null_entry = {
137 .u = {
138 .dst = {
139 .__refcnt = ATOMIC_INIT(1),
140 .__use = 1,
141 .dev = &loopback_dev,
142 .obsolete = -1,
143 .error = -ENETUNREACH,
144 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
145 .input = ip6_pkt_discard,
146 .output = ip6_pkt_discard_out,
147 .ops = &ip6_dst_ops,
148 .path = (struct dst_entry*)&ip6_null_entry,
151 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
152 .rt6i_protocol = RTPROT_KERNEL,
153 .rt6i_metric = ~(u32) 0,
154 .rt6i_ref = ATOMIC_INIT(1),
157 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
159 static int ip6_pkt_prohibit(struct sk_buff *skb);
160 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
161 static int ip6_pkt_blk_hole(struct sk_buff *skb);
163 struct rt6_info ip6_prohibit_entry = {
164 .u = {
165 .dst = {
166 .__refcnt = ATOMIC_INIT(1),
167 .__use = 1,
168 .dev = &loopback_dev,
169 .obsolete = -1,
170 .error = -EACCES,
171 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
172 .input = ip6_pkt_prohibit,
173 .output = ip6_pkt_prohibit_out,
174 .ops = &ip6_dst_ops,
175 .path = (struct dst_entry*)&ip6_prohibit_entry,
178 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
179 .rt6i_protocol = RTPROT_KERNEL,
180 .rt6i_metric = ~(u32) 0,
181 .rt6i_ref = ATOMIC_INIT(1),
184 struct rt6_info ip6_blk_hole_entry = {
185 .u = {
186 .dst = {
187 .__refcnt = ATOMIC_INIT(1),
188 .__use = 1,
189 .dev = &loopback_dev,
190 .obsolete = -1,
191 .error = -EINVAL,
192 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
193 .input = ip6_pkt_blk_hole,
194 .output = ip6_pkt_blk_hole,
195 .ops = &ip6_dst_ops,
196 .path = (struct dst_entry*)&ip6_blk_hole_entry,
199 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
200 .rt6i_protocol = RTPROT_KERNEL,
201 .rt6i_metric = ~(u32) 0,
202 .rt6i_ref = ATOMIC_INIT(1),
205 #endif
207 /* allocate dst with ip6_dst_ops */
208 static __inline__ struct rt6_info *ip6_dst_alloc(void)
210 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
213 static void ip6_dst_destroy(struct dst_entry *dst)
215 struct rt6_info *rt = (struct rt6_info *)dst;
216 struct inet6_dev *idev = rt->rt6i_idev;
218 if (idev != NULL) {
219 rt->rt6i_idev = NULL;
220 in6_dev_put(idev);
224 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
225 int how)
227 struct rt6_info *rt = (struct rt6_info *)dst;
228 struct inet6_dev *idev = rt->rt6i_idev;
230 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
231 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
232 if (loopback_idev != NULL) {
233 rt->rt6i_idev = loopback_idev;
234 in6_dev_put(idev);
239 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
241 return (rt->rt6i_flags & RTF_EXPIRES &&
242 time_after(jiffies, rt->rt6i_expires));
245 static inline int rt6_need_strict(struct in6_addr *daddr)
247 return (ipv6_addr_type(daddr) &
248 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
252 * Route lookup. Any table->tb6_lock is implied.
255 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
256 int oif,
257 int strict)
259 struct rt6_info *local = NULL;
260 struct rt6_info *sprt;
262 if (oif) {
263 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
264 struct net_device *dev = sprt->rt6i_dev;
265 if (dev->ifindex == oif)
266 return sprt;
267 if (dev->flags & IFF_LOOPBACK) {
268 if (sprt->rt6i_idev == NULL ||
269 sprt->rt6i_idev->dev->ifindex != oif) {
270 if (strict && oif)
271 continue;
272 if (local && (!oif ||
273 local->rt6i_idev->dev->ifindex == oif))
274 continue;
276 local = sprt;
280 if (local)
281 return local;
283 if (strict)
284 return &ip6_null_entry;
286 return rt;
289 #ifdef CONFIG_IPV6_ROUTER_PREF
290 static void rt6_probe(struct rt6_info *rt)
292 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
294 * Okay, this does not seem to be appropriate
295 * for now, however, we need to check if it
296 * is really so; aka Router Reachability Probing.
298 * Router Reachability Probe MUST be rate-limited
299 * to no more than one per minute.
301 if (!neigh || (neigh->nud_state & NUD_VALID))
302 return;
303 read_lock_bh(&neigh->lock);
304 if (!(neigh->nud_state & NUD_VALID) &&
305 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
306 struct in6_addr mcaddr;
307 struct in6_addr *target;
309 neigh->updated = jiffies;
310 read_unlock_bh(&neigh->lock);
312 target = (struct in6_addr *)&neigh->primary_key;
313 addrconf_addr_solict_mult(target, &mcaddr);
314 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
315 } else
316 read_unlock_bh(&neigh->lock);
318 #else
319 static inline void rt6_probe(struct rt6_info *rt)
321 return;
323 #endif
326 * Default Router Selection (RFC 2461 6.3.6)
328 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
330 struct net_device *dev = rt->rt6i_dev;
331 if (!oif || dev->ifindex == oif)
332 return 2;
333 if ((dev->flags & IFF_LOOPBACK) &&
334 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
335 return 1;
336 return 0;
339 static inline int rt6_check_neigh(struct rt6_info *rt)
341 struct neighbour *neigh = rt->rt6i_nexthop;
342 int m;
343 if (rt->rt6i_flags & RTF_NONEXTHOP ||
344 !(rt->rt6i_flags & RTF_GATEWAY))
345 m = 1;
346 else if (neigh) {
347 read_lock_bh(&neigh->lock);
348 if (neigh->nud_state & NUD_VALID)
349 m = 2;
350 #ifdef CONFIG_IPV6_ROUTER_PREF
351 else if (neigh->nud_state & NUD_FAILED)
352 m = 0;
353 #endif
354 else
355 m = 1;
356 read_unlock_bh(&neigh->lock);
357 } else
358 m = 0;
359 return m;
362 static int rt6_score_route(struct rt6_info *rt, int oif,
363 int strict)
365 int m, n;
367 m = rt6_check_dev(rt, oif);
368 if (!m && (strict & RT6_LOOKUP_F_IFACE))
369 return -1;
370 #ifdef CONFIG_IPV6_ROUTER_PREF
371 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
372 #endif
373 n = rt6_check_neigh(rt);
374 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
375 return -1;
376 return m;
379 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
380 int *mpri, struct rt6_info *match)
382 int m;
384 if (rt6_check_expired(rt))
385 goto out;
387 m = rt6_score_route(rt, oif, strict);
388 if (m < 0)
389 goto out;
391 if (m > *mpri) {
392 if (strict & RT6_LOOKUP_F_REACHABLE)
393 rt6_probe(match);
394 *mpri = m;
395 match = rt;
396 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
397 rt6_probe(rt);
400 out:
401 return match;
404 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
405 struct rt6_info *rr_head,
406 u32 metric, int oif, int strict)
408 struct rt6_info *rt, *match;
409 int mpri = -1;
411 match = NULL;
412 for (rt = rr_head; rt && rt->rt6i_metric == metric;
413 rt = rt->u.dst.rt6_next)
414 match = find_match(rt, oif, strict, &mpri, match);
415 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
416 rt = rt->u.dst.rt6_next)
417 match = find_match(rt, oif, strict, &mpri, match);
419 return match;
422 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
424 struct rt6_info *match, *rt0;
426 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
427 __FUNCTION__, fn->leaf, oif);
429 rt0 = fn->rr_ptr;
430 if (!rt0)
431 fn->rr_ptr = rt0 = fn->leaf;
433 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
435 if (!match &&
436 (strict & RT6_LOOKUP_F_REACHABLE)) {
437 struct rt6_info *next = rt0->u.dst.rt6_next;
439 /* no entries matched; do round-robin */
440 if (!next || next->rt6i_metric != rt0->rt6i_metric)
441 next = fn->leaf;
443 if (next != rt0)
444 fn->rr_ptr = next;
447 RT6_TRACE("%s() => %p\n",
448 __FUNCTION__, match);
450 return (match ? match : &ip6_null_entry);
453 #ifdef CONFIG_IPV6_ROUTE_INFO
454 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
455 struct in6_addr *gwaddr)
457 struct route_info *rinfo = (struct route_info *) opt;
458 struct in6_addr prefix_buf, *prefix;
459 unsigned int pref;
460 u32 lifetime;
461 struct rt6_info *rt;
463 if (len < sizeof(struct route_info)) {
464 return -EINVAL;
467 /* Sanity check for prefix_len and length */
468 if (rinfo->length > 3) {
469 return -EINVAL;
470 } else if (rinfo->prefix_len > 128) {
471 return -EINVAL;
472 } else if (rinfo->prefix_len > 64) {
473 if (rinfo->length < 2) {
474 return -EINVAL;
476 } else if (rinfo->prefix_len > 0) {
477 if (rinfo->length < 1) {
478 return -EINVAL;
482 pref = rinfo->route_pref;
483 if (pref == ICMPV6_ROUTER_PREF_INVALID)
484 pref = ICMPV6_ROUTER_PREF_MEDIUM;
486 lifetime = ntohl(rinfo->lifetime);
487 if (lifetime == 0xffffffff) {
488 /* infinity */
489 } else if (lifetime > 0x7fffffff/HZ) {
490 /* Avoid arithmetic overflow */
491 lifetime = 0x7fffffff/HZ - 1;
494 if (rinfo->length == 3)
495 prefix = (struct in6_addr *)rinfo->prefix;
496 else {
497 /* this function is safe */
498 ipv6_addr_prefix(&prefix_buf,
499 (struct in6_addr *)rinfo->prefix,
500 rinfo->prefix_len);
501 prefix = &prefix_buf;
504 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
506 if (rt && !lifetime) {
507 ip6_del_rt(rt);
508 rt = NULL;
511 if (!rt && lifetime)
512 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
513 pref);
514 else if (rt)
515 rt->rt6i_flags = RTF_ROUTEINFO |
516 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
518 if (rt) {
519 if (lifetime == 0xffffffff) {
520 rt->rt6i_flags &= ~RTF_EXPIRES;
521 } else {
522 rt->rt6i_expires = jiffies + HZ * lifetime;
523 rt->rt6i_flags |= RTF_EXPIRES;
525 dst_release(&rt->u.dst);
527 return 0;
529 #endif
531 #define BACKTRACK(saddr) \
532 do { \
533 if (rt == &ip6_null_entry) { \
534 struct fib6_node *pn; \
535 while (1) { \
536 if (fn->fn_flags & RTN_TL_ROOT) \
537 goto out; \
538 pn = fn->parent; \
539 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
540 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
541 else \
542 fn = pn; \
543 if (fn->fn_flags & RTN_RTINFO) \
544 goto restart; \
547 } while(0)
549 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
550 struct flowi *fl, int flags)
552 struct fib6_node *fn;
553 struct rt6_info *rt;
555 read_lock_bh(&table->tb6_lock);
556 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
557 restart:
558 rt = fn->leaf;
559 rt = rt6_device_match(rt, fl->oif, flags);
560 BACKTRACK(&fl->fl6_src);
561 out:
562 dst_hold(&rt->u.dst);
563 read_unlock_bh(&table->tb6_lock);
565 rt->u.dst.lastuse = jiffies;
566 rt->u.dst.__use++;
568 return rt;
572 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
573 int oif, int strict)
575 struct flowi fl = {
576 .oif = oif,
577 .nl_u = {
578 .ip6_u = {
579 .daddr = *daddr,
583 struct dst_entry *dst;
584 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
586 if (saddr) {
587 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
588 flags |= RT6_LOOKUP_F_HAS_SADDR;
591 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
592 if (dst->error == 0)
593 return (struct rt6_info *) dst;
595 dst_release(dst);
597 return NULL;
600 EXPORT_SYMBOL(rt6_lookup);
602 /* ip6_ins_rt is called with FREE table->tb6_lock.
603 It takes new route entry, the addition fails by any reason the
604 route is freed. In any case, if caller does not hold it, it may
605 be destroyed.
608 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
610 int err;
611 struct fib6_table *table;
613 table = rt->rt6i_table;
614 write_lock_bh(&table->tb6_lock);
615 err = fib6_add(&table->tb6_root, rt, info);
616 write_unlock_bh(&table->tb6_lock);
618 return err;
621 int ip6_ins_rt(struct rt6_info *rt)
623 return __ip6_ins_rt(rt, NULL);
626 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
627 struct in6_addr *saddr)
629 struct rt6_info *rt;
632 * Clone the route.
635 rt = ip6_rt_copy(ort);
637 if (rt) {
638 struct neighbour *neigh;
639 int attempts = !in_softirq();
641 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
642 if (rt->rt6i_dst.plen != 128 &&
643 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
644 rt->rt6i_flags |= RTF_ANYCAST;
645 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
648 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
649 rt->rt6i_dst.plen = 128;
650 rt->rt6i_flags |= RTF_CACHE;
651 rt->u.dst.flags |= DST_HOST;
653 #ifdef CONFIG_IPV6_SUBTREES
654 if (rt->rt6i_src.plen && saddr) {
655 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
656 rt->rt6i_src.plen = 128;
658 #endif
660 retry:
661 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
662 if (IS_ERR(neigh)) {
663 int saved_rt_min_interval = ip6_rt_gc_min_interval;
664 int saved_rt_elasticity = ip6_rt_gc_elasticity;
666 if (attempts-- > 0) {
667 ip6_rt_gc_elasticity = 1;
668 ip6_rt_gc_min_interval = 0;
670 ip6_dst_gc();
672 ip6_rt_gc_elasticity = saved_rt_elasticity;
673 ip6_rt_gc_min_interval = saved_rt_min_interval;
674 goto retry;
677 if (net_ratelimit())
678 printk(KERN_WARNING
679 "Neighbour table overflow.\n");
680 dst_free(&rt->u.dst);
681 return NULL;
683 rt->rt6i_nexthop = neigh;
687 return rt;
690 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
692 struct rt6_info *rt = ip6_rt_copy(ort);
693 if (rt) {
694 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695 rt->rt6i_dst.plen = 128;
696 rt->rt6i_flags |= RTF_CACHE;
697 rt->u.dst.flags |= DST_HOST;
698 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
700 return rt;
703 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
704 struct flowi *fl, int flags)
706 struct fib6_node *fn;
707 struct rt6_info *rt, *nrt;
708 int strict = 0;
709 int attempts = 3;
710 int err;
711 int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
713 strict |= flags & RT6_LOOKUP_F_IFACE;
715 relookup:
716 read_lock_bh(&table->tb6_lock);
718 restart_2:
719 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
721 restart:
722 rt = rt6_select(fn, fl->iif, strict | reachable);
723 BACKTRACK(&fl->fl6_src);
724 if (rt == &ip6_null_entry ||
725 rt->rt6i_flags & RTF_CACHE)
726 goto out;
728 dst_hold(&rt->u.dst);
729 read_unlock_bh(&table->tb6_lock);
731 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
732 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
733 else {
734 #if CLONE_OFFLINK_ROUTE
735 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
736 #else
737 goto out2;
738 #endif
741 dst_release(&rt->u.dst);
742 rt = nrt ? : &ip6_null_entry;
744 dst_hold(&rt->u.dst);
745 if (nrt) {
746 err = ip6_ins_rt(nrt);
747 if (!err)
748 goto out2;
751 if (--attempts <= 0)
752 goto out2;
755 * Race condition! In the gap, when table->tb6_lock was
756 * released someone could insert this route. Relookup.
758 dst_release(&rt->u.dst);
759 goto relookup;
761 out:
762 if (reachable) {
763 reachable = 0;
764 goto restart_2;
766 dst_hold(&rt->u.dst);
767 read_unlock_bh(&table->tb6_lock);
768 out2:
769 rt->u.dst.lastuse = jiffies;
770 rt->u.dst.__use++;
772 return rt;
775 void ip6_route_input(struct sk_buff *skb)
777 struct ipv6hdr *iph = ipv6_hdr(skb);
778 int flags = RT6_LOOKUP_F_HAS_SADDR;
779 struct flowi fl = {
780 .iif = skb->dev->ifindex,
781 .nl_u = {
782 .ip6_u = {
783 .daddr = iph->daddr,
784 .saddr = iph->saddr,
785 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
788 .mark = skb->mark,
789 .proto = iph->nexthdr,
792 if (rt6_need_strict(&iph->daddr))
793 flags |= RT6_LOOKUP_F_IFACE;
795 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
798 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
799 struct flowi *fl, int flags)
801 struct fib6_node *fn;
802 struct rt6_info *rt, *nrt;
803 int strict = 0;
804 int attempts = 3;
805 int err;
806 int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
808 strict |= flags & RT6_LOOKUP_F_IFACE;
810 relookup:
811 read_lock_bh(&table->tb6_lock);
813 restart_2:
814 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
816 restart:
817 rt = rt6_select(fn, fl->oif, strict | reachable);
818 BACKTRACK(&fl->fl6_src);
819 if (rt == &ip6_null_entry ||
820 rt->rt6i_flags & RTF_CACHE)
821 goto out;
823 dst_hold(&rt->u.dst);
824 read_unlock_bh(&table->tb6_lock);
826 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
827 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
828 else {
829 #if CLONE_OFFLINK_ROUTE
830 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
831 #else
832 goto out2;
833 #endif
836 dst_release(&rt->u.dst);
837 rt = nrt ? : &ip6_null_entry;
839 dst_hold(&rt->u.dst);
840 if (nrt) {
841 err = ip6_ins_rt(nrt);
842 if (!err)
843 goto out2;
846 if (--attempts <= 0)
847 goto out2;
850 * Race condition! In the gap, when table->tb6_lock was
851 * released someone could insert this route. Relookup.
853 dst_release(&rt->u.dst);
854 goto relookup;
856 out:
857 if (reachable) {
858 reachable = 0;
859 goto restart_2;
861 dst_hold(&rt->u.dst);
862 read_unlock_bh(&table->tb6_lock);
863 out2:
864 rt->u.dst.lastuse = jiffies;
865 rt->u.dst.__use++;
866 return rt;
869 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
871 int flags = 0;
873 if (rt6_need_strict(&fl->fl6_dst))
874 flags |= RT6_LOOKUP_F_IFACE;
876 if (!ipv6_addr_any(&fl->fl6_src))
877 flags |= RT6_LOOKUP_F_HAS_SADDR;
879 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
882 EXPORT_SYMBOL(ip6_route_output);
884 static int ip6_blackhole_output(struct sk_buff *skb)
886 kfree_skb(skb);
887 return 0;
890 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
892 struct rt6_info *ort = (struct rt6_info *) *dstp;
893 struct rt6_info *rt = (struct rt6_info *)
894 dst_alloc(&ip6_dst_blackhole_ops);
895 struct dst_entry *new = NULL;
897 if (rt) {
898 new = &rt->u.dst;
900 atomic_set(&new->__refcnt, 1);
901 new->__use = 1;
902 new->input = ip6_blackhole_output;
903 new->output = ip6_blackhole_output;
905 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
906 new->dev = ort->u.dst.dev;
907 if (new->dev)
908 dev_hold(new->dev);
909 rt->rt6i_idev = ort->rt6i_idev;
910 if (rt->rt6i_idev)
911 in6_dev_hold(rt->rt6i_idev);
912 rt->rt6i_expires = 0;
914 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
915 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
916 rt->rt6i_metric = 0;
918 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
919 #ifdef CONFIG_IPV6_SUBTREES
920 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
921 #endif
923 dst_free(new);
926 dst_release(*dstp);
927 *dstp = new;
928 return (new ? 0 : -ENOMEM);
930 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
933 * Destination cache support functions
936 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
938 struct rt6_info *rt;
940 rt = (struct rt6_info *) dst;
942 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
943 return dst;
945 return NULL;
948 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
950 struct rt6_info *rt = (struct rt6_info *) dst;
952 if (rt) {
953 if (rt->rt6i_flags & RTF_CACHE)
954 ip6_del_rt(rt);
955 else
956 dst_release(dst);
958 return NULL;
961 static void ip6_link_failure(struct sk_buff *skb)
963 struct rt6_info *rt;
965 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
967 rt = (struct rt6_info *) skb->dst;
968 if (rt) {
969 if (rt->rt6i_flags&RTF_CACHE) {
970 dst_set_expires(&rt->u.dst, 0);
971 rt->rt6i_flags |= RTF_EXPIRES;
972 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
973 rt->rt6i_node->fn_sernum = -1;
977 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
979 struct rt6_info *rt6 = (struct rt6_info*)dst;
981 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
982 rt6->rt6i_flags |= RTF_MODIFIED;
983 if (mtu < IPV6_MIN_MTU) {
984 mtu = IPV6_MIN_MTU;
985 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
987 dst->metrics[RTAX_MTU-1] = mtu;
988 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
992 static int ipv6_get_mtu(struct net_device *dev);
994 static inline unsigned int ipv6_advmss(unsigned int mtu)
996 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
998 if (mtu < ip6_rt_min_advmss)
999 mtu = ip6_rt_min_advmss;
1002 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1003 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1004 * IPV6_MAXPLEN is also valid and means: "any MSS,
1005 * rely only on pmtu discovery"
1007 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1008 mtu = IPV6_MAXPLEN;
1009 return mtu;
1012 static struct dst_entry *ndisc_dst_gc_list;
1013 static DEFINE_SPINLOCK(ndisc_lock);
1015 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
1016 struct neighbour *neigh,
1017 struct in6_addr *addr,
1018 int (*output)(struct sk_buff *))
1020 struct rt6_info *rt;
1021 struct inet6_dev *idev = in6_dev_get(dev);
1023 if (unlikely(idev == NULL))
1024 return NULL;
1026 rt = ip6_dst_alloc();
1027 if (unlikely(rt == NULL)) {
1028 in6_dev_put(idev);
1029 goto out;
1032 dev_hold(dev);
1033 if (neigh)
1034 neigh_hold(neigh);
1035 else {
1036 neigh = ndisc_get_neigh(dev, addr);
1037 if (IS_ERR(neigh))
1038 neigh = NULL;
1041 rt->rt6i_dev = dev;
1042 rt->rt6i_idev = idev;
1043 rt->rt6i_nexthop = neigh;
1044 atomic_set(&rt->u.dst.__refcnt, 1);
1045 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
1046 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1047 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1048 rt->u.dst.output = output;
1050 #if 0 /* there's no chance to use these for ndisc */
1051 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1052 ? DST_HOST
1053 : 0;
1054 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1055 rt->rt6i_dst.plen = 128;
1056 #endif
1058 spin_lock_bh(&ndisc_lock);
1059 rt->u.dst.next = ndisc_dst_gc_list;
1060 ndisc_dst_gc_list = &rt->u.dst;
1061 spin_unlock_bh(&ndisc_lock);
1063 fib6_force_start_gc();
1065 out:
1066 return &rt->u.dst;
1069 int ndisc_dst_gc(int *more)
1071 struct dst_entry *dst, *next, **pprev;
1072 int freed;
1074 next = NULL;
1075 freed = 0;
1077 spin_lock_bh(&ndisc_lock);
1078 pprev = &ndisc_dst_gc_list;
1080 while ((dst = *pprev) != NULL) {
1081 if (!atomic_read(&dst->__refcnt)) {
1082 *pprev = dst->next;
1083 dst_free(dst);
1084 freed++;
1085 } else {
1086 pprev = &dst->next;
1087 (*more)++;
1091 spin_unlock_bh(&ndisc_lock);
1093 return freed;
1096 static int ip6_dst_gc(void)
1098 static unsigned expire = 30*HZ;
1099 static unsigned long last_gc;
1100 unsigned long now = jiffies;
1102 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
1103 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
1104 goto out;
1106 expire++;
1107 fib6_run_gc(expire);
1108 last_gc = now;
1109 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1110 expire = ip6_rt_gc_timeout>>1;
1112 out:
1113 expire -= expire>>ip6_rt_gc_elasticity;
1114 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1117 /* Clean host part of a prefix. Not necessary in radix tree,
1118 but results in cleaner routing tables.
1120 Remove it only when all the things will work!
1123 static int ipv6_get_mtu(struct net_device *dev)
1125 int mtu = IPV6_MIN_MTU;
1126 struct inet6_dev *idev;
1128 idev = in6_dev_get(dev);
1129 if (idev) {
1130 mtu = idev->cnf.mtu6;
1131 in6_dev_put(idev);
1133 return mtu;
1136 int ipv6_get_hoplimit(struct net_device *dev)
1138 int hoplimit = ipv6_devconf.hop_limit;
1139 struct inet6_dev *idev;
1141 idev = in6_dev_get(dev);
1142 if (idev) {
1143 hoplimit = idev->cnf.hop_limit;
1144 in6_dev_put(idev);
1146 return hoplimit;
1153 int ip6_route_add(struct fib6_config *cfg)
1155 int err;
1156 struct rt6_info *rt = NULL;
1157 struct net_device *dev = NULL;
1158 struct inet6_dev *idev = NULL;
1159 struct fib6_table *table;
1160 int addr_type;
1162 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1163 return -EINVAL;
1164 #ifndef CONFIG_IPV6_SUBTREES
1165 if (cfg->fc_src_len)
1166 return -EINVAL;
1167 #endif
1168 if (cfg->fc_ifindex) {
1169 err = -ENODEV;
1170 dev = dev_get_by_index(cfg->fc_ifindex);
1171 if (!dev)
1172 goto out;
1173 idev = in6_dev_get(dev);
1174 if (!idev)
1175 goto out;
1178 if (cfg->fc_metric == 0)
1179 cfg->fc_metric = IP6_RT_PRIO_USER;
1181 table = fib6_new_table(cfg->fc_table);
1182 if (table == NULL) {
1183 err = -ENOBUFS;
1184 goto out;
1187 rt = ip6_dst_alloc();
1189 if (rt == NULL) {
1190 err = -ENOMEM;
1191 goto out;
1194 rt->u.dst.obsolete = -1;
1195 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1196 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1199 if (cfg->fc_protocol == RTPROT_UNSPEC)
1200 cfg->fc_protocol = RTPROT_BOOT;
1201 rt->rt6i_protocol = cfg->fc_protocol;
1203 addr_type = ipv6_addr_type(&cfg->fc_dst);
1205 if (addr_type & IPV6_ADDR_MULTICAST)
1206 rt->u.dst.input = ip6_mc_input;
1207 else
1208 rt->u.dst.input = ip6_forward;
1210 rt->u.dst.output = ip6_output;
1212 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1213 rt->rt6i_dst.plen = cfg->fc_dst_len;
1214 if (rt->rt6i_dst.plen == 128)
1215 rt->u.dst.flags = DST_HOST;
1217 #ifdef CONFIG_IPV6_SUBTREES
1218 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1219 rt->rt6i_src.plen = cfg->fc_src_len;
1220 #endif
1222 rt->rt6i_metric = cfg->fc_metric;
1224 /* We cannot add true routes via loopback here,
1225 they would result in kernel looping; promote them to reject routes
1227 if ((cfg->fc_flags & RTF_REJECT) ||
1228 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1229 /* hold loopback dev/idev if we haven't done so. */
1230 if (dev != &loopback_dev) {
1231 if (dev) {
1232 dev_put(dev);
1233 in6_dev_put(idev);
1235 dev = &loopback_dev;
1236 dev_hold(dev);
1237 idev = in6_dev_get(dev);
1238 if (!idev) {
1239 err = -ENODEV;
1240 goto out;
1243 rt->u.dst.output = ip6_pkt_discard_out;
1244 rt->u.dst.input = ip6_pkt_discard;
1245 rt->u.dst.error = -ENETUNREACH;
1246 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1247 goto install_route;
1250 if (cfg->fc_flags & RTF_GATEWAY) {
1251 struct in6_addr *gw_addr;
1252 int gwa_type;
1254 gw_addr = &cfg->fc_gateway;
1255 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1256 gwa_type = ipv6_addr_type(gw_addr);
1258 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1259 struct rt6_info *grt;
1261 /* IPv6 strictly inhibits using not link-local
1262 addresses as nexthop address.
1263 Otherwise, router will not able to send redirects.
1264 It is very good, but in some (rare!) circumstances
1265 (SIT, PtP, NBMA NOARP links) it is handy to allow
1266 some exceptions. --ANK
1268 err = -EINVAL;
1269 if (!(gwa_type&IPV6_ADDR_UNICAST))
1270 goto out;
1272 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1274 err = -EHOSTUNREACH;
1275 if (grt == NULL)
1276 goto out;
1277 if (dev) {
1278 if (dev != grt->rt6i_dev) {
1279 dst_release(&grt->u.dst);
1280 goto out;
1282 } else {
1283 dev = grt->rt6i_dev;
1284 idev = grt->rt6i_idev;
1285 dev_hold(dev);
1286 in6_dev_hold(grt->rt6i_idev);
1288 if (!(grt->rt6i_flags&RTF_GATEWAY))
1289 err = 0;
1290 dst_release(&grt->u.dst);
1292 if (err)
1293 goto out;
1295 err = -EINVAL;
1296 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1297 goto out;
1300 err = -ENODEV;
1301 if (dev == NULL)
1302 goto out;
1304 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1305 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1306 if (IS_ERR(rt->rt6i_nexthop)) {
1307 err = PTR_ERR(rt->rt6i_nexthop);
1308 rt->rt6i_nexthop = NULL;
1309 goto out;
1313 rt->rt6i_flags = cfg->fc_flags;
1315 install_route:
1316 if (cfg->fc_mx) {
1317 struct nlattr *nla;
1318 int remaining;
1320 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1321 int type = nla_type(nla);
1323 if (type) {
1324 if (type > RTAX_MAX) {
1325 err = -EINVAL;
1326 goto out;
1329 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1334 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1335 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1336 if (!rt->u.dst.metrics[RTAX_MTU-1])
1337 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1338 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1339 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1340 rt->u.dst.dev = dev;
1341 rt->rt6i_idev = idev;
1342 rt->rt6i_table = table;
1343 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1345 out:
1346 if (dev)
1347 dev_put(dev);
1348 if (idev)
1349 in6_dev_put(idev);
1350 if (rt)
1351 dst_free(&rt->u.dst);
1352 return err;
1355 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1357 int err;
1358 struct fib6_table *table;
1360 if (rt == &ip6_null_entry)
1361 return -ENOENT;
1363 table = rt->rt6i_table;
1364 write_lock_bh(&table->tb6_lock);
1366 err = fib6_del(rt, info);
1367 dst_release(&rt->u.dst);
1369 write_unlock_bh(&table->tb6_lock);
1371 return err;
1374 int ip6_del_rt(struct rt6_info *rt)
1376 return __ip6_del_rt(rt, NULL);
1379 static int ip6_route_del(struct fib6_config *cfg)
1381 struct fib6_table *table;
1382 struct fib6_node *fn;
1383 struct rt6_info *rt;
1384 int err = -ESRCH;
1386 table = fib6_get_table(cfg->fc_table);
1387 if (table == NULL)
1388 return err;
1390 read_lock_bh(&table->tb6_lock);
1392 fn = fib6_locate(&table->tb6_root,
1393 &cfg->fc_dst, cfg->fc_dst_len,
1394 &cfg->fc_src, cfg->fc_src_len);
1396 if (fn) {
1397 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1398 if (cfg->fc_ifindex &&
1399 (rt->rt6i_dev == NULL ||
1400 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1401 continue;
1402 if (cfg->fc_flags & RTF_GATEWAY &&
1403 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1404 continue;
1405 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1406 continue;
1407 dst_hold(&rt->u.dst);
1408 read_unlock_bh(&table->tb6_lock);
1410 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1413 read_unlock_bh(&table->tb6_lock);
1415 return err;
1419 * Handle redirects
1421 struct ip6rd_flowi {
1422 struct flowi fl;
1423 struct in6_addr gateway;
1426 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1427 struct flowi *fl,
1428 int flags)
1430 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1431 struct rt6_info *rt;
1432 struct fib6_node *fn;
1435 * Get the "current" route for this destination and
1436 * check if the redirect has come from approriate router.
1438 * RFC 2461 specifies that redirects should only be
1439 * accepted if they come from the nexthop to the target.
1440 * Due to the way the routes are chosen, this notion
1441 * is a bit fuzzy and one might need to check all possible
1442 * routes.
1445 read_lock_bh(&table->tb6_lock);
1446 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1447 restart:
1448 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1450 * Current route is on-link; redirect is always invalid.
1452 * Seems, previous statement is not true. It could
1453 * be node, which looks for us as on-link (f.e. proxy ndisc)
1454 * But then router serving it might decide, that we should
1455 * know truth 8)8) --ANK (980726).
1457 if (rt6_check_expired(rt))
1458 continue;
1459 if (!(rt->rt6i_flags & RTF_GATEWAY))
1460 continue;
1461 if (fl->oif != rt->rt6i_dev->ifindex)
1462 continue;
1463 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1464 continue;
1465 break;
1468 if (!rt)
1469 rt = &ip6_null_entry;
1470 BACKTRACK(&fl->fl6_src);
1471 out:
1472 dst_hold(&rt->u.dst);
1474 read_unlock_bh(&table->tb6_lock);
1476 return rt;
1479 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1480 struct in6_addr *src,
1481 struct in6_addr *gateway,
1482 struct net_device *dev)
1484 int flags = RT6_LOOKUP_F_HAS_SADDR;
1485 struct ip6rd_flowi rdfl = {
1486 .fl = {
1487 .oif = dev->ifindex,
1488 .nl_u = {
1489 .ip6_u = {
1490 .daddr = *dest,
1491 .saddr = *src,
1495 .gateway = *gateway,
1498 if (rt6_need_strict(dest))
1499 flags |= RT6_LOOKUP_F_IFACE;
1501 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1504 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1505 struct in6_addr *saddr,
1506 struct neighbour *neigh, u8 *lladdr, int on_link)
1508 struct rt6_info *rt, *nrt = NULL;
1509 struct netevent_redirect netevent;
1511 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1513 if (rt == &ip6_null_entry) {
1514 if (net_ratelimit())
1515 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1516 "for redirect target\n");
1517 goto out;
1521 * We have finally decided to accept it.
1524 neigh_update(neigh, lladdr, NUD_STALE,
1525 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1526 NEIGH_UPDATE_F_OVERRIDE|
1527 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1528 NEIGH_UPDATE_F_ISROUTER))
1532 * Redirect received -> path was valid.
1533 * Look, redirects are sent only in response to data packets,
1534 * so that this nexthop apparently is reachable. --ANK
1536 dst_confirm(&rt->u.dst);
1538 /* Duplicate redirect: silently ignore. */
1539 if (neigh == rt->u.dst.neighbour)
1540 goto out;
1542 nrt = ip6_rt_copy(rt);
1543 if (nrt == NULL)
1544 goto out;
1546 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1547 if (on_link)
1548 nrt->rt6i_flags &= ~RTF_GATEWAY;
1550 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1551 nrt->rt6i_dst.plen = 128;
1552 nrt->u.dst.flags |= DST_HOST;
1554 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1555 nrt->rt6i_nexthop = neigh_clone(neigh);
1556 /* Reset pmtu, it may be better */
1557 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1558 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1560 if (ip6_ins_rt(nrt))
1561 goto out;
1563 netevent.old = &rt->u.dst;
1564 netevent.new = &nrt->u.dst;
1565 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1567 if (rt->rt6i_flags&RTF_CACHE) {
1568 ip6_del_rt(rt);
1569 return;
1572 out:
1573 dst_release(&rt->u.dst);
1574 return;
1578 * Handle ICMP "packet too big" messages
1579 * i.e. Path MTU discovery
1582 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1583 u32 pmtu, int ifindex)
1585 struct rt6_info *rt, *nrt;
1586 int allfrag = 0;
1588 rt = rt6_lookup(daddr, saddr, ifindex, 0);
1589 if (rt == NULL)
1590 return;
1592 if (pmtu >= dst_mtu(&rt->u.dst))
1593 goto out;
1595 if (pmtu < IPV6_MIN_MTU) {
1597 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1598 * MTU (1280) and a fragment header should always be included
1599 * after a node receiving Too Big message reporting PMTU is
1600 * less than the IPv6 Minimum Link MTU.
1602 pmtu = IPV6_MIN_MTU;
1603 allfrag = 1;
1606 /* New mtu received -> path was valid.
1607 They are sent only in response to data packets,
1608 so that this nexthop apparently is reachable. --ANK
1610 dst_confirm(&rt->u.dst);
1612 /* Host route. If it is static, it would be better
1613 not to override it, but add new one, so that
1614 when cache entry will expire old pmtu
1615 would return automatically.
1617 if (rt->rt6i_flags & RTF_CACHE) {
1618 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1619 if (allfrag)
1620 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1621 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1622 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1623 goto out;
1626 /* Network route.
1627 Two cases are possible:
1628 1. It is connected route. Action: COW
1629 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1631 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1632 nrt = rt6_alloc_cow(rt, daddr, saddr);
1633 else
1634 nrt = rt6_alloc_clone(rt, daddr);
1636 if (nrt) {
1637 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1638 if (allfrag)
1639 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1641 /* According to RFC 1981, detecting PMTU increase shouldn't be
1642 * happened within 5 mins, the recommended timer is 10 mins.
1643 * Here this route expiration time is set to ip6_rt_mtu_expires
1644 * which is 10 mins. After 10 mins the decreased pmtu is expired
1645 * and detecting PMTU increase will be automatically happened.
1647 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1648 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1650 ip6_ins_rt(nrt);
1652 out:
1653 dst_release(&rt->u.dst);
1656 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1657 struct net_device *dev, u32 pmtu)
1660 * RFC 1981 states that a node "MUST reduce the size of the packets it
1661 * is sending along the path" that caused the Packet Too Big message.
1662 * Since it's not possible in the general case to determine which
1663 * interface was used to send the original packet, we update the MTU
1664 * on the interface that will be used to send future packets. We also
1665 * update the MTU on the interface that received the Packet Too Big in
1666 * case the original packet was forced out that interface with
1667 * SO_BINDTODEVICE or similar. This is the next best thing to the
1668 * correct behaviour, which would be to update the MTU on all
1669 * interfaces.
1671 rt6_do_pmtu_disc(daddr, saddr, pmtu, 0);
1672 rt6_do_pmtu_disc(daddr, saddr, pmtu, dev->ifindex);
1676 * Misc support functions
1679 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1681 struct rt6_info *rt = ip6_dst_alloc();
1683 if (rt) {
1684 rt->u.dst.input = ort->u.dst.input;
1685 rt->u.dst.output = ort->u.dst.output;
1687 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1688 rt->u.dst.error = ort->u.dst.error;
1689 rt->u.dst.dev = ort->u.dst.dev;
1690 if (rt->u.dst.dev)
1691 dev_hold(rt->u.dst.dev);
1692 rt->rt6i_idev = ort->rt6i_idev;
1693 if (rt->rt6i_idev)
1694 in6_dev_hold(rt->rt6i_idev);
1695 rt->u.dst.lastuse = jiffies;
1696 rt->rt6i_expires = 0;
1698 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1699 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1700 rt->rt6i_metric = 0;
1702 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1703 #ifdef CONFIG_IPV6_SUBTREES
1704 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1705 #endif
1706 rt->rt6i_table = ort->rt6i_table;
1708 return rt;
1711 #ifdef CONFIG_IPV6_ROUTE_INFO
1712 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1713 struct in6_addr *gwaddr, int ifindex)
1715 struct fib6_node *fn;
1716 struct rt6_info *rt = NULL;
1717 struct fib6_table *table;
1719 table = fib6_get_table(RT6_TABLE_INFO);
1720 if (table == NULL)
1721 return NULL;
1723 write_lock_bh(&table->tb6_lock);
1724 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1725 if (!fn)
1726 goto out;
1728 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1729 if (rt->rt6i_dev->ifindex != ifindex)
1730 continue;
1731 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1732 continue;
1733 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1734 continue;
1735 dst_hold(&rt->u.dst);
1736 break;
1738 out:
1739 write_unlock_bh(&table->tb6_lock);
1740 return rt;
1743 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1744 struct in6_addr *gwaddr, int ifindex,
1745 unsigned pref)
1747 struct fib6_config cfg = {
1748 .fc_table = RT6_TABLE_INFO,
1749 .fc_metric = 1024,
1750 .fc_ifindex = ifindex,
1751 .fc_dst_len = prefixlen,
1752 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1753 RTF_UP | RTF_PREF(pref),
1756 ipv6_addr_copy(&cfg.fc_dst, prefix);
1757 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1759 /* We should treat it as a default route if prefix length is 0. */
1760 if (!prefixlen)
1761 cfg.fc_flags |= RTF_DEFAULT;
1763 ip6_route_add(&cfg);
1765 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1767 #endif
1769 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1771 struct rt6_info *rt;
1772 struct fib6_table *table;
1774 table = fib6_get_table(RT6_TABLE_DFLT);
1775 if (table == NULL)
1776 return NULL;
1778 write_lock_bh(&table->tb6_lock);
1779 for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1780 if (dev == rt->rt6i_dev &&
1781 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1782 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1783 break;
1785 if (rt)
1786 dst_hold(&rt->u.dst);
1787 write_unlock_bh(&table->tb6_lock);
1788 return rt;
1791 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1792 struct net_device *dev,
1793 unsigned int pref)
1795 struct fib6_config cfg = {
1796 .fc_table = RT6_TABLE_DFLT,
1797 .fc_metric = 1024,
1798 .fc_ifindex = dev->ifindex,
1799 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1800 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1803 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1805 ip6_route_add(&cfg);
1807 return rt6_get_dflt_router(gwaddr, dev);
1810 void rt6_purge_dflt_routers(void)
1812 struct rt6_info *rt;
1813 struct fib6_table *table;
1815 /* NOTE: Keep consistent with rt6_get_dflt_router */
1816 table = fib6_get_table(RT6_TABLE_DFLT);
1817 if (table == NULL)
1818 return;
1820 restart:
1821 read_lock_bh(&table->tb6_lock);
1822 for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1823 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1824 dst_hold(&rt->u.dst);
1825 read_unlock_bh(&table->tb6_lock);
1826 ip6_del_rt(rt);
1827 goto restart;
1830 read_unlock_bh(&table->tb6_lock);
1833 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1834 struct fib6_config *cfg)
1836 memset(cfg, 0, sizeof(*cfg));
1838 cfg->fc_table = RT6_TABLE_MAIN;
1839 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1840 cfg->fc_metric = rtmsg->rtmsg_metric;
1841 cfg->fc_expires = rtmsg->rtmsg_info;
1842 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1843 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1844 cfg->fc_flags = rtmsg->rtmsg_flags;
1846 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1847 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1848 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1851 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1853 struct fib6_config cfg;
1854 struct in6_rtmsg rtmsg;
1855 int err;
1857 switch(cmd) {
1858 case SIOCADDRT: /* Add a route */
1859 case SIOCDELRT: /* Delete a route */
1860 if (!capable(CAP_NET_ADMIN))
1861 return -EPERM;
1862 err = copy_from_user(&rtmsg, arg,
1863 sizeof(struct in6_rtmsg));
1864 if (err)
1865 return -EFAULT;
1867 rtmsg_to_fib6_config(&rtmsg, &cfg);
1869 rtnl_lock();
1870 switch (cmd) {
1871 case SIOCADDRT:
1872 err = ip6_route_add(&cfg);
1873 break;
1874 case SIOCDELRT:
1875 err = ip6_route_del(&cfg);
1876 break;
1877 default:
1878 err = -EINVAL;
1880 rtnl_unlock();
1882 return err;
1885 return -EINVAL;
1889 * Drop the packet on the floor
1892 static inline int ip6_pkt_drop(struct sk_buff *skb, int code,
1893 int ipstats_mib_noroutes)
1895 int type;
1896 switch (ipstats_mib_noroutes) {
1897 case IPSTATS_MIB_INNOROUTES:
1898 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1899 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1900 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1901 break;
1903 /* FALLTHROUGH */
1904 case IPSTATS_MIB_OUTNOROUTES:
1905 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1906 break;
1908 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1909 kfree_skb(skb);
1910 return 0;
1913 static int ip6_pkt_discard(struct sk_buff *skb)
1915 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1918 static int ip6_pkt_discard_out(struct sk_buff *skb)
1920 skb->dev = skb->dst->dev;
1921 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1924 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1926 static int ip6_pkt_prohibit(struct sk_buff *skb)
1928 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1931 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1933 skb->dev = skb->dst->dev;
1934 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1937 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1939 kfree_skb(skb);
1940 return 0;
1943 #endif
1946 * Allocate a dst for local (unicast / anycast) address.
1949 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1950 const struct in6_addr *addr,
1951 int anycast)
1953 struct rt6_info *rt = ip6_dst_alloc();
1954 struct neighbour *neigh;
1956 if (rt == NULL)
1957 return ERR_PTR(-ENOMEM);
1959 dev_hold(&loopback_dev);
1960 in6_dev_hold(idev);
1962 rt->u.dst.flags = DST_HOST;
1963 rt->u.dst.input = ip6_input;
1964 rt->u.dst.output = ip6_output;
1965 rt->rt6i_dev = &loopback_dev;
1966 rt->rt6i_idev = idev;
1967 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1968 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1969 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1970 rt->u.dst.obsolete = -1;
1972 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1973 if (anycast)
1974 rt->rt6i_flags |= RTF_ANYCAST;
1975 else
1976 rt->rt6i_flags |= RTF_LOCAL;
1977 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1978 if (IS_ERR(neigh)) {
1979 dst_free(&rt->u.dst);
1981 /* We are casting this because that is the return
1982 * value type. But an errno encoded pointer is the
1983 * same regardless of the underlying pointer type,
1984 * and that's what we are returning. So this is OK.
1986 return (struct rt6_info *) neigh;
1988 rt->rt6i_nexthop = neigh;
1990 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1991 rt->rt6i_dst.plen = 128;
1992 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1994 atomic_set(&rt->u.dst.__refcnt, 1);
1996 return rt;
1999 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2001 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
2002 rt != &ip6_null_entry) {
2003 RT6_TRACE("deleted by ifdown %p\n", rt);
2004 return -1;
2006 return 0;
2009 void rt6_ifdown(struct net_device *dev)
2011 fib6_clean_all(fib6_ifdown, 0, dev);
2014 struct rt6_mtu_change_arg
2016 struct net_device *dev;
2017 unsigned mtu;
2020 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2022 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2023 struct inet6_dev *idev;
2025 /* In IPv6 pmtu discovery is not optional,
2026 so that RTAX_MTU lock cannot disable it.
2027 We still use this lock to block changes
2028 caused by addrconf/ndisc.
2031 idev = __in6_dev_get(arg->dev);
2032 if (idev == NULL)
2033 return 0;
2035 /* For administrative MTU increase, there is no way to discover
2036 IPv6 PMTU increase, so PMTU increase should be updated here.
2037 Since RFC 1981 doesn't include administrative MTU increase
2038 update PMTU increase is a MUST. (i.e. jumbo frame)
2041 If new MTU is less than route PMTU, this new MTU will be the
2042 lowest MTU in the path, update the route PMTU to reflect PMTU
2043 decreases; if new MTU is greater than route PMTU, and the
2044 old MTU is the lowest MTU in the path, update the route PMTU
2045 to reflect the increase. In this case if the other nodes' MTU
2046 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2047 PMTU discouvery.
2049 if (rt->rt6i_dev == arg->dev &&
2050 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2051 (dst_mtu(&rt->u.dst) >= arg->mtu ||
2052 (dst_mtu(&rt->u.dst) < arg->mtu &&
2053 dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2054 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2055 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
2057 return 0;
2060 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2062 struct rt6_mtu_change_arg arg = {
2063 .dev = dev,
2064 .mtu = mtu,
2067 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
2070 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2071 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2072 [RTA_OIF] = { .type = NLA_U32 },
2073 [RTA_IIF] = { .type = NLA_U32 },
2074 [RTA_PRIORITY] = { .type = NLA_U32 },
2075 [RTA_METRICS] = { .type = NLA_NESTED },
2078 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2079 struct fib6_config *cfg)
2081 struct rtmsg *rtm;
2082 struct nlattr *tb[RTA_MAX+1];
2083 int err;
2085 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2086 if (err < 0)
2087 goto errout;
2089 err = -EINVAL;
2090 rtm = nlmsg_data(nlh);
2091 memset(cfg, 0, sizeof(*cfg));
2093 cfg->fc_table = rtm->rtm_table;
2094 cfg->fc_dst_len = rtm->rtm_dst_len;
2095 cfg->fc_src_len = rtm->rtm_src_len;
2096 cfg->fc_flags = RTF_UP;
2097 cfg->fc_protocol = rtm->rtm_protocol;
2099 if (rtm->rtm_type == RTN_UNREACHABLE)
2100 cfg->fc_flags |= RTF_REJECT;
2102 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2103 cfg->fc_nlinfo.nlh = nlh;
2105 if (tb[RTA_GATEWAY]) {
2106 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2107 cfg->fc_flags |= RTF_GATEWAY;
2110 if (tb[RTA_DST]) {
2111 int plen = (rtm->rtm_dst_len + 7) >> 3;
2113 if (nla_len(tb[RTA_DST]) < plen)
2114 goto errout;
2116 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2119 if (tb[RTA_SRC]) {
2120 int plen = (rtm->rtm_src_len + 7) >> 3;
2122 if (nla_len(tb[RTA_SRC]) < plen)
2123 goto errout;
2125 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2128 if (tb[RTA_OIF])
2129 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2131 if (tb[RTA_PRIORITY])
2132 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2134 if (tb[RTA_METRICS]) {
2135 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2136 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2139 if (tb[RTA_TABLE])
2140 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2142 err = 0;
2143 errout:
2144 return err;
2147 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2149 struct fib6_config cfg;
2150 int err;
2152 err = rtm_to_fib6_config(skb, nlh, &cfg);
2153 if (err < 0)
2154 return err;
2156 return ip6_route_del(&cfg);
2159 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2161 struct fib6_config cfg;
2162 int err;
2164 err = rtm_to_fib6_config(skb, nlh, &cfg);
2165 if (err < 0)
2166 return err;
2168 return ip6_route_add(&cfg);
2171 static inline size_t rt6_nlmsg_size(void)
2173 return NLMSG_ALIGN(sizeof(struct rtmsg))
2174 + nla_total_size(16) /* RTA_SRC */
2175 + nla_total_size(16) /* RTA_DST */
2176 + nla_total_size(16) /* RTA_GATEWAY */
2177 + nla_total_size(16) /* RTA_PREFSRC */
2178 + nla_total_size(4) /* RTA_TABLE */
2179 + nla_total_size(4) /* RTA_IIF */
2180 + nla_total_size(4) /* RTA_OIF */
2181 + nla_total_size(4) /* RTA_PRIORITY */
2182 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2183 + nla_total_size(sizeof(struct rta_cacheinfo));
2186 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2187 struct in6_addr *dst, struct in6_addr *src,
2188 int iif, int type, u32 pid, u32 seq,
2189 int prefix, int nowait, unsigned int flags)
2191 struct rtmsg *rtm;
2192 struct nlmsghdr *nlh;
2193 long expires;
2194 u32 table;
2196 if (prefix) { /* user wants prefix routes only */
2197 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2198 /* success since this is not a prefix route */
2199 return 1;
2203 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2204 if (nlh == NULL)
2205 return -EMSGSIZE;
2207 rtm = nlmsg_data(nlh);
2208 rtm->rtm_family = AF_INET6;
2209 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2210 rtm->rtm_src_len = rt->rt6i_src.plen;
2211 rtm->rtm_tos = 0;
2212 if (rt->rt6i_table)
2213 table = rt->rt6i_table->tb6_id;
2214 else
2215 table = RT6_TABLE_UNSPEC;
2216 rtm->rtm_table = table;
2217 NLA_PUT_U32(skb, RTA_TABLE, table);
2218 if (rt->rt6i_flags&RTF_REJECT)
2219 rtm->rtm_type = RTN_UNREACHABLE;
2220 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2221 rtm->rtm_type = RTN_LOCAL;
2222 else
2223 rtm->rtm_type = RTN_UNICAST;
2224 rtm->rtm_flags = 0;
2225 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2226 rtm->rtm_protocol = rt->rt6i_protocol;
2227 if (rt->rt6i_flags&RTF_DYNAMIC)
2228 rtm->rtm_protocol = RTPROT_REDIRECT;
2229 else if (rt->rt6i_flags & RTF_ADDRCONF)
2230 rtm->rtm_protocol = RTPROT_KERNEL;
2231 else if (rt->rt6i_flags&RTF_DEFAULT)
2232 rtm->rtm_protocol = RTPROT_RA;
2234 if (rt->rt6i_flags&RTF_CACHE)
2235 rtm->rtm_flags |= RTM_F_CLONED;
2237 if (dst) {
2238 NLA_PUT(skb, RTA_DST, 16, dst);
2239 rtm->rtm_dst_len = 128;
2240 } else if (rtm->rtm_dst_len)
2241 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2242 #ifdef CONFIG_IPV6_SUBTREES
2243 if (src) {
2244 NLA_PUT(skb, RTA_SRC, 16, src);
2245 rtm->rtm_src_len = 128;
2246 } else if (rtm->rtm_src_len)
2247 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2248 #endif
2249 if (iif) {
2250 #ifdef CONFIG_IPV6_MROUTE
2251 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2252 int err = ip6mr_get_route(skb, rtm, nowait);
2253 if (err <= 0) {
2254 if (!nowait) {
2255 if (err == 0)
2256 return 0;
2257 goto nla_put_failure;
2258 } else {
2259 if (err == -EMSGSIZE)
2260 goto nla_put_failure;
2263 } else
2264 #endif
2265 NLA_PUT_U32(skb, RTA_IIF, iif);
2266 } else if (dst) {
2267 struct in6_addr saddr_buf;
2268 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2269 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2272 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2273 goto nla_put_failure;
2275 if (rt->u.dst.neighbour)
2276 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2278 if (rt->u.dst.dev)
2279 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2281 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2283 expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2284 if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2285 expires, rt->u.dst.error) < 0)
2286 goto nla_put_failure;
2288 return nlmsg_end(skb, nlh);
2290 nla_put_failure:
2291 nlmsg_cancel(skb, nlh);
2292 return -EMSGSIZE;
2295 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2297 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2298 int prefix;
2300 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2301 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2302 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2303 } else
2304 prefix = 0;
2306 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2307 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2308 prefix, 0, NLM_F_MULTI);
2311 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2313 struct nlattr *tb[RTA_MAX+1];
2314 struct rt6_info *rt;
2315 struct sk_buff *skb;
2316 struct rtmsg *rtm;
2317 struct flowi fl;
2318 int err, iif = 0;
2320 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2321 if (err < 0)
2322 goto errout;
2324 err = -EINVAL;
2325 memset(&fl, 0, sizeof(fl));
2327 if (tb[RTA_SRC]) {
2328 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2329 goto errout;
2331 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2334 if (tb[RTA_DST]) {
2335 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2336 goto errout;
2338 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2341 if (tb[RTA_IIF])
2342 iif = nla_get_u32(tb[RTA_IIF]);
2344 if (tb[RTA_OIF])
2345 fl.oif = nla_get_u32(tb[RTA_OIF]);
2347 if (iif) {
2348 struct net_device *dev;
2349 dev = __dev_get_by_index(iif);
2350 if (!dev) {
2351 err = -ENODEV;
2352 goto errout;
2356 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2357 if (skb == NULL) {
2358 err = -ENOBUFS;
2359 goto errout;
2362 /* Reserve room for dummy headers, this skb can pass
2363 through good chunk of routing engine.
2365 skb_reset_mac_header(skb);
2366 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2368 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2369 skb->dst = &rt->u.dst;
2371 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2372 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2373 nlh->nlmsg_seq, 0, 0, 0);
2374 if (err < 0) {
2375 kfree_skb(skb);
2376 goto errout;
2379 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2380 errout:
2381 return err;
2384 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2386 struct sk_buff *skb;
2387 u32 pid = 0, seq = 0;
2388 struct nlmsghdr *nlh = NULL;
2389 int err = -ENOBUFS;
2391 if (info) {
2392 pid = info->pid;
2393 nlh = info->nlh;
2394 if (nlh)
2395 seq = nlh->nlmsg_seq;
2398 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2399 if (skb == NULL)
2400 goto errout;
2402 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0, 0);
2403 if (err < 0) {
2404 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2405 WARN_ON(err == -EMSGSIZE);
2406 kfree_skb(skb);
2407 goto errout;
2409 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2410 errout:
2411 if (err < 0)
2412 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2416 * /proc
2419 #ifdef CONFIG_PROC_FS
2421 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2423 struct rt6_proc_arg
2425 char *buffer;
2426 int offset;
2427 int length;
2428 int skip;
2429 int len;
2432 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2434 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2436 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2437 arg->skip++;
2438 return 0;
2441 if (arg->len >= arg->length)
2442 return 0;
2444 arg->len += sprintf(arg->buffer + arg->len,
2445 NIP6_SEQFMT " %02x ",
2446 NIP6(rt->rt6i_dst.addr),
2447 rt->rt6i_dst.plen);
2449 #ifdef CONFIG_IPV6_SUBTREES
2450 arg->len += sprintf(arg->buffer + arg->len,
2451 NIP6_SEQFMT " %02x ",
2452 NIP6(rt->rt6i_src.addr),
2453 rt->rt6i_src.plen);
2454 #else
2455 arg->len += sprintf(arg->buffer + arg->len,
2456 "00000000000000000000000000000000 00 ");
2457 #endif
2459 if (rt->rt6i_nexthop) {
2460 arg->len += sprintf(arg->buffer + arg->len,
2461 NIP6_SEQFMT,
2462 NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2463 } else {
2464 arg->len += sprintf(arg->buffer + arg->len,
2465 "00000000000000000000000000000000");
2467 arg->len += sprintf(arg->buffer + arg->len,
2468 " %08x %08x %08x %08x %8s\n",
2469 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2470 rt->u.dst.__use, rt->rt6i_flags,
2471 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2472 return 0;
2475 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2477 struct rt6_proc_arg arg = {
2478 .buffer = buffer,
2479 .offset = offset,
2480 .length = length,
2483 fib6_clean_all(rt6_info_route, 0, &arg);
2485 *start = buffer;
2486 if (offset)
2487 *start += offset % RT6_INFO_LEN;
2489 arg.len -= offset % RT6_INFO_LEN;
2491 if (arg.len > length)
2492 arg.len = length;
2493 if (arg.len < 0)
2494 arg.len = 0;
2496 return arg.len;
2499 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2501 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2502 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2503 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2504 rt6_stats.fib_rt_cache,
2505 atomic_read(&ip6_dst_ops.entries),
2506 rt6_stats.fib_discarded_routes);
2508 return 0;
2511 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2513 return single_open(file, rt6_stats_seq_show, NULL);
2516 static const struct file_operations rt6_stats_seq_fops = {
2517 .owner = THIS_MODULE,
2518 .open = rt6_stats_seq_open,
2519 .read = seq_read,
2520 .llseek = seq_lseek,
2521 .release = single_release,
2523 #endif /* CONFIG_PROC_FS */
2525 #ifdef CONFIG_SYSCTL
2527 static int flush_delay;
2529 static
2530 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2531 void __user *buffer, size_t *lenp, loff_t *ppos)
2533 if (write) {
2534 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2535 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2536 return 0;
2537 } else
2538 return -EINVAL;
2541 ctl_table ipv6_route_table[] = {
2543 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2544 .procname = "flush",
2545 .data = &flush_delay,
2546 .maxlen = sizeof(int),
2547 .mode = 0200,
2548 .proc_handler = &ipv6_sysctl_rtcache_flush
2551 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2552 .procname = "gc_thresh",
2553 .data = &ip6_dst_ops.gc_thresh,
2554 .maxlen = sizeof(int),
2555 .mode = 0644,
2556 .proc_handler = &proc_dointvec,
2559 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2560 .procname = "max_size",
2561 .data = &ip6_rt_max_size,
2562 .maxlen = sizeof(int),
2563 .mode = 0644,
2564 .proc_handler = &proc_dointvec,
2567 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2568 .procname = "gc_min_interval",
2569 .data = &ip6_rt_gc_min_interval,
2570 .maxlen = sizeof(int),
2571 .mode = 0644,
2572 .proc_handler = &proc_dointvec_jiffies,
2573 .strategy = &sysctl_jiffies,
2576 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2577 .procname = "gc_timeout",
2578 .data = &ip6_rt_gc_timeout,
2579 .maxlen = sizeof(int),
2580 .mode = 0644,
2581 .proc_handler = &proc_dointvec_jiffies,
2582 .strategy = &sysctl_jiffies,
2585 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2586 .procname = "gc_interval",
2587 .data = &ip6_rt_gc_interval,
2588 .maxlen = sizeof(int),
2589 .mode = 0644,
2590 .proc_handler = &proc_dointvec_jiffies,
2591 .strategy = &sysctl_jiffies,
2594 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2595 .procname = "gc_elasticity",
2596 .data = &ip6_rt_gc_elasticity,
2597 .maxlen = sizeof(int),
2598 .mode = 0644,
2599 .proc_handler = &proc_dointvec_jiffies,
2600 .strategy = &sysctl_jiffies,
2603 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2604 .procname = "mtu_expires",
2605 .data = &ip6_rt_mtu_expires,
2606 .maxlen = sizeof(int),
2607 .mode = 0644,
2608 .proc_handler = &proc_dointvec_jiffies,
2609 .strategy = &sysctl_jiffies,
2612 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2613 .procname = "min_adv_mss",
2614 .data = &ip6_rt_min_advmss,
2615 .maxlen = sizeof(int),
2616 .mode = 0644,
2617 .proc_handler = &proc_dointvec_jiffies,
2618 .strategy = &sysctl_jiffies,
2621 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2622 .procname = "gc_min_interval_ms",
2623 .data = &ip6_rt_gc_min_interval,
2624 .maxlen = sizeof(int),
2625 .mode = 0644,
2626 .proc_handler = &proc_dointvec_ms_jiffies,
2627 .strategy = &sysctl_ms_jiffies,
2629 { .ctl_name = 0 }
2632 #endif
2634 void __init ip6_route_init(void)
2636 #ifdef CONFIG_PROC_FS
2637 struct proc_dir_entry *p;
2638 #endif
2639 ip6_dst_ops.kmem_cachep =
2640 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2641 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2642 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2644 fib6_init();
2645 #ifdef CONFIG_PROC_FS
2646 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2647 if (p)
2648 p->owner = THIS_MODULE;
2650 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2651 #endif
2652 #ifdef CONFIG_XFRM
2653 xfrm6_init();
2654 #endif
2655 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2656 fib6_rules_init();
2657 #endif
2659 __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL);
2660 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL);
2661 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL);
2664 void ip6_route_cleanup(void)
2666 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2667 fib6_rules_cleanup();
2668 #endif
2669 #ifdef CONFIG_PROC_FS
2670 proc_net_remove("ipv6_route");
2671 proc_net_remove("rt6_stats");
2672 #endif
2673 #ifdef CONFIG_XFRM
2674 xfrm6_fini();
2675 #endif
2676 rt6_ifdown(NULL);
2677 fib6_gc_cleanup();
2678 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);