ipv6: drop metadata dst in ip6_route_input
[linux-2.6/btrfs-unstable.git] / net / ipv6 / route.c
blob0947ad0b3de8a9a8abee2abcfc98e00886438826
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
63 #include <asm/uaccess.h>
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
69 enum rt6_nud_state {
70 RT6_NUD_FAIL_HARD = -3,
71 RT6_NUD_FAIL_PROBE = -2,
72 RT6_NUD_FAIL_DO_RR = -1,
73 RT6_NUD_SUCCEED = 1
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void ip6_dst_destroy(struct dst_entry *);
82 static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how);
84 static int ip6_dst_gc(struct dst_ops *ops);
86 static int ip6_pkt_discard(struct sk_buff *skb);
87 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int ip6_pkt_prohibit(struct sk_buff *skb);
89 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void ip6_link_failure(struct sk_buff *skb);
91 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92 struct sk_buff *skb, u32 mtu);
93 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
94 struct sk_buff *skb);
95 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100 const struct in6_addr *prefix, int prefixlen,
101 const struct in6_addr *gwaddr, int ifindex,
102 unsigned int pref);
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104 const struct in6_addr *prefix, int prefixlen,
105 const struct in6_addr *gwaddr, int ifindex);
106 #endif
108 struct uncached_list {
109 spinlock_t lock;
110 struct list_head head;
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
115 static void rt6_uncached_list_add(struct rt6_info *rt)
117 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
119 rt->dst.flags |= DST_NOCACHE;
120 rt->rt6i_uncached_list = ul;
122 spin_lock_bh(&ul->lock);
123 list_add_tail(&rt->rt6i_uncached, &ul->head);
124 spin_unlock_bh(&ul->lock);
127 static void rt6_uncached_list_del(struct rt6_info *rt)
129 if (!list_empty(&rt->rt6i_uncached)) {
130 struct uncached_list *ul = rt->rt6i_uncached_list;
132 spin_lock_bh(&ul->lock);
133 list_del(&rt->rt6i_uncached);
134 spin_unlock_bh(&ul->lock);
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
140 struct net_device *loopback_dev = net->loopback_dev;
141 int cpu;
143 for_each_possible_cpu(cpu) {
144 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
145 struct rt6_info *rt;
147 spin_lock_bh(&ul->lock);
148 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149 struct inet6_dev *rt_idev = rt->rt6i_idev;
150 struct net_device *rt_dev = rt->dst.dev;
152 if (rt_idev && (rt_idev->dev == dev || !dev) &&
153 rt_idev->dev != loopback_dev) {
154 rt->rt6i_idev = in6_dev_get(loopback_dev);
155 in6_dev_put(rt_idev);
158 if (rt_dev && (rt_dev == dev || !dev) &&
159 rt_dev != loopback_dev) {
160 rt->dst.dev = loopback_dev;
161 dev_hold(rt->dst.dev);
162 dev_put(rt_dev);
165 spin_unlock_bh(&ul->lock);
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
171 return dst_metrics_write_ptr(rt->dst.from);
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
176 struct rt6_info *rt = (struct rt6_info *)dst;
178 if (rt->rt6i_flags & RTF_PCPU)
179 return rt6_pcpu_cow_metrics(rt);
180 else if (rt->rt6i_flags & RTF_CACHE)
181 return NULL;
182 else
183 return dst_cow_metrics_generic(dst, old);
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
187 struct sk_buff *skb,
188 const void *daddr)
190 struct in6_addr *p = &rt->rt6i_gateway;
192 if (!ipv6_addr_any(p))
193 return (const void *) p;
194 else if (skb)
195 return &ipv6_hdr(skb)->daddr;
196 return daddr;
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
200 struct sk_buff *skb,
201 const void *daddr)
203 struct rt6_info *rt = (struct rt6_info *) dst;
204 struct neighbour *n;
206 daddr = choose_neigh_daddr(rt, skb, daddr);
207 n = __ipv6_neigh_lookup(dst->dev, daddr);
208 if (n)
209 return n;
210 return neigh_create(&nd_tbl, daddr, dst->dev);
213 static struct dst_ops ip6_dst_ops_template = {
214 .family = AF_INET6,
215 .gc = ip6_dst_gc,
216 .gc_thresh = 1024,
217 .check = ip6_dst_check,
218 .default_advmss = ip6_default_advmss,
219 .mtu = ip6_mtu,
220 .cow_metrics = ipv6_cow_metrics,
221 .destroy = ip6_dst_destroy,
222 .ifdown = ip6_dst_ifdown,
223 .negative_advice = ip6_negative_advice,
224 .link_failure = ip6_link_failure,
225 .update_pmtu = ip6_rt_update_pmtu,
226 .redirect = rt6_do_redirect,
227 .local_out = __ip6_local_out,
228 .neigh_lookup = ip6_neigh_lookup,
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
233 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
235 return mtu ? : dst->dev->mtu;
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239 struct sk_buff *skb, u32 mtu)
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
244 struct sk_buff *skb)
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
249 unsigned long old)
251 return NULL;
254 static struct dst_ops ip6_dst_blackhole_ops = {
255 .family = AF_INET6,
256 .destroy = ip6_dst_destroy,
257 .check = ip6_dst_check,
258 .mtu = ip6_blackhole_mtu,
259 .default_advmss = ip6_default_advmss,
260 .update_pmtu = ip6_rt_blackhole_update_pmtu,
261 .redirect = ip6_rt_blackhole_redirect,
262 .cow_metrics = ip6_rt_blackhole_cow_metrics,
263 .neigh_lookup = ip6_neigh_lookup,
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267 [RTAX_HOPLIMIT - 1] = 0,
270 static const struct rt6_info ip6_null_entry_template = {
271 .dst = {
272 .__refcnt = ATOMIC_INIT(1),
273 .__use = 1,
274 .obsolete = DST_OBSOLETE_FORCE_CHK,
275 .error = -ENETUNREACH,
276 .input = ip6_pkt_discard,
277 .output = ip6_pkt_discard_out,
279 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
280 .rt6i_protocol = RTPROT_KERNEL,
281 .rt6i_metric = ~(u32) 0,
282 .rt6i_ref = ATOMIC_INIT(1),
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287 static const struct rt6_info ip6_prohibit_entry_template = {
288 .dst = {
289 .__refcnt = ATOMIC_INIT(1),
290 .__use = 1,
291 .obsolete = DST_OBSOLETE_FORCE_CHK,
292 .error = -EACCES,
293 .input = ip6_pkt_prohibit,
294 .output = ip6_pkt_prohibit_out,
296 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .rt6i_protocol = RTPROT_KERNEL,
298 .rt6i_metric = ~(u32) 0,
299 .rt6i_ref = ATOMIC_INIT(1),
302 static const struct rt6_info ip6_blk_hole_entry_template = {
303 .dst = {
304 .__refcnt = ATOMIC_INIT(1),
305 .__use = 1,
306 .obsolete = DST_OBSOLETE_FORCE_CHK,
307 .error = -EINVAL,
308 .input = dst_discard,
309 .output = dst_discard_sk,
311 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
312 .rt6i_protocol = RTPROT_KERNEL,
313 .rt6i_metric = ~(u32) 0,
314 .rt6i_ref = ATOMIC_INIT(1),
317 #endif
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321 struct net_device *dev,
322 int flags,
323 struct fib6_table *table)
325 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326 0, DST_OBSOLETE_FORCE_CHK, flags);
328 if (rt) {
329 struct dst_entry *dst = &rt->dst;
331 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332 INIT_LIST_HEAD(&rt->rt6i_siblings);
333 INIT_LIST_HEAD(&rt->rt6i_uncached);
335 return rt;
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339 struct net_device *dev,
340 int flags,
341 struct fib6_table *table)
343 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
345 if (rt) {
346 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
347 if (rt->rt6i_pcpu) {
348 int cpu;
350 for_each_possible_cpu(cpu) {
351 struct rt6_info **p;
353 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354 /* no one shares rt */
355 *p = NULL;
357 } else {
358 dst_destroy((struct dst_entry *)rt);
359 return NULL;
363 return rt;
366 static void ip6_dst_destroy(struct dst_entry *dst)
368 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct dst_entry *from = dst->from;
370 struct inet6_dev *idev;
372 dst_destroy_metrics_generic(dst);
373 free_percpu(rt->rt6i_pcpu);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
377 if (idev) {
378 rt->rt6i_idev = NULL;
379 in6_dev_put(idev);
382 dst->from = NULL;
383 dst_release(from);
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387 int how)
389 struct rt6_info *rt = (struct rt6_info *)dst;
390 struct inet6_dev *idev = rt->rt6i_idev;
391 struct net_device *loopback_dev =
392 dev_net(dev)->loopback_dev;
394 if (dev != loopback_dev) {
395 if (idev && idev->dev == dev) {
396 struct inet6_dev *loopback_idev =
397 in6_dev_get(loopback_dev);
398 if (loopback_idev) {
399 rt->rt6i_idev = loopback_idev;
400 in6_dev_put(idev);
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES) {
409 if (time_after(jiffies, rt->dst.expires))
410 return true;
411 } else if (rt->dst.from) {
412 return rt6_check_expired((struct rt6_info *) rt->dst.from);
414 return false;
417 /* Multipath route selection:
418 * Hash based function using packet header and flowlabel.
419 * Adapted from fib_info_hashfn()
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422 const struct flowi6 *fl6)
424 unsigned int val = fl6->flowi6_proto;
426 val ^= ipv6_addr_hash(&fl6->daddr);
427 val ^= ipv6_addr_hash(&fl6->saddr);
429 /* Work only if this not encapsulated */
430 switch (fl6->flowi6_proto) {
431 case IPPROTO_UDP:
432 case IPPROTO_TCP:
433 case IPPROTO_SCTP:
434 val ^= (__force u16)fl6->fl6_sport;
435 val ^= (__force u16)fl6->fl6_dport;
436 break;
438 case IPPROTO_ICMPV6:
439 val ^= (__force u16)fl6->fl6_icmp_type;
440 val ^= (__force u16)fl6->fl6_icmp_code;
441 break;
443 /* RFC6438 recommands to use flowlabel */
444 val ^= (__force u32)fl6->flowlabel;
446 /* Perhaps, we need to tune, this function? */
447 val = val ^ (val >> 7) ^ (val >> 12);
448 return val % candidate_count;
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 struct flowi6 *fl6, int oif,
453 int strict)
455 struct rt6_info *sibling, *next_sibling;
456 int route_choosen;
458 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459 /* Don't change the route, if route_choosen == 0
460 * (siblings does not include ourself)
462 if (route_choosen)
463 list_for_each_entry_safe(sibling, next_sibling,
464 &match->rt6i_siblings, rt6i_siblings) {
465 route_choosen--;
466 if (route_choosen == 0) {
467 if (rt6_score_route(sibling, oif, strict) < 0)
468 break;
469 match = sibling;
470 break;
473 return match;
477 * Route lookup. Any table->tb6_lock is implied.
480 static inline struct rt6_info *rt6_device_match(struct net *net,
481 struct rt6_info *rt,
482 const struct in6_addr *saddr,
483 int oif,
484 int flags)
486 struct rt6_info *local = NULL;
487 struct rt6_info *sprt;
489 if (!oif && ipv6_addr_any(saddr))
490 goto out;
492 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493 struct net_device *dev = sprt->dst.dev;
495 if (oif) {
496 if (dev->ifindex == oif)
497 return sprt;
498 if (dev->flags & IFF_LOOPBACK) {
499 if (!sprt->rt6i_idev ||
500 sprt->rt6i_idev->dev->ifindex != oif) {
501 if (flags & RT6_LOOKUP_F_IFACE && oif)
502 continue;
503 if (local && (!oif ||
504 local->rt6i_idev->dev->ifindex == oif))
505 continue;
507 local = sprt;
509 } else {
510 if (ipv6_chk_addr(net, saddr, dev,
511 flags & RT6_LOOKUP_F_IFACE))
512 return sprt;
516 if (oif) {
517 if (local)
518 return local;
520 if (flags & RT6_LOOKUP_F_IFACE)
521 return net->ipv6.ip6_null_entry;
523 out:
524 return rt;
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529 struct work_struct work;
530 struct in6_addr target;
531 struct net_device *dev;
534 static void rt6_probe_deferred(struct work_struct *w)
536 struct in6_addr mcaddr;
537 struct __rt6_probe_work *work =
538 container_of(w, struct __rt6_probe_work, work);
540 addrconf_addr_solict_mult(&work->target, &mcaddr);
541 ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
542 dev_put(work->dev);
543 kfree(work);
546 static void rt6_probe(struct rt6_info *rt)
548 struct __rt6_probe_work *work;
549 struct neighbour *neigh;
551 * Okay, this does not seem to be appropriate
552 * for now, however, we need to check if it
553 * is really so; aka Router Reachability Probing.
555 * Router Reachability Probe MUST be rate-limited
556 * to no more than one per minute.
558 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
559 return;
560 rcu_read_lock_bh();
561 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
562 if (neigh) {
563 if (neigh->nud_state & NUD_VALID)
564 goto out;
566 work = NULL;
567 write_lock(&neigh->lock);
568 if (!(neigh->nud_state & NUD_VALID) &&
569 time_after(jiffies,
570 neigh->updated +
571 rt->rt6i_idev->cnf.rtr_probe_interval)) {
572 work = kmalloc(sizeof(*work), GFP_ATOMIC);
573 if (work)
574 __neigh_set_probe_once(neigh);
576 write_unlock(&neigh->lock);
577 } else {
578 work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 if (work) {
582 INIT_WORK(&work->work, rt6_probe_deferred);
583 work->target = rt->rt6i_gateway;
584 dev_hold(rt->dst.dev);
585 work->dev = rt->dst.dev;
586 schedule_work(&work->work);
589 out:
590 rcu_read_unlock_bh();
592 #else
593 static inline void rt6_probe(struct rt6_info *rt)
596 #endif
599 * Default Router Selection (RFC 2461 6.3.6)
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
603 struct net_device *dev = rt->dst.dev;
604 if (!oif || dev->ifindex == oif)
605 return 2;
606 if ((dev->flags & IFF_LOOPBACK) &&
607 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
608 return 1;
609 return 0;
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
614 struct neighbour *neigh;
615 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
617 if (rt->rt6i_flags & RTF_NONEXTHOP ||
618 !(rt->rt6i_flags & RTF_GATEWAY))
619 return RT6_NUD_SUCCEED;
621 rcu_read_lock_bh();
622 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
623 if (neigh) {
624 read_lock(&neigh->lock);
625 if (neigh->nud_state & NUD_VALID)
626 ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628 else if (!(neigh->nud_state & NUD_FAILED))
629 ret = RT6_NUD_SUCCEED;
630 else
631 ret = RT6_NUD_FAIL_PROBE;
632 #endif
633 read_unlock(&neigh->lock);
634 } else {
635 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
638 rcu_read_unlock_bh();
640 return ret;
643 static int rt6_score_route(struct rt6_info *rt, int oif,
644 int strict)
646 int m;
648 m = rt6_check_dev(rt, oif);
649 if (!m && (strict & RT6_LOOKUP_F_IFACE))
650 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
653 #endif
654 if (strict & RT6_LOOKUP_F_REACHABLE) {
655 int n = rt6_check_neigh(rt);
656 if (n < 0)
657 return n;
659 return m;
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663 int *mpri, struct rt6_info *match,
664 bool *do_rr)
666 int m;
667 bool match_do_rr = false;
668 struct inet6_dev *idev = rt->rt6i_idev;
669 struct net_device *dev = rt->dst.dev;
671 if (dev && !netif_carrier_ok(dev) &&
672 idev->cnf.ignore_routes_with_linkdown)
673 goto out;
675 if (rt6_check_expired(rt))
676 goto out;
678 m = rt6_score_route(rt, oif, strict);
679 if (m == RT6_NUD_FAIL_DO_RR) {
680 match_do_rr = true;
681 m = 0; /* lowest valid score */
682 } else if (m == RT6_NUD_FAIL_HARD) {
683 goto out;
686 if (strict & RT6_LOOKUP_F_REACHABLE)
687 rt6_probe(rt);
689 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
690 if (m > *mpri) {
691 *do_rr = match_do_rr;
692 *mpri = m;
693 match = rt;
695 out:
696 return match;
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700 struct rt6_info *rr_head,
701 u32 metric, int oif, int strict,
702 bool *do_rr)
704 struct rt6_info *rt, *match, *cont;
705 int mpri = -1;
707 match = NULL;
708 cont = NULL;
709 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
710 if (rt->rt6i_metric != metric) {
711 cont = rt;
712 break;
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
719 if (rt->rt6i_metric != metric) {
720 cont = rt;
721 break;
724 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 if (match || !cont)
728 return match;
730 for (rt = cont; rt; rt = rt->dst.rt6_next)
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 return match;
736 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
738 struct rt6_info *match, *rt0;
739 struct net *net;
740 bool do_rr = false;
742 rt0 = fn->rr_ptr;
743 if (!rt0)
744 fn->rr_ptr = rt0 = fn->leaf;
746 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
747 &do_rr);
749 if (do_rr) {
750 struct rt6_info *next = rt0->dst.rt6_next;
752 /* no entries matched; do round-robin */
753 if (!next || next->rt6i_metric != rt0->rt6i_metric)
754 next = fn->leaf;
756 if (next != rt0)
757 fn->rr_ptr = next;
760 net = dev_net(rt0->dst.dev);
761 return match ? match : net->ipv6.ip6_null_entry;
764 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
766 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
769 #ifdef CONFIG_IPV6_ROUTE_INFO
770 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
771 const struct in6_addr *gwaddr)
773 struct net *net = dev_net(dev);
774 struct route_info *rinfo = (struct route_info *) opt;
775 struct in6_addr prefix_buf, *prefix;
776 unsigned int pref;
777 unsigned long lifetime;
778 struct rt6_info *rt;
780 if (len < sizeof(struct route_info)) {
781 return -EINVAL;
784 /* Sanity check for prefix_len and length */
785 if (rinfo->length > 3) {
786 return -EINVAL;
787 } else if (rinfo->prefix_len > 128) {
788 return -EINVAL;
789 } else if (rinfo->prefix_len > 64) {
790 if (rinfo->length < 2) {
791 return -EINVAL;
793 } else if (rinfo->prefix_len > 0) {
794 if (rinfo->length < 1) {
795 return -EINVAL;
799 pref = rinfo->route_pref;
800 if (pref == ICMPV6_ROUTER_PREF_INVALID)
801 return -EINVAL;
803 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
805 if (rinfo->length == 3)
806 prefix = (struct in6_addr *)rinfo->prefix;
807 else {
808 /* this function is safe */
809 ipv6_addr_prefix(&prefix_buf,
810 (struct in6_addr *)rinfo->prefix,
811 rinfo->prefix_len);
812 prefix = &prefix_buf;
815 if (rinfo->prefix_len == 0)
816 rt = rt6_get_dflt_router(gwaddr, dev);
817 else
818 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
819 gwaddr, dev->ifindex);
821 if (rt && !lifetime) {
822 ip6_del_rt(rt);
823 rt = NULL;
826 if (!rt && lifetime)
827 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
828 pref);
829 else if (rt)
830 rt->rt6i_flags = RTF_ROUTEINFO |
831 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
833 if (rt) {
834 if (!addrconf_finite_timeout(lifetime))
835 rt6_clean_expires(rt);
836 else
837 rt6_set_expires(rt, jiffies + HZ * lifetime);
839 ip6_rt_put(rt);
841 return 0;
843 #endif
845 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
846 struct in6_addr *saddr)
848 struct fib6_node *pn;
849 while (1) {
850 if (fn->fn_flags & RTN_TL_ROOT)
851 return NULL;
852 pn = fn->parent;
853 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
854 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
855 else
856 fn = pn;
857 if (fn->fn_flags & RTN_RTINFO)
858 return fn;
862 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
863 struct fib6_table *table,
864 struct flowi6 *fl6, int flags)
866 struct fib6_node *fn;
867 struct rt6_info *rt;
869 read_lock_bh(&table->tb6_lock);
870 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
871 restart:
872 rt = fn->leaf;
873 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
874 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
875 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
876 if (rt == net->ipv6.ip6_null_entry) {
877 fn = fib6_backtrack(fn, &fl6->saddr);
878 if (fn)
879 goto restart;
881 dst_use(&rt->dst, jiffies);
882 read_unlock_bh(&table->tb6_lock);
883 return rt;
887 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
888 int flags)
890 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
892 EXPORT_SYMBOL_GPL(ip6_route_lookup);
894 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
895 const struct in6_addr *saddr, int oif, int strict)
897 struct flowi6 fl6 = {
898 .flowi6_oif = oif,
899 .daddr = *daddr,
901 struct dst_entry *dst;
902 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
904 if (saddr) {
905 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
906 flags |= RT6_LOOKUP_F_HAS_SADDR;
909 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
910 if (dst->error == 0)
911 return (struct rt6_info *) dst;
913 dst_release(dst);
915 return NULL;
917 EXPORT_SYMBOL(rt6_lookup);
919 /* ip6_ins_rt is called with FREE table->tb6_lock.
920 It takes new route entry, the addition fails by any reason the
921 route is freed. In any case, if caller does not hold it, it may
922 be destroyed.
925 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
926 struct mx6_config *mxc)
928 int err;
929 struct fib6_table *table;
931 table = rt->rt6i_table;
932 write_lock_bh(&table->tb6_lock);
933 err = fib6_add(&table->tb6_root, rt, info, mxc);
934 write_unlock_bh(&table->tb6_lock);
936 return err;
939 int ip6_ins_rt(struct rt6_info *rt)
941 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
942 struct mx6_config mxc = { .mx = NULL, };
944 return __ip6_ins_rt(rt, &info, &mxc);
947 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
948 const struct in6_addr *daddr,
949 const struct in6_addr *saddr)
951 struct rt6_info *rt;
954 * Clone the route.
957 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
958 ort = (struct rt6_info *)ort->dst.from;
960 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
961 0, ort->rt6i_table);
963 if (!rt)
964 return NULL;
966 ip6_rt_copy_init(rt, ort);
967 rt->rt6i_flags |= RTF_CACHE;
968 rt->rt6i_metric = 0;
969 rt->dst.flags |= DST_HOST;
970 rt->rt6i_dst.addr = *daddr;
971 rt->rt6i_dst.plen = 128;
973 if (!rt6_is_gw_or_nonexthop(ort)) {
974 if (ort->rt6i_dst.plen != 128 &&
975 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
976 rt->rt6i_flags |= RTF_ANYCAST;
977 #ifdef CONFIG_IPV6_SUBTREES
978 if (rt->rt6i_src.plen && saddr) {
979 rt->rt6i_src.addr = *saddr;
980 rt->rt6i_src.plen = 128;
982 #endif
985 return rt;
988 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
990 struct rt6_info *pcpu_rt;
992 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
993 rt->dst.dev, rt->dst.flags,
994 rt->rt6i_table);
996 if (!pcpu_rt)
997 return NULL;
998 ip6_rt_copy_init(pcpu_rt, rt);
999 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1000 pcpu_rt->rt6i_flags |= RTF_PCPU;
1001 return pcpu_rt;
1004 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1005 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1007 struct rt6_info *pcpu_rt, *prev, **p;
1009 p = this_cpu_ptr(rt->rt6i_pcpu);
1010 pcpu_rt = *p;
1012 if (pcpu_rt)
1013 goto done;
1015 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016 if (!pcpu_rt) {
1017 struct net *net = dev_net(rt->dst.dev);
1019 pcpu_rt = net->ipv6.ip6_null_entry;
1020 goto done;
1023 prev = cmpxchg(p, NULL, pcpu_rt);
1024 if (prev) {
1025 /* If someone did it before us, return prev instead */
1026 dst_destroy(&pcpu_rt->dst);
1027 pcpu_rt = prev;
1030 done:
1031 dst_hold(&pcpu_rt->dst);
1032 rt6_dst_from_metrics_check(pcpu_rt);
1033 return pcpu_rt;
1036 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1037 struct flowi6 *fl6, int flags)
1039 struct fib6_node *fn, *saved_fn;
1040 struct rt6_info *rt;
1041 int strict = 0;
1043 strict |= flags & RT6_LOOKUP_F_IFACE;
1044 if (net->ipv6.devconf_all->forwarding == 0)
1045 strict |= RT6_LOOKUP_F_REACHABLE;
1047 read_lock_bh(&table->tb6_lock);
1049 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1050 saved_fn = fn;
1052 redo_rt6_select:
1053 rt = rt6_select(fn, oif, strict);
1054 if (rt->rt6i_nsiblings)
1055 rt = rt6_multipath_select(rt, fl6, oif, strict);
1056 if (rt == net->ipv6.ip6_null_entry) {
1057 fn = fib6_backtrack(fn, &fl6->saddr);
1058 if (fn)
1059 goto redo_rt6_select;
1060 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1061 /* also consider unreachable route */
1062 strict &= ~RT6_LOOKUP_F_REACHABLE;
1063 fn = saved_fn;
1064 goto redo_rt6_select;
1069 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1070 dst_use(&rt->dst, jiffies);
1071 read_unlock_bh(&table->tb6_lock);
1073 rt6_dst_from_metrics_check(rt);
1074 return rt;
1075 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1076 !(rt->rt6i_flags & RTF_GATEWAY))) {
1077 /* Create a RTF_CACHE clone which will not be
1078 * owned by the fib6 tree. It is for the special case where
1079 * the daddr in the skb during the neighbor look-up is different
1080 * from the fl6->daddr used to look-up route here.
1083 struct rt6_info *uncached_rt;
1085 dst_use(&rt->dst, jiffies);
1086 read_unlock_bh(&table->tb6_lock);
1088 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1089 dst_release(&rt->dst);
1091 if (uncached_rt)
1092 rt6_uncached_list_add(uncached_rt);
1093 else
1094 uncached_rt = net->ipv6.ip6_null_entry;
1096 dst_hold(&uncached_rt->dst);
1097 return uncached_rt;
1099 } else {
1100 /* Get a percpu copy */
1102 struct rt6_info *pcpu_rt;
1104 rt->dst.lastuse = jiffies;
1105 rt->dst.__use++;
1106 pcpu_rt = rt6_get_pcpu_route(rt);
1107 read_unlock_bh(&table->tb6_lock);
1109 return pcpu_rt;
1113 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1114 struct flowi6 *fl6, int flags)
1116 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1119 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1120 struct net_device *dev,
1121 struct flowi6 *fl6, int flags)
1123 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1124 flags |= RT6_LOOKUP_F_IFACE;
1126 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1129 void ip6_route_input(struct sk_buff *skb)
1131 const struct ipv6hdr *iph = ipv6_hdr(skb);
1132 struct net *net = dev_net(skb->dev);
1133 int flags = RT6_LOOKUP_F_HAS_SADDR;
1134 struct flowi6 fl6 = {
1135 .flowi6_iif = skb->dev->ifindex,
1136 .daddr = iph->daddr,
1137 .saddr = iph->saddr,
1138 .flowlabel = ip6_flowinfo(iph),
1139 .flowi6_mark = skb->mark,
1140 .flowi6_proto = iph->nexthdr,
1143 skb_dst_drop(skb);
1144 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1147 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1148 struct flowi6 *fl6, int flags)
1150 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1153 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1154 struct flowi6 *fl6)
1156 int flags = 0;
1158 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1160 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1161 flags |= RT6_LOOKUP_F_IFACE;
1163 if (!ipv6_addr_any(&fl6->saddr))
1164 flags |= RT6_LOOKUP_F_HAS_SADDR;
1165 else if (sk)
1166 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1168 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1170 EXPORT_SYMBOL(ip6_route_output);
1172 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1174 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1175 struct dst_entry *new = NULL;
1177 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1178 if (rt) {
1179 new = &rt->dst;
1181 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1183 new->__use = 1;
1184 new->input = dst_discard;
1185 new->output = dst_discard_sk;
1187 if (dst_metrics_read_only(&ort->dst))
1188 new->_metrics = ort->dst._metrics;
1189 else
1190 dst_copy_metrics(new, &ort->dst);
1191 rt->rt6i_idev = ort->rt6i_idev;
1192 if (rt->rt6i_idev)
1193 in6_dev_hold(rt->rt6i_idev);
1195 rt->rt6i_gateway = ort->rt6i_gateway;
1196 rt->rt6i_flags = ort->rt6i_flags;
1197 rt->rt6i_metric = 0;
1199 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1202 #endif
1204 dst_free(new);
1207 dst_release(dst_orig);
1208 return new ? new : ERR_PTR(-ENOMEM);
1212 * Destination cache support functions
1215 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1217 if (rt->dst.from &&
1218 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1219 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1222 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1224 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1225 return NULL;
1227 if (rt6_check_expired(rt))
1228 return NULL;
1230 return &rt->dst;
1233 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1235 if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1236 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1237 return &rt->dst;
1238 else
1239 return NULL;
1242 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1244 struct rt6_info *rt;
1246 rt = (struct rt6_info *) dst;
1248 /* All IPV6 dsts are created with ->obsolete set to the value
1249 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1250 * into this function always.
1253 rt6_dst_from_metrics_check(rt);
1255 if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1256 return rt6_dst_from_check(rt, cookie);
1257 else
1258 return rt6_check(rt, cookie);
1261 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1263 struct rt6_info *rt = (struct rt6_info *) dst;
1265 if (rt) {
1266 if (rt->rt6i_flags & RTF_CACHE) {
1267 if (rt6_check_expired(rt)) {
1268 ip6_del_rt(rt);
1269 dst = NULL;
1271 } else {
1272 dst_release(dst);
1273 dst = NULL;
1276 return dst;
1279 static void ip6_link_failure(struct sk_buff *skb)
1281 struct rt6_info *rt;
1283 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1285 rt = (struct rt6_info *) skb_dst(skb);
1286 if (rt) {
1287 if (rt->rt6i_flags & RTF_CACHE) {
1288 dst_hold(&rt->dst);
1289 if (ip6_del_rt(rt))
1290 dst_free(&rt->dst);
1291 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1292 rt->rt6i_node->fn_sernum = -1;
1297 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1299 struct net *net = dev_net(rt->dst.dev);
1301 rt->rt6i_flags |= RTF_MODIFIED;
1302 rt->rt6i_pmtu = mtu;
1303 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1306 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1307 const struct ipv6hdr *iph, u32 mtu)
1309 struct rt6_info *rt6 = (struct rt6_info *)dst;
1311 if (rt6->rt6i_flags & RTF_LOCAL)
1312 return;
1314 dst_confirm(dst);
1315 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1316 if (mtu >= dst_mtu(dst))
1317 return;
1319 if (rt6->rt6i_flags & RTF_CACHE) {
1320 rt6_do_update_pmtu(rt6, mtu);
1321 } else {
1322 const struct in6_addr *daddr, *saddr;
1323 struct rt6_info *nrt6;
1325 if (iph) {
1326 daddr = &iph->daddr;
1327 saddr = &iph->saddr;
1328 } else if (sk) {
1329 daddr = &sk->sk_v6_daddr;
1330 saddr = &inet6_sk(sk)->saddr;
1331 } else {
1332 return;
1334 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1335 if (nrt6) {
1336 rt6_do_update_pmtu(nrt6, mtu);
1338 /* ip6_ins_rt(nrt6) will bump the
1339 * rt6->rt6i_node->fn_sernum
1340 * which will fail the next rt6_check() and
1341 * invalidate the sk->sk_dst_cache.
1343 ip6_ins_rt(nrt6);
1348 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1349 struct sk_buff *skb, u32 mtu)
1351 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1354 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1355 int oif, u32 mark)
1357 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1358 struct dst_entry *dst;
1359 struct flowi6 fl6;
1361 memset(&fl6, 0, sizeof(fl6));
1362 fl6.flowi6_oif = oif;
1363 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1364 fl6.daddr = iph->daddr;
1365 fl6.saddr = iph->saddr;
1366 fl6.flowlabel = ip6_flowinfo(iph);
1368 dst = ip6_route_output(net, NULL, &fl6);
1369 if (!dst->error)
1370 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1371 dst_release(dst);
1373 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1375 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1377 ip6_update_pmtu(skb, sock_net(sk), mtu,
1378 sk->sk_bound_dev_if, sk->sk_mark);
1380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1382 /* Handle redirects */
1383 struct ip6rd_flowi {
1384 struct flowi6 fl6;
1385 struct in6_addr gateway;
1388 static struct rt6_info *__ip6_route_redirect(struct net *net,
1389 struct fib6_table *table,
1390 struct flowi6 *fl6,
1391 int flags)
1393 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1394 struct rt6_info *rt;
1395 struct fib6_node *fn;
1397 /* Get the "current" route for this destination and
1398 * check if the redirect has come from approriate router.
1400 * RFC 4861 specifies that redirects should only be
1401 * accepted if they come from the nexthop to the target.
1402 * Due to the way the routes are chosen, this notion
1403 * is a bit fuzzy and one might need to check all possible
1404 * routes.
1407 read_lock_bh(&table->tb6_lock);
1408 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1409 restart:
1410 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1411 if (rt6_check_expired(rt))
1412 continue;
1413 if (rt->dst.error)
1414 break;
1415 if (!(rt->rt6i_flags & RTF_GATEWAY))
1416 continue;
1417 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1418 continue;
1419 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1420 continue;
1421 break;
1424 if (!rt)
1425 rt = net->ipv6.ip6_null_entry;
1426 else if (rt->dst.error) {
1427 rt = net->ipv6.ip6_null_entry;
1428 goto out;
1431 if (rt == net->ipv6.ip6_null_entry) {
1432 fn = fib6_backtrack(fn, &fl6->saddr);
1433 if (fn)
1434 goto restart;
1437 out:
1438 dst_hold(&rt->dst);
1440 read_unlock_bh(&table->tb6_lock);
1442 return rt;
1445 static struct dst_entry *ip6_route_redirect(struct net *net,
1446 const struct flowi6 *fl6,
1447 const struct in6_addr *gateway)
1449 int flags = RT6_LOOKUP_F_HAS_SADDR;
1450 struct ip6rd_flowi rdfl;
1452 rdfl.fl6 = *fl6;
1453 rdfl.gateway = *gateway;
1455 return fib6_rule_lookup(net, &rdfl.fl6,
1456 flags, __ip6_route_redirect);
1459 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1461 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1462 struct dst_entry *dst;
1463 struct flowi6 fl6;
1465 memset(&fl6, 0, sizeof(fl6));
1466 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1467 fl6.flowi6_oif = oif;
1468 fl6.flowi6_mark = mark;
1469 fl6.daddr = iph->daddr;
1470 fl6.saddr = iph->saddr;
1471 fl6.flowlabel = ip6_flowinfo(iph);
1473 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1474 rt6_do_redirect(dst, NULL, skb);
1475 dst_release(dst);
1477 EXPORT_SYMBOL_GPL(ip6_redirect);
1479 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1480 u32 mark)
1482 const struct ipv6hdr *iph = ipv6_hdr(skb);
1483 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1484 struct dst_entry *dst;
1485 struct flowi6 fl6;
1487 memset(&fl6, 0, sizeof(fl6));
1488 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1489 fl6.flowi6_oif = oif;
1490 fl6.flowi6_mark = mark;
1491 fl6.daddr = msg->dest;
1492 fl6.saddr = iph->daddr;
1494 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1495 rt6_do_redirect(dst, NULL, skb);
1496 dst_release(dst);
1499 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1501 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1503 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1505 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1507 struct net_device *dev = dst->dev;
1508 unsigned int mtu = dst_mtu(dst);
1509 struct net *net = dev_net(dev);
1511 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1513 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1514 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1517 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1518 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1519 * IPV6_MAXPLEN is also valid and means: "any MSS,
1520 * rely only on pmtu discovery"
1522 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1523 mtu = IPV6_MAXPLEN;
1524 return mtu;
1527 static unsigned int ip6_mtu(const struct dst_entry *dst)
1529 const struct rt6_info *rt = (const struct rt6_info *)dst;
1530 unsigned int mtu = rt->rt6i_pmtu;
1531 struct inet6_dev *idev;
1533 if (mtu)
1534 goto out;
1536 mtu = dst_metric_raw(dst, RTAX_MTU);
1537 if (mtu)
1538 goto out;
1540 mtu = IPV6_MIN_MTU;
1542 rcu_read_lock();
1543 idev = __in6_dev_get(dst->dev);
1544 if (idev)
1545 mtu = idev->cnf.mtu6;
1546 rcu_read_unlock();
1548 out:
1549 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1552 static struct dst_entry *icmp6_dst_gc_list;
1553 static DEFINE_SPINLOCK(icmp6_dst_lock);
1555 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1556 struct flowi6 *fl6)
1558 struct dst_entry *dst;
1559 struct rt6_info *rt;
1560 struct inet6_dev *idev = in6_dev_get(dev);
1561 struct net *net = dev_net(dev);
1563 if (unlikely(!idev))
1564 return ERR_PTR(-ENODEV);
1566 rt = ip6_dst_alloc(net, dev, 0, NULL);
1567 if (unlikely(!rt)) {
1568 in6_dev_put(idev);
1569 dst = ERR_PTR(-ENOMEM);
1570 goto out;
1573 rt->dst.flags |= DST_HOST;
1574 rt->dst.output = ip6_output;
1575 atomic_set(&rt->dst.__refcnt, 1);
1576 rt->rt6i_gateway = fl6->daddr;
1577 rt->rt6i_dst.addr = fl6->daddr;
1578 rt->rt6i_dst.plen = 128;
1579 rt->rt6i_idev = idev;
1580 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1582 spin_lock_bh(&icmp6_dst_lock);
1583 rt->dst.next = icmp6_dst_gc_list;
1584 icmp6_dst_gc_list = &rt->dst;
1585 spin_unlock_bh(&icmp6_dst_lock);
1587 fib6_force_start_gc(net);
1589 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1591 out:
1592 return dst;
1595 int icmp6_dst_gc(void)
1597 struct dst_entry *dst, **pprev;
1598 int more = 0;
1600 spin_lock_bh(&icmp6_dst_lock);
1601 pprev = &icmp6_dst_gc_list;
1603 while ((dst = *pprev) != NULL) {
1604 if (!atomic_read(&dst->__refcnt)) {
1605 *pprev = dst->next;
1606 dst_free(dst);
1607 } else {
1608 pprev = &dst->next;
1609 ++more;
1613 spin_unlock_bh(&icmp6_dst_lock);
1615 return more;
1618 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1619 void *arg)
1621 struct dst_entry *dst, **pprev;
1623 spin_lock_bh(&icmp6_dst_lock);
1624 pprev = &icmp6_dst_gc_list;
1625 while ((dst = *pprev) != NULL) {
1626 struct rt6_info *rt = (struct rt6_info *) dst;
1627 if (func(rt, arg)) {
1628 *pprev = dst->next;
1629 dst_free(dst);
1630 } else {
1631 pprev = &dst->next;
1634 spin_unlock_bh(&icmp6_dst_lock);
1637 static int ip6_dst_gc(struct dst_ops *ops)
1639 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1640 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1641 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1642 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1643 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1644 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1645 int entries;
1647 entries = dst_entries_get_fast(ops);
1648 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1649 entries <= rt_max_size)
1650 goto out;
1652 net->ipv6.ip6_rt_gc_expire++;
1653 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1654 entries = dst_entries_get_slow(ops);
1655 if (entries < ops->gc_thresh)
1656 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1657 out:
1658 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1659 return entries > rt_max_size;
1662 static int ip6_convert_metrics(struct mx6_config *mxc,
1663 const struct fib6_config *cfg)
1665 struct nlattr *nla;
1666 int remaining;
1667 u32 *mp;
1669 if (!cfg->fc_mx)
1670 return 0;
1672 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1673 if (unlikely(!mp))
1674 return -ENOMEM;
1676 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1677 int type = nla_type(nla);
1679 if (type) {
1680 u32 val;
1682 if (unlikely(type > RTAX_MAX))
1683 goto err;
1684 if (type == RTAX_CC_ALGO) {
1685 char tmp[TCP_CA_NAME_MAX];
1687 nla_strlcpy(tmp, nla, sizeof(tmp));
1688 val = tcp_ca_get_key_by_name(tmp);
1689 if (val == TCP_CA_UNSPEC)
1690 goto err;
1691 } else {
1692 val = nla_get_u32(nla);
1695 mp[type - 1] = val;
1696 __set_bit(type - 1, mxc->mx_valid);
1700 mxc->mx = mp;
1702 return 0;
1703 err:
1704 kfree(mp);
1705 return -EINVAL;
1708 int ip6_route_add(struct fib6_config *cfg)
1710 int err;
1711 struct net *net = cfg->fc_nlinfo.nl_net;
1712 struct rt6_info *rt = NULL;
1713 struct net_device *dev = NULL;
1714 struct inet6_dev *idev = NULL;
1715 struct fib6_table *table;
1716 struct mx6_config mxc = { .mx = NULL, };
1717 int addr_type;
1719 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1720 return -EINVAL;
1721 #ifndef CONFIG_IPV6_SUBTREES
1722 if (cfg->fc_src_len)
1723 return -EINVAL;
1724 #endif
1725 if (cfg->fc_ifindex) {
1726 err = -ENODEV;
1727 dev = dev_get_by_index(net, cfg->fc_ifindex);
1728 if (!dev)
1729 goto out;
1730 idev = in6_dev_get(dev);
1731 if (!idev)
1732 goto out;
1735 if (cfg->fc_metric == 0)
1736 cfg->fc_metric = IP6_RT_PRIO_USER;
1738 err = -ENOBUFS;
1739 if (cfg->fc_nlinfo.nlh &&
1740 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1741 table = fib6_get_table(net, cfg->fc_table);
1742 if (!table) {
1743 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1744 table = fib6_new_table(net, cfg->fc_table);
1746 } else {
1747 table = fib6_new_table(net, cfg->fc_table);
1750 if (!table)
1751 goto out;
1753 rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1755 if (!rt) {
1756 err = -ENOMEM;
1757 goto out;
1760 if (cfg->fc_flags & RTF_EXPIRES)
1761 rt6_set_expires(rt, jiffies +
1762 clock_t_to_jiffies(cfg->fc_expires));
1763 else
1764 rt6_clean_expires(rt);
1766 if (cfg->fc_protocol == RTPROT_UNSPEC)
1767 cfg->fc_protocol = RTPROT_BOOT;
1768 rt->rt6i_protocol = cfg->fc_protocol;
1770 addr_type = ipv6_addr_type(&cfg->fc_dst);
1772 if (addr_type & IPV6_ADDR_MULTICAST)
1773 rt->dst.input = ip6_mc_input;
1774 else if (cfg->fc_flags & RTF_LOCAL)
1775 rt->dst.input = ip6_input;
1776 else
1777 rt->dst.input = ip6_forward;
1779 rt->dst.output = ip6_output;
1781 if (cfg->fc_encap) {
1782 struct lwtunnel_state *lwtstate;
1784 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1785 cfg->fc_encap, &lwtstate);
1786 if (err)
1787 goto out;
1788 rt->dst.lwtstate = lwtstate_get(lwtstate);
1789 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1790 rt->dst.lwtstate->orig_output = rt->dst.output;
1791 rt->dst.output = lwtunnel_output;
1793 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1794 rt->dst.lwtstate->orig_input = rt->dst.input;
1795 rt->dst.input = lwtunnel_input;
1799 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1800 rt->rt6i_dst.plen = cfg->fc_dst_len;
1801 if (rt->rt6i_dst.plen == 128)
1802 rt->dst.flags |= DST_HOST;
1804 #ifdef CONFIG_IPV6_SUBTREES
1805 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1806 rt->rt6i_src.plen = cfg->fc_src_len;
1807 #endif
1809 rt->rt6i_metric = cfg->fc_metric;
1811 /* We cannot add true routes via loopback here,
1812 they would result in kernel looping; promote them to reject routes
1814 if ((cfg->fc_flags & RTF_REJECT) ||
1815 (dev && (dev->flags & IFF_LOOPBACK) &&
1816 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1817 !(cfg->fc_flags & RTF_LOCAL))) {
1818 /* hold loopback dev/idev if we haven't done so. */
1819 if (dev != net->loopback_dev) {
1820 if (dev) {
1821 dev_put(dev);
1822 in6_dev_put(idev);
1824 dev = net->loopback_dev;
1825 dev_hold(dev);
1826 idev = in6_dev_get(dev);
1827 if (!idev) {
1828 err = -ENODEV;
1829 goto out;
1832 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1833 switch (cfg->fc_type) {
1834 case RTN_BLACKHOLE:
1835 rt->dst.error = -EINVAL;
1836 rt->dst.output = dst_discard_sk;
1837 rt->dst.input = dst_discard;
1838 break;
1839 case RTN_PROHIBIT:
1840 rt->dst.error = -EACCES;
1841 rt->dst.output = ip6_pkt_prohibit_out;
1842 rt->dst.input = ip6_pkt_prohibit;
1843 break;
1844 case RTN_THROW:
1845 default:
1846 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1847 : -ENETUNREACH;
1848 rt->dst.output = ip6_pkt_discard_out;
1849 rt->dst.input = ip6_pkt_discard;
1850 break;
1852 goto install_route;
1855 if (cfg->fc_flags & RTF_GATEWAY) {
1856 const struct in6_addr *gw_addr;
1857 int gwa_type;
1859 gw_addr = &cfg->fc_gateway;
1860 gwa_type = ipv6_addr_type(gw_addr);
1862 /* if gw_addr is local we will fail to detect this in case
1863 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1864 * will return already-added prefix route via interface that
1865 * prefix route was assigned to, which might be non-loopback.
1867 err = -EINVAL;
1868 if (ipv6_chk_addr_and_flags(net, gw_addr,
1869 gwa_type & IPV6_ADDR_LINKLOCAL ?
1870 dev : NULL, 0, 0))
1871 goto out;
1873 rt->rt6i_gateway = *gw_addr;
1875 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1876 struct rt6_info *grt;
1878 /* IPv6 strictly inhibits using not link-local
1879 addresses as nexthop address.
1880 Otherwise, router will not able to send redirects.
1881 It is very good, but in some (rare!) circumstances
1882 (SIT, PtP, NBMA NOARP links) it is handy to allow
1883 some exceptions. --ANK
1885 if (!(gwa_type & IPV6_ADDR_UNICAST))
1886 goto out;
1888 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1890 err = -EHOSTUNREACH;
1891 if (!grt)
1892 goto out;
1893 if (dev) {
1894 if (dev != grt->dst.dev) {
1895 ip6_rt_put(grt);
1896 goto out;
1898 } else {
1899 dev = grt->dst.dev;
1900 idev = grt->rt6i_idev;
1901 dev_hold(dev);
1902 in6_dev_hold(grt->rt6i_idev);
1904 if (!(grt->rt6i_flags & RTF_GATEWAY))
1905 err = 0;
1906 ip6_rt_put(grt);
1908 if (err)
1909 goto out;
1911 err = -EINVAL;
1912 if (!dev || (dev->flags & IFF_LOOPBACK))
1913 goto out;
1916 err = -ENODEV;
1917 if (!dev)
1918 goto out;
1920 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1921 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1922 err = -EINVAL;
1923 goto out;
1925 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1926 rt->rt6i_prefsrc.plen = 128;
1927 } else
1928 rt->rt6i_prefsrc.plen = 0;
1930 rt->rt6i_flags = cfg->fc_flags;
1932 install_route:
1933 rt->dst.dev = dev;
1934 rt->rt6i_idev = idev;
1935 rt->rt6i_table = table;
1937 cfg->fc_nlinfo.nl_net = dev_net(dev);
1939 err = ip6_convert_metrics(&mxc, cfg);
1940 if (err)
1941 goto out;
1943 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1945 kfree(mxc.mx);
1946 return err;
1947 out:
1948 if (dev)
1949 dev_put(dev);
1950 if (idev)
1951 in6_dev_put(idev);
1952 if (rt)
1953 dst_free(&rt->dst);
1954 return err;
1957 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1959 int err;
1960 struct fib6_table *table;
1961 struct net *net = dev_net(rt->dst.dev);
1963 if (rt == net->ipv6.ip6_null_entry) {
1964 err = -ENOENT;
1965 goto out;
1968 table = rt->rt6i_table;
1969 write_lock_bh(&table->tb6_lock);
1970 err = fib6_del(rt, info);
1971 write_unlock_bh(&table->tb6_lock);
1973 out:
1974 ip6_rt_put(rt);
1975 return err;
1978 int ip6_del_rt(struct rt6_info *rt)
1980 struct nl_info info = {
1981 .nl_net = dev_net(rt->dst.dev),
1983 return __ip6_del_rt(rt, &info);
1986 static int ip6_route_del(struct fib6_config *cfg)
1988 struct fib6_table *table;
1989 struct fib6_node *fn;
1990 struct rt6_info *rt;
1991 int err = -ESRCH;
1993 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1994 if (!table)
1995 return err;
1997 read_lock_bh(&table->tb6_lock);
1999 fn = fib6_locate(&table->tb6_root,
2000 &cfg->fc_dst, cfg->fc_dst_len,
2001 &cfg->fc_src, cfg->fc_src_len);
2003 if (fn) {
2004 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2005 if ((rt->rt6i_flags & RTF_CACHE) &&
2006 !(cfg->fc_flags & RTF_CACHE))
2007 continue;
2008 if (cfg->fc_ifindex &&
2009 (!rt->dst.dev ||
2010 rt->dst.dev->ifindex != cfg->fc_ifindex))
2011 continue;
2012 if (cfg->fc_flags & RTF_GATEWAY &&
2013 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2014 continue;
2015 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2016 continue;
2017 dst_hold(&rt->dst);
2018 read_unlock_bh(&table->tb6_lock);
2020 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2023 read_unlock_bh(&table->tb6_lock);
2025 return err;
2028 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2030 struct net *net = dev_net(skb->dev);
2031 struct netevent_redirect netevent;
2032 struct rt6_info *rt, *nrt = NULL;
2033 struct ndisc_options ndopts;
2034 struct inet6_dev *in6_dev;
2035 struct neighbour *neigh;
2036 struct rd_msg *msg;
2037 int optlen, on_link;
2038 u8 *lladdr;
2040 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2041 optlen -= sizeof(*msg);
2043 if (optlen < 0) {
2044 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2045 return;
2048 msg = (struct rd_msg *)icmp6_hdr(skb);
2050 if (ipv6_addr_is_multicast(&msg->dest)) {
2051 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2052 return;
2055 on_link = 0;
2056 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2057 on_link = 1;
2058 } else if (ipv6_addr_type(&msg->target) !=
2059 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2060 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2061 return;
2064 in6_dev = __in6_dev_get(skb->dev);
2065 if (!in6_dev)
2066 return;
2067 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2068 return;
2070 /* RFC2461 8.1:
2071 * The IP source address of the Redirect MUST be the same as the current
2072 * first-hop router for the specified ICMP Destination Address.
2075 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2076 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2077 return;
2080 lladdr = NULL;
2081 if (ndopts.nd_opts_tgt_lladdr) {
2082 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2083 skb->dev);
2084 if (!lladdr) {
2085 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2086 return;
2090 rt = (struct rt6_info *) dst;
2091 if (rt == net->ipv6.ip6_null_entry) {
2092 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2093 return;
2096 /* Redirect received -> path was valid.
2097 * Look, redirects are sent only in response to data packets,
2098 * so that this nexthop apparently is reachable. --ANK
2100 dst_confirm(&rt->dst);
2102 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2103 if (!neigh)
2104 return;
2107 * We have finally decided to accept it.
2110 neigh_update(neigh, lladdr, NUD_STALE,
2111 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2112 NEIGH_UPDATE_F_OVERRIDE|
2113 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2114 NEIGH_UPDATE_F_ISROUTER))
2117 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2118 if (!nrt)
2119 goto out;
2121 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2122 if (on_link)
2123 nrt->rt6i_flags &= ~RTF_GATEWAY;
2125 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2127 if (ip6_ins_rt(nrt))
2128 goto out;
2130 netevent.old = &rt->dst;
2131 netevent.new = &nrt->dst;
2132 netevent.daddr = &msg->dest;
2133 netevent.neigh = neigh;
2134 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2136 if (rt->rt6i_flags & RTF_CACHE) {
2137 rt = (struct rt6_info *) dst_clone(&rt->dst);
2138 ip6_del_rt(rt);
2141 out:
2142 neigh_release(neigh);
2146 * Misc support functions
2149 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2151 BUG_ON(from->dst.from);
2153 rt->rt6i_flags &= ~RTF_EXPIRES;
2154 dst_hold(&from->dst);
2155 rt->dst.from = &from->dst;
2156 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2159 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2161 rt->dst.input = ort->dst.input;
2162 rt->dst.output = ort->dst.output;
2163 rt->rt6i_dst = ort->rt6i_dst;
2164 rt->dst.error = ort->dst.error;
2165 rt->rt6i_idev = ort->rt6i_idev;
2166 if (rt->rt6i_idev)
2167 in6_dev_hold(rt->rt6i_idev);
2168 rt->dst.lastuse = jiffies;
2169 rt->rt6i_gateway = ort->rt6i_gateway;
2170 rt->rt6i_flags = ort->rt6i_flags;
2171 rt6_set_from(rt, ort);
2172 rt->rt6i_metric = ort->rt6i_metric;
2173 #ifdef CONFIG_IPV6_SUBTREES
2174 rt->rt6i_src = ort->rt6i_src;
2175 #endif
2176 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2177 rt->rt6i_table = ort->rt6i_table;
2178 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2181 #ifdef CONFIG_IPV6_ROUTE_INFO
2182 static struct rt6_info *rt6_get_route_info(struct net *net,
2183 const struct in6_addr *prefix, int prefixlen,
2184 const struct in6_addr *gwaddr, int ifindex)
2186 struct fib6_node *fn;
2187 struct rt6_info *rt = NULL;
2188 struct fib6_table *table;
2190 table = fib6_get_table(net, RT6_TABLE_INFO);
2191 if (!table)
2192 return NULL;
2194 read_lock_bh(&table->tb6_lock);
2195 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2196 if (!fn)
2197 goto out;
2199 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2200 if (rt->dst.dev->ifindex != ifindex)
2201 continue;
2202 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2203 continue;
2204 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2205 continue;
2206 dst_hold(&rt->dst);
2207 break;
2209 out:
2210 read_unlock_bh(&table->tb6_lock);
2211 return rt;
2214 static struct rt6_info *rt6_add_route_info(struct net *net,
2215 const struct in6_addr *prefix, int prefixlen,
2216 const struct in6_addr *gwaddr, int ifindex,
2217 unsigned int pref)
2219 struct fib6_config cfg = {
2220 .fc_table = RT6_TABLE_INFO,
2221 .fc_metric = IP6_RT_PRIO_USER,
2222 .fc_ifindex = ifindex,
2223 .fc_dst_len = prefixlen,
2224 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2225 RTF_UP | RTF_PREF(pref),
2226 .fc_nlinfo.portid = 0,
2227 .fc_nlinfo.nlh = NULL,
2228 .fc_nlinfo.nl_net = net,
2231 cfg.fc_dst = *prefix;
2232 cfg.fc_gateway = *gwaddr;
2234 /* We should treat it as a default route if prefix length is 0. */
2235 if (!prefixlen)
2236 cfg.fc_flags |= RTF_DEFAULT;
2238 ip6_route_add(&cfg);
2240 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2242 #endif
2244 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2246 struct rt6_info *rt;
2247 struct fib6_table *table;
2249 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2250 if (!table)
2251 return NULL;
2253 read_lock_bh(&table->tb6_lock);
2254 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2255 if (dev == rt->dst.dev &&
2256 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2257 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2258 break;
2260 if (rt)
2261 dst_hold(&rt->dst);
2262 read_unlock_bh(&table->tb6_lock);
2263 return rt;
2266 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2267 struct net_device *dev,
2268 unsigned int pref)
2270 struct fib6_config cfg = {
2271 .fc_table = RT6_TABLE_DFLT,
2272 .fc_metric = IP6_RT_PRIO_USER,
2273 .fc_ifindex = dev->ifindex,
2274 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2275 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2276 .fc_nlinfo.portid = 0,
2277 .fc_nlinfo.nlh = NULL,
2278 .fc_nlinfo.nl_net = dev_net(dev),
2281 cfg.fc_gateway = *gwaddr;
2283 ip6_route_add(&cfg);
2285 return rt6_get_dflt_router(gwaddr, dev);
2288 void rt6_purge_dflt_routers(struct net *net)
2290 struct rt6_info *rt;
2291 struct fib6_table *table;
2293 /* NOTE: Keep consistent with rt6_get_dflt_router */
2294 table = fib6_get_table(net, RT6_TABLE_DFLT);
2295 if (!table)
2296 return;
2298 restart:
2299 read_lock_bh(&table->tb6_lock);
2300 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2301 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2302 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2303 dst_hold(&rt->dst);
2304 read_unlock_bh(&table->tb6_lock);
2305 ip6_del_rt(rt);
2306 goto restart;
2309 read_unlock_bh(&table->tb6_lock);
2312 static void rtmsg_to_fib6_config(struct net *net,
2313 struct in6_rtmsg *rtmsg,
2314 struct fib6_config *cfg)
2316 memset(cfg, 0, sizeof(*cfg));
2318 cfg->fc_table = RT6_TABLE_MAIN;
2319 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2320 cfg->fc_metric = rtmsg->rtmsg_metric;
2321 cfg->fc_expires = rtmsg->rtmsg_info;
2322 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2323 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2324 cfg->fc_flags = rtmsg->rtmsg_flags;
2326 cfg->fc_nlinfo.nl_net = net;
2328 cfg->fc_dst = rtmsg->rtmsg_dst;
2329 cfg->fc_src = rtmsg->rtmsg_src;
2330 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2333 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2335 struct fib6_config cfg;
2336 struct in6_rtmsg rtmsg;
2337 int err;
2339 switch (cmd) {
2340 case SIOCADDRT: /* Add a route */
2341 case SIOCDELRT: /* Delete a route */
2342 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2343 return -EPERM;
2344 err = copy_from_user(&rtmsg, arg,
2345 sizeof(struct in6_rtmsg));
2346 if (err)
2347 return -EFAULT;
2349 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2351 rtnl_lock();
2352 switch (cmd) {
2353 case SIOCADDRT:
2354 err = ip6_route_add(&cfg);
2355 break;
2356 case SIOCDELRT:
2357 err = ip6_route_del(&cfg);
2358 break;
2359 default:
2360 err = -EINVAL;
2362 rtnl_unlock();
2364 return err;
2367 return -EINVAL;
2371 * Drop the packet on the floor
2374 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2376 int type;
2377 struct dst_entry *dst = skb_dst(skb);
2378 switch (ipstats_mib_noroutes) {
2379 case IPSTATS_MIB_INNOROUTES:
2380 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2381 if (type == IPV6_ADDR_ANY) {
2382 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2383 IPSTATS_MIB_INADDRERRORS);
2384 break;
2386 /* FALLTHROUGH */
2387 case IPSTATS_MIB_OUTNOROUTES:
2388 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2389 ipstats_mib_noroutes);
2390 break;
2392 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2393 kfree_skb(skb);
2394 return 0;
2397 static int ip6_pkt_discard(struct sk_buff *skb)
2399 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2402 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2404 skb->dev = skb_dst(skb)->dev;
2405 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2408 static int ip6_pkt_prohibit(struct sk_buff *skb)
2410 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2413 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2415 skb->dev = skb_dst(skb)->dev;
2416 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2420 * Allocate a dst for local (unicast / anycast) address.
2423 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2424 const struct in6_addr *addr,
2425 bool anycast)
2427 struct net *net = dev_net(idev->dev);
2428 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2429 DST_NOCOUNT, NULL);
2430 if (!rt)
2431 return ERR_PTR(-ENOMEM);
2433 in6_dev_hold(idev);
2435 rt->dst.flags |= DST_HOST;
2436 rt->dst.input = ip6_input;
2437 rt->dst.output = ip6_output;
2438 rt->rt6i_idev = idev;
2440 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2441 if (anycast)
2442 rt->rt6i_flags |= RTF_ANYCAST;
2443 else
2444 rt->rt6i_flags |= RTF_LOCAL;
2446 rt->rt6i_gateway = *addr;
2447 rt->rt6i_dst.addr = *addr;
2448 rt->rt6i_dst.plen = 128;
2449 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2451 atomic_set(&rt->dst.__refcnt, 1);
2453 return rt;
2456 int ip6_route_get_saddr(struct net *net,
2457 struct rt6_info *rt,
2458 const struct in6_addr *daddr,
2459 unsigned int prefs,
2460 struct in6_addr *saddr)
2462 struct inet6_dev *idev =
2463 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2464 int err = 0;
2465 if (rt && rt->rt6i_prefsrc.plen)
2466 *saddr = rt->rt6i_prefsrc.addr;
2467 else
2468 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2469 daddr, prefs, saddr);
2470 return err;
2473 /* remove deleted ip from prefsrc entries */
2474 struct arg_dev_net_ip {
2475 struct net_device *dev;
2476 struct net *net;
2477 struct in6_addr *addr;
2480 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2482 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2483 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2484 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2486 if (((void *)rt->dst.dev == dev || !dev) &&
2487 rt != net->ipv6.ip6_null_entry &&
2488 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2489 /* remove prefsrc entry */
2490 rt->rt6i_prefsrc.plen = 0;
2492 return 0;
2495 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2497 struct net *net = dev_net(ifp->idev->dev);
2498 struct arg_dev_net_ip adni = {
2499 .dev = ifp->idev->dev,
2500 .net = net,
2501 .addr = &ifp->addr,
2503 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2506 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2507 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2509 /* Remove routers and update dst entries when gateway turn into host. */
2510 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2512 struct in6_addr *gateway = (struct in6_addr *)arg;
2514 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2515 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2516 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2517 return -1;
2519 return 0;
2522 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2524 fib6_clean_all(net, fib6_clean_tohost, gateway);
2527 struct arg_dev_net {
2528 struct net_device *dev;
2529 struct net *net;
2532 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2534 const struct arg_dev_net *adn = arg;
2535 const struct net_device *dev = adn->dev;
2537 if ((rt->dst.dev == dev || !dev) &&
2538 rt != adn->net->ipv6.ip6_null_entry)
2539 return -1;
2541 return 0;
2544 void rt6_ifdown(struct net *net, struct net_device *dev)
2546 struct arg_dev_net adn = {
2547 .dev = dev,
2548 .net = net,
2551 fib6_clean_all(net, fib6_ifdown, &adn);
2552 icmp6_clean_all(fib6_ifdown, &adn);
2553 rt6_uncached_list_flush_dev(net, dev);
2556 struct rt6_mtu_change_arg {
2557 struct net_device *dev;
2558 unsigned int mtu;
2561 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2563 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2564 struct inet6_dev *idev;
2566 /* In IPv6 pmtu discovery is not optional,
2567 so that RTAX_MTU lock cannot disable it.
2568 We still use this lock to block changes
2569 caused by addrconf/ndisc.
2572 idev = __in6_dev_get(arg->dev);
2573 if (!idev)
2574 return 0;
2576 /* For administrative MTU increase, there is no way to discover
2577 IPv6 PMTU increase, so PMTU increase should be updated here.
2578 Since RFC 1981 doesn't include administrative MTU increase
2579 update PMTU increase is a MUST. (i.e. jumbo frame)
2582 If new MTU is less than route PMTU, this new MTU will be the
2583 lowest MTU in the path, update the route PMTU to reflect PMTU
2584 decreases; if new MTU is greater than route PMTU, and the
2585 old MTU is the lowest MTU in the path, update the route PMTU
2586 to reflect the increase. In this case if the other nodes' MTU
2587 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2588 PMTU discouvery.
2590 if (rt->dst.dev == arg->dev &&
2591 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2592 if (rt->rt6i_flags & RTF_CACHE) {
2593 /* For RTF_CACHE with rt6i_pmtu == 0
2594 * (i.e. a redirected route),
2595 * the metrics of its rt->dst.from has already
2596 * been updated.
2598 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2599 rt->rt6i_pmtu = arg->mtu;
2600 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2601 (dst_mtu(&rt->dst) < arg->mtu &&
2602 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2603 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2606 return 0;
2609 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2611 struct rt6_mtu_change_arg arg = {
2612 .dev = dev,
2613 .mtu = mtu,
2616 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2619 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2620 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2621 [RTA_OIF] = { .type = NLA_U32 },
2622 [RTA_IIF] = { .type = NLA_U32 },
2623 [RTA_PRIORITY] = { .type = NLA_U32 },
2624 [RTA_METRICS] = { .type = NLA_NESTED },
2625 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2626 [RTA_PREF] = { .type = NLA_U8 },
2627 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2628 [RTA_ENCAP] = { .type = NLA_NESTED },
2631 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2632 struct fib6_config *cfg)
2634 struct rtmsg *rtm;
2635 struct nlattr *tb[RTA_MAX+1];
2636 unsigned int pref;
2637 int err;
2639 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2640 if (err < 0)
2641 goto errout;
2643 err = -EINVAL;
2644 rtm = nlmsg_data(nlh);
2645 memset(cfg, 0, sizeof(*cfg));
2647 cfg->fc_table = rtm->rtm_table;
2648 cfg->fc_dst_len = rtm->rtm_dst_len;
2649 cfg->fc_src_len = rtm->rtm_src_len;
2650 cfg->fc_flags = RTF_UP;
2651 cfg->fc_protocol = rtm->rtm_protocol;
2652 cfg->fc_type = rtm->rtm_type;
2654 if (rtm->rtm_type == RTN_UNREACHABLE ||
2655 rtm->rtm_type == RTN_BLACKHOLE ||
2656 rtm->rtm_type == RTN_PROHIBIT ||
2657 rtm->rtm_type == RTN_THROW)
2658 cfg->fc_flags |= RTF_REJECT;
2660 if (rtm->rtm_type == RTN_LOCAL)
2661 cfg->fc_flags |= RTF_LOCAL;
2663 if (rtm->rtm_flags & RTM_F_CLONED)
2664 cfg->fc_flags |= RTF_CACHE;
2666 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2667 cfg->fc_nlinfo.nlh = nlh;
2668 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2670 if (tb[RTA_GATEWAY]) {
2671 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2672 cfg->fc_flags |= RTF_GATEWAY;
2675 if (tb[RTA_DST]) {
2676 int plen = (rtm->rtm_dst_len + 7) >> 3;
2678 if (nla_len(tb[RTA_DST]) < plen)
2679 goto errout;
2681 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2684 if (tb[RTA_SRC]) {
2685 int plen = (rtm->rtm_src_len + 7) >> 3;
2687 if (nla_len(tb[RTA_SRC]) < plen)
2688 goto errout;
2690 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2693 if (tb[RTA_PREFSRC])
2694 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2696 if (tb[RTA_OIF])
2697 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2699 if (tb[RTA_PRIORITY])
2700 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2702 if (tb[RTA_METRICS]) {
2703 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2704 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2707 if (tb[RTA_TABLE])
2708 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2710 if (tb[RTA_MULTIPATH]) {
2711 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2712 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2715 if (tb[RTA_PREF]) {
2716 pref = nla_get_u8(tb[RTA_PREF]);
2717 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2718 pref != ICMPV6_ROUTER_PREF_HIGH)
2719 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2720 cfg->fc_flags |= RTF_PREF(pref);
2723 if (tb[RTA_ENCAP])
2724 cfg->fc_encap = tb[RTA_ENCAP];
2726 if (tb[RTA_ENCAP_TYPE])
2727 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2729 err = 0;
2730 errout:
2731 return err;
2734 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2736 struct fib6_config r_cfg;
2737 struct rtnexthop *rtnh;
2738 int remaining;
2739 int attrlen;
2740 int err = 0, last_err = 0;
2742 remaining = cfg->fc_mp_len;
2743 beginning:
2744 rtnh = (struct rtnexthop *)cfg->fc_mp;
2746 /* Parse a Multipath Entry */
2747 while (rtnh_ok(rtnh, remaining)) {
2748 memcpy(&r_cfg, cfg, sizeof(*cfg));
2749 if (rtnh->rtnh_ifindex)
2750 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2752 attrlen = rtnh_attrlen(rtnh);
2753 if (attrlen > 0) {
2754 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2756 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2757 if (nla) {
2758 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2759 r_cfg.fc_flags |= RTF_GATEWAY;
2761 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2762 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2763 if (nla)
2764 r_cfg.fc_encap_type = nla_get_u16(nla);
2766 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2767 if (err) {
2768 last_err = err;
2769 /* If we are trying to remove a route, do not stop the
2770 * loop when ip6_route_del() fails (because next hop is
2771 * already gone), we should try to remove all next hops.
2773 if (add) {
2774 /* If add fails, we should try to delete all
2775 * next hops that have been already added.
2777 add = 0;
2778 remaining = cfg->fc_mp_len - remaining;
2779 goto beginning;
2782 /* Because each route is added like a single route we remove
2783 * these flags after the first nexthop: if there is a collision,
2784 * we have already failed to add the first nexthop:
2785 * fib6_add_rt2node() has rejected it; when replacing, old
2786 * nexthops have been replaced by first new, the rest should
2787 * be added to it.
2789 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2790 NLM_F_REPLACE);
2791 rtnh = rtnh_next(rtnh, &remaining);
2794 return last_err;
2797 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2799 struct fib6_config cfg;
2800 int err;
2802 err = rtm_to_fib6_config(skb, nlh, &cfg);
2803 if (err < 0)
2804 return err;
2806 if (cfg.fc_mp)
2807 return ip6_route_multipath(&cfg, 0);
2808 else
2809 return ip6_route_del(&cfg);
2812 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2814 struct fib6_config cfg;
2815 int err;
2817 err = rtm_to_fib6_config(skb, nlh, &cfg);
2818 if (err < 0)
2819 return err;
2821 if (cfg.fc_mp)
2822 return ip6_route_multipath(&cfg, 1);
2823 else
2824 return ip6_route_add(&cfg);
2827 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2829 return NLMSG_ALIGN(sizeof(struct rtmsg))
2830 + nla_total_size(16) /* RTA_SRC */
2831 + nla_total_size(16) /* RTA_DST */
2832 + nla_total_size(16) /* RTA_GATEWAY */
2833 + nla_total_size(16) /* RTA_PREFSRC */
2834 + nla_total_size(4) /* RTA_TABLE */
2835 + nla_total_size(4) /* RTA_IIF */
2836 + nla_total_size(4) /* RTA_OIF */
2837 + nla_total_size(4) /* RTA_PRIORITY */
2838 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2839 + nla_total_size(sizeof(struct rta_cacheinfo))
2840 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2841 + nla_total_size(1) /* RTA_PREF */
2842 + lwtunnel_get_encap_size(rt->dst.lwtstate);
2845 static int rt6_fill_node(struct net *net,
2846 struct sk_buff *skb, struct rt6_info *rt,
2847 struct in6_addr *dst, struct in6_addr *src,
2848 int iif, int type, u32 portid, u32 seq,
2849 int prefix, int nowait, unsigned int flags)
2851 u32 metrics[RTAX_MAX];
2852 struct rtmsg *rtm;
2853 struct nlmsghdr *nlh;
2854 long expires;
2855 u32 table;
2857 if (prefix) { /* user wants prefix routes only */
2858 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2859 /* success since this is not a prefix route */
2860 return 1;
2864 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2865 if (!nlh)
2866 return -EMSGSIZE;
2868 rtm = nlmsg_data(nlh);
2869 rtm->rtm_family = AF_INET6;
2870 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2871 rtm->rtm_src_len = rt->rt6i_src.plen;
2872 rtm->rtm_tos = 0;
2873 if (rt->rt6i_table)
2874 table = rt->rt6i_table->tb6_id;
2875 else
2876 table = RT6_TABLE_UNSPEC;
2877 rtm->rtm_table = table;
2878 if (nla_put_u32(skb, RTA_TABLE, table))
2879 goto nla_put_failure;
2880 if (rt->rt6i_flags & RTF_REJECT) {
2881 switch (rt->dst.error) {
2882 case -EINVAL:
2883 rtm->rtm_type = RTN_BLACKHOLE;
2884 break;
2885 case -EACCES:
2886 rtm->rtm_type = RTN_PROHIBIT;
2887 break;
2888 case -EAGAIN:
2889 rtm->rtm_type = RTN_THROW;
2890 break;
2891 default:
2892 rtm->rtm_type = RTN_UNREACHABLE;
2893 break;
2896 else if (rt->rt6i_flags & RTF_LOCAL)
2897 rtm->rtm_type = RTN_LOCAL;
2898 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2899 rtm->rtm_type = RTN_LOCAL;
2900 else
2901 rtm->rtm_type = RTN_UNICAST;
2902 rtm->rtm_flags = 0;
2903 if (!netif_carrier_ok(rt->dst.dev)) {
2904 rtm->rtm_flags |= RTNH_F_LINKDOWN;
2905 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
2906 rtm->rtm_flags |= RTNH_F_DEAD;
2908 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2909 rtm->rtm_protocol = rt->rt6i_protocol;
2910 if (rt->rt6i_flags & RTF_DYNAMIC)
2911 rtm->rtm_protocol = RTPROT_REDIRECT;
2912 else if (rt->rt6i_flags & RTF_ADDRCONF) {
2913 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2914 rtm->rtm_protocol = RTPROT_RA;
2915 else
2916 rtm->rtm_protocol = RTPROT_KERNEL;
2919 if (rt->rt6i_flags & RTF_CACHE)
2920 rtm->rtm_flags |= RTM_F_CLONED;
2922 if (dst) {
2923 if (nla_put_in6_addr(skb, RTA_DST, dst))
2924 goto nla_put_failure;
2925 rtm->rtm_dst_len = 128;
2926 } else if (rtm->rtm_dst_len)
2927 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2928 goto nla_put_failure;
2929 #ifdef CONFIG_IPV6_SUBTREES
2930 if (src) {
2931 if (nla_put_in6_addr(skb, RTA_SRC, src))
2932 goto nla_put_failure;
2933 rtm->rtm_src_len = 128;
2934 } else if (rtm->rtm_src_len &&
2935 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2936 goto nla_put_failure;
2937 #endif
2938 if (iif) {
2939 #ifdef CONFIG_IPV6_MROUTE
2940 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2941 int err = ip6mr_get_route(net, skb, rtm, nowait);
2942 if (err <= 0) {
2943 if (!nowait) {
2944 if (err == 0)
2945 return 0;
2946 goto nla_put_failure;
2947 } else {
2948 if (err == -EMSGSIZE)
2949 goto nla_put_failure;
2952 } else
2953 #endif
2954 if (nla_put_u32(skb, RTA_IIF, iif))
2955 goto nla_put_failure;
2956 } else if (dst) {
2957 struct in6_addr saddr_buf;
2958 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2959 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2960 goto nla_put_failure;
2963 if (rt->rt6i_prefsrc.plen) {
2964 struct in6_addr saddr_buf;
2965 saddr_buf = rt->rt6i_prefsrc.addr;
2966 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2967 goto nla_put_failure;
2970 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2971 if (rt->rt6i_pmtu)
2972 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2973 if (rtnetlink_put_metrics(skb, metrics) < 0)
2974 goto nla_put_failure;
2976 if (rt->rt6i_flags & RTF_GATEWAY) {
2977 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2978 goto nla_put_failure;
2981 if (rt->dst.dev &&
2982 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2983 goto nla_put_failure;
2984 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2985 goto nla_put_failure;
2987 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2989 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2990 goto nla_put_failure;
2992 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2993 goto nla_put_failure;
2995 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
2997 nlmsg_end(skb, nlh);
2998 return 0;
3000 nla_put_failure:
3001 nlmsg_cancel(skb, nlh);
3002 return -EMSGSIZE;
3005 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3007 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3008 int prefix;
3010 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3011 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3012 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3013 } else
3014 prefix = 0;
3016 return rt6_fill_node(arg->net,
3017 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3018 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3019 prefix, 0, NLM_F_MULTI);
3022 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3024 struct net *net = sock_net(in_skb->sk);
3025 struct nlattr *tb[RTA_MAX+1];
3026 struct rt6_info *rt;
3027 struct sk_buff *skb;
3028 struct rtmsg *rtm;
3029 struct flowi6 fl6;
3030 int err, iif = 0, oif = 0;
3032 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3033 if (err < 0)
3034 goto errout;
3036 err = -EINVAL;
3037 memset(&fl6, 0, sizeof(fl6));
3039 if (tb[RTA_SRC]) {
3040 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3041 goto errout;
3043 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3046 if (tb[RTA_DST]) {
3047 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3048 goto errout;
3050 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3053 if (tb[RTA_IIF])
3054 iif = nla_get_u32(tb[RTA_IIF]);
3056 if (tb[RTA_OIF])
3057 oif = nla_get_u32(tb[RTA_OIF]);
3059 if (tb[RTA_MARK])
3060 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3062 if (iif) {
3063 struct net_device *dev;
3064 int flags = 0;
3066 dev = __dev_get_by_index(net, iif);
3067 if (!dev) {
3068 err = -ENODEV;
3069 goto errout;
3072 fl6.flowi6_iif = iif;
3074 if (!ipv6_addr_any(&fl6.saddr))
3075 flags |= RT6_LOOKUP_F_HAS_SADDR;
3077 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3078 flags);
3079 } else {
3080 fl6.flowi6_oif = oif;
3082 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3085 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3086 if (!skb) {
3087 ip6_rt_put(rt);
3088 err = -ENOBUFS;
3089 goto errout;
3092 /* Reserve room for dummy headers, this skb can pass
3093 through good chunk of routing engine.
3095 skb_reset_mac_header(skb);
3096 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3098 skb_dst_set(skb, &rt->dst);
3100 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3101 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3102 nlh->nlmsg_seq, 0, 0, 0);
3103 if (err < 0) {
3104 kfree_skb(skb);
3105 goto errout;
3108 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3109 errout:
3110 return err;
3113 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3115 struct sk_buff *skb;
3116 struct net *net = info->nl_net;
3117 u32 seq;
3118 int err;
3120 err = -ENOBUFS;
3121 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3123 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3124 if (!skb)
3125 goto errout;
3127 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3128 event, info->portid, seq, 0, 0, 0);
3129 if (err < 0) {
3130 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3131 WARN_ON(err == -EMSGSIZE);
3132 kfree_skb(skb);
3133 goto errout;
3135 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3136 info->nlh, gfp_any());
3137 return;
3138 errout:
3139 if (err < 0)
3140 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3143 static int ip6_route_dev_notify(struct notifier_block *this,
3144 unsigned long event, void *ptr)
3146 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3147 struct net *net = dev_net(dev);
3149 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3150 net->ipv6.ip6_null_entry->dst.dev = dev;
3151 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3152 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3153 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3154 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3155 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3156 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3157 #endif
3160 return NOTIFY_OK;
3164 * /proc
3167 #ifdef CONFIG_PROC_FS
3169 static const struct file_operations ipv6_route_proc_fops = {
3170 .owner = THIS_MODULE,
3171 .open = ipv6_route_open,
3172 .read = seq_read,
3173 .llseek = seq_lseek,
3174 .release = seq_release_net,
3177 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3179 struct net *net = (struct net *)seq->private;
3180 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3181 net->ipv6.rt6_stats->fib_nodes,
3182 net->ipv6.rt6_stats->fib_route_nodes,
3183 net->ipv6.rt6_stats->fib_rt_alloc,
3184 net->ipv6.rt6_stats->fib_rt_entries,
3185 net->ipv6.rt6_stats->fib_rt_cache,
3186 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3187 net->ipv6.rt6_stats->fib_discarded_routes);
3189 return 0;
3192 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3194 return single_open_net(inode, file, rt6_stats_seq_show);
3197 static const struct file_operations rt6_stats_seq_fops = {
3198 .owner = THIS_MODULE,
3199 .open = rt6_stats_seq_open,
3200 .read = seq_read,
3201 .llseek = seq_lseek,
3202 .release = single_release_net,
3204 #endif /* CONFIG_PROC_FS */
3206 #ifdef CONFIG_SYSCTL
3208 static
3209 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3210 void __user *buffer, size_t *lenp, loff_t *ppos)
3212 struct net *net;
3213 int delay;
3214 if (!write)
3215 return -EINVAL;
3217 net = (struct net *)ctl->extra1;
3218 delay = net->ipv6.sysctl.flush_delay;
3219 proc_dointvec(ctl, write, buffer, lenp, ppos);
3220 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3221 return 0;
3224 struct ctl_table ipv6_route_table_template[] = {
3226 .procname = "flush",
3227 .data = &init_net.ipv6.sysctl.flush_delay,
3228 .maxlen = sizeof(int),
3229 .mode = 0200,
3230 .proc_handler = ipv6_sysctl_rtcache_flush
3233 .procname = "gc_thresh",
3234 .data = &ip6_dst_ops_template.gc_thresh,
3235 .maxlen = sizeof(int),
3236 .mode = 0644,
3237 .proc_handler = proc_dointvec,
3240 .procname = "max_size",
3241 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3242 .maxlen = sizeof(int),
3243 .mode = 0644,
3244 .proc_handler = proc_dointvec,
3247 .procname = "gc_min_interval",
3248 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3249 .maxlen = sizeof(int),
3250 .mode = 0644,
3251 .proc_handler = proc_dointvec_jiffies,
3254 .procname = "gc_timeout",
3255 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3256 .maxlen = sizeof(int),
3257 .mode = 0644,
3258 .proc_handler = proc_dointvec_jiffies,
3261 .procname = "gc_interval",
3262 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3263 .maxlen = sizeof(int),
3264 .mode = 0644,
3265 .proc_handler = proc_dointvec_jiffies,
3268 .procname = "gc_elasticity",
3269 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3270 .maxlen = sizeof(int),
3271 .mode = 0644,
3272 .proc_handler = proc_dointvec,
3275 .procname = "mtu_expires",
3276 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3277 .maxlen = sizeof(int),
3278 .mode = 0644,
3279 .proc_handler = proc_dointvec_jiffies,
3282 .procname = "min_adv_mss",
3283 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3284 .maxlen = sizeof(int),
3285 .mode = 0644,
3286 .proc_handler = proc_dointvec,
3289 .procname = "gc_min_interval_ms",
3290 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3291 .maxlen = sizeof(int),
3292 .mode = 0644,
3293 .proc_handler = proc_dointvec_ms_jiffies,
3298 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3300 struct ctl_table *table;
3302 table = kmemdup(ipv6_route_table_template,
3303 sizeof(ipv6_route_table_template),
3304 GFP_KERNEL);
3306 if (table) {
3307 table[0].data = &net->ipv6.sysctl.flush_delay;
3308 table[0].extra1 = net;
3309 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3310 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3311 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3312 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3313 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3314 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3315 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3316 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3317 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3319 /* Don't export sysctls to unprivileged users */
3320 if (net->user_ns != &init_user_ns)
3321 table[0].procname = NULL;
3324 return table;
3326 #endif
3328 static int __net_init ip6_route_net_init(struct net *net)
3330 int ret = -ENOMEM;
3332 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3333 sizeof(net->ipv6.ip6_dst_ops));
3335 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3336 goto out_ip6_dst_ops;
3338 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3339 sizeof(*net->ipv6.ip6_null_entry),
3340 GFP_KERNEL);
3341 if (!net->ipv6.ip6_null_entry)
3342 goto out_ip6_dst_entries;
3343 net->ipv6.ip6_null_entry->dst.path =
3344 (struct dst_entry *)net->ipv6.ip6_null_entry;
3345 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3346 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3347 ip6_template_metrics, true);
3349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3350 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3351 sizeof(*net->ipv6.ip6_prohibit_entry),
3352 GFP_KERNEL);
3353 if (!net->ipv6.ip6_prohibit_entry)
3354 goto out_ip6_null_entry;
3355 net->ipv6.ip6_prohibit_entry->dst.path =
3356 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3357 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3358 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3359 ip6_template_metrics, true);
3361 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3362 sizeof(*net->ipv6.ip6_blk_hole_entry),
3363 GFP_KERNEL);
3364 if (!net->ipv6.ip6_blk_hole_entry)
3365 goto out_ip6_prohibit_entry;
3366 net->ipv6.ip6_blk_hole_entry->dst.path =
3367 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3368 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3369 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3370 ip6_template_metrics, true);
3371 #endif
3373 net->ipv6.sysctl.flush_delay = 0;
3374 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3375 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3376 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3377 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3378 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3379 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3380 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3382 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3384 ret = 0;
3385 out:
3386 return ret;
3388 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3389 out_ip6_prohibit_entry:
3390 kfree(net->ipv6.ip6_prohibit_entry);
3391 out_ip6_null_entry:
3392 kfree(net->ipv6.ip6_null_entry);
3393 #endif
3394 out_ip6_dst_entries:
3395 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3396 out_ip6_dst_ops:
3397 goto out;
3400 static void __net_exit ip6_route_net_exit(struct net *net)
3402 kfree(net->ipv6.ip6_null_entry);
3403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3404 kfree(net->ipv6.ip6_prohibit_entry);
3405 kfree(net->ipv6.ip6_blk_hole_entry);
3406 #endif
3407 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3410 static int __net_init ip6_route_net_init_late(struct net *net)
3412 #ifdef CONFIG_PROC_FS
3413 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3414 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3415 #endif
3416 return 0;
3419 static void __net_exit ip6_route_net_exit_late(struct net *net)
3421 #ifdef CONFIG_PROC_FS
3422 remove_proc_entry("ipv6_route", net->proc_net);
3423 remove_proc_entry("rt6_stats", net->proc_net);
3424 #endif
3427 static struct pernet_operations ip6_route_net_ops = {
3428 .init = ip6_route_net_init,
3429 .exit = ip6_route_net_exit,
3432 static int __net_init ipv6_inetpeer_init(struct net *net)
3434 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3436 if (!bp)
3437 return -ENOMEM;
3438 inet_peer_base_init(bp);
3439 net->ipv6.peers = bp;
3440 return 0;
3443 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3445 struct inet_peer_base *bp = net->ipv6.peers;
3447 net->ipv6.peers = NULL;
3448 inetpeer_invalidate_tree(bp);
3449 kfree(bp);
3452 static struct pernet_operations ipv6_inetpeer_ops = {
3453 .init = ipv6_inetpeer_init,
3454 .exit = ipv6_inetpeer_exit,
3457 static struct pernet_operations ip6_route_net_late_ops = {
3458 .init = ip6_route_net_init_late,
3459 .exit = ip6_route_net_exit_late,
3462 static struct notifier_block ip6_route_dev_notifier = {
3463 .notifier_call = ip6_route_dev_notify,
3464 .priority = 0,
3467 int __init ip6_route_init(void)
3469 int ret;
3470 int cpu;
3472 ret = -ENOMEM;
3473 ip6_dst_ops_template.kmem_cachep =
3474 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3475 SLAB_HWCACHE_ALIGN, NULL);
3476 if (!ip6_dst_ops_template.kmem_cachep)
3477 goto out;
3479 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3480 if (ret)
3481 goto out_kmem_cache;
3483 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3484 if (ret)
3485 goto out_dst_entries;
3487 ret = register_pernet_subsys(&ip6_route_net_ops);
3488 if (ret)
3489 goto out_register_inetpeer;
3491 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3493 /* Registering of the loopback is done before this portion of code,
3494 * the loopback reference in rt6_info will not be taken, do it
3495 * manually for init_net */
3496 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3497 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3498 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3499 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3500 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3501 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3502 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3503 #endif
3504 ret = fib6_init();
3505 if (ret)
3506 goto out_register_subsys;
3508 ret = xfrm6_init();
3509 if (ret)
3510 goto out_fib6_init;
3512 ret = fib6_rules_init();
3513 if (ret)
3514 goto xfrm6_init;
3516 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3517 if (ret)
3518 goto fib6_rules_init;
3520 ret = -ENOBUFS;
3521 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3522 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3523 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3524 goto out_register_late_subsys;
3526 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3527 if (ret)
3528 goto out_register_late_subsys;
3530 for_each_possible_cpu(cpu) {
3531 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3533 INIT_LIST_HEAD(&ul->head);
3534 spin_lock_init(&ul->lock);
3537 out:
3538 return ret;
3540 out_register_late_subsys:
3541 unregister_pernet_subsys(&ip6_route_net_late_ops);
3542 fib6_rules_init:
3543 fib6_rules_cleanup();
3544 xfrm6_init:
3545 xfrm6_fini();
3546 out_fib6_init:
3547 fib6_gc_cleanup();
3548 out_register_subsys:
3549 unregister_pernet_subsys(&ip6_route_net_ops);
3550 out_register_inetpeer:
3551 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3552 out_dst_entries:
3553 dst_entries_destroy(&ip6_dst_blackhole_ops);
3554 out_kmem_cache:
3555 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3556 goto out;
3559 void ip6_route_cleanup(void)
3561 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3562 unregister_pernet_subsys(&ip6_route_net_late_ops);
3563 fib6_rules_cleanup();
3564 xfrm6_fini();
3565 fib6_gc_cleanup();
3566 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3567 unregister_pernet_subsys(&ip6_route_net_ops);
3568 dst_entries_destroy(&ip6_dst_blackhole_ops);
3569 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);