driver core: Partially revert "driver core: correct device's shutdown order"
[linux-2.6/btrfs-unstable.git] / net / ipv6 / route.c
blob86a0e4333d42212d03f53e0d54fcf4e03a328607
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
14 /* Changes:
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
80 enum rt6_nud_state {
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
84 RT6_NUD_SUCCEED = 1
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
111 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
121 unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
126 #endif
128 struct uncached_list {
129 spinlock_t lock;
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
162 int cpu;
164 if (dev == loopback_dev)
165 return;
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 struct rt6_info *rt;
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
181 if (rt_dev == dev) {
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
184 dev_put(rt_dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 struct sk_buff *skb,
193 const void *daddr)
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
197 else if (skb)
198 return &ipv6_hdr(skb)->daddr;
199 return daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
204 struct sk_buff *skb,
205 const void *daddr)
207 struct neighbour *n;
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
211 if (n)
212 return n;
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217 struct sk_buff *skb,
218 const void *daddr)
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 if (!daddr)
232 return;
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 return;
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 return;
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
241 .family = AF_INET6,
242 .gc = ip6_dst_gc,
243 .gc_thresh = 1024,
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
246 .mtu = ip6_mtu,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 struct sk_buff *skb)
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 .family = AF_INET6,
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
302 .dst = {
303 .__refcnt = ATOMIC_INIT(1),
304 .__use = 1,
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
320 .error = -EACCES,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328 .dst = {
329 .__refcnt = ATOMIC_INIT(1),
330 .__use = 1,
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
332 .error = -EINVAL,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
339 #endif
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351 int flags)
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
356 if (rt) {
357 rt6_info_init(rt);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361 return rt;
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
375 if (idev) {
376 rt->rt6i_idev = NULL;
377 in6_dev_put(idev);
380 rcu_read_lock();
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
384 rcu_read_unlock();
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388 int how)
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397 if (loopback_idev) {
398 rt->rt6i_idev = loopback_idev;
399 in6_dev_put(idev);
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
408 else
409 return false;
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
420 return true;
421 } else if (from) {
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
425 return false;
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
432 int strict)
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
439 if (!fl6->mp_hash)
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443 return match;
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446 fib6_siblings) {
447 int nh_upper_bound;
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
451 continue;
452 if (rt6_score_route(sibling, oif, strict) < 0)
453 break;
454 match = sibling;
455 break;
458 return match;
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
468 int oif,
469 int flags)
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475 return rt;
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481 continue;
483 if (oif) {
484 if (dev->ifindex == oif)
485 return sprt;
486 } else {
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
489 return sprt;
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514 dev_put(work->dev);
515 kfree(work);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534 return;
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
538 rcu_read_lock_bh();
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
540 if (neigh) {
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
544 goto out;
546 idev = __in6_dev_get(dev);
547 work = NULL;
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
550 time_after(jiffies,
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
553 if (work)
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else {
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
561 if (work) {
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
564 dev_hold(dev);
565 work->dev = dev;
566 schedule_work(&work->work);
569 out:
570 rcu_read_unlock_bh();
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
576 #endif
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
586 return 2;
587 return 0;
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
599 rcu_read_lock_bh();
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601 &rt->fib6_nh.nh_gw);
602 if (neigh) {
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
609 else
610 ret = RT6_NUD_FAIL_PROBE;
611 #endif
612 read_unlock(&neigh->lock);
613 } else {
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
619 return ret;
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
624 int m;
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
634 if (n < 0)
635 return n;
637 return m;
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
644 bool rc = false;
646 if (dev) {
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
652 return rc;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
657 bool *do_rr)
659 int m;
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663 goto out;
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668 goto out;
670 if (fib6_check_expired(rt))
671 goto out;
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
675 match_do_rr = true;
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
678 goto out;
681 if (strict & RT6_LOOKUP_F_REACHABLE)
682 rt6_probe(rt);
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685 if (m > *mpri) {
686 *do_rr = match_do_rr;
687 *mpri = m;
688 match = rt;
690 out:
691 return match;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
698 bool *do_rr)
700 struct fib6_info *rt, *match, *cont;
701 int mpri = -1;
703 match = NULL;
704 cont = NULL;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
707 cont = rt;
708 break;
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
717 cont = rt;
718 break;
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724 if (match || !cont)
725 return match;
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 return match;
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734 int oif, int strict)
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
738 bool do_rr = false;
739 int key_plen;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
745 if (!rt0)
746 rt0 = leaf;
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
757 #endif
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762 &do_rr);
764 if (do_rr) {
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
769 next = leaf;
771 if (next != rt0) {
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
774 if (next->fib6_node)
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
795 unsigned int pref;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
800 return -EINVAL;
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
805 return -EINVAL;
806 } else if (rinfo->prefix_len > 128) {
807 return -EINVAL;
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
810 return -EINVAL;
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
814 return -EINVAL;
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
820 return -EINVAL;
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
826 else {
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
830 rinfo->prefix_len);
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
836 else
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838 gwaddr, dev);
840 if (rt && !lifetime) {
841 ip6_del_rt(net, rt);
842 rt = NULL;
845 if (!rt && lifetime)
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847 dev, pref);
848 else if (rt)
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
852 if (rt) {
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
855 else
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
860 return 0;
862 #endif
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
888 return dev;
891 static const int fib6_prop[RTN_MAX + 1] = {
892 [RTN_UNSPEC] = 0,
893 [RTN_UNICAST] = 0,
894 [RTN_LOCAL] = 0,
895 [RTN_BROADCAST] = 0,
896 [RTN_ANYCAST] = 0,
897 [RTN_MULTICAST] = 0,
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
902 [RTN_NAT] = -EINVAL,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
915 if (rt->dst_nocount)
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
919 if (rt->dst_host)
920 flags |= DST_HOST;
922 return flags;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
930 case RTN_BLACKHOLE:
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
933 break;
934 case RTN_PROHIBIT:
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
937 break;
938 case RTN_THROW:
939 case RTN_UNREACHABLE:
940 default:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
943 break;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 rt->dst.flags |= fib6_info_dst_flags(ort);
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
953 return;
956 rt->dst.error = 0;
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
963 } else {
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 rt->rt6i_flags &= ~RTF_EXPIRES;
978 fib6_info_hold(from);
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 if (from->fib6_metrics != &dst_default_metrics) {
982 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983 refcount_inc(&from->fib6_metrics->refcnt);
987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
989 struct net_device *dev = fib6_info_nh_dev(ort);
991 ip6_rt_init_dst(rt, ort);
993 rt->rt6i_dst = ort->fib6_dst;
994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
995 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
996 rt->rt6i_flags = ort->fib6_flags;
997 rt6_set_from(rt, ort);
998 #ifdef CONFIG_IPV6_SUBTREES
999 rt->rt6i_src = ort->fib6_src;
1000 #endif
1001 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006 struct in6_addr *saddr)
1008 struct fib6_node *pn, *sn;
1009 while (1) {
1010 if (fn->fn_flags & RTN_TL_ROOT)
1011 return NULL;
1012 pn = rcu_dereference(fn->parent);
1013 sn = FIB6_SUBTREE(pn);
1014 if (sn && sn != fn)
1015 fn = fib6_node_lookup(sn, NULL, saddr);
1016 else
1017 fn = pn;
1018 if (fn->fn_flags & RTN_RTINFO)
1019 return fn;
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1024 bool null_fallback)
1026 struct rt6_info *rt = *prt;
1028 if (dst_hold_safe(&rt->dst))
1029 return true;
1030 if (null_fallback) {
1031 rt = net->ipv6.ip6_null_entry;
1032 dst_hold(&rt->dst);
1033 } else {
1034 rt = NULL;
1036 *prt = rt;
1037 return false;
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043 unsigned short flags = fib6_info_dst_flags(rt);
1044 struct net_device *dev = rt->fib6_nh.nh_dev;
1045 struct rt6_info *nrt;
1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1048 if (nrt)
1049 ip6_rt_copy_init(nrt, rt);
1051 return nrt;
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1056 struct flowi6 *fl6,
1057 const struct sk_buff *skb,
1058 int flags)
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1067 rcu_read_lock();
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070 f6i = rcu_dereference(fn->leaf);
1071 if (!f6i) {
1072 f6i = net->ipv6.fib6_null_entry;
1073 } else {
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1079 flags);
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1083 if (fn)
1084 goto restart;
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091 if (rt) {
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1096 dst_hold(&rt->dst);
1097 } else {
1098 rt = ip6_create_rt_rcu(f6i);
1099 if (!rt) {
1100 rt = net->ipv6.ip6_null_entry;
1101 dst_hold(&rt->dst);
1105 rcu_read_unlock();
1107 return rt;
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1121 struct flowi6 fl6 = {
1122 .flowi6_oif = oif,
1123 .daddr = *daddr,
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1128 if (saddr) {
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1137 dst_release(dst);
1139 return NULL;
1141 EXPORT_SYMBOL(rt6_lookup);
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1152 int err;
1153 struct fib6_table *table;
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1160 return err;
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 struct nl_info info = { .nl_net = net, };
1167 return __ip6_ins_rt(rt, &info, NULL);
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1178 * Clone the route.
1181 dev = ip6_rt_get_dev_rcu(ort);
1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1183 if (!rt)
1184 return NULL;
1186 ip6_rt_copy_init(rt, ort);
1187 rt->rt6i_flags |= RTF_CACHE;
1188 rt->dst.flags |= DST_HOST;
1189 rt->rt6i_dst.addr = *daddr;
1190 rt->rt6i_dst.plen = 128;
1192 if (!rt6_is_gw_or_nonexthop(ort)) {
1193 if (ort->fib6_dst.plen != 128 &&
1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1195 rt->rt6i_flags |= RTF_ANYCAST;
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 if (rt->rt6i_src.plen && saddr) {
1198 rt->rt6i_src.addr = *saddr;
1199 rt->rt6i_src.plen = 128;
1201 #endif
1204 return rt;
1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1209 unsigned short flags = fib6_info_dst_flags(rt);
1210 struct net_device *dev;
1211 struct rt6_info *pcpu_rt;
1213 rcu_read_lock();
1214 dev = ip6_rt_get_dev_rcu(rt);
1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1216 rcu_read_unlock();
1217 if (!pcpu_rt)
1218 return NULL;
1219 ip6_rt_copy_init(pcpu_rt, rt);
1220 pcpu_rt->rt6i_flags |= RTF_PCPU;
1221 return pcpu_rt;
1224 /* It should be called with rcu_read_lock() acquired */
1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1227 struct rt6_info *pcpu_rt, **p;
1229 p = this_cpu_ptr(rt->rt6i_pcpu);
1230 pcpu_rt = *p;
1232 if (pcpu_rt)
1233 ip6_hold_safe(NULL, &pcpu_rt, false);
1235 return pcpu_rt;
1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1239 struct fib6_info *rt)
1241 struct rt6_info *pcpu_rt, *prev, **p;
1243 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1244 if (!pcpu_rt) {
1245 dst_hold(&net->ipv6.ip6_null_entry->dst);
1246 return net->ipv6.ip6_null_entry;
1249 dst_hold(&pcpu_rt->dst);
1250 p = this_cpu_ptr(rt->rt6i_pcpu);
1251 prev = cmpxchg(p, NULL, pcpu_rt);
1252 BUG_ON(prev);
1254 return pcpu_rt;
1257 /* exception hash table implementation
1259 static DEFINE_SPINLOCK(rt6_exception_lock);
1261 /* Remove rt6_ex from hash table and free the memory
1262 * Caller must hold rt6_exception_lock
1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1265 struct rt6_exception *rt6_ex)
1267 struct net *net;
1269 if (!bucket || !rt6_ex)
1270 return;
1272 net = dev_net(rt6_ex->rt6i->dst.dev);
1273 hlist_del_rcu(&rt6_ex->hlist);
1274 dst_release(&rt6_ex->rt6i->dst);
1275 kfree_rcu(rt6_ex, rcu);
1276 WARN_ON_ONCE(!bucket->depth);
1277 bucket->depth--;
1278 net->ipv6.rt6_stats->fib_rt_cache--;
1281 /* Remove oldest rt6_ex in bucket and free the memory
1282 * Caller must hold rt6_exception_lock
1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1286 struct rt6_exception *rt6_ex, *oldest = NULL;
1288 if (!bucket)
1289 return;
1291 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1292 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1293 oldest = rt6_ex;
1295 rt6_remove_exception(bucket, oldest);
1298 static u32 rt6_exception_hash(const struct in6_addr *dst,
1299 const struct in6_addr *src)
1301 static u32 seed __read_mostly;
1302 u32 val;
1304 net_get_random_once(&seed, sizeof(seed));
1305 val = jhash(dst, sizeof(*dst), seed);
1307 #ifdef CONFIG_IPV6_SUBTREES
1308 if (src)
1309 val = jhash(src, sizeof(*src), val);
1310 #endif
1311 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1314 /* Helper function to find the cached rt in the hash table
1315 * and update bucket pointer to point to the bucket for this
1316 * (daddr, saddr) pair
1317 * Caller must hold rt6_exception_lock
1319 static struct rt6_exception *
1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1321 const struct in6_addr *daddr,
1322 const struct in6_addr *saddr)
1324 struct rt6_exception *rt6_ex;
1325 u32 hval;
1327 if (!(*bucket) || !daddr)
1328 return NULL;
1330 hval = rt6_exception_hash(daddr, saddr);
1331 *bucket += hval;
1333 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1334 struct rt6_info *rt6 = rt6_ex->rt6i;
1335 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1337 #ifdef CONFIG_IPV6_SUBTREES
1338 if (matched && saddr)
1339 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1340 #endif
1341 if (matched)
1342 return rt6_ex;
1344 return NULL;
1347 /* Helper function to find the cached rt in the hash table
1348 * and update bucket pointer to point to the bucket for this
1349 * (daddr, saddr) pair
1350 * Caller must hold rcu_read_lock()
1352 static struct rt6_exception *
1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1354 const struct in6_addr *daddr,
1355 const struct in6_addr *saddr)
1357 struct rt6_exception *rt6_ex;
1358 u32 hval;
1360 WARN_ON_ONCE(!rcu_read_lock_held());
1362 if (!(*bucket) || !daddr)
1363 return NULL;
1365 hval = rt6_exception_hash(daddr, saddr);
1366 *bucket += hval;
1368 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1369 struct rt6_info *rt6 = rt6_ex->rt6i;
1370 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1372 #ifdef CONFIG_IPV6_SUBTREES
1373 if (matched && saddr)
1374 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1375 #endif
1376 if (matched)
1377 return rt6_ex;
1379 return NULL;
1382 static unsigned int fib6_mtu(const struct fib6_info *rt)
1384 unsigned int mtu;
1386 if (rt->fib6_pmtu) {
1387 mtu = rt->fib6_pmtu;
1388 } else {
1389 struct net_device *dev = fib6_info_nh_dev(rt);
1390 struct inet6_dev *idev;
1392 rcu_read_lock();
1393 idev = __in6_dev_get(dev);
1394 mtu = idev->cnf.mtu6;
1395 rcu_read_unlock();
1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1403 static int rt6_insert_exception(struct rt6_info *nrt,
1404 struct fib6_info *ort)
1406 struct net *net = dev_net(nrt->dst.dev);
1407 struct rt6_exception_bucket *bucket;
1408 struct in6_addr *src_key = NULL;
1409 struct rt6_exception *rt6_ex;
1410 int err = 0;
1412 spin_lock_bh(&rt6_exception_lock);
1414 if (ort->exception_bucket_flushed) {
1415 err = -EINVAL;
1416 goto out;
1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1420 lockdep_is_held(&rt6_exception_lock));
1421 if (!bucket) {
1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1423 GFP_ATOMIC);
1424 if (!bucket) {
1425 err = -ENOMEM;
1426 goto out;
1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1431 #ifdef CONFIG_IPV6_SUBTREES
1432 /* rt6i_src.plen != 0 indicates ort is in subtree
1433 * and exception table is indexed by a hash of
1434 * both rt6i_dst and rt6i_src.
1435 * Otherwise, the exception table is indexed by
1436 * a hash of only rt6i_dst.
1438 if (ort->fib6_src.plen)
1439 src_key = &nrt->rt6i_src.addr;
1440 #endif
1442 /* Update rt6i_prefsrc as it could be changed
1443 * in rt6_remove_prefsrc()
1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1446 /* rt6_mtu_change() might lower mtu on ort.
1447 * Only insert this exception route if its mtu
1448 * is less than ort's mtu value.
1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1451 err = -EINVAL;
1452 goto out;
1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1456 src_key);
1457 if (rt6_ex)
1458 rt6_remove_exception(bucket, rt6_ex);
1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1461 if (!rt6_ex) {
1462 err = -ENOMEM;
1463 goto out;
1465 rt6_ex->rt6i = nrt;
1466 rt6_ex->stamp = jiffies;
1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1468 bucket->depth++;
1469 net->ipv6.rt6_stats->fib_rt_cache++;
1471 if (bucket->depth > FIB6_MAX_DEPTH)
1472 rt6_exception_remove_oldest(bucket);
1474 out:
1475 spin_unlock_bh(&rt6_exception_lock);
1477 /* Update fn->fn_sernum to invalidate all cached dst */
1478 if (!err) {
1479 spin_lock_bh(&ort->fib6_table->tb6_lock);
1480 fib6_update_sernum(net, ort);
1481 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1482 fib6_force_start_gc(net);
1485 return err;
1488 void rt6_flush_exceptions(struct fib6_info *rt)
1490 struct rt6_exception_bucket *bucket;
1491 struct rt6_exception *rt6_ex;
1492 struct hlist_node *tmp;
1493 int i;
1495 spin_lock_bh(&rt6_exception_lock);
1496 /* Prevent rt6_insert_exception() to recreate the bucket list */
1497 rt->exception_bucket_flushed = 1;
1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500 lockdep_is_held(&rt6_exception_lock));
1501 if (!bucket)
1502 goto out;
1504 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1506 rt6_remove_exception(bucket, rt6_ex);
1507 WARN_ON_ONCE(bucket->depth);
1508 bucket++;
1511 out:
1512 spin_unlock_bh(&rt6_exception_lock);
1515 /* Find cached rt in the hash table inside passed in rt
1516 * Caller has to hold rcu_read_lock()
1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1519 struct in6_addr *daddr,
1520 struct in6_addr *saddr)
1522 struct rt6_exception_bucket *bucket;
1523 struct in6_addr *src_key = NULL;
1524 struct rt6_exception *rt6_ex;
1525 struct rt6_info *res = NULL;
1527 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1529 #ifdef CONFIG_IPV6_SUBTREES
1530 /* rt6i_src.plen != 0 indicates rt is in subtree
1531 * and exception table is indexed by a hash of
1532 * both rt6i_dst and rt6i_src.
1533 * Otherwise, the exception table is indexed by
1534 * a hash of only rt6i_dst.
1536 if (rt->fib6_src.plen)
1537 src_key = saddr;
1538 #endif
1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1541 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1542 res = rt6_ex->rt6i;
1544 return res;
1547 /* Remove the passed in cached rt from the hash table that contains it */
1548 static int rt6_remove_exception_rt(struct rt6_info *rt)
1550 struct rt6_exception_bucket *bucket;
1551 struct in6_addr *src_key = NULL;
1552 struct rt6_exception *rt6_ex;
1553 struct fib6_info *from;
1554 int err;
1556 from = rcu_dereference(rt->from);
1557 if (!from ||
1558 !(rt->rt6i_flags & RTF_CACHE))
1559 return -EINVAL;
1561 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1562 return -ENOENT;
1564 spin_lock_bh(&rt6_exception_lock);
1565 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1566 lockdep_is_held(&rt6_exception_lock));
1567 #ifdef CONFIG_IPV6_SUBTREES
1568 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1569 * and exception table is indexed by a hash of
1570 * both rt6i_dst and rt6i_src.
1571 * Otherwise, the exception table is indexed by
1572 * a hash of only rt6i_dst.
1574 if (from->fib6_src.plen)
1575 src_key = &rt->rt6i_src.addr;
1576 #endif
1577 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1578 &rt->rt6i_dst.addr,
1579 src_key);
1580 if (rt6_ex) {
1581 rt6_remove_exception(bucket, rt6_ex);
1582 err = 0;
1583 } else {
1584 err = -ENOENT;
1587 spin_unlock_bh(&rt6_exception_lock);
1588 return err;
1591 /* Find rt6_ex which contains the passed in rt cache and
1592 * refresh its stamp
1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1596 struct rt6_exception_bucket *bucket;
1597 struct fib6_info *from = rt->from;
1598 struct in6_addr *src_key = NULL;
1599 struct rt6_exception *rt6_ex;
1601 if (!from ||
1602 !(rt->rt6i_flags & RTF_CACHE))
1603 return;
1605 rcu_read_lock();
1606 bucket = rcu_dereference(from->rt6i_exception_bucket);
1608 #ifdef CONFIG_IPV6_SUBTREES
1609 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1610 * and exception table is indexed by a hash of
1611 * both rt6i_dst and rt6i_src.
1612 * Otherwise, the exception table is indexed by
1613 * a hash of only rt6i_dst.
1615 if (from->fib6_src.plen)
1616 src_key = &rt->rt6i_src.addr;
1617 #endif
1618 rt6_ex = __rt6_find_exception_rcu(&bucket,
1619 &rt->rt6i_dst.addr,
1620 src_key);
1621 if (rt6_ex)
1622 rt6_ex->stamp = jiffies;
1624 rcu_read_unlock();
1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1629 struct rt6_exception_bucket *bucket;
1630 struct rt6_exception *rt6_ex;
1631 int i;
1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634 lockdep_is_held(&rt6_exception_lock));
1636 if (bucket) {
1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1639 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1641 bucket++;
1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1647 struct rt6_info *rt, int mtu)
1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1650 * lowest MTU in the path: always allow updating the route PMTU to
1651 * reflect PMTU decreases.
1653 * If the new MTU is higher, and the route PMTU is equal to the local
1654 * MTU, this means the old MTU is the lowest in the path, so allow
1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1656 * handle this.
1659 if (dst_mtu(&rt->dst) >= mtu)
1660 return true;
1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1663 return true;
1665 return false;
1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1669 struct fib6_info *rt, int mtu)
1671 struct rt6_exception_bucket *bucket;
1672 struct rt6_exception *rt6_ex;
1673 int i;
1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1676 lockdep_is_held(&rt6_exception_lock));
1678 if (!bucket)
1679 return;
1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1683 struct rt6_info *entry = rt6_ex->rt6i;
1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1686 * route), the metrics of its rt->from have already
1687 * been updated.
1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1690 rt6_mtu_change_route_allowed(idev, entry, mtu))
1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1693 bucket++;
1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1700 struct in6_addr *gateway)
1702 struct rt6_exception_bucket *bucket;
1703 struct rt6_exception *rt6_ex;
1704 struct hlist_node *tmp;
1705 int i;
1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1708 return;
1710 spin_lock_bh(&rt6_exception_lock);
1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1712 lockdep_is_held(&rt6_exception_lock));
1714 if (bucket) {
1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1716 hlist_for_each_entry_safe(rt6_ex, tmp,
1717 &bucket->chain, hlist) {
1718 struct rt6_info *entry = rt6_ex->rt6i;
1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1721 RTF_CACHE_GATEWAY &&
1722 ipv6_addr_equal(gateway,
1723 &entry->rt6i_gateway)) {
1724 rt6_remove_exception(bucket, rt6_ex);
1727 bucket++;
1731 spin_unlock_bh(&rt6_exception_lock);
1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1735 struct rt6_exception *rt6_ex,
1736 struct fib6_gc_args *gc_args,
1737 unsigned long now)
1739 struct rt6_info *rt = rt6_ex->rt6i;
1741 /* we are pruning and obsoleting aged-out and non gateway exceptions
1742 * even if others have still references to them, so that on next
1743 * dst_check() such references can be dropped.
1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1745 * expired, independently from their aging, as per RFC 8201 section 4
1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1749 RT6_TRACE("aging clone %p\n", rt);
1750 rt6_remove_exception(bucket, rt6_ex);
1751 return;
1753 } else if (time_after(jiffies, rt->dst.expires)) {
1754 RT6_TRACE("purging expired route %p\n", rt);
1755 rt6_remove_exception(bucket, rt6_ex);
1756 return;
1759 if (rt->rt6i_flags & RTF_GATEWAY) {
1760 struct neighbour *neigh;
1761 __u8 neigh_flags = 0;
1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1764 if (neigh)
1765 neigh_flags = neigh->flags;
1767 if (!(neigh_flags & NTF_ROUTER)) {
1768 RT6_TRACE("purging route %p via non-router but gateway\n",
1769 rt);
1770 rt6_remove_exception(bucket, rt6_ex);
1771 return;
1775 gc_args->more++;
1778 void rt6_age_exceptions(struct fib6_info *rt,
1779 struct fib6_gc_args *gc_args,
1780 unsigned long now)
1782 struct rt6_exception_bucket *bucket;
1783 struct rt6_exception *rt6_ex;
1784 struct hlist_node *tmp;
1785 int i;
1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1788 return;
1790 rcu_read_lock_bh();
1791 spin_lock(&rt6_exception_lock);
1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1793 lockdep_is_held(&rt6_exception_lock));
1795 if (bucket) {
1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1797 hlist_for_each_entry_safe(rt6_ex, tmp,
1798 &bucket->chain, hlist) {
1799 rt6_age_examine_exception(bucket, rt6_ex,
1800 gc_args, now);
1802 bucket++;
1805 spin_unlock(&rt6_exception_lock);
1806 rcu_read_unlock_bh();
1809 /* must be called with rcu lock held */
1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1811 int oif, struct flowi6 *fl6, int strict)
1813 struct fib6_node *fn, *saved_fn;
1814 struct fib6_info *f6i;
1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1817 saved_fn = fn;
1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1820 oif = 0;
1822 redo_rt6_select:
1823 f6i = rt6_select(net, fn, oif, strict);
1824 if (f6i == net->ipv6.fib6_null_entry) {
1825 fn = fib6_backtrack(fn, &fl6->saddr);
1826 if (fn)
1827 goto redo_rt6_select;
1828 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1829 /* also consider unreachable route */
1830 strict &= ~RT6_LOOKUP_F_REACHABLE;
1831 fn = saved_fn;
1832 goto redo_rt6_select;
1836 trace_fib6_table_lookup(net, f6i, table, fl6);
1838 return f6i;
1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1842 int oif, struct flowi6 *fl6,
1843 const struct sk_buff *skb, int flags)
1845 struct fib6_info *f6i;
1846 struct rt6_info *rt;
1847 int strict = 0;
1849 strict |= flags & RT6_LOOKUP_F_IFACE;
1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1851 if (net->ipv6.devconf_all->forwarding == 0)
1852 strict |= RT6_LOOKUP_F_REACHABLE;
1854 rcu_read_lock();
1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1857 if (f6i->fib6_nsiblings)
1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1860 if (f6i == net->ipv6.fib6_null_entry) {
1861 rt = net->ipv6.ip6_null_entry;
1862 rcu_read_unlock();
1863 dst_hold(&rt->dst);
1864 return rt;
1867 /*Search through exception table */
1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1869 if (rt) {
1870 if (ip6_hold_safe(net, &rt, true))
1871 dst_use_noref(&rt->dst, jiffies);
1873 rcu_read_unlock();
1874 return rt;
1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1876 !(f6i->fib6_flags & RTF_GATEWAY))) {
1877 /* Create a RTF_CACHE clone which will not be
1878 * owned by the fib6 tree. It is for the special case where
1879 * the daddr in the skb during the neighbor look-up is different
1880 * from the fl6->daddr used to look-up route here.
1882 struct rt6_info *uncached_rt;
1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1886 rcu_read_unlock();
1888 if (uncached_rt) {
1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1890 * No need for another dst_hold()
1892 rt6_uncached_list_add(uncached_rt);
1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1894 } else {
1895 uncached_rt = net->ipv6.ip6_null_entry;
1896 dst_hold(&uncached_rt->dst);
1899 return uncached_rt;
1900 } else {
1901 /* Get a percpu copy */
1903 struct rt6_info *pcpu_rt;
1905 local_bh_disable();
1906 pcpu_rt = rt6_get_pcpu_route(f6i);
1908 if (!pcpu_rt)
1909 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1911 local_bh_enable();
1912 rcu_read_unlock();
1914 return pcpu_rt;
1917 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919 static struct rt6_info *ip6_pol_route_input(struct net *net,
1920 struct fib6_table *table,
1921 struct flowi6 *fl6,
1922 const struct sk_buff *skb,
1923 int flags)
1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1928 struct dst_entry *ip6_route_input_lookup(struct net *net,
1929 struct net_device *dev,
1930 struct flowi6 *fl6,
1931 const struct sk_buff *skb,
1932 int flags)
1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1935 flags |= RT6_LOOKUP_F_IFACE;
1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1942 struct flow_keys *keys,
1943 struct flow_keys *flkeys)
1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1946 const struct ipv6hdr *key_iph = outer_iph;
1947 struct flow_keys *_flkeys = flkeys;
1948 const struct ipv6hdr *inner_iph;
1949 const struct icmp6hdr *icmph;
1950 struct ipv6hdr _inner_iph;
1951 struct icmp6hdr _icmph;
1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1954 goto out;
1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1957 sizeof(_icmph), &_icmph);
1958 if (!icmph)
1959 goto out;
1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1964 icmph->icmp6_type != ICMPV6_PARAMPROB)
1965 goto out;
1967 inner_iph = skb_header_pointer(skb,
1968 skb_transport_offset(skb) + sizeof(*icmph),
1969 sizeof(_inner_iph), &_inner_iph);
1970 if (!inner_iph)
1971 goto out;
1973 key_iph = inner_iph;
1974 _flkeys = NULL;
1975 out:
1976 if (_flkeys) {
1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1979 keys->tags.flow_label = _flkeys->tags.flow_label;
1980 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1981 } else {
1982 keys->addrs.v6addrs.src = key_iph->saddr;
1983 keys->addrs.v6addrs.dst = key_iph->daddr;
1984 keys->tags.flow_label = ip6_flowlabel(key_iph);
1985 keys->basic.ip_proto = key_iph->nexthdr;
1989 /* if skb is set it will be used and fl6 can be NULL */
1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1991 const struct sk_buff *skb, struct flow_keys *flkeys)
1993 struct flow_keys hash_keys;
1994 u32 mhash;
1996 switch (ip6_multipath_hash_policy(net)) {
1997 case 0:
1998 memset(&hash_keys, 0, sizeof(hash_keys));
1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2000 if (skb) {
2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2002 } else {
2003 hash_keys.addrs.v6addrs.src = fl6->saddr;
2004 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2006 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2008 break;
2009 case 1:
2010 if (skb) {
2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2012 struct flow_keys keys;
2014 /* short-circuit if we already have L4 hash present */
2015 if (skb->l4_hash)
2016 return skb_get_hash_raw(skb) >> 1;
2018 memset(&hash_keys, 0, sizeof(hash_keys));
2020 if (!flkeys) {
2021 skb_flow_dissect_flow_keys(skb, &keys, flag);
2022 flkeys = &keys;
2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2027 hash_keys.ports.src = flkeys->ports.src;
2028 hash_keys.ports.dst = flkeys->ports.dst;
2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2030 } else {
2031 memset(&hash_keys, 0, sizeof(hash_keys));
2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2033 hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 hash_keys.ports.src = fl6->fl6_sport;
2036 hash_keys.ports.dst = fl6->fl6_dport;
2037 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2039 break;
2041 mhash = flow_hash_from_keys(&hash_keys);
2043 return mhash >> 1;
2046 void ip6_route_input(struct sk_buff *skb)
2048 const struct ipv6hdr *iph = ipv6_hdr(skb);
2049 struct net *net = dev_net(skb->dev);
2050 int flags = RT6_LOOKUP_F_HAS_SADDR;
2051 struct ip_tunnel_info *tun_info;
2052 struct flowi6 fl6 = {
2053 .flowi6_iif = skb->dev->ifindex,
2054 .daddr = iph->daddr,
2055 .saddr = iph->saddr,
2056 .flowlabel = ip6_flowinfo(iph),
2057 .flowi6_mark = skb->mark,
2058 .flowi6_proto = iph->nexthdr,
2060 struct flow_keys *flkeys = NULL, _flkeys;
2062 tun_info = skb_tunnel_info(skb);
2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2067 flkeys = &_flkeys;
2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2071 skb_dst_drop(skb);
2072 skb_dst_set(skb,
2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2076 static struct rt6_info *ip6_pol_route_output(struct net *net,
2077 struct fib6_table *table,
2078 struct flowi6 *fl6,
2079 const struct sk_buff *skb,
2080 int flags)
2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2086 struct flowi6 *fl6, int flags)
2088 bool any_src;
2090 if (rt6_need_strict(&fl6->daddr)) {
2091 struct dst_entry *dst;
2093 dst = l3mdev_link_scope_lookup(net, fl6);
2094 if (dst)
2095 return dst;
2098 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2100 any_src = ipv6_addr_any(&fl6->saddr);
2101 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2102 (fl6->flowi6_oif && any_src))
2103 flags |= RT6_LOOKUP_F_IFACE;
2105 if (!any_src)
2106 flags |= RT6_LOOKUP_F_HAS_SADDR;
2107 else if (sk)
2108 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2110 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2116 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2117 struct net_device *loopback_dev = net->loopback_dev;
2118 struct dst_entry *new = NULL;
2120 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2121 DST_OBSOLETE_DEAD, 0);
2122 if (rt) {
2123 rt6_info_init(rt);
2124 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2126 new = &rt->dst;
2127 new->__use = 1;
2128 new->input = dst_discard;
2129 new->output = dst_discard_out;
2131 dst_copy_metrics(new, &ort->dst);
2133 rt->rt6i_idev = in6_dev_get(loopback_dev);
2134 rt->rt6i_gateway = ort->rt6i_gateway;
2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2138 #ifdef CONFIG_IPV6_SUBTREES
2139 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2140 #endif
2143 dst_release(dst_orig);
2144 return new ? new : ERR_PTR(-ENOMEM);
2148 * Destination cache support functions
2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2153 u32 rt_cookie = 0;
2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2156 return false;
2158 if (fib6_check_expired(f6i))
2159 return false;
2161 return true;
2164 static struct dst_entry *rt6_check(struct rt6_info *rt,
2165 struct fib6_info *from,
2166 u32 cookie)
2168 u32 rt_cookie = 0;
2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2171 rt_cookie != cookie)
2172 return NULL;
2174 if (rt6_check_expired(rt))
2175 return NULL;
2177 return &rt->dst;
2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2181 struct fib6_info *from,
2182 u32 cookie)
2184 if (!__rt6_check_expired(rt) &&
2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2186 fib6_check(from, cookie))
2187 return &rt->dst;
2188 else
2189 return NULL;
2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2194 struct dst_entry *dst_ret;
2195 struct fib6_info *from;
2196 struct rt6_info *rt;
2198 rt = container_of(dst, struct rt6_info, dst);
2200 rcu_read_lock();
2202 /* All IPV6 dsts are created with ->obsolete set to the value
2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2204 * into this function always.
2207 from = rcu_dereference(rt->from);
2209 if (from && (rt->rt6i_flags & RTF_PCPU ||
2210 unlikely(!list_empty(&rt->rt6i_uncached))))
2211 dst_ret = rt6_dst_from_check(rt, from, cookie);
2212 else
2213 dst_ret = rt6_check(rt, from, cookie);
2215 rcu_read_unlock();
2217 return dst_ret;
2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2222 struct rt6_info *rt = (struct rt6_info *) dst;
2224 if (rt) {
2225 if (rt->rt6i_flags & RTF_CACHE) {
2226 rcu_read_lock();
2227 if (rt6_check_expired(rt)) {
2228 rt6_remove_exception_rt(rt);
2229 dst = NULL;
2231 rcu_read_unlock();
2232 } else {
2233 dst_release(dst);
2234 dst = NULL;
2237 return dst;
2240 static void ip6_link_failure(struct sk_buff *skb)
2242 struct rt6_info *rt;
2244 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2246 rt = (struct rt6_info *) skb_dst(skb);
2247 if (rt) {
2248 rcu_read_lock();
2249 if (rt->rt6i_flags & RTF_CACHE) {
2250 if (dst_hold_safe(&rt->dst))
2251 rt6_remove_exception_rt(rt);
2252 } else {
2253 struct fib6_info *from;
2254 struct fib6_node *fn;
2256 from = rcu_dereference(rt->from);
2257 if (from) {
2258 fn = rcu_dereference(from->fib6_node);
2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2260 fn->fn_sernum = -1;
2263 rcu_read_unlock();
2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2270 struct fib6_info *from;
2272 rcu_read_lock();
2273 from = rcu_dereference(rt0->from);
2274 if (from)
2275 rt0->dst.expires = from->expires;
2276 rcu_read_unlock();
2279 dst_set_expires(&rt0->dst, timeout);
2280 rt0->rt6i_flags |= RTF_EXPIRES;
2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 struct net *net = dev_net(rt->dst.dev);
2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2288 rt->rt6i_flags |= RTF_MODIFIED;
2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2294 bool from_set;
2296 rcu_read_lock();
2297 from_set = !!rcu_dereference(rt->from);
2298 rcu_read_unlock();
2300 return !(rt->rt6i_flags & RTF_CACHE) &&
2301 (rt->rt6i_flags & RTF_PCPU || from_set);
2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2305 const struct ipv6hdr *iph, u32 mtu)
2307 const struct in6_addr *daddr, *saddr;
2308 struct rt6_info *rt6 = (struct rt6_info *)dst;
2310 if (dst_metric_locked(dst, RTAX_MTU))
2311 return;
2313 if (iph) {
2314 daddr = &iph->daddr;
2315 saddr = &iph->saddr;
2316 } else if (sk) {
2317 daddr = &sk->sk_v6_daddr;
2318 saddr = &inet6_sk(sk)->saddr;
2319 } else {
2320 daddr = NULL;
2321 saddr = NULL;
2323 dst_confirm_neigh(dst, daddr);
2324 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2325 if (mtu >= dst_mtu(dst))
2326 return;
2328 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2329 rt6_do_update_pmtu(rt6, mtu);
2330 /* update rt6_ex->stamp for cache */
2331 if (rt6->rt6i_flags & RTF_CACHE)
2332 rt6_update_exception_stamp_rt(rt6);
2333 } else if (daddr) {
2334 struct fib6_info *from;
2335 struct rt6_info *nrt6;
2337 rcu_read_lock();
2338 from = rcu_dereference(rt6->from);
2339 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2340 if (nrt6) {
2341 rt6_do_update_pmtu(nrt6, mtu);
2342 if (rt6_insert_exception(nrt6, from))
2343 dst_release_immediate(&nrt6->dst);
2345 rcu_read_unlock();
2349 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2350 struct sk_buff *skb, u32 mtu)
2352 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2355 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2356 int oif, u32 mark, kuid_t uid)
2358 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2359 struct dst_entry *dst;
2360 struct flowi6 fl6;
2362 memset(&fl6, 0, sizeof(fl6));
2363 fl6.flowi6_oif = oif;
2364 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2365 fl6.daddr = iph->daddr;
2366 fl6.saddr = iph->saddr;
2367 fl6.flowlabel = ip6_flowinfo(iph);
2368 fl6.flowi6_uid = uid;
2370 dst = ip6_route_output(net, NULL, &fl6);
2371 if (!dst->error)
2372 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2373 dst_release(dst);
2375 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2377 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2379 struct dst_entry *dst;
2381 ip6_update_pmtu(skb, sock_net(sk), mtu,
2382 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2384 dst = __sk_dst_get(sk);
2385 if (!dst || !dst->obsolete ||
2386 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2387 return;
2389 bh_lock_sock(sk);
2390 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2391 ip6_datagram_dst_update(sk, false);
2392 bh_unlock_sock(sk);
2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2397 const struct flowi6 *fl6)
2399 #ifdef CONFIG_IPV6_SUBTREES
2400 struct ipv6_pinfo *np = inet6_sk(sk);
2401 #endif
2403 ip6_dst_store(sk, dst,
2404 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2405 &sk->sk_v6_daddr : NULL,
2406 #ifdef CONFIG_IPV6_SUBTREES
2407 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2408 &np->saddr :
2409 #endif
2410 NULL);
2413 /* Handle redirects */
2414 struct ip6rd_flowi {
2415 struct flowi6 fl6;
2416 struct in6_addr gateway;
2419 static struct rt6_info *__ip6_route_redirect(struct net *net,
2420 struct fib6_table *table,
2421 struct flowi6 *fl6,
2422 const struct sk_buff *skb,
2423 int flags)
2425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2426 struct rt6_info *ret = NULL, *rt_cache;
2427 struct fib6_info *rt;
2428 struct fib6_node *fn;
2430 /* Get the "current" route for this destination and
2431 * check if the redirect has come from appropriate router.
2433 * RFC 4861 specifies that redirects should only be
2434 * accepted if they come from the nexthop to the target.
2435 * Due to the way the routes are chosen, this notion
2436 * is a bit fuzzy and one might need to check all possible
2437 * routes.
2440 rcu_read_lock();
2441 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2442 restart:
2443 for_each_fib6_node_rt_rcu(fn) {
2444 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2445 continue;
2446 if (fib6_check_expired(rt))
2447 continue;
2448 if (rt->fib6_flags & RTF_REJECT)
2449 break;
2450 if (!(rt->fib6_flags & RTF_GATEWAY))
2451 continue;
2452 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2453 continue;
2454 /* rt_cache's gateway might be different from its 'parent'
2455 * in the case of an ip redirect.
2456 * So we keep searching in the exception table if the gateway
2457 * is different.
2459 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2460 rt_cache = rt6_find_cached_rt(rt,
2461 &fl6->daddr,
2462 &fl6->saddr);
2463 if (rt_cache &&
2464 ipv6_addr_equal(&rdfl->gateway,
2465 &rt_cache->rt6i_gateway)) {
2466 ret = rt_cache;
2467 break;
2469 continue;
2471 break;
2474 if (!rt)
2475 rt = net->ipv6.fib6_null_entry;
2476 else if (rt->fib6_flags & RTF_REJECT) {
2477 ret = net->ipv6.ip6_null_entry;
2478 goto out;
2481 if (rt == net->ipv6.fib6_null_entry) {
2482 fn = fib6_backtrack(fn, &fl6->saddr);
2483 if (fn)
2484 goto restart;
2487 out:
2488 if (ret)
2489 dst_hold(&ret->dst);
2490 else
2491 ret = ip6_create_rt_rcu(rt);
2493 rcu_read_unlock();
2495 trace_fib6_table_lookup(net, rt, table, fl6);
2496 return ret;
2499 static struct dst_entry *ip6_route_redirect(struct net *net,
2500 const struct flowi6 *fl6,
2501 const struct sk_buff *skb,
2502 const struct in6_addr *gateway)
2504 int flags = RT6_LOOKUP_F_HAS_SADDR;
2505 struct ip6rd_flowi rdfl;
2507 rdfl.fl6 = *fl6;
2508 rdfl.gateway = *gateway;
2510 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2511 flags, __ip6_route_redirect);
2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2515 kuid_t uid)
2517 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2518 struct dst_entry *dst;
2519 struct flowi6 fl6;
2521 memset(&fl6, 0, sizeof(fl6));
2522 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2523 fl6.flowi6_oif = oif;
2524 fl6.flowi6_mark = mark;
2525 fl6.daddr = iph->daddr;
2526 fl6.saddr = iph->saddr;
2527 fl6.flowlabel = ip6_flowinfo(iph);
2528 fl6.flowi6_uid = uid;
2530 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2531 rt6_do_redirect(dst, NULL, skb);
2532 dst_release(dst);
2534 EXPORT_SYMBOL_GPL(ip6_redirect);
2536 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2537 u32 mark)
2539 const struct ipv6hdr *iph = ipv6_hdr(skb);
2540 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2541 struct dst_entry *dst;
2542 struct flowi6 fl6;
2544 memset(&fl6, 0, sizeof(fl6));
2545 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2546 fl6.flowi6_oif = oif;
2547 fl6.flowi6_mark = mark;
2548 fl6.daddr = msg->dest;
2549 fl6.saddr = iph->daddr;
2550 fl6.flowi6_uid = sock_net_uid(net, NULL);
2552 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2553 rt6_do_redirect(dst, NULL, skb);
2554 dst_release(dst);
2557 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2559 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2560 sk->sk_uid);
2562 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2564 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2566 struct net_device *dev = dst->dev;
2567 unsigned int mtu = dst_mtu(dst);
2568 struct net *net = dev_net(dev);
2570 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2572 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2573 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2576 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2577 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2578 * IPV6_MAXPLEN is also valid and means: "any MSS,
2579 * rely only on pmtu discovery"
2581 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2582 mtu = IPV6_MAXPLEN;
2583 return mtu;
2586 static unsigned int ip6_mtu(const struct dst_entry *dst)
2588 struct inet6_dev *idev;
2589 unsigned int mtu;
2591 mtu = dst_metric_raw(dst, RTAX_MTU);
2592 if (mtu)
2593 goto out;
2595 mtu = IPV6_MIN_MTU;
2597 rcu_read_lock();
2598 idev = __in6_dev_get(dst->dev);
2599 if (idev)
2600 mtu = idev->cnf.mtu6;
2601 rcu_read_unlock();
2603 out:
2604 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2606 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2609 /* MTU selection:
2610 * 1. mtu on route is locked - use it
2611 * 2. mtu from nexthop exception
2612 * 3. mtu from egress device
2614 * based on ip6_dst_mtu_forward and exception logic of
2615 * rt6_find_cached_rt; called with rcu_read_lock
2617 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2618 struct in6_addr *saddr)
2620 struct rt6_exception_bucket *bucket;
2621 struct rt6_exception *rt6_ex;
2622 struct in6_addr *src_key;
2623 struct inet6_dev *idev;
2624 u32 mtu = 0;
2626 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2627 mtu = f6i->fib6_pmtu;
2628 if (mtu)
2629 goto out;
2632 src_key = NULL;
2633 #ifdef CONFIG_IPV6_SUBTREES
2634 if (f6i->fib6_src.plen)
2635 src_key = saddr;
2636 #endif
2638 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2639 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2640 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2641 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2643 if (likely(!mtu)) {
2644 struct net_device *dev = fib6_info_nh_dev(f6i);
2646 mtu = IPV6_MIN_MTU;
2647 idev = __in6_dev_get(dev);
2648 if (idev && idev->cnf.mtu6 > mtu)
2649 mtu = idev->cnf.mtu6;
2652 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2653 out:
2654 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2657 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2658 struct flowi6 *fl6)
2660 struct dst_entry *dst;
2661 struct rt6_info *rt;
2662 struct inet6_dev *idev = in6_dev_get(dev);
2663 struct net *net = dev_net(dev);
2665 if (unlikely(!idev))
2666 return ERR_PTR(-ENODEV);
2668 rt = ip6_dst_alloc(net, dev, 0);
2669 if (unlikely(!rt)) {
2670 in6_dev_put(idev);
2671 dst = ERR_PTR(-ENOMEM);
2672 goto out;
2675 rt->dst.flags |= DST_HOST;
2676 rt->dst.input = ip6_input;
2677 rt->dst.output = ip6_output;
2678 rt->rt6i_gateway = fl6->daddr;
2679 rt->rt6i_dst.addr = fl6->daddr;
2680 rt->rt6i_dst.plen = 128;
2681 rt->rt6i_idev = idev;
2682 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2684 /* Add this dst into uncached_list so that rt6_disable_ip() can
2685 * do proper release of the net_device
2687 rt6_uncached_list_add(rt);
2688 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2690 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2692 out:
2693 return dst;
2696 static int ip6_dst_gc(struct dst_ops *ops)
2698 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2699 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2700 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2701 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2702 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2703 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2704 int entries;
2706 entries = dst_entries_get_fast(ops);
2707 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2708 entries <= rt_max_size)
2709 goto out;
2711 net->ipv6.ip6_rt_gc_expire++;
2712 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2713 entries = dst_entries_get_slow(ops);
2714 if (entries < ops->gc_thresh)
2715 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2716 out:
2717 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2718 return entries > rt_max_size;
2721 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2722 struct fib6_config *cfg)
2724 struct dst_metrics *p;
2726 if (!cfg->fc_mx)
2727 return 0;
2729 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2730 if (unlikely(!p))
2731 return -ENOMEM;
2733 refcount_set(&p->refcnt, 1);
2734 rt->fib6_metrics = p;
2736 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2739 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2740 struct fib6_config *cfg,
2741 const struct in6_addr *gw_addr,
2742 u32 tbid, int flags)
2744 struct flowi6 fl6 = {
2745 .flowi6_oif = cfg->fc_ifindex,
2746 .daddr = *gw_addr,
2747 .saddr = cfg->fc_prefsrc,
2749 struct fib6_table *table;
2750 struct rt6_info *rt;
2752 table = fib6_get_table(net, tbid);
2753 if (!table)
2754 return NULL;
2756 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2757 flags |= RT6_LOOKUP_F_HAS_SADDR;
2759 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2760 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2762 /* if table lookup failed, fall back to full lookup */
2763 if (rt == net->ipv6.ip6_null_entry) {
2764 ip6_rt_put(rt);
2765 rt = NULL;
2768 return rt;
2771 static int ip6_route_check_nh_onlink(struct net *net,
2772 struct fib6_config *cfg,
2773 const struct net_device *dev,
2774 struct netlink_ext_ack *extack)
2776 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2777 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2778 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2779 struct rt6_info *grt;
2780 int err;
2782 err = 0;
2783 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2784 if (grt) {
2785 if (!grt->dst.error &&
2786 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2787 NL_SET_ERR_MSG(extack,
2788 "Nexthop has invalid gateway or device mismatch");
2789 err = -EINVAL;
2792 ip6_rt_put(grt);
2795 return err;
2798 static int ip6_route_check_nh(struct net *net,
2799 struct fib6_config *cfg,
2800 struct net_device **_dev,
2801 struct inet6_dev **idev)
2803 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2804 struct net_device *dev = _dev ? *_dev : NULL;
2805 struct rt6_info *grt = NULL;
2806 int err = -EHOSTUNREACH;
2808 if (cfg->fc_table) {
2809 int flags = RT6_LOOKUP_F_IFACE;
2811 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2812 cfg->fc_table, flags);
2813 if (grt) {
2814 if (grt->rt6i_flags & RTF_GATEWAY ||
2815 (dev && dev != grt->dst.dev)) {
2816 ip6_rt_put(grt);
2817 grt = NULL;
2822 if (!grt)
2823 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2825 if (!grt)
2826 goto out;
2828 if (dev) {
2829 if (dev != grt->dst.dev) {
2830 ip6_rt_put(grt);
2831 goto out;
2833 } else {
2834 *_dev = dev = grt->dst.dev;
2835 *idev = grt->rt6i_idev;
2836 dev_hold(dev);
2837 in6_dev_hold(grt->rt6i_idev);
2840 if (!(grt->rt6i_flags & RTF_GATEWAY))
2841 err = 0;
2843 ip6_rt_put(grt);
2845 out:
2846 return err;
2849 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2850 struct net_device **_dev, struct inet6_dev **idev,
2851 struct netlink_ext_ack *extack)
2853 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2854 int gwa_type = ipv6_addr_type(gw_addr);
2855 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2856 const struct net_device *dev = *_dev;
2857 bool need_addr_check = !dev;
2858 int err = -EINVAL;
2860 /* if gw_addr is local we will fail to detect this in case
2861 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2862 * will return already-added prefix route via interface that
2863 * prefix route was assigned to, which might be non-loopback.
2865 if (dev &&
2866 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2867 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2868 goto out;
2871 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2872 /* IPv6 strictly inhibits using not link-local
2873 * addresses as nexthop address.
2874 * Otherwise, router will not able to send redirects.
2875 * It is very good, but in some (rare!) circumstances
2876 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2877 * some exceptions. --ANK
2878 * We allow IPv4-mapped nexthops to support RFC4798-type
2879 * addressing
2881 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2882 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2883 goto out;
2886 if (cfg->fc_flags & RTNH_F_ONLINK)
2887 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2888 else
2889 err = ip6_route_check_nh(net, cfg, _dev, idev);
2891 if (err)
2892 goto out;
2895 /* reload in case device was changed */
2896 dev = *_dev;
2898 err = -EINVAL;
2899 if (!dev) {
2900 NL_SET_ERR_MSG(extack, "Egress device not specified");
2901 goto out;
2902 } else if (dev->flags & IFF_LOOPBACK) {
2903 NL_SET_ERR_MSG(extack,
2904 "Egress device can not be loopback device for this route");
2905 goto out;
2908 /* if we did not check gw_addr above, do so now that the
2909 * egress device has been resolved.
2911 if (need_addr_check &&
2912 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2913 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2914 goto out;
2917 err = 0;
2918 out:
2919 return err;
2922 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2923 gfp_t gfp_flags,
2924 struct netlink_ext_ack *extack)
2926 struct net *net = cfg->fc_nlinfo.nl_net;
2927 struct fib6_info *rt = NULL;
2928 struct net_device *dev = NULL;
2929 struct inet6_dev *idev = NULL;
2930 struct fib6_table *table;
2931 int addr_type;
2932 int err = -EINVAL;
2934 /* RTF_PCPU is an internal flag; can not be set by userspace */
2935 if (cfg->fc_flags & RTF_PCPU) {
2936 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2937 goto out;
2940 /* RTF_CACHE is an internal flag; can not be set by userspace */
2941 if (cfg->fc_flags & RTF_CACHE) {
2942 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2943 goto out;
2946 if (cfg->fc_type > RTN_MAX) {
2947 NL_SET_ERR_MSG(extack, "Invalid route type");
2948 goto out;
2951 if (cfg->fc_dst_len > 128) {
2952 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2953 goto out;
2955 if (cfg->fc_src_len > 128) {
2956 NL_SET_ERR_MSG(extack, "Invalid source address length");
2957 goto out;
2959 #ifndef CONFIG_IPV6_SUBTREES
2960 if (cfg->fc_src_len) {
2961 NL_SET_ERR_MSG(extack,
2962 "Specifying source address requires IPV6_SUBTREES to be enabled");
2963 goto out;
2965 #endif
2966 if (cfg->fc_ifindex) {
2967 err = -ENODEV;
2968 dev = dev_get_by_index(net, cfg->fc_ifindex);
2969 if (!dev)
2970 goto out;
2971 idev = in6_dev_get(dev);
2972 if (!idev)
2973 goto out;
2976 if (cfg->fc_metric == 0)
2977 cfg->fc_metric = IP6_RT_PRIO_USER;
2979 if (cfg->fc_flags & RTNH_F_ONLINK) {
2980 if (!dev) {
2981 NL_SET_ERR_MSG(extack,
2982 "Nexthop device required for onlink");
2983 err = -ENODEV;
2984 goto out;
2987 if (!(dev->flags & IFF_UP)) {
2988 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2989 err = -ENETDOWN;
2990 goto out;
2994 err = -ENOBUFS;
2995 if (cfg->fc_nlinfo.nlh &&
2996 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2997 table = fib6_get_table(net, cfg->fc_table);
2998 if (!table) {
2999 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3000 table = fib6_new_table(net, cfg->fc_table);
3002 } else {
3003 table = fib6_new_table(net, cfg->fc_table);
3006 if (!table)
3007 goto out;
3009 err = -ENOMEM;
3010 rt = fib6_info_alloc(gfp_flags);
3011 if (!rt)
3012 goto out;
3014 if (cfg->fc_flags & RTF_ADDRCONF)
3015 rt->dst_nocount = true;
3017 err = ip6_convert_metrics(net, rt, cfg);
3018 if (err < 0)
3019 goto out;
3021 if (cfg->fc_flags & RTF_EXPIRES)
3022 fib6_set_expires(rt, jiffies +
3023 clock_t_to_jiffies(cfg->fc_expires));
3024 else
3025 fib6_clean_expires(rt);
3027 if (cfg->fc_protocol == RTPROT_UNSPEC)
3028 cfg->fc_protocol = RTPROT_BOOT;
3029 rt->fib6_protocol = cfg->fc_protocol;
3031 addr_type = ipv6_addr_type(&cfg->fc_dst);
3033 if (cfg->fc_encap) {
3034 struct lwtunnel_state *lwtstate;
3036 err = lwtunnel_build_state(cfg->fc_encap_type,
3037 cfg->fc_encap, AF_INET6, cfg,
3038 &lwtstate, extack);
3039 if (err)
3040 goto out;
3041 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3044 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3045 rt->fib6_dst.plen = cfg->fc_dst_len;
3046 if (rt->fib6_dst.plen == 128)
3047 rt->dst_host = true;
3049 #ifdef CONFIG_IPV6_SUBTREES
3050 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3051 rt->fib6_src.plen = cfg->fc_src_len;
3052 #endif
3054 rt->fib6_metric = cfg->fc_metric;
3055 rt->fib6_nh.nh_weight = 1;
3057 rt->fib6_type = cfg->fc_type;
3059 /* We cannot add true routes via loopback here,
3060 they would result in kernel looping; promote them to reject routes
3062 if ((cfg->fc_flags & RTF_REJECT) ||
3063 (dev && (dev->flags & IFF_LOOPBACK) &&
3064 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3065 !(cfg->fc_flags & RTF_LOCAL))) {
3066 /* hold loopback dev/idev if we haven't done so. */
3067 if (dev != net->loopback_dev) {
3068 if (dev) {
3069 dev_put(dev);
3070 in6_dev_put(idev);
3072 dev = net->loopback_dev;
3073 dev_hold(dev);
3074 idev = in6_dev_get(dev);
3075 if (!idev) {
3076 err = -ENODEV;
3077 goto out;
3080 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3081 goto install_route;
3084 if (cfg->fc_flags & RTF_GATEWAY) {
3085 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3086 if (err)
3087 goto out;
3089 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3092 err = -ENODEV;
3093 if (!dev)
3094 goto out;
3096 if (idev->cnf.disable_ipv6) {
3097 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3098 err = -EACCES;
3099 goto out;
3102 if (!(dev->flags & IFF_UP)) {
3103 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3104 err = -ENETDOWN;
3105 goto out;
3108 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3109 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3110 NL_SET_ERR_MSG(extack, "Invalid source address");
3111 err = -EINVAL;
3112 goto out;
3114 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3115 rt->fib6_prefsrc.plen = 128;
3116 } else
3117 rt->fib6_prefsrc.plen = 0;
3119 rt->fib6_flags = cfg->fc_flags;
3121 install_route:
3122 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3123 !netif_carrier_ok(dev))
3124 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3125 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3126 rt->fib6_nh.nh_dev = dev;
3127 rt->fib6_table = table;
3129 cfg->fc_nlinfo.nl_net = dev_net(dev);
3131 if (idev)
3132 in6_dev_put(idev);
3134 return rt;
3135 out:
3136 if (dev)
3137 dev_put(dev);
3138 if (idev)
3139 in6_dev_put(idev);
3141 fib6_info_release(rt);
3142 return ERR_PTR(err);
3145 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3146 struct netlink_ext_ack *extack)
3148 struct fib6_info *rt;
3149 int err;
3151 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3152 if (IS_ERR(rt))
3153 return PTR_ERR(rt);
3155 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3156 fib6_info_release(rt);
3158 return err;
3161 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3163 struct net *net = info->nl_net;
3164 struct fib6_table *table;
3165 int err;
3167 if (rt == net->ipv6.fib6_null_entry) {
3168 err = -ENOENT;
3169 goto out;
3172 table = rt->fib6_table;
3173 spin_lock_bh(&table->tb6_lock);
3174 err = fib6_del(rt, info);
3175 spin_unlock_bh(&table->tb6_lock);
3177 out:
3178 fib6_info_release(rt);
3179 return err;
3182 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3184 struct nl_info info = { .nl_net = net };
3186 return __ip6_del_rt(rt, &info);
3189 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3191 struct nl_info *info = &cfg->fc_nlinfo;
3192 struct net *net = info->nl_net;
3193 struct sk_buff *skb = NULL;
3194 struct fib6_table *table;
3195 int err = -ENOENT;
3197 if (rt == net->ipv6.fib6_null_entry)
3198 goto out_put;
3199 table = rt->fib6_table;
3200 spin_lock_bh(&table->tb6_lock);
3202 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3203 struct fib6_info *sibling, *next_sibling;
3205 /* prefer to send a single notification with all hops */
3206 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3207 if (skb) {
3208 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3210 if (rt6_fill_node(net, skb, rt, NULL,
3211 NULL, NULL, 0, RTM_DELROUTE,
3212 info->portid, seq, 0) < 0) {
3213 kfree_skb(skb);
3214 skb = NULL;
3215 } else
3216 info->skip_notify = 1;
3219 list_for_each_entry_safe(sibling, next_sibling,
3220 &rt->fib6_siblings,
3221 fib6_siblings) {
3222 err = fib6_del(sibling, info);
3223 if (err)
3224 goto out_unlock;
3228 err = fib6_del(rt, info);
3229 out_unlock:
3230 spin_unlock_bh(&table->tb6_lock);
3231 out_put:
3232 fib6_info_release(rt);
3234 if (skb) {
3235 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3236 info->nlh, gfp_any());
3238 return err;
3241 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3243 int rc = -ESRCH;
3245 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3246 goto out;
3248 if (cfg->fc_flags & RTF_GATEWAY &&
3249 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3250 goto out;
3251 if (dst_hold_safe(&rt->dst))
3252 rc = rt6_remove_exception_rt(rt);
3253 out:
3254 return rc;
3257 static int ip6_route_del(struct fib6_config *cfg,
3258 struct netlink_ext_ack *extack)
3260 struct rt6_info *rt_cache;
3261 struct fib6_table *table;
3262 struct fib6_info *rt;
3263 struct fib6_node *fn;
3264 int err = -ESRCH;
3266 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3267 if (!table) {
3268 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3269 return err;
3272 rcu_read_lock();
3274 fn = fib6_locate(&table->tb6_root,
3275 &cfg->fc_dst, cfg->fc_dst_len,
3276 &cfg->fc_src, cfg->fc_src_len,
3277 !(cfg->fc_flags & RTF_CACHE));
3279 if (fn) {
3280 for_each_fib6_node_rt_rcu(fn) {
3281 if (cfg->fc_flags & RTF_CACHE) {
3282 int rc;
3284 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3285 &cfg->fc_src);
3286 if (rt_cache) {
3287 rc = ip6_del_cached_rt(rt_cache, cfg);
3288 if (rc != -ESRCH) {
3289 rcu_read_unlock();
3290 return rc;
3293 continue;
3295 if (cfg->fc_ifindex &&
3296 (!rt->fib6_nh.nh_dev ||
3297 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3298 continue;
3299 if (cfg->fc_flags & RTF_GATEWAY &&
3300 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3301 continue;
3302 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3303 continue;
3304 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3305 continue;
3306 fib6_info_hold(rt);
3307 rcu_read_unlock();
3309 /* if gateway was specified only delete the one hop */
3310 if (cfg->fc_flags & RTF_GATEWAY)
3311 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3313 return __ip6_del_rt_siblings(rt, cfg);
3316 rcu_read_unlock();
3318 return err;
3321 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3323 struct netevent_redirect netevent;
3324 struct rt6_info *rt, *nrt = NULL;
3325 struct ndisc_options ndopts;
3326 struct inet6_dev *in6_dev;
3327 struct neighbour *neigh;
3328 struct fib6_info *from;
3329 struct rd_msg *msg;
3330 int optlen, on_link;
3331 u8 *lladdr;
3333 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3334 optlen -= sizeof(*msg);
3336 if (optlen < 0) {
3337 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3338 return;
3341 msg = (struct rd_msg *)icmp6_hdr(skb);
3343 if (ipv6_addr_is_multicast(&msg->dest)) {
3344 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3345 return;
3348 on_link = 0;
3349 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3350 on_link = 1;
3351 } else if (ipv6_addr_type(&msg->target) !=
3352 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3353 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3354 return;
3357 in6_dev = __in6_dev_get(skb->dev);
3358 if (!in6_dev)
3359 return;
3360 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3361 return;
3363 /* RFC2461 8.1:
3364 * The IP source address of the Redirect MUST be the same as the current
3365 * first-hop router for the specified ICMP Destination Address.
3368 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3369 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3370 return;
3373 lladdr = NULL;
3374 if (ndopts.nd_opts_tgt_lladdr) {
3375 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3376 skb->dev);
3377 if (!lladdr) {
3378 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3379 return;
3383 rt = (struct rt6_info *) dst;
3384 if (rt->rt6i_flags & RTF_REJECT) {
3385 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3386 return;
3389 /* Redirect received -> path was valid.
3390 * Look, redirects are sent only in response to data packets,
3391 * so that this nexthop apparently is reachable. --ANK
3393 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3395 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3396 if (!neigh)
3397 return;
3400 * We have finally decided to accept it.
3403 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3404 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3405 NEIGH_UPDATE_F_OVERRIDE|
3406 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3407 NEIGH_UPDATE_F_ISROUTER)),
3408 NDISC_REDIRECT, &ndopts);
3410 rcu_read_lock();
3411 from = rcu_dereference(rt->from);
3412 fib6_info_hold(from);
3413 rcu_read_unlock();
3415 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3416 if (!nrt)
3417 goto out;
3419 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3420 if (on_link)
3421 nrt->rt6i_flags &= ~RTF_GATEWAY;
3423 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3425 /* No need to remove rt from the exception table if rt is
3426 * a cached route because rt6_insert_exception() will
3427 * takes care of it
3429 if (rt6_insert_exception(nrt, from)) {
3430 dst_release_immediate(&nrt->dst);
3431 goto out;
3434 netevent.old = &rt->dst;
3435 netevent.new = &nrt->dst;
3436 netevent.daddr = &msg->dest;
3437 netevent.neigh = neigh;
3438 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3440 out:
3441 fib6_info_release(from);
3442 neigh_release(neigh);
3445 #ifdef CONFIG_IPV6_ROUTE_INFO
3446 static struct fib6_info *rt6_get_route_info(struct net *net,
3447 const struct in6_addr *prefix, int prefixlen,
3448 const struct in6_addr *gwaddr,
3449 struct net_device *dev)
3451 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3452 int ifindex = dev->ifindex;
3453 struct fib6_node *fn;
3454 struct fib6_info *rt = NULL;
3455 struct fib6_table *table;
3457 table = fib6_get_table(net, tb_id);
3458 if (!table)
3459 return NULL;
3461 rcu_read_lock();
3462 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3463 if (!fn)
3464 goto out;
3466 for_each_fib6_node_rt_rcu(fn) {
3467 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3468 continue;
3469 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3470 continue;
3471 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3472 continue;
3473 fib6_info_hold(rt);
3474 break;
3476 out:
3477 rcu_read_unlock();
3478 return rt;
3481 static struct fib6_info *rt6_add_route_info(struct net *net,
3482 const struct in6_addr *prefix, int prefixlen,
3483 const struct in6_addr *gwaddr,
3484 struct net_device *dev,
3485 unsigned int pref)
3487 struct fib6_config cfg = {
3488 .fc_metric = IP6_RT_PRIO_USER,
3489 .fc_ifindex = dev->ifindex,
3490 .fc_dst_len = prefixlen,
3491 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3492 RTF_UP | RTF_PREF(pref),
3493 .fc_protocol = RTPROT_RA,
3494 .fc_type = RTN_UNICAST,
3495 .fc_nlinfo.portid = 0,
3496 .fc_nlinfo.nlh = NULL,
3497 .fc_nlinfo.nl_net = net,
3500 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3501 cfg.fc_dst = *prefix;
3502 cfg.fc_gateway = *gwaddr;
3504 /* We should treat it as a default route if prefix length is 0. */
3505 if (!prefixlen)
3506 cfg.fc_flags |= RTF_DEFAULT;
3508 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3510 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3512 #endif
3514 struct fib6_info *rt6_get_dflt_router(struct net *net,
3515 const struct in6_addr *addr,
3516 struct net_device *dev)
3518 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3519 struct fib6_info *rt;
3520 struct fib6_table *table;
3522 table = fib6_get_table(net, tb_id);
3523 if (!table)
3524 return NULL;
3526 rcu_read_lock();
3527 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3528 if (dev == rt->fib6_nh.nh_dev &&
3529 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3530 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3531 break;
3533 if (rt)
3534 fib6_info_hold(rt);
3535 rcu_read_unlock();
3536 return rt;
3539 struct fib6_info *rt6_add_dflt_router(struct net *net,
3540 const struct in6_addr *gwaddr,
3541 struct net_device *dev,
3542 unsigned int pref)
3544 struct fib6_config cfg = {
3545 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3546 .fc_metric = IP6_RT_PRIO_USER,
3547 .fc_ifindex = dev->ifindex,
3548 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3549 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3550 .fc_protocol = RTPROT_RA,
3551 .fc_type = RTN_UNICAST,
3552 .fc_nlinfo.portid = 0,
3553 .fc_nlinfo.nlh = NULL,
3554 .fc_nlinfo.nl_net = net,
3557 cfg.fc_gateway = *gwaddr;
3559 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3560 struct fib6_table *table;
3562 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3563 if (table)
3564 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3567 return rt6_get_dflt_router(net, gwaddr, dev);
3570 static void __rt6_purge_dflt_routers(struct net *net,
3571 struct fib6_table *table)
3573 struct fib6_info *rt;
3575 restart:
3576 rcu_read_lock();
3577 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3578 struct net_device *dev = fib6_info_nh_dev(rt);
3579 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3581 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3582 (!idev || idev->cnf.accept_ra != 2)) {
3583 fib6_info_hold(rt);
3584 rcu_read_unlock();
3585 ip6_del_rt(net, rt);
3586 goto restart;
3589 rcu_read_unlock();
3591 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3594 void rt6_purge_dflt_routers(struct net *net)
3596 struct fib6_table *table;
3597 struct hlist_head *head;
3598 unsigned int h;
3600 rcu_read_lock();
3602 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3603 head = &net->ipv6.fib_table_hash[h];
3604 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3605 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3606 __rt6_purge_dflt_routers(net, table);
3610 rcu_read_unlock();
3613 static void rtmsg_to_fib6_config(struct net *net,
3614 struct in6_rtmsg *rtmsg,
3615 struct fib6_config *cfg)
3617 memset(cfg, 0, sizeof(*cfg));
3619 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3620 : RT6_TABLE_MAIN;
3621 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3622 cfg->fc_metric = rtmsg->rtmsg_metric;
3623 cfg->fc_expires = rtmsg->rtmsg_info;
3624 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3625 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3626 cfg->fc_flags = rtmsg->rtmsg_flags;
3627 cfg->fc_type = rtmsg->rtmsg_type;
3629 cfg->fc_nlinfo.nl_net = net;
3631 cfg->fc_dst = rtmsg->rtmsg_dst;
3632 cfg->fc_src = rtmsg->rtmsg_src;
3633 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3636 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3638 struct fib6_config cfg;
3639 struct in6_rtmsg rtmsg;
3640 int err;
3642 switch (cmd) {
3643 case SIOCADDRT: /* Add a route */
3644 case SIOCDELRT: /* Delete a route */
3645 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3646 return -EPERM;
3647 err = copy_from_user(&rtmsg, arg,
3648 sizeof(struct in6_rtmsg));
3649 if (err)
3650 return -EFAULT;
3652 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3654 rtnl_lock();
3655 switch (cmd) {
3656 case SIOCADDRT:
3657 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3658 break;
3659 case SIOCDELRT:
3660 err = ip6_route_del(&cfg, NULL);
3661 break;
3662 default:
3663 err = -EINVAL;
3665 rtnl_unlock();
3667 return err;
3670 return -EINVAL;
3674 * Drop the packet on the floor
3677 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3679 int type;
3680 struct dst_entry *dst = skb_dst(skb);
3681 switch (ipstats_mib_noroutes) {
3682 case IPSTATS_MIB_INNOROUTES:
3683 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3684 if (type == IPV6_ADDR_ANY) {
3685 IP6_INC_STATS(dev_net(dst->dev),
3686 __in6_dev_get_safely(skb->dev),
3687 IPSTATS_MIB_INADDRERRORS);
3688 break;
3690 /* FALLTHROUGH */
3691 case IPSTATS_MIB_OUTNOROUTES:
3692 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3693 ipstats_mib_noroutes);
3694 break;
3696 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3697 kfree_skb(skb);
3698 return 0;
3701 static int ip6_pkt_discard(struct sk_buff *skb)
3703 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3706 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3708 skb->dev = skb_dst(skb)->dev;
3709 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3712 static int ip6_pkt_prohibit(struct sk_buff *skb)
3714 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3717 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3719 skb->dev = skb_dst(skb)->dev;
3720 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3724 * Allocate a dst for local (unicast / anycast) address.
3727 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3728 struct inet6_dev *idev,
3729 const struct in6_addr *addr,
3730 bool anycast, gfp_t gfp_flags)
3732 u32 tb_id;
3733 struct net_device *dev = idev->dev;
3734 struct fib6_info *f6i;
3736 f6i = fib6_info_alloc(gfp_flags);
3737 if (!f6i)
3738 return ERR_PTR(-ENOMEM);
3740 f6i->dst_nocount = true;
3741 f6i->dst_host = true;
3742 f6i->fib6_protocol = RTPROT_KERNEL;
3743 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3744 if (anycast) {
3745 f6i->fib6_type = RTN_ANYCAST;
3746 f6i->fib6_flags |= RTF_ANYCAST;
3747 } else {
3748 f6i->fib6_type = RTN_LOCAL;
3749 f6i->fib6_flags |= RTF_LOCAL;
3752 f6i->fib6_nh.nh_gw = *addr;
3753 dev_hold(dev);
3754 f6i->fib6_nh.nh_dev = dev;
3755 f6i->fib6_dst.addr = *addr;
3756 f6i->fib6_dst.plen = 128;
3757 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3758 f6i->fib6_table = fib6_get_table(net, tb_id);
3760 return f6i;
3763 /* remove deleted ip from prefsrc entries */
3764 struct arg_dev_net_ip {
3765 struct net_device *dev;
3766 struct net *net;
3767 struct in6_addr *addr;
3770 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3772 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3773 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3774 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3776 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3777 rt != net->ipv6.fib6_null_entry &&
3778 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3779 spin_lock_bh(&rt6_exception_lock);
3780 /* remove prefsrc entry */
3781 rt->fib6_prefsrc.plen = 0;
3782 /* need to update cache as well */
3783 rt6_exceptions_remove_prefsrc(rt);
3784 spin_unlock_bh(&rt6_exception_lock);
3786 return 0;
3789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3791 struct net *net = dev_net(ifp->idev->dev);
3792 struct arg_dev_net_ip adni = {
3793 .dev = ifp->idev->dev,
3794 .net = net,
3795 .addr = &ifp->addr,
3797 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3800 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3802 /* Remove routers and update dst entries when gateway turn into host. */
3803 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3805 struct in6_addr *gateway = (struct in6_addr *)arg;
3807 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3808 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3809 return -1;
3812 /* Further clean up cached routes in exception table.
3813 * This is needed because cached route may have a different
3814 * gateway than its 'parent' in the case of an ip redirect.
3816 rt6_exceptions_clean_tohost(rt, gateway);
3818 return 0;
3821 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3823 fib6_clean_all(net, fib6_clean_tohost, gateway);
3826 struct arg_netdev_event {
3827 const struct net_device *dev;
3828 union {
3829 unsigned int nh_flags;
3830 unsigned long event;
3834 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3836 struct fib6_info *iter;
3837 struct fib6_node *fn;
3839 fn = rcu_dereference_protected(rt->fib6_node,
3840 lockdep_is_held(&rt->fib6_table->tb6_lock));
3841 iter = rcu_dereference_protected(fn->leaf,
3842 lockdep_is_held(&rt->fib6_table->tb6_lock));
3843 while (iter) {
3844 if (iter->fib6_metric == rt->fib6_metric &&
3845 iter->fib6_nsiblings)
3846 return iter;
3847 iter = rcu_dereference_protected(iter->fib6_next,
3848 lockdep_is_held(&rt->fib6_table->tb6_lock));
3851 return NULL;
3854 static bool rt6_is_dead(const struct fib6_info *rt)
3856 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3857 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3858 fib6_ignore_linkdown(rt)))
3859 return true;
3861 return false;
3864 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3866 struct fib6_info *iter;
3867 int total = 0;
3869 if (!rt6_is_dead(rt))
3870 total += rt->fib6_nh.nh_weight;
3872 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3873 if (!rt6_is_dead(iter))
3874 total += iter->fib6_nh.nh_weight;
3877 return total;
3880 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3882 int upper_bound = -1;
3884 if (!rt6_is_dead(rt)) {
3885 *weight += rt->fib6_nh.nh_weight;
3886 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3887 total) - 1;
3889 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3892 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3894 struct fib6_info *iter;
3895 int weight = 0;
3897 rt6_upper_bound_set(rt, &weight, total);
3899 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3900 rt6_upper_bound_set(iter, &weight, total);
3903 void rt6_multipath_rebalance(struct fib6_info *rt)
3905 struct fib6_info *first;
3906 int total;
3908 /* In case the entire multipath route was marked for flushing,
3909 * then there is no need to rebalance upon the removal of every
3910 * sibling route.
3912 if (!rt->fib6_nsiblings || rt->should_flush)
3913 return;
3915 /* During lookup routes are evaluated in order, so we need to
3916 * make sure upper bounds are assigned from the first sibling
3917 * onwards.
3919 first = rt6_multipath_first_sibling(rt);
3920 if (WARN_ON_ONCE(!first))
3921 return;
3923 total = rt6_multipath_total_weight(first);
3924 rt6_multipath_upper_bound_set(first, total);
3927 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3929 const struct arg_netdev_event *arg = p_arg;
3930 struct net *net = dev_net(arg->dev);
3932 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3933 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3934 fib6_update_sernum_upto_root(net, rt);
3935 rt6_multipath_rebalance(rt);
3938 return 0;
3941 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3943 struct arg_netdev_event arg = {
3944 .dev = dev,
3946 .nh_flags = nh_flags,
3950 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3951 arg.nh_flags |= RTNH_F_LINKDOWN;
3953 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3956 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3957 const struct net_device *dev)
3959 struct fib6_info *iter;
3961 if (rt->fib6_nh.nh_dev == dev)
3962 return true;
3963 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3964 if (iter->fib6_nh.nh_dev == dev)
3965 return true;
3967 return false;
3970 static void rt6_multipath_flush(struct fib6_info *rt)
3972 struct fib6_info *iter;
3974 rt->should_flush = 1;
3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976 iter->should_flush = 1;
3979 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3980 const struct net_device *down_dev)
3982 struct fib6_info *iter;
3983 unsigned int dead = 0;
3985 if (rt->fib6_nh.nh_dev == down_dev ||
3986 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3987 dead++;
3988 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3989 if (iter->fib6_nh.nh_dev == down_dev ||
3990 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3991 dead++;
3993 return dead;
3996 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3997 const struct net_device *dev,
3998 unsigned int nh_flags)
4000 struct fib6_info *iter;
4002 if (rt->fib6_nh.nh_dev == dev)
4003 rt->fib6_nh.nh_flags |= nh_flags;
4004 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4005 if (iter->fib6_nh.nh_dev == dev)
4006 iter->fib6_nh.nh_flags |= nh_flags;
4009 /* called with write lock held for table with rt */
4010 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4012 const struct arg_netdev_event *arg = p_arg;
4013 const struct net_device *dev = arg->dev;
4014 struct net *net = dev_net(dev);
4016 if (rt == net->ipv6.fib6_null_entry)
4017 return 0;
4019 switch (arg->event) {
4020 case NETDEV_UNREGISTER:
4021 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4022 case NETDEV_DOWN:
4023 if (rt->should_flush)
4024 return -1;
4025 if (!rt->fib6_nsiblings)
4026 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4027 if (rt6_multipath_uses_dev(rt, dev)) {
4028 unsigned int count;
4030 count = rt6_multipath_dead_count(rt, dev);
4031 if (rt->fib6_nsiblings + 1 == count) {
4032 rt6_multipath_flush(rt);
4033 return -1;
4035 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4036 RTNH_F_LINKDOWN);
4037 fib6_update_sernum(net, rt);
4038 rt6_multipath_rebalance(rt);
4040 return -2;
4041 case NETDEV_CHANGE:
4042 if (rt->fib6_nh.nh_dev != dev ||
4043 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4044 break;
4045 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4046 rt6_multipath_rebalance(rt);
4047 break;
4050 return 0;
4053 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4055 struct arg_netdev_event arg = {
4056 .dev = dev,
4058 .event = event,
4062 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4065 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4067 rt6_sync_down_dev(dev, event);
4068 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4069 neigh_ifdown(&nd_tbl, dev);
4072 struct rt6_mtu_change_arg {
4073 struct net_device *dev;
4074 unsigned int mtu;
4077 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4079 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4080 struct inet6_dev *idev;
4082 /* In IPv6 pmtu discovery is not optional,
4083 so that RTAX_MTU lock cannot disable it.
4084 We still use this lock to block changes
4085 caused by addrconf/ndisc.
4088 idev = __in6_dev_get(arg->dev);
4089 if (!idev)
4090 return 0;
4092 /* For administrative MTU increase, there is no way to discover
4093 IPv6 PMTU increase, so PMTU increase should be updated here.
4094 Since RFC 1981 doesn't include administrative MTU increase
4095 update PMTU increase is a MUST. (i.e. jumbo frame)
4097 if (rt->fib6_nh.nh_dev == arg->dev &&
4098 !fib6_metric_locked(rt, RTAX_MTU)) {
4099 u32 mtu = rt->fib6_pmtu;
4101 if (mtu >= arg->mtu ||
4102 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4103 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4105 spin_lock_bh(&rt6_exception_lock);
4106 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4107 spin_unlock_bh(&rt6_exception_lock);
4109 return 0;
4112 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4114 struct rt6_mtu_change_arg arg = {
4115 .dev = dev,
4116 .mtu = mtu,
4119 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4122 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4123 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4124 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4125 [RTA_OIF] = { .type = NLA_U32 },
4126 [RTA_IIF] = { .type = NLA_U32 },
4127 [RTA_PRIORITY] = { .type = NLA_U32 },
4128 [RTA_METRICS] = { .type = NLA_NESTED },
4129 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4130 [RTA_PREF] = { .type = NLA_U8 },
4131 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4132 [RTA_ENCAP] = { .type = NLA_NESTED },
4133 [RTA_EXPIRES] = { .type = NLA_U32 },
4134 [RTA_UID] = { .type = NLA_U32 },
4135 [RTA_MARK] = { .type = NLA_U32 },
4136 [RTA_TABLE] = { .type = NLA_U32 },
4137 [RTA_IP_PROTO] = { .type = NLA_U8 },
4138 [RTA_SPORT] = { .type = NLA_U16 },
4139 [RTA_DPORT] = { .type = NLA_U16 },
4142 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4143 struct fib6_config *cfg,
4144 struct netlink_ext_ack *extack)
4146 struct rtmsg *rtm;
4147 struct nlattr *tb[RTA_MAX+1];
4148 unsigned int pref;
4149 int err;
4151 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4152 NULL);
4153 if (err < 0)
4154 goto errout;
4156 err = -EINVAL;
4157 rtm = nlmsg_data(nlh);
4158 memset(cfg, 0, sizeof(*cfg));
4160 cfg->fc_table = rtm->rtm_table;
4161 cfg->fc_dst_len = rtm->rtm_dst_len;
4162 cfg->fc_src_len = rtm->rtm_src_len;
4163 cfg->fc_flags = RTF_UP;
4164 cfg->fc_protocol = rtm->rtm_protocol;
4165 cfg->fc_type = rtm->rtm_type;
4167 if (rtm->rtm_type == RTN_UNREACHABLE ||
4168 rtm->rtm_type == RTN_BLACKHOLE ||
4169 rtm->rtm_type == RTN_PROHIBIT ||
4170 rtm->rtm_type == RTN_THROW)
4171 cfg->fc_flags |= RTF_REJECT;
4173 if (rtm->rtm_type == RTN_LOCAL)
4174 cfg->fc_flags |= RTF_LOCAL;
4176 if (rtm->rtm_flags & RTM_F_CLONED)
4177 cfg->fc_flags |= RTF_CACHE;
4179 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4181 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4182 cfg->fc_nlinfo.nlh = nlh;
4183 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4185 if (tb[RTA_GATEWAY]) {
4186 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4187 cfg->fc_flags |= RTF_GATEWAY;
4190 if (tb[RTA_DST]) {
4191 int plen = (rtm->rtm_dst_len + 7) >> 3;
4193 if (nla_len(tb[RTA_DST]) < plen)
4194 goto errout;
4196 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4199 if (tb[RTA_SRC]) {
4200 int plen = (rtm->rtm_src_len + 7) >> 3;
4202 if (nla_len(tb[RTA_SRC]) < plen)
4203 goto errout;
4205 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4208 if (tb[RTA_PREFSRC])
4209 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4211 if (tb[RTA_OIF])
4212 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4214 if (tb[RTA_PRIORITY])
4215 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4217 if (tb[RTA_METRICS]) {
4218 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4219 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4222 if (tb[RTA_TABLE])
4223 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4225 if (tb[RTA_MULTIPATH]) {
4226 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4227 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4229 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4230 cfg->fc_mp_len, extack);
4231 if (err < 0)
4232 goto errout;
4235 if (tb[RTA_PREF]) {
4236 pref = nla_get_u8(tb[RTA_PREF]);
4237 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4238 pref != ICMPV6_ROUTER_PREF_HIGH)
4239 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4240 cfg->fc_flags |= RTF_PREF(pref);
4243 if (tb[RTA_ENCAP])
4244 cfg->fc_encap = tb[RTA_ENCAP];
4246 if (tb[RTA_ENCAP_TYPE]) {
4247 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4249 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4250 if (err < 0)
4251 goto errout;
4254 if (tb[RTA_EXPIRES]) {
4255 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4257 if (addrconf_finite_timeout(timeout)) {
4258 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4259 cfg->fc_flags |= RTF_EXPIRES;
4263 err = 0;
4264 errout:
4265 return err;
4268 struct rt6_nh {
4269 struct fib6_info *fib6_info;
4270 struct fib6_config r_cfg;
4271 struct list_head next;
4274 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4276 struct rt6_nh *nh;
4278 list_for_each_entry(nh, rt6_nh_list, next) {
4279 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4280 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4281 nh->r_cfg.fc_ifindex);
4285 static int ip6_route_info_append(struct net *net,
4286 struct list_head *rt6_nh_list,
4287 struct fib6_info *rt,
4288 struct fib6_config *r_cfg)
4290 struct rt6_nh *nh;
4291 int err = -EEXIST;
4293 list_for_each_entry(nh, rt6_nh_list, next) {
4294 /* check if fib6_info already exists */
4295 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4296 return err;
4299 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4300 if (!nh)
4301 return -ENOMEM;
4302 nh->fib6_info = rt;
4303 err = ip6_convert_metrics(net, rt, r_cfg);
4304 if (err) {
4305 kfree(nh);
4306 return err;
4308 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4309 list_add_tail(&nh->next, rt6_nh_list);
4311 return 0;
4314 static void ip6_route_mpath_notify(struct fib6_info *rt,
4315 struct fib6_info *rt_last,
4316 struct nl_info *info,
4317 __u16 nlflags)
4319 /* if this is an APPEND route, then rt points to the first route
4320 * inserted and rt_last points to last route inserted. Userspace
4321 * wants a consistent dump of the route which starts at the first
4322 * nexthop. Since sibling routes are always added at the end of
4323 * the list, find the first sibling of the last route appended
4325 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4326 rt = list_first_entry(&rt_last->fib6_siblings,
4327 struct fib6_info,
4328 fib6_siblings);
4331 if (rt)
4332 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4335 static int ip6_route_multipath_add(struct fib6_config *cfg,
4336 struct netlink_ext_ack *extack)
4338 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4339 struct nl_info *info = &cfg->fc_nlinfo;
4340 struct fib6_config r_cfg;
4341 struct rtnexthop *rtnh;
4342 struct fib6_info *rt;
4343 struct rt6_nh *err_nh;
4344 struct rt6_nh *nh, *nh_safe;
4345 __u16 nlflags;
4346 int remaining;
4347 int attrlen;
4348 int err = 1;
4349 int nhn = 0;
4350 int replace = (cfg->fc_nlinfo.nlh &&
4351 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4352 LIST_HEAD(rt6_nh_list);
4354 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4355 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4356 nlflags |= NLM_F_APPEND;
4358 remaining = cfg->fc_mp_len;
4359 rtnh = (struct rtnexthop *)cfg->fc_mp;
4361 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4362 * fib6_info structs per nexthop
4364 while (rtnh_ok(rtnh, remaining)) {
4365 memcpy(&r_cfg, cfg, sizeof(*cfg));
4366 if (rtnh->rtnh_ifindex)
4367 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4369 attrlen = rtnh_attrlen(rtnh);
4370 if (attrlen > 0) {
4371 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4373 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4374 if (nla) {
4375 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4376 r_cfg.fc_flags |= RTF_GATEWAY;
4378 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4379 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4380 if (nla)
4381 r_cfg.fc_encap_type = nla_get_u16(nla);
4384 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4385 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4386 if (IS_ERR(rt)) {
4387 err = PTR_ERR(rt);
4388 rt = NULL;
4389 goto cleanup;
4392 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4394 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4395 rt, &r_cfg);
4396 if (err) {
4397 fib6_info_release(rt);
4398 goto cleanup;
4401 rtnh = rtnh_next(rtnh, &remaining);
4404 /* for add and replace send one notification with all nexthops.
4405 * Skip the notification in fib6_add_rt2node and send one with
4406 * the full route when done
4408 info->skip_notify = 1;
4410 err_nh = NULL;
4411 list_for_each_entry(nh, &rt6_nh_list, next) {
4412 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4413 fib6_info_release(nh->fib6_info);
4415 if (!err) {
4416 /* save reference to last route successfully inserted */
4417 rt_last = nh->fib6_info;
4419 /* save reference to first route for notification */
4420 if (!rt_notif)
4421 rt_notif = nh->fib6_info;
4424 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4425 nh->fib6_info = NULL;
4426 if (err) {
4427 if (replace && nhn)
4428 ip6_print_replace_route_err(&rt6_nh_list);
4429 err_nh = nh;
4430 goto add_errout;
4433 /* Because each route is added like a single route we remove
4434 * these flags after the first nexthop: if there is a collision,
4435 * we have already failed to add the first nexthop:
4436 * fib6_add_rt2node() has rejected it; when replacing, old
4437 * nexthops have been replaced by first new, the rest should
4438 * be added to it.
4440 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4441 NLM_F_REPLACE);
4442 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
4443 nhn++;
4446 /* success ... tell user about new route */
4447 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4448 goto cleanup;
4450 add_errout:
4451 /* send notification for routes that were added so that
4452 * the delete notifications sent by ip6_route_del are
4453 * coherent
4455 if (rt_notif)
4456 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4458 /* Delete routes that were already added */
4459 list_for_each_entry(nh, &rt6_nh_list, next) {
4460 if (err_nh == nh)
4461 break;
4462 ip6_route_del(&nh->r_cfg, extack);
4465 cleanup:
4466 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4467 if (nh->fib6_info)
4468 fib6_info_release(nh->fib6_info);
4469 list_del(&nh->next);
4470 kfree(nh);
4473 return err;
4476 static int ip6_route_multipath_del(struct fib6_config *cfg,
4477 struct netlink_ext_ack *extack)
4479 struct fib6_config r_cfg;
4480 struct rtnexthop *rtnh;
4481 int remaining;
4482 int attrlen;
4483 int err = 1, last_err = 0;
4485 remaining = cfg->fc_mp_len;
4486 rtnh = (struct rtnexthop *)cfg->fc_mp;
4488 /* Parse a Multipath Entry */
4489 while (rtnh_ok(rtnh, remaining)) {
4490 memcpy(&r_cfg, cfg, sizeof(*cfg));
4491 if (rtnh->rtnh_ifindex)
4492 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4494 attrlen = rtnh_attrlen(rtnh);
4495 if (attrlen > 0) {
4496 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4498 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4499 if (nla) {
4500 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4501 r_cfg.fc_flags |= RTF_GATEWAY;
4504 err = ip6_route_del(&r_cfg, extack);
4505 if (err)
4506 last_err = err;
4508 rtnh = rtnh_next(rtnh, &remaining);
4511 return last_err;
4514 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4515 struct netlink_ext_ack *extack)
4517 struct fib6_config cfg;
4518 int err;
4520 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4521 if (err < 0)
4522 return err;
4524 if (cfg.fc_mp)
4525 return ip6_route_multipath_del(&cfg, extack);
4526 else {
4527 cfg.fc_delete_all_nh = 1;
4528 return ip6_route_del(&cfg, extack);
4532 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4533 struct netlink_ext_ack *extack)
4535 struct fib6_config cfg;
4536 int err;
4538 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4539 if (err < 0)
4540 return err;
4542 if (cfg.fc_mp)
4543 return ip6_route_multipath_add(&cfg, extack);
4544 else
4545 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4548 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4550 int nexthop_len = 0;
4552 if (rt->fib6_nsiblings) {
4553 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4554 + NLA_ALIGN(sizeof(struct rtnexthop))
4555 + nla_total_size(16) /* RTA_GATEWAY */
4556 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4558 nexthop_len *= rt->fib6_nsiblings;
4561 return NLMSG_ALIGN(sizeof(struct rtmsg))
4562 + nla_total_size(16) /* RTA_SRC */
4563 + nla_total_size(16) /* RTA_DST */
4564 + nla_total_size(16) /* RTA_GATEWAY */
4565 + nla_total_size(16) /* RTA_PREFSRC */
4566 + nla_total_size(4) /* RTA_TABLE */
4567 + nla_total_size(4) /* RTA_IIF */
4568 + nla_total_size(4) /* RTA_OIF */
4569 + nla_total_size(4) /* RTA_PRIORITY */
4570 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4571 + nla_total_size(sizeof(struct rta_cacheinfo))
4572 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4573 + nla_total_size(1) /* RTA_PREF */
4574 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4575 + nexthop_len;
4578 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4579 unsigned int *flags, bool skip_oif)
4581 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4582 *flags |= RTNH_F_DEAD;
4584 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4585 *flags |= RTNH_F_LINKDOWN;
4587 rcu_read_lock();
4588 if (fib6_ignore_linkdown(rt))
4589 *flags |= RTNH_F_DEAD;
4590 rcu_read_unlock();
4593 if (rt->fib6_flags & RTF_GATEWAY) {
4594 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4595 goto nla_put_failure;
4598 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4599 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4600 *flags |= RTNH_F_OFFLOAD;
4602 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4603 if (!skip_oif && rt->fib6_nh.nh_dev &&
4604 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4605 goto nla_put_failure;
4607 if (rt->fib6_nh.nh_lwtstate &&
4608 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4609 goto nla_put_failure;
4611 return 0;
4613 nla_put_failure:
4614 return -EMSGSIZE;
4617 /* add multipath next hop */
4618 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4620 const struct net_device *dev = rt->fib6_nh.nh_dev;
4621 struct rtnexthop *rtnh;
4622 unsigned int flags = 0;
4624 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4625 if (!rtnh)
4626 goto nla_put_failure;
4628 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4629 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4631 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4632 goto nla_put_failure;
4634 rtnh->rtnh_flags = flags;
4636 /* length of rtnetlink header + attributes */
4637 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4639 return 0;
4641 nla_put_failure:
4642 return -EMSGSIZE;
4645 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4646 struct fib6_info *rt, struct dst_entry *dst,
4647 struct in6_addr *dest, struct in6_addr *src,
4648 int iif, int type, u32 portid, u32 seq,
4649 unsigned int flags)
4651 struct rtmsg *rtm;
4652 struct nlmsghdr *nlh;
4653 long expires = 0;
4654 u32 *pmetrics;
4655 u32 table;
4657 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4658 if (!nlh)
4659 return -EMSGSIZE;
4661 rtm = nlmsg_data(nlh);
4662 rtm->rtm_family = AF_INET6;
4663 rtm->rtm_dst_len = rt->fib6_dst.plen;
4664 rtm->rtm_src_len = rt->fib6_src.plen;
4665 rtm->rtm_tos = 0;
4666 if (rt->fib6_table)
4667 table = rt->fib6_table->tb6_id;
4668 else
4669 table = RT6_TABLE_UNSPEC;
4670 rtm->rtm_table = table;
4671 if (nla_put_u32(skb, RTA_TABLE, table))
4672 goto nla_put_failure;
4674 rtm->rtm_type = rt->fib6_type;
4675 rtm->rtm_flags = 0;
4676 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4677 rtm->rtm_protocol = rt->fib6_protocol;
4679 if (rt->fib6_flags & RTF_CACHE)
4680 rtm->rtm_flags |= RTM_F_CLONED;
4682 if (dest) {
4683 if (nla_put_in6_addr(skb, RTA_DST, dest))
4684 goto nla_put_failure;
4685 rtm->rtm_dst_len = 128;
4686 } else if (rtm->rtm_dst_len)
4687 if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4688 goto nla_put_failure;
4689 #ifdef CONFIG_IPV6_SUBTREES
4690 if (src) {
4691 if (nla_put_in6_addr(skb, RTA_SRC, src))
4692 goto nla_put_failure;
4693 rtm->rtm_src_len = 128;
4694 } else if (rtm->rtm_src_len &&
4695 nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4696 goto nla_put_failure;
4697 #endif
4698 if (iif) {
4699 #ifdef CONFIG_IPV6_MROUTE
4700 if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4701 int err = ip6mr_get_route(net, skb, rtm, portid);
4703 if (err == 0)
4704 return 0;
4705 if (err < 0)
4706 goto nla_put_failure;
4707 } else
4708 #endif
4709 if (nla_put_u32(skb, RTA_IIF, iif))
4710 goto nla_put_failure;
4711 } else if (dest) {
4712 struct in6_addr saddr_buf;
4713 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4714 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4715 goto nla_put_failure;
4718 if (rt->fib6_prefsrc.plen) {
4719 struct in6_addr saddr_buf;
4720 saddr_buf = rt->fib6_prefsrc.addr;
4721 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4722 goto nla_put_failure;
4725 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4726 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4727 goto nla_put_failure;
4729 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4730 goto nla_put_failure;
4732 /* For multipath routes, walk the siblings list and add
4733 * each as a nexthop within RTA_MULTIPATH.
4735 if (rt->fib6_nsiblings) {
4736 struct fib6_info *sibling, *next_sibling;
4737 struct nlattr *mp;
4739 mp = nla_nest_start(skb, RTA_MULTIPATH);
4740 if (!mp)
4741 goto nla_put_failure;
4743 if (rt6_add_nexthop(skb, rt) < 0)
4744 goto nla_put_failure;
4746 list_for_each_entry_safe(sibling, next_sibling,
4747 &rt->fib6_siblings, fib6_siblings) {
4748 if (rt6_add_nexthop(skb, sibling) < 0)
4749 goto nla_put_failure;
4752 nla_nest_end(skb, mp);
4753 } else {
4754 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4755 goto nla_put_failure;
4758 if (rt->fib6_flags & RTF_EXPIRES) {
4759 expires = dst ? dst->expires : rt->expires;
4760 expires -= jiffies;
4763 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4764 goto nla_put_failure;
4766 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4767 goto nla_put_failure;
4770 nlmsg_end(skb, nlh);
4771 return 0;
4773 nla_put_failure:
4774 nlmsg_cancel(skb, nlh);
4775 return -EMSGSIZE;
4778 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4780 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4781 struct net *net = arg->net;
4783 if (rt == net->ipv6.fib6_null_entry)
4784 return 0;
4786 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4787 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4789 /* user wants prefix routes only */
4790 if (rtm->rtm_flags & RTM_F_PREFIX &&
4791 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4792 /* success since this is not a prefix route */
4793 return 1;
4797 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4798 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4799 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4802 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4803 struct netlink_ext_ack *extack)
4805 struct net *net = sock_net(in_skb->sk);
4806 struct nlattr *tb[RTA_MAX+1];
4807 int err, iif = 0, oif = 0;
4808 struct fib6_info *from;
4809 struct dst_entry *dst;
4810 struct rt6_info *rt;
4811 struct sk_buff *skb;
4812 struct rtmsg *rtm;
4813 struct flowi6 fl6;
4814 bool fibmatch;
4816 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4817 extack);
4818 if (err < 0)
4819 goto errout;
4821 err = -EINVAL;
4822 memset(&fl6, 0, sizeof(fl6));
4823 rtm = nlmsg_data(nlh);
4824 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4825 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4827 if (tb[RTA_SRC]) {
4828 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4829 goto errout;
4831 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4834 if (tb[RTA_DST]) {
4835 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4836 goto errout;
4838 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4841 if (tb[RTA_IIF])
4842 iif = nla_get_u32(tb[RTA_IIF]);
4844 if (tb[RTA_OIF])
4845 oif = nla_get_u32(tb[RTA_OIF]);
4847 if (tb[RTA_MARK])
4848 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4850 if (tb[RTA_UID])
4851 fl6.flowi6_uid = make_kuid(current_user_ns(),
4852 nla_get_u32(tb[RTA_UID]));
4853 else
4854 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4856 if (tb[RTA_SPORT])
4857 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4859 if (tb[RTA_DPORT])
4860 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4862 if (tb[RTA_IP_PROTO]) {
4863 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4864 &fl6.flowi6_proto, extack);
4865 if (err)
4866 goto errout;
4869 if (iif) {
4870 struct net_device *dev;
4871 int flags = 0;
4873 rcu_read_lock();
4875 dev = dev_get_by_index_rcu(net, iif);
4876 if (!dev) {
4877 rcu_read_unlock();
4878 err = -ENODEV;
4879 goto errout;
4882 fl6.flowi6_iif = iif;
4884 if (!ipv6_addr_any(&fl6.saddr))
4885 flags |= RT6_LOOKUP_F_HAS_SADDR;
4887 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4889 rcu_read_unlock();
4890 } else {
4891 fl6.flowi6_oif = oif;
4893 dst = ip6_route_output(net, NULL, &fl6);
4897 rt = container_of(dst, struct rt6_info, dst);
4898 if (rt->dst.error) {
4899 err = rt->dst.error;
4900 ip6_rt_put(rt);
4901 goto errout;
4904 if (rt == net->ipv6.ip6_null_entry) {
4905 err = rt->dst.error;
4906 ip6_rt_put(rt);
4907 goto errout;
4910 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4911 if (!skb) {
4912 ip6_rt_put(rt);
4913 err = -ENOBUFS;
4914 goto errout;
4917 skb_dst_set(skb, &rt->dst);
4919 rcu_read_lock();
4920 from = rcu_dereference(rt->from);
4922 if (fibmatch)
4923 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4924 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4925 nlh->nlmsg_seq, 0);
4926 else
4927 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4928 &fl6.saddr, iif, RTM_NEWROUTE,
4929 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4931 rcu_read_unlock();
4933 if (err < 0) {
4934 kfree_skb(skb);
4935 goto errout;
4938 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4939 errout:
4940 return err;
4943 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4944 unsigned int nlm_flags)
4946 struct sk_buff *skb;
4947 struct net *net = info->nl_net;
4948 u32 seq;
4949 int err;
4951 err = -ENOBUFS;
4952 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4954 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4955 if (!skb)
4956 goto errout;
4958 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4959 event, info->portid, seq, nlm_flags);
4960 if (err < 0) {
4961 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4962 WARN_ON(err == -EMSGSIZE);
4963 kfree_skb(skb);
4964 goto errout;
4966 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4967 info->nlh, gfp_any());
4968 return;
4969 errout:
4970 if (err < 0)
4971 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4974 static int ip6_route_dev_notify(struct notifier_block *this,
4975 unsigned long event, void *ptr)
4977 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4978 struct net *net = dev_net(dev);
4980 if (!(dev->flags & IFF_LOOPBACK))
4981 return NOTIFY_OK;
4983 if (event == NETDEV_REGISTER) {
4984 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4985 net->ipv6.ip6_null_entry->dst.dev = dev;
4986 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4987 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4988 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4989 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4990 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4991 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4992 #endif
4993 } else if (event == NETDEV_UNREGISTER &&
4994 dev->reg_state != NETREG_UNREGISTERED) {
4995 /* NETDEV_UNREGISTER could be fired for multiple times by
4996 * netdev_wait_allrefs(). Make sure we only call this once.
4998 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4999 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5000 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5001 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5002 #endif
5005 return NOTIFY_OK;
5009 * /proc
5012 #ifdef CONFIG_PROC_FS
5013 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5015 struct net *net = (struct net *)seq->private;
5016 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5017 net->ipv6.rt6_stats->fib_nodes,
5018 net->ipv6.rt6_stats->fib_route_nodes,
5019 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5020 net->ipv6.rt6_stats->fib_rt_entries,
5021 net->ipv6.rt6_stats->fib_rt_cache,
5022 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5023 net->ipv6.rt6_stats->fib_discarded_routes);
5025 return 0;
5027 #endif /* CONFIG_PROC_FS */
5029 #ifdef CONFIG_SYSCTL
5031 static
5032 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5033 void __user *buffer, size_t *lenp, loff_t *ppos)
5035 struct net *net;
5036 int delay;
5037 if (!write)
5038 return -EINVAL;
5040 net = (struct net *)ctl->extra1;
5041 delay = net->ipv6.sysctl.flush_delay;
5042 proc_dointvec(ctl, write, buffer, lenp, ppos);
5043 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5044 return 0;
5047 struct ctl_table ipv6_route_table_template[] = {
5049 .procname = "flush",
5050 .data = &init_net.ipv6.sysctl.flush_delay,
5051 .maxlen = sizeof(int),
5052 .mode = 0200,
5053 .proc_handler = ipv6_sysctl_rtcache_flush
5056 .procname = "gc_thresh",
5057 .data = &ip6_dst_ops_template.gc_thresh,
5058 .maxlen = sizeof(int),
5059 .mode = 0644,
5060 .proc_handler = proc_dointvec,
5063 .procname = "max_size",
5064 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5065 .maxlen = sizeof(int),
5066 .mode = 0644,
5067 .proc_handler = proc_dointvec,
5070 .procname = "gc_min_interval",
5071 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5072 .maxlen = sizeof(int),
5073 .mode = 0644,
5074 .proc_handler = proc_dointvec_jiffies,
5077 .procname = "gc_timeout",
5078 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5079 .maxlen = sizeof(int),
5080 .mode = 0644,
5081 .proc_handler = proc_dointvec_jiffies,
5084 .procname = "gc_interval",
5085 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5086 .maxlen = sizeof(int),
5087 .mode = 0644,
5088 .proc_handler = proc_dointvec_jiffies,
5091 .procname = "gc_elasticity",
5092 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5093 .maxlen = sizeof(int),
5094 .mode = 0644,
5095 .proc_handler = proc_dointvec,
5098 .procname = "mtu_expires",
5099 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5100 .maxlen = sizeof(int),
5101 .mode = 0644,
5102 .proc_handler = proc_dointvec_jiffies,
5105 .procname = "min_adv_mss",
5106 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5107 .maxlen = sizeof(int),
5108 .mode = 0644,
5109 .proc_handler = proc_dointvec,
5112 .procname = "gc_min_interval_ms",
5113 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5114 .maxlen = sizeof(int),
5115 .mode = 0644,
5116 .proc_handler = proc_dointvec_ms_jiffies,
5121 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5123 struct ctl_table *table;
5125 table = kmemdup(ipv6_route_table_template,
5126 sizeof(ipv6_route_table_template),
5127 GFP_KERNEL);
5129 if (table) {
5130 table[0].data = &net->ipv6.sysctl.flush_delay;
5131 table[0].extra1 = net;
5132 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5133 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5134 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5135 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5136 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5137 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5138 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5139 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5140 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5142 /* Don't export sysctls to unprivileged users */
5143 if (net->user_ns != &init_user_ns)
5144 table[0].procname = NULL;
5147 return table;
5149 #endif
5151 static int __net_init ip6_route_net_init(struct net *net)
5153 int ret = -ENOMEM;
5155 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5156 sizeof(net->ipv6.ip6_dst_ops));
5158 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5159 goto out_ip6_dst_ops;
5161 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5162 sizeof(*net->ipv6.fib6_null_entry),
5163 GFP_KERNEL);
5164 if (!net->ipv6.fib6_null_entry)
5165 goto out_ip6_dst_entries;
5167 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5168 sizeof(*net->ipv6.ip6_null_entry),
5169 GFP_KERNEL);
5170 if (!net->ipv6.ip6_null_entry)
5171 goto out_fib6_null_entry;
5172 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5173 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5174 ip6_template_metrics, true);
5176 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5177 net->ipv6.fib6_has_custom_rules = false;
5178 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5179 sizeof(*net->ipv6.ip6_prohibit_entry),
5180 GFP_KERNEL);
5181 if (!net->ipv6.ip6_prohibit_entry)
5182 goto out_ip6_null_entry;
5183 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5184 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5185 ip6_template_metrics, true);
5187 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5188 sizeof(*net->ipv6.ip6_blk_hole_entry),
5189 GFP_KERNEL);
5190 if (!net->ipv6.ip6_blk_hole_entry)
5191 goto out_ip6_prohibit_entry;
5192 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5193 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5194 ip6_template_metrics, true);
5195 #endif
5197 net->ipv6.sysctl.flush_delay = 0;
5198 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5199 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5200 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5201 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5202 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5203 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5204 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5206 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5208 ret = 0;
5209 out:
5210 return ret;
5212 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5213 out_ip6_prohibit_entry:
5214 kfree(net->ipv6.ip6_prohibit_entry);
5215 out_ip6_null_entry:
5216 kfree(net->ipv6.ip6_null_entry);
5217 #endif
5218 out_fib6_null_entry:
5219 kfree(net->ipv6.fib6_null_entry);
5220 out_ip6_dst_entries:
5221 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5222 out_ip6_dst_ops:
5223 goto out;
5226 static void __net_exit ip6_route_net_exit(struct net *net)
5228 kfree(net->ipv6.fib6_null_entry);
5229 kfree(net->ipv6.ip6_null_entry);
5230 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5231 kfree(net->ipv6.ip6_prohibit_entry);
5232 kfree(net->ipv6.ip6_blk_hole_entry);
5233 #endif
5234 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5237 static int __net_init ip6_route_net_init_late(struct net *net)
5239 #ifdef CONFIG_PROC_FS
5240 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5241 sizeof(struct ipv6_route_iter));
5242 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5243 rt6_stats_seq_show, NULL);
5244 #endif
5245 return 0;
5248 static void __net_exit ip6_route_net_exit_late(struct net *net)
5250 #ifdef CONFIG_PROC_FS
5251 remove_proc_entry("ipv6_route", net->proc_net);
5252 remove_proc_entry("rt6_stats", net->proc_net);
5253 #endif
5256 static struct pernet_operations ip6_route_net_ops = {
5257 .init = ip6_route_net_init,
5258 .exit = ip6_route_net_exit,
5261 static int __net_init ipv6_inetpeer_init(struct net *net)
5263 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5265 if (!bp)
5266 return -ENOMEM;
5267 inet_peer_base_init(bp);
5268 net->ipv6.peers = bp;
5269 return 0;
5272 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5274 struct inet_peer_base *bp = net->ipv6.peers;
5276 net->ipv6.peers = NULL;
5277 inetpeer_invalidate_tree(bp);
5278 kfree(bp);
5281 static struct pernet_operations ipv6_inetpeer_ops = {
5282 .init = ipv6_inetpeer_init,
5283 .exit = ipv6_inetpeer_exit,
5286 static struct pernet_operations ip6_route_net_late_ops = {
5287 .init = ip6_route_net_init_late,
5288 .exit = ip6_route_net_exit_late,
5291 static struct notifier_block ip6_route_dev_notifier = {
5292 .notifier_call = ip6_route_dev_notify,
5293 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5296 void __init ip6_route_init_special_entries(void)
5298 /* Registering of the loopback is done before this portion of code,
5299 * the loopback reference in rt6_info will not be taken, do it
5300 * manually for init_net */
5301 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5302 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5303 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5304 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5305 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5306 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5307 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5308 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5309 #endif
5312 int __init ip6_route_init(void)
5314 int ret;
5315 int cpu;
5317 ret = -ENOMEM;
5318 ip6_dst_ops_template.kmem_cachep =
5319 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5320 SLAB_HWCACHE_ALIGN, NULL);
5321 if (!ip6_dst_ops_template.kmem_cachep)
5322 goto out;
5324 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5325 if (ret)
5326 goto out_kmem_cache;
5328 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5329 if (ret)
5330 goto out_dst_entries;
5332 ret = register_pernet_subsys(&ip6_route_net_ops);
5333 if (ret)
5334 goto out_register_inetpeer;
5336 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5338 ret = fib6_init();
5339 if (ret)
5340 goto out_register_subsys;
5342 ret = xfrm6_init();
5343 if (ret)
5344 goto out_fib6_init;
5346 ret = fib6_rules_init();
5347 if (ret)
5348 goto xfrm6_init;
5350 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5351 if (ret)
5352 goto fib6_rules_init;
5354 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5355 inet6_rtm_newroute, NULL, 0);
5356 if (ret < 0)
5357 goto out_register_late_subsys;
5359 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5360 inet6_rtm_delroute, NULL, 0);
5361 if (ret < 0)
5362 goto out_register_late_subsys;
5364 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5365 inet6_rtm_getroute, NULL,
5366 RTNL_FLAG_DOIT_UNLOCKED);
5367 if (ret < 0)
5368 goto out_register_late_subsys;
5370 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5371 if (ret)
5372 goto out_register_late_subsys;
5374 for_each_possible_cpu(cpu) {
5375 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5377 INIT_LIST_HEAD(&ul->head);
5378 spin_lock_init(&ul->lock);
5381 out:
5382 return ret;
5384 out_register_late_subsys:
5385 rtnl_unregister_all(PF_INET6);
5386 unregister_pernet_subsys(&ip6_route_net_late_ops);
5387 fib6_rules_init:
5388 fib6_rules_cleanup();
5389 xfrm6_init:
5390 xfrm6_fini();
5391 out_fib6_init:
5392 fib6_gc_cleanup();
5393 out_register_subsys:
5394 unregister_pernet_subsys(&ip6_route_net_ops);
5395 out_register_inetpeer:
5396 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5397 out_dst_entries:
5398 dst_entries_destroy(&ip6_dst_blackhole_ops);
5399 out_kmem_cache:
5400 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5401 goto out;
5404 void ip6_route_cleanup(void)
5406 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5407 unregister_pernet_subsys(&ip6_route_net_late_ops);
5408 fib6_rules_cleanup();
5409 xfrm6_fini();
5410 fib6_gc_cleanup();
5411 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5412 unregister_pernet_subsys(&ip6_route_net_ops);
5413 dst_entries_destroy(&ip6_dst_blackhole_ops);
5414 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);