4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
26 * This file contains consumer routines of the IPv4 forwarding engine
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
35 #include <sys/cmn_err.h>
36 #include <sys/policy.h>
38 #include <sys/systm.h>
39 #include <sys/strsun.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <sys/strsubr.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
47 #include <net/if_dl.h>
48 #include <netinet/ip6.h>
49 #include <netinet/icmp6.h>
51 #include <inet/ipsec_impl.h>
52 #include <inet/common.h>
54 #include <inet/mib2.h>
56 #include <inet/ip_impl.h>
58 #include <inet/ip_ndp.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_ftable.h>
63 #include <inet/ip_rts.h>
66 #include <net/pfkeyv2.h>
67 #include <inet/sadb.h>
69 #include <inet/ipclassifier.h>
71 #include <net/radix.h>
73 #define IS_DEFAULT_ROUTE(ire) \
74 (((ire)->ire_type & IRE_DEFAULT) || \
75 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
77 #define IP_SRC_MULTIHOMING(isv6, ipst) \
78 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
79 ipst->ips_ip_strict_src_multihoming)
81 static ire_t
*route_to_dst(const struct sockaddr
*, zoneid_t
, ip_stack_t
*);
82 static void ire_del_host_redir(ire_t
*, char *);
83 static boolean_t
ire_find_best_route(struct radix_node
*, void *);
86 * Lookup a route in forwarding table. A specific lookup is indicated by
87 * passing the required parameters and indicating the match required in the
90 * Supports IP_BOUND_IF by following the ipif/ill when recursing.
93 ire_ftable_lookup_v4(ipaddr_t addr
, ipaddr_t mask
, ipaddr_t gateway
,
94 int type
, const ill_t
*ill
, zoneid_t zoneid
, int flags
, uint32_t xmit_hint
,
95 ip_stack_t
*ipst
, uint_t
*generationp
)
98 struct rt_sockaddr rdst
, rmask
;
100 ire_ftable_args_t margs
;
102 ASSERT(ill
== NULL
|| !ill
->ill_isv6
);
105 * ire_match_args() will dereference ill if MATCH_IRE_ILL
108 if ((flags
& (MATCH_IRE_ILL
|MATCH_IRE_SRC_ILL
)) && (ill
== NULL
))
111 bzero(&rdst
, sizeof (rdst
));
112 rdst
.rt_sin_len
= sizeof (rdst
);
113 rdst
.rt_sin_family
= AF_INET
;
114 rdst
.rt_sin_addr
.s_addr
= addr
;
116 bzero(&rmask
, sizeof (rmask
));
117 rmask
.rt_sin_len
= sizeof (rmask
);
118 rmask
.rt_sin_family
= AF_INET
;
119 rmask
.rt_sin_addr
.s_addr
= mask
;
121 bzero(&margs
, sizeof (margs
));
122 margs
.ift_addr
= addr
;
123 margs
.ift_mask
= mask
;
124 margs
.ift_gateway
= gateway
;
125 margs
.ift_type
= type
;
127 margs
.ift_zoneid
= zoneid
;
128 margs
.ift_flags
= flags
;
131 * The flags argument passed to ire_ftable_lookup may cause the
132 * search to return, not the longest matching prefix, but the
133 * "best matching prefix", i.e., the longest prefix that also
134 * satisfies constraints imposed via the permutation of flags
135 * passed in. To achieve this, we invoke ire_match_args() on
136 * each matching leaf in the radix tree. ire_match_args is
137 * invoked by the callback function ire_find_best_route()
138 * We hold the global tree lock in read mode when calling
139 * rn_match_args. Before dropping the global tree lock, ensure
140 * that the radix node can't be deleted by incrementing ire_refcnt.
142 RADIX_NODE_HEAD_RLOCK(ipst
->ips_ip_ftable
);
143 rt
= (struct rt_entry
*)ipst
->ips_ip_ftable
->rnh_matchaddr_args(&rdst
,
144 ipst
->ips_ip_ftable
, ire_find_best_route
, &margs
);
145 ire
= margs
.ift_best_ire
;
147 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
152 DTRACE_PROBE2(ire__found
, ire_ftable_args_t
*, &margs
, ire_t
*, ire
);
155 * round-robin only if we have more than one route in the bucket.
156 * ips_ip_ecmp_behavior controls when we do ECMP
158 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
161 if (ire
->ire_bucket
->irb_ire_cnt
> 1 && !(flags
& MATCH_IRE_GW
)) {
162 if (ipst
->ips_ip_ecmp_behavior
== 2 ||
163 (ipst
->ips_ip_ecmp_behavior
== 1 &&
164 IS_DEFAULT_ROUTE(ire
))) {
167 margs
.ift_best_ire
= NULL
;
168 next_ire
= ire_round_robin(ire
->ire_bucket
, &margs
,
169 xmit_hint
, ire
, ipst
);
170 if (next_ire
== NULL
) {
171 /* keep ire if next_ire is null */
180 /* Return generation before dropping lock */
181 if (generationp
!= NULL
)
182 *generationp
= ire
->ire_generation
;
184 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
187 * For shared-IP zones we need additional checks to what was
188 * done in ire_match_args to make sure IRE_LOCALs are handled.
190 * When ip_restrict_interzone_loopback is set, then
191 * we ensure that IRE_LOCAL are only used for loopback
192 * between zones when the logical "Ethernet" would
193 * have looped them back. That is, if in the absense of
194 * the IRE_LOCAL we would have sent to packet out the
197 if ((ire
->ire_type
& IRE_LOCAL
) && zoneid
!= ALL_ZONES
&&
198 ire
->ire_zoneid
!= zoneid
&& ire
->ire_zoneid
!= ALL_ZONES
&&
199 ipst
->ips_ip_restrict_interzone_loopback
) {
200 ire
= ire_alt_local(ire
, zoneid
, ill
, generationp
);
207 * This function is called by
208 * ip_input/ire_route_recursive when doing a route lookup on only the
209 * destination address.
211 * The optimizations of this function over ire_ftable_lookup are:
212 * o removing unnecessary flag matching
213 * o doing longest prefix match instead of overloading it further
214 * with the unnecessary "best_prefix_match"
216 * If no route is found we return IRE_NOROUTE.
219 ire_ftable_lookup_simple_v4(ipaddr_t addr
, uint32_t xmit_hint
, ip_stack_t
*ipst
,
223 struct rt_sockaddr rdst
;
227 rdst
.rt_sin_len
= sizeof (rdst
);
228 rdst
.rt_sin_family
= AF_INET
;
229 rdst
.rt_sin_addr
.s_addr
= addr
;
232 * This is basically inlining a simpler version of ire_match_args
234 RADIX_NODE_HEAD_RLOCK(ipst
->ips_ip_ftable
);
236 rt
= (struct rt_entry
*)ipst
->ips_ip_ftable
->rnh_matchaddr_args(&rdst
,
237 ipst
->ips_ip_ftable
, NULL
, NULL
);
243 if (irb
->irb_ire_cnt
== 0)
246 rw_enter(&irb
->irb_lock
, RW_READER
);
249 rw_exit(&irb
->irb_lock
);
252 while (IRE_IS_CONDEMNED(ire
)) {
255 rw_exit(&irb
->irb_lock
);
260 /* we have a ire that matches */
262 rw_exit(&irb
->irb_lock
);
265 * round-robin only if we have more than one route in the bucket.
266 * ips_ip_ecmp_behavior controls when we do ECMP
268 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
271 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
272 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
273 * and the IRE_INTERFACESs are likely to be shorter matches.
275 if (ire
->ire_bucket
->irb_ire_cnt
> 1) {
276 if (ipst
->ips_ip_ecmp_behavior
== 2 ||
277 (ipst
->ips_ip_ecmp_behavior
== 1 &&
278 IS_DEFAULT_ROUTE(ire
))) {
280 ire_ftable_args_t margs
;
282 bzero(&margs
, sizeof (margs
));
283 margs
.ift_addr
= addr
;
284 margs
.ift_zoneid
= ALL_ZONES
;
286 next_ire
= ire_round_robin(ire
->ire_bucket
, &margs
,
287 xmit_hint
, ire
, ipst
);
288 if (next_ire
== NULL
) {
289 /* keep ire if next_ire is null */
290 if (generationp
!= NULL
)
291 *generationp
= ire
->ire_generation
;
292 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
299 /* Return generation before dropping lock */
300 if (generationp
!= NULL
)
301 *generationp
= ire
->ire_generation
;
303 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
306 * Since we only did ALL_ZONES matches there is no special handling
307 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
312 if (generationp
!= NULL
)
313 *generationp
= IRE_GENERATION_VERIFY
;
315 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
316 return (ire_reject(ipst
, B_FALSE
));
320 * Find the ill matching a multicast group.
321 * Allows different routes for multicast addresses
322 * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
323 * which point at different interfaces. This is used when IP_MULTICAST_IF
324 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
325 * specify the interface to join on.
327 * Supports link-local addresses by using ire_route_recursive which follows
328 * the ill when recursing.
330 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
331 * We have a setsrcp argument for the same reason.
334 ire_lookup_multi_ill_v4(ipaddr_t group
, zoneid_t zoneid
, ip_stack_t
*ipst
,
340 ire
= ire_route_recursive_v4(group
, 0, NULL
, zoneid
, MATCH_IRE_DSTONLY
,
341 IRR_NONE
, 0, ipst
, setsrcp
, NULL
);
343 if (ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
)) {
348 ill
= ire_nexthop_ill(ire
);
354 * Delete the passed in ire if the gateway addr matches
357 ire_del_host_redir(ire_t
*ire
, char *gateway
)
359 if ((ire
->ire_flags
& RTF_DYNAMIC
) &&
360 (ire
->ire_gateway_addr
== *(ipaddr_t
*)gateway
))
365 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
366 * pointing at the specified gateway and
367 * delete them. This routine is called only
368 * when a default gateway is going away.
371 ire_delete_host_redirects(ipaddr_t gateway
, ip_stack_t
*ipst
)
373 struct rtfuncarg rtfarg
;
375 bzero(&rtfarg
, sizeof (rtfarg
));
376 rtfarg
.rt_func
= ire_del_host_redir
;
377 rtfarg
.rt_arg
= (void *)&gateway
;
378 rtfarg
.rt_zoneid
= ALL_ZONES
;
379 rtfarg
.rt_ipst
= ipst
;
380 (void) ipst
->ips_ip_ftable
->rnh_walktree_mt(ipst
->ips_ip_ftable
,
381 rtfunc
, &rtfarg
, irb_refhold_rn
, irb_refrele_rn
);
385 * Obtain the rt_entry and rt_irb for the route to be added to
387 * First attempt to add a node to the radix tree via rn_addroute. If the
388 * route already exists, return the bucket for the existing route.
390 * Locking notes: Need to hold the global radix tree lock in write mode to
391 * add a radix node. To prevent the node from being deleted, ire_get_bucket()
392 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
393 * while holding the irb_lock, but not the radix tree lock.
396 ire_get_bucket(ire_t
*ire
)
398 struct radix_node
*rn
;
400 struct rt_sockaddr rmask
, rdst
;
402 ip_stack_t
*ipst
= ire
->ire_ipst
;
404 ASSERT(ipst
->ips_ip_ftable
!= NULL
);
406 /* first try to see if route exists (based on rtalloc1) */
407 bzero(&rdst
, sizeof (rdst
));
408 rdst
.rt_sin_len
= sizeof (rdst
);
409 rdst
.rt_sin_family
= AF_INET
;
410 rdst
.rt_sin_addr
.s_addr
= ire
->ire_addr
;
412 bzero(&rmask
, sizeof (rmask
));
413 rmask
.rt_sin_len
= sizeof (rmask
);
414 rmask
.rt_sin_family
= AF_INET
;
415 rmask
.rt_sin_addr
.s_addr
= ire
->ire_mask
;
418 * add the route. based on BSD's rtrequest1(RTM_ADD)
420 R_Malloc(rt
, rt_entry_cache
, sizeof (*rt
));
421 /* kmem_alloc failed */
425 bzero(rt
, sizeof (*rt
));
426 rt
->rt_nodes
->rn_key
= (char *)&rt
->rt_dst
;
429 irb
->irb_marks
|= IRB_MARK_DYNAMIC
; /* dynamically allocated/freed */
430 irb
->irb_ipst
= ipst
;
431 rw_init(&irb
->irb_lock
, NULL
, RW_DEFAULT
, NULL
);
432 RADIX_NODE_HEAD_WLOCK(ipst
->ips_ip_ftable
);
433 rn
= ipst
->ips_ip_ftable
->rnh_addaddr(&rt
->rt_dst
, &rmask
,
434 ipst
->ips_ip_ftable
, (struct radix_node
*)rt
);
436 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
437 Free(rt
, rt_entry_cache
);
440 RADIX_NODE_HEAD_RLOCK(ipst
->ips_ip_ftable
);
441 rn
= ipst
->ips_ip_ftable
->rnh_lookup(&rdst
, &rmask
,
442 ipst
->ips_ip_ftable
);
443 if (rn
!= NULL
&& ((rn
->rn_flags
& RNF_ROOT
) == 0)) {
444 /* found a non-root match */
445 rt
= (struct rt_entry
*)rn
;
452 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
457 * This function is used when the caller wants to know the outbound
458 * interface for a packet given only the address.
459 * If this is a offlink IP address and there are multiple
460 * routes to this destination, this routine will utilise the
461 * first route it finds to IP address
467 ifindex_lookup(const struct sockaddr
*ipaddr
, zoneid_t zoneid
)
475 if (zoneid
== ALL_ZONES
)
476 ns
= netstack_find_by_zoneid(GLOBAL_ZONEID
);
478 ns
= netstack_find_by_zoneid(zoneid
);
482 * For exclusive stacks we set the zoneid to zero
483 * since IP uses the global zoneid in the exclusive stacks.
485 if (ns
->netstack_stackid
!= GLOBAL_NETSTACKID
)
486 zoneid
= GLOBAL_ZONEID
;
487 ipst
= ns
->netstack_ip
;
489 ASSERT(ipaddr
->sa_family
== AF_INET
|| ipaddr
->sa_family
== AF_INET6
);
491 if ((ire
= route_to_dst(ipaddr
, zoneid
, ipst
)) != NULL
) {
492 ill
= ire_nexthop_ill(ire
);
494 ifindex
= ill
->ill_phyint
->phyint_ifindex
;
504 * Routine to find the route to a destination. If a ifindex is supplied
505 * it tries to match the route to the corresponding ipif for the ifindex
508 route_to_dst(const struct sockaddr
*dst_addr
, zoneid_t zoneid
, ip_stack_t
*ipst
)
513 match_flags
= MATCH_IRE_DSTONLY
;
515 if (dst_addr
->sa_family
== AF_INET
) {
516 ire
= ire_route_recursive_v4(
517 ((struct sockaddr_in
*)dst_addr
)->sin_addr
.s_addr
, 0, NULL
,
518 zoneid
, match_flags
, IRR_ALLOCATE
, 0, ipst
, NULL
, NULL
);
520 ire
= ire_route_recursive_v6(
521 &((struct sockaddr_in6
*)dst_addr
)->sin6_addr
, 0, NULL
,
522 zoneid
, match_flags
, IRR_ALLOCATE
, 0, ipst
, NULL
, NULL
);
525 if (ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
)) {
533 * This routine is called by IP Filter to send a packet out on the wire
534 * to a specified dstination (which may be onlink or offlink). The ifindex may
535 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
536 * an outgoing interface and requires the nexthop to be on that interface.
537 * IP WILL NOT DO the following to the data packet before sending it out:
542 * If the packet has been prepared for hardware checksum then it will be
543 * passed off to ip_send_align_cksum() to check that the flags set on the
544 * packet are in alignment with the capabilities of the new outgoing NIC.
547 * 0: IP was able to send of the data pkt
548 * ECOMM: Could not send packet
549 * ENONET No route to dst. It is up to the caller
550 * to send icmp unreachable error message,
551 * EINPROGRESS The macaddr of the onlink dst or that
552 * of the offlink dst's nexthop needs to get
553 * resolved before packet can be sent to dst.
554 * Thus transmission is not guaranteed.
555 * Note: No longer have visibility to the ARP queue
556 * hence no EINPROGRESS.
559 ipfil_sendpkt(const struct sockaddr
*dst_addr
, mblk_t
*mp
, uint_t ifindex
,
570 if (zoneid
== ALL_ZONES
)
571 ns
= netstack_find_by_zoneid(GLOBAL_ZONEID
);
573 ns
= netstack_find_by_zoneid(zoneid
);
577 * For exclusive stacks we set the zoneid to zero
578 * since IP uses the global zoneid in the exclusive stacks.
580 if (ns
->netstack_stackid
!= GLOBAL_NETSTACKID
)
581 zoneid
= GLOBAL_ZONEID
;
582 ipst
= ns
->netstack_ip
;
584 ASSERT(dst_addr
->sa_family
== AF_INET
||
585 dst_addr
->sa_family
== AF_INET6
);
587 bzero(&ixas
, sizeof (ixas
));
589 * No IPsec, no fragmentation, and don't let any hooks see
592 ixas
.ixa_flags
= IXAF_NO_IPSEC
| IXAF_DONTFRAG
| IXAF_NO_PFHOOK
;
593 ixas
.ixa_cred
= kcred
;
594 ixas
.ixa_cpid
= NOPID
;
595 ixas
.ixa_ipst
= ipst
;
596 ixas
.ixa_ifindex
= ifindex
;
598 if (dst_addr
->sa_family
== AF_INET
) {
599 ipha_t
*ipha
= (ipha_t
*)mp
->b_rptr
;
601 ixas
.ixa_flags
|= IXAF_IS_IPV4
;
602 nexthop
= ((struct sockaddr_in
*)dst_addr
)->sin_addr
.s_addr
;
603 if (nexthop
!= ipha
->ipha_dst
) {
604 ixas
.ixa_flags
|= IXAF_NEXTHOP_SET
;
605 ixas
.ixa_nexthop_v4
= nexthop
;
607 ixas
.ixa_multicast_ttl
= ipha
->ipha_ttl
;
609 ip6_t
*ip6h
= (ip6_t
*)mp
->b_rptr
;
610 in6_addr_t
*nexthop6
;
612 nexthop6
= &((struct sockaddr_in6
*)dst_addr
)->sin6_addr
;
613 if (!IN6_ARE_ADDR_EQUAL(nexthop6
, &ip6h
->ip6_dst
)) {
614 ixas
.ixa_flags
|= IXAF_NEXTHOP_SET
;
615 ixas
.ixa_nexthop_v6
= *nexthop6
;
617 ixas
.ixa_multicast_ttl
= ip6h
->ip6_hops
;
619 error
= ip_output_simple(mp
, &ixas
);
640 * callback function provided by ire_ftable_lookup when calling
641 * rn_match_args(). Invoke ire_match_args on each matching leaf node in
645 ire_find_best_route(struct radix_node
*rn
, void *arg
)
647 struct rt_entry
*rt
= (struct rt_entry
*)rn
;
650 ire_ftable_args_t
*margs
= arg
;
655 irb_ptr
= &rt
->rt_irb
;
657 if (irb_ptr
->irb_ire_cnt
== 0)
660 rw_enter(&irb_ptr
->irb_lock
, RW_READER
);
661 for (ire
= irb_ptr
->irb_ire
; ire
!= NULL
; ire
= ire
->ire_next
) {
662 if (IRE_IS_CONDEMNED(ire
))
664 ASSERT((margs
->ift_flags
& MATCH_IRE_SHORTERMASK
) == 0);
665 if (margs
->ift_flags
& MATCH_IRE_MASK
)
666 match_mask
= margs
->ift_mask
;
668 match_mask
= ire
->ire_mask
;
670 if (ire_match_args(ire
, margs
->ift_addr
, match_mask
,
671 margs
->ift_gateway
, margs
->ift_type
, margs
->ift_ill
,
672 margs
->ift_zoneid
, margs
->ift_flags
)) {
674 rw_exit(&irb_ptr
->irb_lock
);
675 margs
->ift_best_ire
= ire
;
679 rw_exit(&irb_ptr
->irb_lock
);
684 * ftable irb_t structures are dynamically allocated, and we need to
685 * check if the irb_t (and associated ftable tree attachment) needs to
686 * be cleaned up when the irb_refcnt goes to 0. The conditions that need
688 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
689 * - no other threads holding references to ire's in the bucket,
690 * i.e., irb_nire == 0
691 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
692 * - need to hold the global tree lock and irb_lock in write mode.
695 irb_refrele_ftable(irb_t
*irb
)
698 rw_enter(&irb
->irb_lock
, RW_WRITER
);
699 ASSERT(irb
->irb_refcnt
!= 0);
700 if (irb
->irb_refcnt
!= 1) {
702 * Someone has a reference to this radix node
703 * or there is some bucket walker.
706 rw_exit(&irb
->irb_lock
);
710 * There is no other walker, nor is there any
711 * other thread that holds a direct ref to this
712 * radix node. Do the clean up if needed. Call
713 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
715 if (irb
->irb_marks
& IRB_MARK_CONDEMNED
) {
718 ire_list
= ire_unlink(irb
);
719 rw_exit(&irb
->irb_lock
);
721 if (ire_list
!= NULL
)
722 ire_cleanup(ire_list
);
724 * more CONDEMNED entries could have
725 * been added while we dropped the lock,
726 * so we have to re-check.
732 * Now check if there are still any ires
733 * associated with this radix node.
735 if (irb
->irb_nire
!= 0) {
737 * someone is still holding on
738 * to ires in this bucket
741 rw_exit(&irb
->irb_lock
);
745 * Everything is clear. Zero walkers,
746 * Zero threads with a ref to this
747 * radix node, Zero ires associated with
748 * this radix node. Due to lock order,
749 * check the above conditions again
750 * after grabbing all locks in the right order
752 rw_exit(&irb
->irb_lock
);
753 if (irb_inactive(irb
))
756 * irb_inactive could not free the irb.
757 * See if there are any walkers, if not
758 * try to clean up again.
766 * IRE iterator used by ire_ftable_lookup to process multiple equal
767 * routes. Given a starting point in the hash list (hash), walk the IREs
768 * in the bucket skipping deleted entries. We treat the bucket as a circular
769 * list for the purposes of walking it.
770 * Returns the IRE (held) that corresponds to the hash value. If that IRE is
771 * not applicable (ire_match_args failed) then it returns a subsequent one.
772 * If we fail to find an IRE we return NULL.
774 * Assumes that the caller holds a reference on the IRE bucket and a read lock
775 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
777 * Applies to IPv4 and IPv6.
779 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
780 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
781 * in which the zone has an IP address. We check this for the global zone
782 * even if no shared-IP zones are configured.
785 ire_round_robin(irb_t
*irb_ptr
, ire_ftable_args_t
*margs
, uint_t hash
,
786 ire_t
*orig_ire
, ip_stack_t
*ipst
)
788 ire_t
*ire
, *maybe_ire
= NULL
;
792 /* Fold in more bits from the hint/hash */
793 hash
= hash
^ (hash
>> 8) ^ (hash
>> 16);
795 rw_enter(&irb_ptr
->irb_lock
, RW_WRITER
);
796 maxwalk
= irb_ptr
->irb_ire_cnt
; /* Excludes condemned */
798 rw_exit(&irb_ptr
->irb_lock
);
803 irb_refhold_locked(irb_ptr
);
804 rw_exit(&irb_ptr
->irb_lock
);
807 * Round-robin the routers list looking for a route that
808 * matches the passed in parameters.
809 * First we skip "hash" number of non-condemned IREs.
810 * Then we match the IRE.
811 * If we find an ire which has a non-zero ire_badcnt then we remember
812 * it and keep on looking for a lower ire_badcnt.
813 * If we come to the end of the list we continue (treat the
814 * bucket list as a circular list) but we match less than "max"
817 ire
= irb_ptr
->irb_ire
;
818 while (maxwalk
> 0) {
819 if (IRE_IS_CONDEMNED(ire
))
822 /* Skip the first "hash" entries to do ECMP */
829 * Note: Since IPv6 has hash buckets instead of radix
830 * buckers we need to explicitly compare the addresses.
831 * That makes this less efficient since we will be called
832 * even if there is no alternatives just because the
833 * bucket has multiple IREs for different addresses.
835 if (ire
->ire_ipversion
== IPV6_VERSION
) {
836 if (!IN6_ARE_ADDR_EQUAL(&orig_ire
->ire_addr_v6
,
842 * For some reason find_best_route uses ire_mask. We do
845 if (ire
->ire_ipversion
== IPV4_VERSION
?
846 !ire_match_args(ire
, margs
->ift_addr
,
847 ire
->ire_mask
, margs
->ift_gateway
,
848 margs
->ift_type
, margs
->ift_ill
, margs
->ift_zoneid
,
850 !ire_match_args_v6(ire
, &margs
->ift_addr_v6
,
851 &ire
->ire_mask_v6
, &margs
->ift_gateway_v6
,
852 margs
->ift_type
, margs
->ift_ill
, margs
->ift_zoneid
,
856 if (margs
->ift_zoneid
!= ALL_ZONES
&&
857 (ire
->ire_type
& IRE_OFFLINK
)) {
859 * When we're in a zone, we're only
860 * interested in routers that are
861 * reachable through ipifs within our zone.
863 if (ire
->ire_ipversion
== IPV4_VERSION
) {
864 if (!ire_gateway_ok_zone_v4(
865 ire
->ire_gateway_addr
, margs
->ift_zoneid
,
866 ire
->ire_ill
, ipst
, B_TRUE
))
869 if (!ire_gateway_ok_zone_v6(
870 &ire
->ire_gateway_addr_v6
,
871 margs
->ift_zoneid
, ire
->ire_ill
,
876 mutex_enter(&ire
->ire_lock
);
877 /* Look for stale ire_badcnt and clear */
878 if (ire
->ire_badcnt
!= 0 &&
879 (TICK_TO_SEC(ddi_get_lbolt64()) - ire
->ire_last_badcnt
>
880 ipst
->ips_ip_ire_badcnt_lifetime
))
882 mutex_exit(&ire
->ire_lock
);
884 if (ire
->ire_badcnt
== 0) {
885 /* We found one with a zero badcnt; done */
888 * Care needed since irb_refrele grabs WLOCK to free
891 if (ire
->ire_ipversion
== IPV4_VERSION
) {
892 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
893 irb_refrele(irb_ptr
);
894 RADIX_NODE_HEAD_RLOCK(ipst
->ips_ip_ftable
);
896 rw_exit(&ipst
->ips_ip6_ire_head_lock
);
897 irb_refrele(irb_ptr
);
898 rw_enter(&ipst
->ips_ip6_ire_head_lock
,
904 * keep looking to see if there is a better (lower
905 * badcnt) matching IRE, but save this one as a last resort.
906 * If we find a lower badcnt pick that one as the last* resort.
908 if (maybe_ire
== NULL
) {
910 maybe_badcnt
= ire
->ire_badcnt
;
911 } else if (ire
->ire_badcnt
< maybe_badcnt
) {
913 maybe_badcnt
= ire
->ire_badcnt
;
921 ire
= irb_ptr
->irb_ire
;
923 if (maybe_ire
!= NULL
)
924 ire_refhold(maybe_ire
);
926 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
927 if (ire
->ire_ipversion
== IPV4_VERSION
) {
928 RADIX_NODE_HEAD_UNLOCK(ipst
->ips_ip_ftable
);
929 irb_refrele(irb_ptr
);
930 RADIX_NODE_HEAD_RLOCK(ipst
->ips_ip_ftable
);
932 rw_exit(&ipst
->ips_ip6_ire_head_lock
);
933 irb_refrele(irb_ptr
);
934 rw_enter(&ipst
->ips_ip6_ire_head_lock
, RW_READER
);
940 irb_refhold_rn(struct radix_node
*rn
)
942 if ((rn
->rn_flags
& RNF_ROOT
) == 0)
943 irb_refhold(&((rt_t
*)(rn
))->rt_irb
);
947 irb_refrele_rn(struct radix_node
*rn
)
949 if ((rn
->rn_flags
& RNF_ROOT
) == 0)
950 irb_refrele_ftable(&((rt_t
*)(rn
))->rt_irb
);
955 * ip_select_src_ill() is used by ip_select_route() to find the src_ill
956 * to be used for source-aware routing table lookup. This function will
957 * ignore IPIF_UNNUMBERED interface addresses, and will only return a
958 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
962 ip_select_src_ill(const in6_addr_t
*v6src
, zoneid_t zoneid
, ip_stack_t
*ipst
)
966 boolean_t isv6
= !IN6_IS_ADDR_V4MAPPED(v6src
);
970 ipif
= ipif_lookup_addr_nondup_v6(v6src
, NULL
, zoneid
, ipst
);
972 IN6_V4MAPPED_TO_IPADDR(v6src
, v4src
);
973 ipif
= ipif_lookup_addr_nondup(v4src
, NULL
, zoneid
, ipst
);
977 ill
= ipif
->ipif_ill
;
984 * verify that v6src is configured on ill
987 ip_verify_src_on_ill(const in6_addr_t v6src
, ill_t
*ill
, zoneid_t zoneid
)
995 ipst
= ill
->ill_ipst
;
998 ipif
= ipif_lookup_addr_nondup_v6(&v6src
, ill
, zoneid
, ipst
);
1000 IN6_V4MAPPED_TO_IPADDR(&v6src
, v4src
);
1001 ipif
= ipif_lookup_addr_nondup(v4src
, ill
, zoneid
, ipst
);
1013 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1014 * routes this routine sets up a ire_nce_cache as well. The caller needs to
1015 * lookup an nce for the multicast case.
1017 * When src_multihoming is set to 2 (strict src multihoming) we use the source
1018 * address to select the interface and route. If IP_BOUND_IF etc are
1019 * specified, we require that they specify an interface on which the
1020 * source address is assigned.
1022 * When src_multihoming is set to 1 (preferred src aware route
1023 * selection) the unicast lookup prefers a matching source
1024 * (i.e., that the route points out an ill on which the source is assigned), but
1025 * if no such route is found we fallback to not considering the source in the
1028 * We skip the src_multihoming check when the source isn't (yet) set, and
1029 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1030 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1031 * when secpolicy_net_rawaccess().
1034 ip_select_route(const in6_addr_t
*v6dst
, const in6_addr_t v6src
,
1035 ip_xmit_attr_t
*ixa
, uint_t
*generationp
, in6_addr_t
*setsrcp
,
1042 ip_stack_t
*ipst
= ixa
->ixa_ipst
;
1044 in6_addr_t v6nexthop
;
1045 iaflags_t ixaflags
= ixa
->ixa_flags
;
1047 boolean_t preferred_src_aware
= B_FALSE
;
1048 boolean_t verify_src
;
1049 boolean_t isv6
= !(ixa
->ixa_flags
& IXAF_IS_IPV4
);
1050 int src_multihoming
= IP_SRC_MULTIHOMING(isv6
, ipst
);
1053 * We only verify that the src has been configured on a selected
1054 * interface if the src is not :: or INADDR_ANY, and if the
1055 * IXAF_VERIFY_SOURCE flag is set.
1057 verify_src
= (!V6_OR_V4_INADDR_ANY(v6src
) &&
1058 (ixa
->ixa_flags
& IXAF_VERIFY_SOURCE
));
1061 IN6_V4MAPPED_TO_IPADDR(v6dst
, v4dst
);
1062 if (setsrcp
!= NULL
)
1063 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp
));
1065 ASSERT(*errorp
== 0);
1068 * The content of the ixa will be different if IP_NEXTHOP,
1069 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1072 if (isv6
? IN6_IS_ADDR_MULTICAST(v6dst
) : CLASSD(v4dst
)) {
1073 /* Pick up the IRE_MULTICAST for the ill */
1074 if (ixa
->ixa_multicast_ifindex
!= 0) {
1075 ill
= ill_lookup_on_ifindex(ixa
->ixa_multicast_ifindex
,
1077 } else if (ixaflags
& IXAF_SCOPEID_SET
) {
1078 /* sin6_scope_id takes precedence over ixa_ifindex */
1079 ASSERT(ixa
->ixa_scopeid
!= 0);
1080 ill
= ill_lookup_on_ifindex(ixa
->ixa_scopeid
,
1082 } else if (ixa
->ixa_ifindex
!= 0) {
1084 * In the ipmp case, the ixa_ifindex is set to
1085 * point at an under_ill and we would return the
1086 * ire_multicast() corresponding to that under_ill.
1088 ill
= ill_lookup_on_ifindex(ixa
->ixa_ifindex
,
1090 } else if (src_multihoming
!= 0 && verify_src
) {
1091 /* Look up the ill based on the source address */
1092 ill
= ip_select_src_ill(&v6src
, ixa
->ixa_zoneid
, ipst
);
1094 * Since we looked up the ill from the source there
1095 * is no need to verify that the source is on the ill
1098 verify_src
= B_FALSE
;
1099 if (ill
!= NULL
&& IS_VNI(ill
)) {
1100 ill_t
*usesrc
= ill
;
1102 ill
= ill_lookup_usesrc(usesrc
);
1103 ill_refrele(usesrc
);
1106 ipaddr_t v4setsrc
= INADDR_ANY
;
1108 ill
= ill_lookup_group_v4(v4dst
, ixa
->ixa_zoneid
,
1110 if (setsrcp
!= NULL
)
1111 IN6_IPADDR_TO_V4MAPPED(v4setsrc
, setsrcp
);
1113 ill
= ill_lookup_group_v6(v6dst
, ixa
->ixa_zoneid
,
1116 if (ill
!= NULL
&& IS_VNI(ill
)) {
1123 /* Get a hold on the IRE_NOROUTE */
1124 ire
= ire_reject(ipst
, isv6
);
1127 if (!(ill
->ill_flags
& ILLF_MULTICAST
)) {
1130 *errorp
= EHOSTUNREACH
;
1131 /* Get a hold on the IRE_NOROUTE */
1132 ire
= ire_reject(ipst
, isv6
);
1136 * If we are doing the strictest src_multihoming, then
1137 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1138 * an interface that is consistent with the source address.
1140 if (verify_src
&& src_multihoming
== 2 &&
1141 !ip_verify_src_on_ill(v6src
, ill
, ixa
->ixa_zoneid
)) {
1143 *errorp
= EADDRNOTAVAIL
;
1145 /* Get a hold on the IRE_NOROUTE */
1146 ire
= ire_reject(ipst
, isv6
);
1149 /* Get a refcnt on the single IRE_MULTICAST per ill */
1150 ire
= ire_multicast(ill
);
1152 if (generationp
!= NULL
)
1153 *generationp
= ire
->ire_generation
;
1154 if (errorp
!= NULL
&&
1155 (ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
))) {
1156 *errorp
= EHOSTUNREACH
;
1161 /* Now for unicast */
1162 if (ixa
->ixa_ifindex
!= 0 || (ixaflags
& IXAF_SCOPEID_SET
)) {
1163 if (ixaflags
& IXAF_SCOPEID_SET
) {
1164 /* sin6_scope_id takes precedence over ixa_ifindex */
1165 ASSERT(ixa
->ixa_scopeid
!= 0);
1166 ill
= ill_lookup_on_ifindex(ixa
->ixa_scopeid
,
1169 ASSERT(ixa
->ixa_ifindex
!= 0);
1170 ill
= ill_lookup_on_ifindex(ixa
->ixa_ifindex
,
1173 if (ill
!= NULL
&& IS_VNI(ill
)) {
1180 /* Get a hold on the IRE_NOROUTE */
1181 ire
= ire_reject(ipst
, isv6
);
1185 match_args
|= MATCH_IRE_ILL
;
1188 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1189 * so for both of them we need to be able look for an under
1192 if (IS_UNDER_IPMP(ill
))
1193 match_args
|= MATCH_IRE_TESTHIDDEN
;
1196 * If we are doing the strictest src_multihoming, then
1197 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1198 * an interface that is consistent with the source address.
1200 if (src_multihoming
== 2 &&
1201 !ip_verify_src_on_ill(v6src
, ill
, ixa
->ixa_zoneid
)) {
1203 *errorp
= EADDRNOTAVAIL
;
1205 /* Get a hold on the IRE_NOROUTE */
1206 ire
= ire_reject(ipst
, isv6
);
1209 } else if (src_multihoming
!= 0 && verify_src
) {
1210 /* Look up the ill based on the source address */
1211 ill
= ip_select_src_ill(&v6src
, ixa
->ixa_zoneid
, ipst
);
1213 char addrbuf
[INET6_ADDRSTRLEN
];
1215 ip3dbg(("%s not a valid src for unicast",
1216 inet_ntop(AF_INET6
, &v6src
, addrbuf
,
1217 sizeof (addrbuf
))));
1219 *errorp
= EADDRNOTAVAIL
;
1220 /* Get a hold on the IRE_NOROUTE */
1221 ire
= ire_reject(ipst
, isv6
);
1224 match_args
|= MATCH_IRE_SRC_ILL
;
1225 preferred_src_aware
= (src_multihoming
== 1);
1228 if (ixaflags
& IXAF_NEXTHOP_SET
) {
1229 /* IP_NEXTHOP was set */
1230 v6nexthop
= ixa
->ixa_nexthop_v6
;
1238 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1239 * we only look for an onlink IRE.
1241 if (ixaflags
& (IXAF_DONTROUTE
|IXAF_NEXTHOP_SET
)) {
1242 match_args
|= MATCH_IRE_TYPE
;
1243 ire_type
= IRE_ONLINK
;
1249 ipaddr_t v4setsrc
= INADDR_ANY
;
1251 IN6_V4MAPPED_TO_IPADDR(&v6nexthop
, v4nexthop
);
1252 ire
= ire_route_recursive_v4(v4nexthop
, ire_type
, ill
,
1253 ixa
->ixa_zoneid
, match_args
, IRR_ALLOCATE
,
1254 ixa
->ixa_xmit_hint
, ipst
, &v4setsrc
, generationp
);
1255 if (setsrcp
!= NULL
)
1256 IN6_IPADDR_TO_V4MAPPED(v4setsrc
, setsrcp
);
1258 ire
= ire_route_recursive_v6(&v6nexthop
, ire_type
, ill
,
1259 ixa
->ixa_zoneid
, match_args
, IRR_ALLOCATE
,
1260 ixa
->ixa_xmit_hint
, ipst
, setsrcp
, generationp
);
1264 if (match_args
& MATCH_IRE_TESTHIDDEN
) {
1265 ip3dbg(("looking for hidden; dst %x ire %p\n",
1266 v4dst
, (void *)ire
));
1273 if ((ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
)) ||
1274 (ire
->ire_type
& IRE_MULTICAST
)) {
1275 if (preferred_src_aware
) {
1277 * "Preferred Source Aware" send mode. If we cannot
1278 * find an ire whose ire_ill had the desired source
1279 * address retry after relaxing the ill matching
1283 preferred_src_aware
= B_FALSE
;
1284 match_args
&= ~MATCH_IRE_SRC_ILL
;
1287 /* No ire_nce_cache */
1291 /* Setup ire_nce_cache if it doesn't exist or is condemned. */
1292 mutex_enter(&ire
->ire_lock
);
1293 nce
= ire
->ire_nce_cache
;
1294 if (nce
== NULL
|| nce
->nce_is_condemned
) {
1295 mutex_exit(&ire
->ire_lock
);
1296 (void) ire_revalidate_nce(ire
);
1298 mutex_exit(&ire
->ire_lock
);
1304 * Find a route given some xmit attributes and a packet.
1305 * Generic for IPv4 and IPv6
1307 * This never returns NULL. But when it returns the IRE_NOROUTE
1308 * it might set errorp.
1311 ip_select_route_pkt(mblk_t
*mp
, ip_xmit_attr_t
*ixa
, uint_t
*generationp
,
1314 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
1315 ipha_t
*ipha
= (ipha_t
*)mp
->b_rptr
;
1316 in6_addr_t v6dst
, v6src
;
1318 IN6_IPADDR_TO_V4MAPPED(ipha
->ipha_dst
, &v6dst
);
1319 IN6_IPADDR_TO_V4MAPPED(ipha
->ipha_src
, &v6src
);
1321 return (ip_select_route(&v6dst
, v6src
, ixa
, generationp
,
1324 ip6_t
*ip6h
= (ip6_t
*)mp
->b_rptr
;
1326 return (ip_select_route(&ip6h
->ip6_dst
, ip6h
->ip6_src
,
1327 ixa
, generationp
, NULL
, errorp
));
1332 ip_select_route_v4(ipaddr_t dst
, ipaddr_t src
, ip_xmit_attr_t
*ixa
,
1333 uint_t
*generationp
, ipaddr_t
*v4setsrcp
, int *errorp
)
1335 in6_addr_t v6dst
, v6src
;
1339 ASSERT(ixa
->ixa_flags
& IXAF_IS_IPV4
);
1341 IN6_IPADDR_TO_V4MAPPED(dst
, &v6dst
);
1342 IN6_IPADDR_TO_V4MAPPED(src
, &v6src
);
1344 setsrc
= ipv6_all_zeros
;
1345 ire
= ip_select_route(&v6dst
, v6src
, ixa
, generationp
, &setsrc
, errorp
);
1346 if (v4setsrcp
!= NULL
)
1347 IN6_V4MAPPED_TO_IPADDR(&setsrc
, *v4setsrcp
);
1352 * Recursively look for a route to the destination. Can also match on
1353 * the zoneid and ill. Used for the data paths. See also
1354 * ire_route_recursive.
1356 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1357 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1359 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1360 * resolve the gateway.
1362 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1365 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1367 * Allow at most one RTF_INDIRECT.
1370 ire_route_recursive_impl_v4(ire_t
*ire
,
1371 ipaddr_t nexthop
, uint_t ire_type
, const ill_t
*ill_arg
,
1372 zoneid_t zoneid
, uint_t match_args
, uint_t irr_flags
, uint32_t xmit_hint
,
1373 ip_stack_t
*ipst
, ipaddr_t
*setsrcp
, uint_t
*generationp
)
1376 ire_t
*ires
[MAX_IRE_RECURSION
];
1378 uint_t generations
[MAX_IRE_RECURSION
];
1379 boolean_t need_refrele
= B_FALSE
;
1380 boolean_t invalidate
= B_FALSE
;
1382 uint_t maskoff
= (IRE_LOCAL
|IRE_LOOPBACK
|IRE_BROADCAST
);
1384 if (setsrcp
!= NULL
)
1385 ASSERT(*setsrcp
== INADDR_ANY
);
1388 * We iterate up to three times to resolve a route, even though
1389 * we have four slots in the array. The extra slot is for an
1390 * IRE_IF_CLONE we might need to create.
1393 while (i
< MAX_IRE_RECURSION
- 1) {
1394 /* ire_ftable_lookup handles round-robin/ECMP */
1396 ire
= ire_ftable_lookup_v4(nexthop
, 0, 0, ire_type
,
1397 (ill
!= NULL
? ill
: ill_arg
), zoneid
, match_args
,
1398 xmit_hint
, ipst
, &generation
);
1400 /* Caller passed it; extra hold since we will rele */
1402 if (generationp
!= NULL
)
1403 generation
= *generationp
;
1405 generation
= IRE_GENERATION_VERIFY
;
1408 if (i
> 0 && (irr_flags
& IRR_INCOMPLETE
)) {
1412 ire
= ire_reject(ipst
, B_FALSE
);
1417 /* Need to return the ire with RTF_REJECT|BLACKHOLE */
1418 if (ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
))
1421 ASSERT(!(ire
->ire_type
& IRE_MULTICAST
)); /* Not in ftable */
1423 * Verify that the IRE_IF_CLONE has a consistent generation
1426 if ((ire
->ire_type
& IRE_IF_CLONE
) && !ire_clone_verify(ire
)) {
1433 * Don't allow anything unusual past the first iteration.
1434 * After the first lookup, we should no longer look for
1435 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1438 * In addition, after we have found a direct IRE_OFFLINK,
1439 * we should only look for interface or clone routes.
1441 match_args
|= MATCH_IRE_DIRECT
; /* no more RTF_INDIRECTs */
1443 if ((ire
->ire_type
& IRE_OFFLINK
) &&
1444 !(ire
->ire_flags
& RTF_INDIRECT
)) {
1445 ire_type
= IRE_IF_ALL
;
1448 * no more local, loopback, broadcast routes
1450 if (!(match_args
& MATCH_IRE_TYPE
))
1451 ire_type
= (IRE_OFFLINK
|IRE_ONLINK
);
1452 ire_type
&= ~maskoff
;
1454 match_args
|= MATCH_IRE_TYPE
;
1456 /* We have a usable IRE */
1458 generations
[i
] = generation
;
1461 /* The first RTF_SETSRC address is passed back if setsrcp */
1462 if ((ire
->ire_flags
& RTF_SETSRC
) &&
1463 setsrcp
!= NULL
&& *setsrcp
== INADDR_ANY
) {
1464 ASSERT(ire
->ire_setsrc_addr
!= INADDR_ANY
);
1465 *setsrcp
= ire
->ire_setsrc_addr
;
1469 * Check if we have a short-cut pointer to an IRE for this
1470 * destination, and that the cached dependency isn't stale.
1471 * In that case we've rejoined an existing tree towards a
1472 * parent, thus we don't need to continue the loop to
1473 * discover the rest of the tree.
1475 mutex_enter(&ire
->ire_lock
);
1476 if (ire
->ire_dep_parent
!= NULL
&&
1477 ire
->ire_dep_parent
->ire_generation
==
1478 ire
->ire_dep_parent_generation
) {
1479 mutex_exit(&ire
->ire_lock
);
1483 mutex_exit(&ire
->ire_lock
);
1486 * If this type should have an ire_nce_cache (even if it
1487 * doesn't yet have one) then we are done. Includes
1488 * IRE_INTERFACE with a full 32 bit mask.
1490 if (ire
->ire_nce_capable
) {
1494 ASSERT(!(ire
->ire_type
& IRE_IF_CLONE
));
1496 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1497 * particular destination
1499 if (ire
->ire_type
& IRE_INTERFACE
) {
1500 in6_addr_t v6nexthop
;
1503 ASSERT(ire
->ire_masklen
!= IPV4_ABITS
);
1506 * In the case of ip_input and ILLF_FORWARDING not
1507 * being set, and in the case of RTM_GET, there is
1508 * no point in allocating an IRE_IF_CLONE. We return
1509 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1510 * result in a ire_dep_parent which is IRE_IF_*
1511 * without an IRE_IF_CLONE.
1512 * We recover from that when we need to send packets
1513 * by ensuring that the generations become
1514 * IRE_GENERATION_VERIFY in this case.
1516 if (!(irr_flags
& IRR_ALLOCATE
)) {
1517 invalidate
= B_TRUE
;
1522 IN6_IPADDR_TO_V4MAPPED(nexthop
, &v6nexthop
);
1524 clone
= ire_create_if_clone(ire
, &v6nexthop
,
1526 if (clone
== NULL
) {
1528 * Temporary failure - no memory.
1529 * Don't want caller to cache IRE_NOROUTE.
1531 invalidate
= B_TRUE
;
1532 ire
= ire_blackhole(ipst
, B_FALSE
);
1536 * Make clone next to last entry and the
1537 * IRE_INTERFACE the last in the dependency
1538 * chain since the clone depends on the
1542 ASSERT(i
< MAX_IRE_RECURSION
);
1544 ires
[i
] = ires
[i
-1];
1545 generations
[i
] = generations
[i
-1];
1547 generations
[i
-1] = generation
;
1555 * We only match on the type and optionally ILL when
1556 * recursing. The type match is used by some callers
1557 * to exclude certain types (such as IRE_IF_CLONE or
1558 * IRE_LOCAL|IRE_LOOPBACK).
1560 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1561 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1562 * ire_ill, so we set ill to the ire_ill;
1564 match_args
&= (MATCH_IRE_TYPE
| MATCH_IRE_DIRECT
);
1565 nexthop
= ire
->ire_gateway_addr
;
1566 if (ill
== NULL
&& ire
->ire_ill
!= NULL
) {
1568 need_refrele
= B_TRUE
;
1570 match_args
|= MATCH_IRE_ILL
;
1574 ASSERT(ire
== NULL
);
1575 ire
= ire_reject(ipst
, B_FALSE
);
1578 ASSERT(ire
!= NULL
);
1583 /* cleanup ires[i] */
1584 ire_dep_unbuild(ires
, i
);
1585 for (j
= 0; j
< i
; j
++)
1586 ire_refrele(ires
[j
]);
1588 ASSERT((ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
)) ||
1589 (irr_flags
& IRR_INCOMPLETE
));
1591 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1592 * ip_select_route since the reject or lack of memory might be gone.
1594 if (generationp
!= NULL
)
1595 *generationp
= IRE_GENERATION_VERIFY
;
1599 ASSERT(ire
== NULL
);
1605 /* Build dependencies */
1606 if (i
> 1 && !ire_dep_build(ires
, generations
, i
)) {
1607 /* Something in chain was condemned; tear it apart */
1608 ire
= ire_reject(ipst
, B_FALSE
);
1613 * Release all refholds except the one for ires[0] that we
1614 * will return to the caller.
1616 for (j
= 1; j
< i
; j
++)
1617 ire_refrele(ires
[j
]);
1621 * Since we needed to allocate but couldn't we need to make
1622 * sure that the dependency chain is rebuilt the next time.
1624 ire_dep_invalidate_generations(ires
[0]);
1625 generation
= IRE_GENERATION_VERIFY
;
1628 * IREs can have been added or deleted while we did the
1629 * recursive lookup and we can't catch those until we've built
1630 * the dependencies. We verify the stored
1631 * ire_dep_parent_generation to catch any such changes and
1632 * return IRE_GENERATION_VERIFY (which will cause
1633 * ip_select_route to be called again so we can redo the
1634 * recursive lookup next time we send a packet.
1636 if (ires
[0]->ire_dep_parent
== NULL
)
1637 generation
= ires
[0]->ire_generation
;
1639 generation
= ire_dep_validate_generations(ires
[0]);
1640 if (generations
[0] != ires
[0]->ire_generation
) {
1641 /* Something changed at the top */
1642 generation
= IRE_GENERATION_VERIFY
;
1645 if (generationp
!= NULL
)
1646 *generationp
= generation
;
1652 ire_route_recursive_v4(ipaddr_t nexthop
, uint_t ire_type
, const ill_t
*ill
,
1653 zoneid_t zoneid
, uint_t match_args
, uint_t irr_flags
, uint32_t xmit_hint
,
1654 ip_stack_t
*ipst
, ipaddr_t
*setsrcp
, uint_t
*generationp
)
1656 return (ire_route_recursive_impl_v4(NULL
, nexthop
, ire_type
, ill
,
1657 zoneid
, match_args
, irr_flags
, xmit_hint
, ipst
, setsrcp
,
1662 * Recursively look for a route to the destination.
1663 * We only handle a destination match here, yet we have the same arguments
1664 * as the full match to allow function pointers to select between the two.
1666 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1669 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1671 * Allow at most one RTF_INDIRECT.
1674 ire_route_recursive_dstonly_v4(ipaddr_t nexthop
, uint_t irr_flags
,
1675 uint32_t xmit_hint
, ip_stack_t
*ipst
)
1681 /* ire_ftable_lookup handles round-robin/ECMP */
1682 ire
= ire_ftable_lookup_simple_v4(nexthop
, xmit_hint
, ipst
,
1684 ASSERT(ire
!= NULL
);
1686 * If the IRE has a current cached parent we know that the whole
1687 * parent chain is current, hence we don't need to discover and
1688 * build any dependencies by doing a recursive lookup.
1690 mutex_enter(&ire
->ire_lock
);
1691 if (ire
->ire_dep_parent
!= NULL
) {
1692 if (ire
->ire_dep_parent
->ire_generation
==
1693 ire
->ire_dep_parent_generation
) {
1694 mutex_exit(&ire
->ire_lock
);
1697 mutex_exit(&ire
->ire_lock
);
1699 mutex_exit(&ire
->ire_lock
);
1701 * If this type should have an ire_nce_cache (even if it
1702 * doesn't yet have one) then we are done. Includes
1703 * IRE_INTERFACE with a full 32 bit mask.
1705 if (ire
->ire_nce_capable
)
1710 * Fallback to loop in the normal code starting with the ire
1711 * we found. Normally this would return the same ire.
1713 ire1
= ire_route_recursive_impl_v4(ire
, nexthop
, 0, NULL
, ALL_ZONES
,
1714 MATCH_IRE_DSTONLY
, irr_flags
, xmit_hint
, ipst
, NULL
, &generation
);
1720 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1721 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1722 * are not consistent, and TRUE otherwise.
1725 ire_clone_verify(ire_t
*ire
)
1727 ASSERT((ire
->ire_type
& IRE_IF_CLONE
) != 0);
1728 mutex_enter(&ire
->ire_lock
);
1729 if (ire
->ire_dep_parent
!= NULL
&&
1730 ire
->ire_dep_parent
->ire_generation
!=
1731 ire
->ire_dep_parent_generation
) {
1732 mutex_exit(&ire
->ire_lock
);
1736 mutex_exit(&ire
->ire_lock
);