kill tsol ("Trusted Solaris") aka TX ("Trusted Extensions")
[unleashed.git] / kernel / net / ip / ip_ftable.c
blob8f43f0eafba377e4fa6fc0f59c21cd8e08bb40ad
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
26 * This file contains consumer routines of the IPv4 forwarding engine
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #include <sys/dlpi.h>
34 #include <sys/ddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/policy.h>
38 #include <sys/systm.h>
39 #include <sys/strsun.h>
40 #include <sys/kmem.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <sys/strsubr.h>
44 #include <net/if.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
47 #include <net/if_dl.h>
48 #include <netinet/ip6.h>
49 #include <netinet/icmp6.h>
51 #include <inet/ipsec_impl.h>
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/arp.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_ftable.h>
63 #include <inet/ip_rts.h>
64 #include <inet/nd.h>
66 #include <net/pfkeyv2.h>
67 #include <inet/sadb.h>
68 #include <inet/tcp.h>
69 #include <inet/ipclassifier.h>
70 #include <sys/zone.h>
71 #include <net/radix.h>
73 #define IS_DEFAULT_ROUTE(ire) \
74 (((ire)->ire_type & IRE_DEFAULT) || \
75 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
77 #define IP_SRC_MULTIHOMING(isv6, ipst) \
78 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
79 ipst->ips_ip_strict_src_multihoming)
81 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
82 static void ire_del_host_redir(ire_t *, char *);
83 static boolean_t ire_find_best_route(struct radix_node *, void *);
86 * Lookup a route in forwarding table. A specific lookup is indicated by
87 * passing the required parameters and indicating the match required in the
88 * flag field.
90 * Supports IP_BOUND_IF by following the ipif/ill when recursing.
92 ire_t *
93 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
94 int type, const ill_t *ill, zoneid_t zoneid, int flags, uint32_t xmit_hint,
95 ip_stack_t *ipst, uint_t *generationp)
97 ire_t *ire;
98 struct rt_sockaddr rdst, rmask;
99 struct rt_entry *rt;
100 ire_ftable_args_t margs;
102 ASSERT(ill == NULL || !ill->ill_isv6);
105 * ire_match_args() will dereference ill if MATCH_IRE_ILL
106 * is set.
108 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
109 return (NULL);
111 bzero(&rdst, sizeof (rdst));
112 rdst.rt_sin_len = sizeof (rdst);
113 rdst.rt_sin_family = AF_INET;
114 rdst.rt_sin_addr.s_addr = addr;
116 bzero(&rmask, sizeof (rmask));
117 rmask.rt_sin_len = sizeof (rmask);
118 rmask.rt_sin_family = AF_INET;
119 rmask.rt_sin_addr.s_addr = mask;
121 bzero(&margs, sizeof (margs));
122 margs.ift_addr = addr;
123 margs.ift_mask = mask;
124 margs.ift_gateway = gateway;
125 margs.ift_type = type;
126 margs.ift_ill = ill;
127 margs.ift_zoneid = zoneid;
128 margs.ift_flags = flags;
131 * The flags argument passed to ire_ftable_lookup may cause the
132 * search to return, not the longest matching prefix, but the
133 * "best matching prefix", i.e., the longest prefix that also
134 * satisfies constraints imposed via the permutation of flags
135 * passed in. To achieve this, we invoke ire_match_args() on
136 * each matching leaf in the radix tree. ire_match_args is
137 * invoked by the callback function ire_find_best_route()
138 * We hold the global tree lock in read mode when calling
139 * rn_match_args. Before dropping the global tree lock, ensure
140 * that the radix node can't be deleted by incrementing ire_refcnt.
142 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
143 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
144 ipst->ips_ip_ftable, ire_find_best_route, &margs);
145 ire = margs.ift_best_ire;
146 if (rt == NULL) {
147 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
148 return (NULL);
150 ASSERT(ire != NULL);
152 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
155 * round-robin only if we have more than one route in the bucket.
156 * ips_ip_ecmp_behavior controls when we do ECMP
157 * 2: always
158 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
159 * 0: never
161 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
162 if (ipst->ips_ip_ecmp_behavior == 2 ||
163 (ipst->ips_ip_ecmp_behavior == 1 &&
164 IS_DEFAULT_ROUTE(ire))) {
165 ire_t *next_ire;
167 margs.ift_best_ire = NULL;
168 next_ire = ire_round_robin(ire->ire_bucket, &margs,
169 xmit_hint, ire, ipst);
170 if (next_ire == NULL) {
171 /* keep ire if next_ire is null */
172 goto done;
174 ire_refrele(ire);
175 ire = next_ire;
179 done:
180 /* Return generation before dropping lock */
181 if (generationp != NULL)
182 *generationp = ire->ire_generation;
184 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
187 * For shared-IP zones we need additional checks to what was
188 * done in ire_match_args to make sure IRE_LOCALs are handled.
190 * When ip_restrict_interzone_loopback is set, then
191 * we ensure that IRE_LOCAL are only used for loopback
192 * between zones when the logical "Ethernet" would
193 * have looped them back. That is, if in the absense of
194 * the IRE_LOCAL we would have sent to packet out the
195 * same ill.
197 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
198 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
199 ipst->ips_ip_restrict_interzone_loopback) {
200 ire = ire_alt_local(ire, zoneid, ill, generationp);
201 ASSERT(ire != NULL);
203 return (ire);
207 * This function is called by
208 * ip_input/ire_route_recursive when doing a route lookup on only the
209 * destination address.
211 * The optimizations of this function over ire_ftable_lookup are:
212 * o removing unnecessary flag matching
213 * o doing longest prefix match instead of overloading it further
214 * with the unnecessary "best_prefix_match"
216 * If no route is found we return IRE_NOROUTE.
218 ire_t *
219 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
220 uint_t *generationp)
222 ire_t *ire;
223 struct rt_sockaddr rdst;
224 struct rt_entry *rt;
225 irb_t *irb;
227 rdst.rt_sin_len = sizeof (rdst);
228 rdst.rt_sin_family = AF_INET;
229 rdst.rt_sin_addr.s_addr = addr;
232 * This is basically inlining a simpler version of ire_match_args
234 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
236 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
237 ipst->ips_ip_ftable, NULL, NULL);
239 if (rt == NULL)
240 goto bad;
242 irb = &rt->rt_irb;
243 if (irb->irb_ire_cnt == 0)
244 goto bad;
246 rw_enter(&irb->irb_lock, RW_READER);
247 ire = irb->irb_ire;
248 if (ire == NULL) {
249 rw_exit(&irb->irb_lock);
250 goto bad;
252 while (IRE_IS_CONDEMNED(ire)) {
253 ire = ire->ire_next;
254 if (ire == NULL) {
255 rw_exit(&irb->irb_lock);
256 goto bad;
260 /* we have a ire that matches */
261 ire_refhold(ire);
262 rw_exit(&irb->irb_lock);
265 * round-robin only if we have more than one route in the bucket.
266 * ips_ip_ecmp_behavior controls when we do ECMP
267 * 2: always
268 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
269 * 0: never
271 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
272 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
273 * and the IRE_INTERFACESs are likely to be shorter matches.
275 if (ire->ire_bucket->irb_ire_cnt > 1) {
276 if (ipst->ips_ip_ecmp_behavior == 2 ||
277 (ipst->ips_ip_ecmp_behavior == 1 &&
278 IS_DEFAULT_ROUTE(ire))) {
279 ire_t *next_ire;
280 ire_ftable_args_t margs;
282 bzero(&margs, sizeof (margs));
283 margs.ift_addr = addr;
284 margs.ift_zoneid = ALL_ZONES;
286 next_ire = ire_round_robin(ire->ire_bucket, &margs,
287 xmit_hint, ire, ipst);
288 if (next_ire == NULL) {
289 /* keep ire if next_ire is null */
290 if (generationp != NULL)
291 *generationp = ire->ire_generation;
292 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
293 return (ire);
295 ire_refrele(ire);
296 ire = next_ire;
299 /* Return generation before dropping lock */
300 if (generationp != NULL)
301 *generationp = ire->ire_generation;
303 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
306 * Since we only did ALL_ZONES matches there is no special handling
307 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
309 return (ire);
311 bad:
312 if (generationp != NULL)
313 *generationp = IRE_GENERATION_VERIFY;
315 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
316 return (ire_reject(ipst, B_FALSE));
320 * Find the ill matching a multicast group.
321 * Allows different routes for multicast addresses
322 * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
323 * which point at different interfaces. This is used when IP_MULTICAST_IF
324 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
325 * specify the interface to join on.
327 * Supports link-local addresses by using ire_route_recursive which follows
328 * the ill when recursing.
330 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
331 * and the MULTIRT property can be different for different groups, we
332 * extract RTF_MULTIRT from the special unicast route added for a group
333 * with CGTP and pass that back in the multirtp argument.
334 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
335 * We have a setsrcp argument for the same reason.
337 ill_t *
338 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
339 boolean_t *multirtp, ipaddr_t *setsrcp)
341 ire_t *ire;
342 ill_t *ill;
344 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, MATCH_IRE_DSTONLY,
345 IRR_NONE, 0, ipst, setsrcp, NULL);
346 ASSERT(ire != NULL);
347 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
348 ire_refrele(ire);
349 return (NULL);
352 if (multirtp != NULL)
353 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
355 ill = ire_nexthop_ill(ire);
356 ire_refrele(ire);
357 return (ill);
361 * Delete the passed in ire if the gateway addr matches
363 void
364 ire_del_host_redir(ire_t *ire, char *gateway)
366 if ((ire->ire_flags & RTF_DYNAMIC) &&
367 (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
368 ire_delete(ire);
372 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
373 * pointing at the specified gateway and
374 * delete them. This routine is called only
375 * when a default gateway is going away.
377 void
378 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
380 struct rtfuncarg rtfarg;
382 bzero(&rtfarg, sizeof (rtfarg));
383 rtfarg.rt_func = ire_del_host_redir;
384 rtfarg.rt_arg = (void *)&gateway;
385 rtfarg.rt_zoneid = ALL_ZONES;
386 rtfarg.rt_ipst = ipst;
387 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
388 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
392 * Obtain the rt_entry and rt_irb for the route to be added to
393 * the ips_ip_ftable.
394 * First attempt to add a node to the radix tree via rn_addroute. If the
395 * route already exists, return the bucket for the existing route.
397 * Locking notes: Need to hold the global radix tree lock in write mode to
398 * add a radix node. To prevent the node from being deleted, ire_get_bucket()
399 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
400 * while holding the irb_lock, but not the radix tree lock.
402 irb_t *
403 ire_get_bucket(ire_t *ire)
405 struct radix_node *rn;
406 struct rt_entry *rt;
407 struct rt_sockaddr rmask, rdst;
408 irb_t *irb = NULL;
409 ip_stack_t *ipst = ire->ire_ipst;
411 ASSERT(ipst->ips_ip_ftable != NULL);
413 /* first try to see if route exists (based on rtalloc1) */
414 bzero(&rdst, sizeof (rdst));
415 rdst.rt_sin_len = sizeof (rdst);
416 rdst.rt_sin_family = AF_INET;
417 rdst.rt_sin_addr.s_addr = ire->ire_addr;
419 bzero(&rmask, sizeof (rmask));
420 rmask.rt_sin_len = sizeof (rmask);
421 rmask.rt_sin_family = AF_INET;
422 rmask.rt_sin_addr.s_addr = ire->ire_mask;
425 * add the route. based on BSD's rtrequest1(RTM_ADD)
427 R_Malloc(rt, rt_entry_cache, sizeof (*rt));
428 /* kmem_alloc failed */
429 if (rt == NULL)
430 return (NULL);
432 bzero(rt, sizeof (*rt));
433 rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
434 rt->rt_dst = rdst;
435 irb = &rt->rt_irb;
436 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
437 irb->irb_ipst = ipst;
438 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
439 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
440 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
441 ipst->ips_ip_ftable, (struct radix_node *)rt);
442 if (rn == NULL) {
443 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
444 Free(rt, rt_entry_cache);
445 rt = NULL;
446 irb = NULL;
447 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
448 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
449 ipst->ips_ip_ftable);
450 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
451 /* found a non-root match */
452 rt = (struct rt_entry *)rn;
455 if (rt != NULL) {
456 irb = &rt->rt_irb;
457 irb_refhold(irb);
459 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
460 return (irb);
464 * This function is used when the caller wants to know the outbound
465 * interface for a packet given only the address.
466 * If this is a offlink IP address and there are multiple
467 * routes to this destination, this routine will utilise the
468 * first route it finds to IP address
469 * Return values:
470 * 0 - FAILURE
471 * nonzero - ifindex
473 uint_t
474 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
476 uint_t ifindex = 0;
477 ire_t *ire;
478 ill_t *ill;
479 netstack_t *ns;
480 ip_stack_t *ipst;
482 if (zoneid == ALL_ZONES)
483 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
484 else
485 ns = netstack_find_by_zoneid(zoneid);
486 ASSERT(ns != NULL);
489 * For exclusive stacks we set the zoneid to zero
490 * since IP uses the global zoneid in the exclusive stacks.
492 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
493 zoneid = GLOBAL_ZONEID;
494 ipst = ns->netstack_ip;
496 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
498 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
499 ill = ire_nexthop_ill(ire);
500 if (ill != NULL) {
501 ifindex = ill->ill_phyint->phyint_ifindex;
502 ill_refrele(ill);
504 ire_refrele(ire);
506 netstack_rele(ns);
507 return (ifindex);
511 * Routine to find the route to a destination. If a ifindex is supplied
512 * it tries to match the route to the corresponding ipif for the ifindex
514 static ire_t *
515 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
517 ire_t *ire = NULL;
518 int match_flags;
520 match_flags = MATCH_IRE_DSTONLY;
522 if (dst_addr->sa_family == AF_INET) {
523 ire = ire_route_recursive_v4(
524 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
525 zoneid, match_flags, IRR_ALLOCATE, 0, ipst, NULL, NULL);
526 } else {
527 ire = ire_route_recursive_v6(
528 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
529 zoneid, match_flags, IRR_ALLOCATE, 0, ipst, NULL, NULL);
531 ASSERT(ire != NULL);
532 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
533 ire_refrele(ire);
534 return (NULL);
536 return (ire);
540 * This routine is called by IP Filter to send a packet out on the wire
541 * to a specified dstination (which may be onlink or offlink). The ifindex may
542 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
543 * an outgoing interface and requires the nexthop to be on that interface.
544 * IP WILL NOT DO the following to the data packet before sending it out:
545 * a. manipulate ttl
546 * b. ipsec work
547 * c. fragmentation
549 * If the packet has been prepared for hardware checksum then it will be
550 * passed off to ip_send_align_cksum() to check that the flags set on the
551 * packet are in alignment with the capabilities of the new outgoing NIC.
553 * Return values:
554 * 0: IP was able to send of the data pkt
555 * ECOMM: Could not send packet
556 * ENONET No route to dst. It is up to the caller
557 * to send icmp unreachable error message,
558 * EINPROGRESS The macaddr of the onlink dst or that
559 * of the offlink dst's nexthop needs to get
560 * resolved before packet can be sent to dst.
561 * Thus transmission is not guaranteed.
562 * Note: No longer have visibility to the ARP queue
563 * hence no EINPROGRESS.
566 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
567 zoneid_t zoneid)
569 ipaddr_t nexthop;
570 netstack_t *ns;
571 ip_stack_t *ipst;
572 ip_xmit_attr_t ixas;
573 int error;
575 ASSERT(mp != NULL);
577 if (zoneid == ALL_ZONES)
578 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
579 else
580 ns = netstack_find_by_zoneid(zoneid);
581 ASSERT(ns != NULL);
584 * For exclusive stacks we set the zoneid to zero
585 * since IP uses the global zoneid in the exclusive stacks.
587 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
588 zoneid = GLOBAL_ZONEID;
589 ipst = ns->netstack_ip;
591 ASSERT(dst_addr->sa_family == AF_INET ||
592 dst_addr->sa_family == AF_INET6);
594 bzero(&ixas, sizeof (ixas));
596 * No IPsec, no fragmentation, and don't let any hooks see
597 * the packet.
599 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
600 ixas.ixa_cred = kcred;
601 ixas.ixa_cpid = NOPID;
602 ixas.ixa_ipst = ipst;
603 ixas.ixa_ifindex = ifindex;
605 if (dst_addr->sa_family == AF_INET) {
606 ipha_t *ipha = (ipha_t *)mp->b_rptr;
608 ixas.ixa_flags |= IXAF_IS_IPV4;
609 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
610 if (nexthop != ipha->ipha_dst) {
611 ixas.ixa_flags |= IXAF_NEXTHOP_SET;
612 ixas.ixa_nexthop_v4 = nexthop;
614 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
615 } else {
616 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
617 in6_addr_t *nexthop6;
619 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
620 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
621 ixas.ixa_flags |= IXAF_NEXTHOP_SET;
622 ixas.ixa_nexthop_v6 = *nexthop6;
624 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
626 error = ip_output_simple(mp, &ixas);
627 ixa_cleanup(&ixas);
629 netstack_rele(ns);
630 switch (error) {
631 case 0:
632 break;
634 case EHOSTUNREACH:
635 case ENETUNREACH:
636 error = ENONET;
637 break;
639 default:
640 error = ECOMM;
641 break;
643 return (error);
647 * callback function provided by ire_ftable_lookup when calling
648 * rn_match_args(). Invoke ire_match_args on each matching leaf node in
649 * the radix tree.
651 boolean_t
652 ire_find_best_route(struct radix_node *rn, void *arg)
654 struct rt_entry *rt = (struct rt_entry *)rn;
655 irb_t *irb_ptr;
656 ire_t *ire;
657 ire_ftable_args_t *margs = arg;
658 ipaddr_t match_mask;
660 ASSERT(rt != NULL);
662 irb_ptr = &rt->rt_irb;
664 if (irb_ptr->irb_ire_cnt == 0)
665 return (B_FALSE);
667 rw_enter(&irb_ptr->irb_lock, RW_READER);
668 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
669 if (IRE_IS_CONDEMNED(ire))
670 continue;
671 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
672 if (margs->ift_flags & MATCH_IRE_MASK)
673 match_mask = margs->ift_mask;
674 else
675 match_mask = ire->ire_mask;
677 if (ire_match_args(ire, margs->ift_addr, match_mask,
678 margs->ift_gateway, margs->ift_type, margs->ift_ill,
679 margs->ift_zoneid, margs->ift_flags)) {
680 ire_refhold(ire);
681 rw_exit(&irb_ptr->irb_lock);
682 margs->ift_best_ire = ire;
683 return (B_TRUE);
686 rw_exit(&irb_ptr->irb_lock);
687 return (B_FALSE);
691 * ftable irb_t structures are dynamically allocated, and we need to
692 * check if the irb_t (and associated ftable tree attachment) needs to
693 * be cleaned up when the irb_refcnt goes to 0. The conditions that need
694 * be verified are:
695 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
696 * - no other threads holding references to ire's in the bucket,
697 * i.e., irb_nire == 0
698 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
699 * - need to hold the global tree lock and irb_lock in write mode.
701 void
702 irb_refrele_ftable(irb_t *irb)
704 for (;;) {
705 rw_enter(&irb->irb_lock, RW_WRITER);
706 ASSERT(irb->irb_refcnt != 0);
707 if (irb->irb_refcnt != 1) {
709 * Someone has a reference to this radix node
710 * or there is some bucket walker.
712 irb->irb_refcnt--;
713 rw_exit(&irb->irb_lock);
714 return;
715 } else {
717 * There is no other walker, nor is there any
718 * other thread that holds a direct ref to this
719 * radix node. Do the clean up if needed. Call
720 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
722 if (irb->irb_marks & IRB_MARK_CONDEMNED) {
723 ire_t *ire_list;
725 ire_list = ire_unlink(irb);
726 rw_exit(&irb->irb_lock);
728 if (ire_list != NULL)
729 ire_cleanup(ire_list);
731 * more CONDEMNED entries could have
732 * been added while we dropped the lock,
733 * so we have to re-check.
735 continue;
739 * Now check if there are still any ires
740 * associated with this radix node.
742 if (irb->irb_nire != 0) {
744 * someone is still holding on
745 * to ires in this bucket
747 irb->irb_refcnt--;
748 rw_exit(&irb->irb_lock);
749 return;
750 } else {
752 * Everything is clear. Zero walkers,
753 * Zero threads with a ref to this
754 * radix node, Zero ires associated with
755 * this radix node. Due to lock order,
756 * check the above conditions again
757 * after grabbing all locks in the right order
759 rw_exit(&irb->irb_lock);
760 if (irb_inactive(irb))
761 return;
763 * irb_inactive could not free the irb.
764 * See if there are any walkers, if not
765 * try to clean up again.
773 * IRE iterator used by ire_ftable_lookup to process multiple equal
774 * routes. Given a starting point in the hash list (hash), walk the IREs
775 * in the bucket skipping deleted entries. We treat the bucket as a circular
776 * list for the purposes of walking it.
777 * Returns the IRE (held) that corresponds to the hash value. If that IRE is
778 * not applicable (ire_match_args failed) then it returns a subsequent one.
779 * If we fail to find an IRE we return NULL.
781 * Assumes that the caller holds a reference on the IRE bucket and a read lock
782 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
784 * Applies to IPv4 and IPv6.
786 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
787 * address and bucket, we compare against ire_type for the orig_ire. We also
788 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
789 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
791 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
792 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
793 * in which the zone has an IP address. We check this for the global zone
794 * even if no shared-IP zones are configured.
796 ire_t *
797 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
798 ire_t *orig_ire, ip_stack_t *ipst)
800 ire_t *ire, *maybe_ire = NULL;
801 uint_t maybe_badcnt;
802 uint_t maxwalk;
804 /* Fold in more bits from the hint/hash */
805 hash = hash ^ (hash >> 8) ^ (hash >> 16);
807 rw_enter(&irb_ptr->irb_lock, RW_WRITER);
808 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
809 if (maxwalk == 0) {
810 rw_exit(&irb_ptr->irb_lock);
811 return (NULL);
814 hash %= maxwalk;
815 irb_refhold_locked(irb_ptr);
816 rw_exit(&irb_ptr->irb_lock);
819 * Round-robin the routers list looking for a route that
820 * matches the passed in parameters.
821 * First we skip "hash" number of non-condemned IREs.
822 * Then we match the IRE.
823 * If we find an ire which has a non-zero ire_badcnt then we remember
824 * it and keep on looking for a lower ire_badcnt.
825 * If we come to the end of the list we continue (treat the
826 * bucket list as a circular list) but we match less than "max"
827 * entries.
829 ire = irb_ptr->irb_ire;
830 while (maxwalk > 0) {
831 if (IRE_IS_CONDEMNED(ire))
832 goto next_ire_skip;
834 /* Skip the first "hash" entries to do ECMP */
835 if (hash != 0) {
836 hash--;
837 goto next_ire_skip;
840 /* See CGTP comment above */
841 if (ire->ire_type != orig_ire->ire_type ||
842 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
843 goto next_ire;
846 * Note: Since IPv6 has hash buckets instead of radix
847 * buckers we need to explicitly compare the addresses.
848 * That makes this less efficient since we will be called
849 * even if there is no alternatives just because the
850 * bucket has multiple IREs for different addresses.
852 if (ire->ire_ipversion == IPV6_VERSION) {
853 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
854 &ire->ire_addr_v6))
855 goto next_ire;
859 * For some reason find_best_route uses ire_mask. We do
860 * the same.
862 if (ire->ire_ipversion == IPV4_VERSION ?
863 !ire_match_args(ire, margs->ift_addr,
864 ire->ire_mask, margs->ift_gateway,
865 margs->ift_type, margs->ift_ill, margs->ift_zoneid,
866 margs->ift_flags) :
867 !ire_match_args_v6(ire, &margs->ift_addr_v6,
868 &ire->ire_mask_v6, &margs->ift_gateway_v6,
869 margs->ift_type, margs->ift_ill, margs->ift_zoneid,
870 margs->ift_flags))
871 goto next_ire;
873 if (margs->ift_zoneid != ALL_ZONES &&
874 (ire->ire_type & IRE_OFFLINK)) {
876 * When we're in a zone, we're only
877 * interested in routers that are
878 * reachable through ipifs within our zone.
880 if (ire->ire_ipversion == IPV4_VERSION) {
881 if (!ire_gateway_ok_zone_v4(
882 ire->ire_gateway_addr, margs->ift_zoneid,
883 ire->ire_ill, ipst, B_TRUE))
884 goto next_ire;
885 } else {
886 if (!ire_gateway_ok_zone_v6(
887 &ire->ire_gateway_addr_v6,
888 margs->ift_zoneid, ire->ire_ill,
889 ipst, B_TRUE))
890 goto next_ire;
893 mutex_enter(&ire->ire_lock);
894 /* Look for stale ire_badcnt and clear */
895 if (ire->ire_badcnt != 0 &&
896 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
897 ipst->ips_ip_ire_badcnt_lifetime))
898 ire->ire_badcnt = 0;
899 mutex_exit(&ire->ire_lock);
901 if (ire->ire_badcnt == 0) {
902 /* We found one with a zero badcnt; done */
903 ire_refhold(ire);
905 * Care needed since irb_refrele grabs WLOCK to free
906 * the irb_t.
908 if (ire->ire_ipversion == IPV4_VERSION) {
909 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
910 irb_refrele(irb_ptr);
911 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
912 } else {
913 rw_exit(&ipst->ips_ip6_ire_head_lock);
914 irb_refrele(irb_ptr);
915 rw_enter(&ipst->ips_ip6_ire_head_lock,
916 RW_READER);
918 return (ire);
921 * keep looking to see if there is a better (lower
922 * badcnt) matching IRE, but save this one as a last resort.
923 * If we find a lower badcnt pick that one as the last* resort.
925 if (maybe_ire == NULL) {
926 maybe_ire = ire;
927 maybe_badcnt = ire->ire_badcnt;
928 } else if (ire->ire_badcnt < maybe_badcnt) {
929 maybe_ire = ire;
930 maybe_badcnt = ire->ire_badcnt;
933 next_ire:
934 maxwalk--;
935 next_ire_skip:
936 ire = ire->ire_next;
937 if (ire == NULL)
938 ire = irb_ptr->irb_ire;
940 if (maybe_ire != NULL)
941 ire_refhold(maybe_ire);
943 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
944 if (ire->ire_ipversion == IPV4_VERSION) {
945 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
946 irb_refrele(irb_ptr);
947 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
948 } else {
949 rw_exit(&ipst->ips_ip6_ire_head_lock);
950 irb_refrele(irb_ptr);
951 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
953 return (maybe_ire);
956 void
957 irb_refhold_rn(struct radix_node *rn)
959 if ((rn->rn_flags & RNF_ROOT) == 0)
960 irb_refhold(&((rt_t *)(rn))->rt_irb);
963 void
964 irb_refrele_rn(struct radix_node *rn)
966 if ((rn->rn_flags & RNF_ROOT) == 0)
967 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
972 * ip_select_src_ill() is used by ip_select_route() to find the src_ill
973 * to be used for source-aware routing table lookup. This function will
974 * ignore IPIF_UNNUMBERED interface addresses, and will only return a
975 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
976 * interfaces).
978 static ill_t *
979 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
981 ipif_t *ipif;
982 ill_t *ill;
983 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
984 ipaddr_t v4src;
986 if (isv6) {
987 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
988 } else {
989 IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
990 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
992 if (ipif == NULL)
993 return (NULL);
994 ill = ipif->ipif_ill;
995 ill_refhold(ill);
996 ipif_refrele(ipif);
997 return (ill);
1001 * verify that v6src is configured on ill
1003 static boolean_t
1004 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
1006 ipif_t *ipif;
1007 ip_stack_t *ipst;
1008 ipaddr_t v4src;
1010 if (ill == NULL)
1011 return (B_FALSE);
1012 ipst = ill->ill_ipst;
1014 if (ill->ill_isv6) {
1015 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
1016 } else {
1017 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
1018 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
1021 if (ipif != NULL) {
1022 ipif_refrele(ipif);
1023 return (B_TRUE);
1024 } else {
1025 return (B_FALSE);
1030 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1031 * routes this routine sets up a ire_nce_cache as well. The caller needs to
1032 * lookup an nce for the multicast case.
1034 * When src_multihoming is set to 2 (strict src multihoming) we use the source
1035 * address to select the interface and route. If IP_BOUND_IF etc are
1036 * specified, we require that they specify an interface on which the
1037 * source address is assigned.
1039 * When src_multihoming is set to 1 (preferred src aware route
1040 * selection) the unicast lookup prefers a matching source
1041 * (i.e., that the route points out an ill on which the source is assigned), but
1042 * if no such route is found we fallback to not considering the source in the
1043 * route lookup.
1045 * We skip the src_multihoming check when the source isn't (yet) set, and
1046 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1047 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1048 * when secpolicy_net_rawaccess().
1050 ire_t *
1051 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
1052 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1053 int *errorp, boolean_t *multirtp)
1055 uint_t match_args;
1056 uint_t ire_type;
1057 ill_t *ill = NULL;
1058 ire_t *ire;
1059 ip_stack_t *ipst = ixa->ixa_ipst;
1060 ipaddr_t v4dst;
1061 in6_addr_t v6nexthop;
1062 iaflags_t ixaflags = ixa->ixa_flags;
1063 nce_t *nce;
1064 boolean_t preferred_src_aware = B_FALSE;
1065 boolean_t verify_src;
1066 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
1067 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
1070 * We only verify that the src has been configured on a selected
1071 * interface if the src is not :: or INADDR_ANY, and if the
1072 * IXAF_VERIFY_SOURCE flag is set.
1074 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
1075 (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
1077 match_args = 0;
1078 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
1079 if (setsrcp != NULL)
1080 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1081 if (errorp != NULL)
1082 ASSERT(*errorp == 0);
1085 * The content of the ixa will be different if IP_NEXTHOP,
1086 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1089 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
1090 /* Pick up the IRE_MULTICAST for the ill */
1091 if (ixa->ixa_multicast_ifindex != 0) {
1092 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1093 isv6, ipst);
1094 } else if (ixaflags & IXAF_SCOPEID_SET) {
1095 /* sin6_scope_id takes precedence over ixa_ifindex */
1096 ASSERT(ixa->ixa_scopeid != 0);
1097 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1098 isv6, ipst);
1099 } else if (ixa->ixa_ifindex != 0) {
1101 * In the ipmp case, the ixa_ifindex is set to
1102 * point at an under_ill and we would return the
1103 * ire_multicast() corresponding to that under_ill.
1105 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1106 isv6, ipst);
1107 } else if (src_multihoming != 0 && verify_src) {
1108 /* Look up the ill based on the source address */
1109 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1111 * Since we looked up the ill from the source there
1112 * is no need to verify that the source is on the ill
1113 * below.
1115 verify_src = B_FALSE;
1116 if (ill != NULL && IS_VNI(ill)) {
1117 ill_t *usesrc = ill;
1119 ill = ill_lookup_usesrc(usesrc);
1120 ill_refrele(usesrc);
1122 } else if (!isv6) {
1123 ipaddr_t v4setsrc = INADDR_ANY;
1125 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
1126 ipst, multirtp, &v4setsrc);
1127 if (setsrcp != NULL)
1128 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1129 } else {
1130 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
1131 ipst, multirtp, setsrcp);
1133 if (ill != NULL && IS_VNI(ill)) {
1134 ill_refrele(ill);
1135 ill = NULL;
1137 if (ill == NULL) {
1138 if (errorp != NULL)
1139 *errorp = ENXIO;
1140 /* Get a hold on the IRE_NOROUTE */
1141 ire = ire_reject(ipst, isv6);
1142 return (ire);
1144 if (!(ill->ill_flags & ILLF_MULTICAST)) {
1145 ill_refrele(ill);
1146 if (errorp != NULL)
1147 *errorp = EHOSTUNREACH;
1148 /* Get a hold on the IRE_NOROUTE */
1149 ire = ire_reject(ipst, isv6);
1150 return (ire);
1153 * If we are doing the strictest src_multihoming, then
1154 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1155 * an interface that is consistent with the source address.
1157 if (verify_src && src_multihoming == 2 &&
1158 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1159 if (errorp != NULL)
1160 *errorp = EADDRNOTAVAIL;
1161 ill_refrele(ill);
1162 /* Get a hold on the IRE_NOROUTE */
1163 ire = ire_reject(ipst, isv6);
1164 return (ire);
1166 /* Get a refcnt on the single IRE_MULTICAST per ill */
1167 ire = ire_multicast(ill);
1168 ill_refrele(ill);
1169 if (generationp != NULL)
1170 *generationp = ire->ire_generation;
1171 if (errorp != NULL &&
1172 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1173 *errorp = EHOSTUNREACH;
1175 return (ire);
1178 /* Now for unicast */
1179 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1180 if (ixaflags & IXAF_SCOPEID_SET) {
1181 /* sin6_scope_id takes precedence over ixa_ifindex */
1182 ASSERT(ixa->ixa_scopeid != 0);
1183 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1184 isv6, ipst);
1185 } else {
1186 ASSERT(ixa->ixa_ifindex != 0);
1187 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1188 isv6, ipst);
1190 if (ill != NULL && IS_VNI(ill)) {
1191 ill_refrele(ill);
1192 ill = NULL;
1194 if (ill == NULL) {
1195 if (errorp != NULL)
1196 *errorp = ENXIO;
1197 /* Get a hold on the IRE_NOROUTE */
1198 ire = ire_reject(ipst, isv6);
1199 return (ire);
1202 match_args |= MATCH_IRE_ILL;
1205 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1206 * so for both of them we need to be able look for an under
1207 * interface.
1209 if (IS_UNDER_IPMP(ill))
1210 match_args |= MATCH_IRE_TESTHIDDEN;
1213 * If we are doing the strictest src_multihoming, then
1214 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1215 * an interface that is consistent with the source address.
1217 if (src_multihoming == 2 &&
1218 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1219 if (errorp != NULL)
1220 *errorp = EADDRNOTAVAIL;
1221 ill_refrele(ill);
1222 /* Get a hold on the IRE_NOROUTE */
1223 ire = ire_reject(ipst, isv6);
1224 return (ire);
1226 } else if (src_multihoming != 0 && verify_src) {
1227 /* Look up the ill based on the source address */
1228 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1229 if (ill == NULL) {
1230 char addrbuf[INET6_ADDRSTRLEN];
1232 ip3dbg(("%s not a valid src for unicast",
1233 inet_ntop(AF_INET6, &v6src, addrbuf,
1234 sizeof (addrbuf))));
1235 if (errorp != NULL)
1236 *errorp = EADDRNOTAVAIL;
1237 /* Get a hold on the IRE_NOROUTE */
1238 ire = ire_reject(ipst, isv6);
1239 return (ire);
1241 match_args |= MATCH_IRE_SRC_ILL;
1242 preferred_src_aware = (src_multihoming == 1);
1245 if (ixaflags & IXAF_NEXTHOP_SET) {
1246 /* IP_NEXTHOP was set */
1247 v6nexthop = ixa->ixa_nexthop_v6;
1248 } else {
1249 v6nexthop = *v6dst;
1252 ire_type = 0;
1255 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1256 * we only look for an onlink IRE.
1258 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1259 match_args |= MATCH_IRE_TYPE;
1260 ire_type = IRE_ONLINK;
1263 retry:
1264 if (!isv6) {
1265 ipaddr_t v4nexthop;
1266 ipaddr_t v4setsrc = INADDR_ANY;
1268 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1269 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1270 ixa->ixa_zoneid, match_args, IRR_ALLOCATE,
1271 ixa->ixa_xmit_hint, ipst, &v4setsrc, generationp);
1272 if (setsrcp != NULL)
1273 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1274 } else {
1275 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1276 ixa->ixa_zoneid, match_args, IRR_ALLOCATE,
1277 ixa->ixa_xmit_hint, ipst, setsrcp, generationp);
1280 #ifdef DEBUG
1281 if (match_args & MATCH_IRE_TESTHIDDEN) {
1282 ip3dbg(("looking for hidden; dst %x ire %p\n",
1283 v4dst, (void *)ire));
1285 #endif
1286 if (ill != NULL) {
1287 ill_refrele(ill);
1288 ill = NULL;
1290 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1291 (ire->ire_type & IRE_MULTICAST)) {
1292 if (preferred_src_aware) {
1294 * "Preferred Source Aware" send mode. If we cannot
1295 * find an ire whose ire_ill had the desired source
1296 * address retry after relaxing the ill matching
1297 * constraint.
1299 ire_refrele(ire);
1300 preferred_src_aware = B_FALSE;
1301 match_args &= ~MATCH_IRE_SRC_ILL;
1302 goto retry;
1304 /* No ire_nce_cache */
1305 return (ire);
1308 /* Setup ire_nce_cache if it doesn't exist or is condemned. */
1309 mutex_enter(&ire->ire_lock);
1310 nce = ire->ire_nce_cache;
1311 if (nce == NULL || nce->nce_is_condemned) {
1312 mutex_exit(&ire->ire_lock);
1313 (void) ire_revalidate_nce(ire);
1314 } else {
1315 mutex_exit(&ire->ire_lock);
1317 return (ire);
1321 * Find a route given some xmit attributes and a packet.
1322 * Generic for IPv4 and IPv6
1324 * This never returns NULL. But when it returns the IRE_NOROUTE
1325 * it might set errorp.
1327 ire_t *
1328 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1329 int *errorp, boolean_t *multirtp)
1331 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1332 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1333 in6_addr_t v6dst, v6src;
1335 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1336 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
1338 return (ip_select_route(&v6dst, v6src, ixa, generationp,
1339 NULL, errorp, multirtp));
1340 } else {
1341 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1343 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
1344 ixa, generationp, NULL, errorp, multirtp));
1348 ire_t *
1349 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
1350 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1352 in6_addr_t v6dst, v6src;
1353 ire_t *ire;
1354 in6_addr_t setsrc;
1356 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1358 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1359 IN6_IPADDR_TO_V4MAPPED(src, &v6src);
1361 setsrc = ipv6_all_zeros;
1362 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
1363 multirtp);
1364 if (v4setsrcp != NULL)
1365 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1366 return (ire);
1370 * Recursively look for a route to the destination. Can also match on
1371 * the zoneid and ill. Used for the data paths. See also
1372 * ire_route_recursive.
1374 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1375 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1376 * forwarding.
1377 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1378 * resolve the gateway.
1380 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1381 * instead.
1383 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1384 * is an error.
1385 * Allow at most one RTF_INDIRECT.
1387 ire_t *
1388 ire_route_recursive_impl_v4(ire_t *ire,
1389 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1390 zoneid_t zoneid, uint_t match_args, uint_t irr_flags, uint32_t xmit_hint,
1391 ip_stack_t *ipst, ipaddr_t *setsrcp, uint_t *generationp)
1393 int i, j;
1394 ire_t *ires[MAX_IRE_RECURSION];
1395 uint_t generation;
1396 uint_t generations[MAX_IRE_RECURSION];
1397 boolean_t need_refrele = B_FALSE;
1398 boolean_t invalidate = B_FALSE;
1399 ill_t *ill = NULL;
1400 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1402 if (setsrcp != NULL)
1403 ASSERT(*setsrcp == INADDR_ANY);
1406 * We iterate up to three times to resolve a route, even though
1407 * we have four slots in the array. The extra slot is for an
1408 * IRE_IF_CLONE we might need to create.
1410 i = 0;
1411 while (i < MAX_IRE_RECURSION - 1) {
1412 /* ire_ftable_lookup handles round-robin/ECMP */
1413 if (ire == NULL) {
1414 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1415 (ill != NULL? ill : ill_arg), zoneid, match_args,
1416 xmit_hint, ipst, &generation);
1417 } else {
1418 /* Caller passed it; extra hold since we will rele */
1419 ire_refhold(ire);
1420 if (generationp != NULL)
1421 generation = *generationp;
1422 else
1423 generation = IRE_GENERATION_VERIFY;
1425 if (ire == NULL) {
1426 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1427 ire = ires[0];
1428 ire_refhold(ire);
1429 } else {
1430 ire = ire_reject(ipst, B_FALSE);
1432 goto error;
1435 /* Need to return the ire with RTF_REJECT|BLACKHOLE */
1436 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1437 goto error;
1439 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1441 * Verify that the IRE_IF_CLONE has a consistent generation
1442 * number.
1444 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1445 ire_refrele(ire);
1446 ire = NULL;
1447 continue;
1451 * Don't allow anything unusual past the first iteration.
1452 * After the first lookup, we should no longer look for
1453 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1454 * routes.
1456 * In addition, after we have found a direct IRE_OFFLINK,
1457 * we should only look for interface or clone routes.
1459 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1461 if ((ire->ire_type & IRE_OFFLINK) &&
1462 !(ire->ire_flags & RTF_INDIRECT)) {
1463 ire_type = IRE_IF_ALL;
1464 } else {
1466 * no more local, loopback, broadcast routes
1468 if (!(match_args & MATCH_IRE_TYPE))
1469 ire_type = (IRE_OFFLINK|IRE_ONLINK);
1470 ire_type &= ~maskoff;
1472 match_args |= MATCH_IRE_TYPE;
1474 /* We have a usable IRE */
1475 ires[i] = ire;
1476 generations[i] = generation;
1477 i++;
1479 /* The first RTF_SETSRC address is passed back if setsrcp */
1480 if ((ire->ire_flags & RTF_SETSRC) &&
1481 setsrcp != NULL && *setsrcp == INADDR_ANY) {
1482 ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1483 *setsrcp = ire->ire_setsrc_addr;
1487 * Check if we have a short-cut pointer to an IRE for this
1488 * destination, and that the cached dependency isn't stale.
1489 * In that case we've rejoined an existing tree towards a
1490 * parent, thus we don't need to continue the loop to
1491 * discover the rest of the tree.
1493 mutex_enter(&ire->ire_lock);
1494 if (ire->ire_dep_parent != NULL &&
1495 ire->ire_dep_parent->ire_generation ==
1496 ire->ire_dep_parent_generation) {
1497 mutex_exit(&ire->ire_lock);
1498 ire = NULL;
1499 goto done;
1501 mutex_exit(&ire->ire_lock);
1504 * If this type should have an ire_nce_cache (even if it
1505 * doesn't yet have one) then we are done. Includes
1506 * IRE_INTERFACE with a full 32 bit mask.
1508 if (ire->ire_nce_capable) {
1509 ire = NULL;
1510 goto done;
1512 ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1514 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1515 * particular destination
1517 if (ire->ire_type & IRE_INTERFACE) {
1518 in6_addr_t v6nexthop;
1519 ire_t *clone;
1521 ASSERT(ire->ire_masklen != IPV4_ABITS);
1524 * In the case of ip_input and ILLF_FORWARDING not
1525 * being set, and in the case of RTM_GET, there is
1526 * no point in allocating an IRE_IF_CLONE. We return
1527 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1528 * result in a ire_dep_parent which is IRE_IF_*
1529 * without an IRE_IF_CLONE.
1530 * We recover from that when we need to send packets
1531 * by ensuring that the generations become
1532 * IRE_GENERATION_VERIFY in this case.
1534 if (!(irr_flags & IRR_ALLOCATE)) {
1535 invalidate = B_TRUE;
1536 ire = NULL;
1537 goto done;
1540 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1542 clone = ire_create_if_clone(ire, &v6nexthop,
1543 &generation);
1544 if (clone == NULL) {
1546 * Temporary failure - no memory.
1547 * Don't want caller to cache IRE_NOROUTE.
1549 invalidate = B_TRUE;
1550 ire = ire_blackhole(ipst, B_FALSE);
1551 goto error;
1554 * Make clone next to last entry and the
1555 * IRE_INTERFACE the last in the dependency
1556 * chain since the clone depends on the
1557 * IRE_INTERFACE.
1559 ASSERT(i >= 1);
1560 ASSERT(i < MAX_IRE_RECURSION);
1562 ires[i] = ires[i-1];
1563 generations[i] = generations[i-1];
1564 ires[i-1] = clone;
1565 generations[i-1] = generation;
1566 i++;
1568 ire = NULL;
1569 goto done;
1573 * We only match on the type and optionally ILL when
1574 * recursing. The type match is used by some callers
1575 * to exclude certain types (such as IRE_IF_CLONE or
1576 * IRE_LOCAL|IRE_LOOPBACK).
1578 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1579 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1580 * ire_ill, so we set ill to the ire_ill;
1582 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1583 nexthop = ire->ire_gateway_addr;
1584 if (ill == NULL && ire->ire_ill != NULL) {
1585 ill = ire->ire_ill;
1586 need_refrele = B_TRUE;
1587 ill_refhold(ill);
1588 match_args |= MATCH_IRE_ILL;
1590 ire = NULL;
1592 ASSERT(ire == NULL);
1593 ire = ire_reject(ipst, B_FALSE);
1595 error:
1596 ASSERT(ire != NULL);
1597 if (need_refrele)
1598 ill_refrele(ill);
1601 * In the case of MULTIRT we want to try a different IRE the next
1602 * time. We let the next packet retry in that case.
1604 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1605 (void) ire_no_good(ires[0]);
1607 cleanup:
1608 /* cleanup ires[i] */
1609 ire_dep_unbuild(ires, i);
1610 for (j = 0; j < i; j++)
1611 ire_refrele(ires[j]);
1613 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1614 (irr_flags & IRR_INCOMPLETE));
1616 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1617 * ip_select_route since the reject or lack of memory might be gone.
1619 if (generationp != NULL)
1620 *generationp = IRE_GENERATION_VERIFY;
1621 return (ire);
1623 done:
1624 ASSERT(ire == NULL);
1625 if (need_refrele) {
1626 ill_refrele(ill);
1627 ill = NULL;
1630 /* Build dependencies */
1631 if (i > 1 && !ire_dep_build(ires, generations, i)) {
1632 /* Something in chain was condemned; tear it apart */
1633 ire = ire_reject(ipst, B_FALSE);
1634 goto cleanup;
1638 * Release all refholds except the one for ires[0] that we
1639 * will return to the caller.
1641 for (j = 1; j < i; j++)
1642 ire_refrele(ires[j]);
1644 if (invalidate) {
1646 * Since we needed to allocate but couldn't we need to make
1647 * sure that the dependency chain is rebuilt the next time.
1649 ire_dep_invalidate_generations(ires[0]);
1650 generation = IRE_GENERATION_VERIFY;
1651 } else {
1653 * IREs can have been added or deleted while we did the
1654 * recursive lookup and we can't catch those until we've built
1655 * the dependencies. We verify the stored
1656 * ire_dep_parent_generation to catch any such changes and
1657 * return IRE_GENERATION_VERIFY (which will cause
1658 * ip_select_route to be called again so we can redo the
1659 * recursive lookup next time we send a packet.
1661 if (ires[0]->ire_dep_parent == NULL)
1662 generation = ires[0]->ire_generation;
1663 else
1664 generation = ire_dep_validate_generations(ires[0]);
1665 if (generations[0] != ires[0]->ire_generation) {
1666 /* Something changed at the top */
1667 generation = IRE_GENERATION_VERIFY;
1670 if (generationp != NULL)
1671 *generationp = generation;
1673 return (ires[0]);
1676 ire_t *
1677 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1678 zoneid_t zoneid, uint_t match_args, uint_t irr_flags, uint32_t xmit_hint,
1679 ip_stack_t *ipst, ipaddr_t *setsrcp, uint_t *generationp)
1681 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1682 zoneid, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1683 generationp));
1687 * Recursively look for a route to the destination.
1688 * We only handle a destination match here, yet we have the same arguments
1689 * as the full match to allow function pointers to select between the two.
1691 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1692 * instead.
1694 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1695 * is an error.
1696 * Allow at most one RTF_INDIRECT.
1698 ire_t *
1699 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1700 uint32_t xmit_hint, ip_stack_t *ipst)
1702 ire_t *ire;
1703 ire_t *ire1;
1704 uint_t generation;
1706 /* ire_ftable_lookup handles round-robin/ECMP */
1707 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1708 &generation);
1709 ASSERT(ire != NULL);
1711 * If the IRE has a current cached parent we know that the whole
1712 * parent chain is current, hence we don't need to discover and
1713 * build any dependencies by doing a recursive lookup.
1715 mutex_enter(&ire->ire_lock);
1716 if (ire->ire_dep_parent != NULL) {
1717 if (ire->ire_dep_parent->ire_generation ==
1718 ire->ire_dep_parent_generation) {
1719 mutex_exit(&ire->ire_lock);
1720 return (ire);
1722 mutex_exit(&ire->ire_lock);
1723 } else {
1724 mutex_exit(&ire->ire_lock);
1726 * If this type should have an ire_nce_cache (even if it
1727 * doesn't yet have one) then we are done. Includes
1728 * IRE_INTERFACE with a full 32 bit mask.
1730 if (ire->ire_nce_capable)
1731 return (ire);
1735 * Fallback to loop in the normal code starting with the ire
1736 * we found. Normally this would return the same ire.
1738 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1739 MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, &generation);
1740 ire_refrele(ire);
1741 return (ire1);
1745 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1746 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1747 * are not consistent, and TRUE otherwise.
1749 boolean_t
1750 ire_clone_verify(ire_t *ire)
1752 ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1753 mutex_enter(&ire->ire_lock);
1754 if (ire->ire_dep_parent != NULL &&
1755 ire->ire_dep_parent->ire_generation !=
1756 ire->ire_dep_parent_generation) {
1757 mutex_exit(&ire->ire_lock);
1758 ire_delete(ire);
1759 return (B_FALSE);
1761 mutex_exit(&ire->ire_lock);
1762 return (B_TRUE);