Unleashed v1.4
[unleashed.git] / kernel / net / ip / ip_output.c
blob7e1f5397804bc75ad27a7cd94c3870863526414a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1990 Mentat Inc. */
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsubr.h>
30 #include <sys/dlpi.h>
31 #include <sys/strsun.h>
32 #include <sys/zone.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/atomic.h>
39 #include <sys/systm.h>
40 #include <sys/param.h>
41 #include <sys/kmem.h>
42 #include <sys/sdt.h>
43 #include <sys/socket.h>
44 #include <sys/mac.h>
45 #include <net/if.h>
46 #include <net/if_arp.h>
47 #include <net/route.h>
48 #include <sys/sockio.h>
49 #include <netinet/in.h>
50 #include <net/if_dl.h>
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/arp.h>
57 #include <inet/snmpcom.h>
58 #include <inet/kstatcom.h>
60 #include <netinet/igmp_var.h>
61 #include <netinet/ip6.h>
62 #include <netinet/icmp6.h>
63 #include <netinet/sctp.h>
65 #include <inet/ip.h>
66 #include <inet/ip_impl.h>
67 #include <inet/ip6.h>
68 #include <inet/ip6_asp.h>
69 #include <inet/tcp.h>
70 #include <inet/ip_multi.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_ire.h>
73 #include <inet/ip_ftable.h>
74 #include <inet/ip_rts.h>
75 #include <inet/optcom.h>
76 #include <inet/ip_ndp.h>
77 #include <inet/ip_listutils.h>
78 #include <netinet/igmp.h>
79 #include <netinet/ip_mroute.h>
80 #include <inet/ipp_common.h>
82 #include <net/pfkeyv2.h>
83 #include <inet/sadb.h>
84 #include <inet/ipsec_impl.h>
85 #include <inet/ipdrop.h>
86 #include <inet/ip_netinfo.h>
88 #include <sys/pattr.h>
89 #include <inet/ipclassifier.h>
90 #include <inet/sctp_ip.h>
91 #include <inet/sctp/sctp_impl.h>
92 #include <inet/udp_impl.h>
93 #include <sys/sunddi.h>
95 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */
97 #ifdef DEBUG
98 extern boolean_t skip_sctp_cksum;
99 #endif
101 static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
102 static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
103 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
104 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
105 static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
108 * There are two types of output functions for IP used for different
109 * purposes:
110 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
111 * is no context in the form of a conn_t. However, there is a
112 * ip_xmit_attr_t that the callers use to influence interface selection
113 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
115 * - conn_ip_output() is used when sending packets with a conn_t and
116 * ip_set_destination has been called to cache information. In that case
117 * various socket options are recorded in the ip_xmit_attr_t and should
118 * be taken into account.
122 * The caller *must* have called conn_connect() or ip_attr_connect()
123 * before calling conn_ip_output(). The caller needs to redo that each time
124 * the destination IP address or port changes, as well as each time there is
125 * a change to any socket option that would modify how packets are routed out
126 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
128 * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
129 * We assert for that here.
132 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
134 iaflags_t ixaflags = ixa->ixa_flags;
135 ire_t *ire;
136 nce_t *nce;
137 dce_t *dce;
138 ill_t *ill;
139 ip_stack_t *ipst = ixa->ixa_ipst;
140 int error;
142 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */
144 ASSERT(ixa->ixa_ire != NULL);
145 /* Note there is no ixa_nce when reject and blackhole routes */
146 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */
148 #ifdef DEBUG
149 ASSERT(ixa->ixa_curthread == NULL);
150 ixa->ixa_curthread = curthread;
151 #endif
153 ire = ixa->ixa_ire;
156 * If the ULP says the (old) IRE resulted in reachability we
157 * record this before determine whether to use a new IRE.
158 * No locking for performance reasons.
160 if (ixaflags & IXAF_REACH_CONF)
161 ire->ire_badcnt = 0;
164 * Has routing changed since we cached the results of the lookup?
166 * This check captures all of:
167 * - the cached ire being deleted (by means of the special
168 * IRE_GENERATION_CONDEMNED)
169 * - A potentially better ire being added (ire_generation being
170 * increased)
171 * - A deletion of the nexthop ire that was used when we did the
172 * lookup.
173 * - An addition of a potentially better nexthop ire.
174 * The last two are handled by walking and increasing the generation
175 * number on all dependant IREs in ire_flush_cache().
177 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
178 * since we ensure that each time we set ixa_ire to such an IRE we
179 * make sure the ixa_ire_generation does not match (by using
180 * IRE_GENERATION_VERIFY).
182 if (ire->ire_generation != ixa->ixa_ire_generation) {
183 error = ip_verify_ire(mp, ixa);
184 if (error != 0) {
185 ip_drop_output("ipIfStatsOutDiscards - verify ire",
186 mp, NULL);
187 goto drop;
189 ire = ixa->ixa_ire;
190 ASSERT(ire != NULL);
191 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
192 #ifdef DEBUG
193 ASSERT(ixa->ixa_curthread == curthread);
194 ixa->ixa_curthread = NULL;
195 #endif
196 ire->ire_ob_pkt_count++;
197 /* ixa_dce might be condemned; use default one */
198 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
199 &ipst->ips_dce_default->dce_ident));
202 * If the ncec changed then ip_verify_ire already set
203 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
204 * so we can recheck the interface mtu.
208 * Note that ire->ire_generation could already have changed.
209 * We catch that next time we send a packet.
214 * No need to lock access to ixa_nce since the ip_xmit_attr usage
215 * is single threaded.
217 ASSERT(ixa->ixa_nce != NULL);
218 nce = ixa->ixa_nce;
219 if (nce->nce_is_condemned) {
220 error = ip_verify_nce(mp, ixa);
222 * In case ZEROCOPY capability become not available, we
223 * copy the message and free the original one. We might
224 * be copying more data than needed but it doesn't hurt
225 * since such change rarely happens.
227 switch (error) {
228 case 0:
229 break;
230 case ENOTSUP: { /* ZEROCOPY */
231 mblk_t *nmp;
233 if ((nmp = copymsg(mp)) != NULL) {
234 freemsg(mp);
235 mp = nmp;
237 break;
240 /* FALLTHROUGH */
241 default:
242 ip_drop_output("ipIfStatsOutDiscards - verify nce",
243 mp, NULL);
244 goto drop;
246 ire = ixa->ixa_ire;
247 ASSERT(ire != NULL);
248 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
249 #ifdef DEBUG
250 ASSERT(ixa->ixa_curthread == curthread);
251 ixa->ixa_curthread = NULL;
252 #endif
253 ire->ire_ob_pkt_count++;
254 /* ixa_dce might be condemned; use default one */
255 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
256 ixa, &ipst->ips_dce_default->dce_ident));
258 ASSERT(ixa->ixa_nce != NULL);
259 nce = ixa->ixa_nce;
262 * Note that some other event could already have made
263 * the new nce condemned. We catch that next time we
264 * try to send a packet.
268 * If there is no per-destination dce_t then we have a reference to
269 * the default dce_t (which merely contains the dce_ipid).
270 * The generation check captures both the introduction of a
271 * per-destination dce_t (e.g., due to ICMP packet too big) and
272 * any change to the per-destination dce (including it becoming
273 * condemned by use of the special DCE_GENERATION_CONDEMNED).
275 dce = ixa->ixa_dce;
278 * To avoid a periodic timer to increase the path MTU we
279 * look at dce_last_change_time each time we send a packet.
281 if (dce->dce_flags & DCEF_PMTU) {
282 int64_t now = LBOLT_FASTPATH64;
284 if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
285 ipst->ips_ip_pathmtu_interval)) {
287 * Older than 20 minutes. Drop the path MTU information.
288 * Since the path MTU changes as a result of this,
289 * twiddle ixa_dce_generation to make us go through the
290 * dce verification code in conn_ip_output.
292 mutex_enter(&dce->dce_lock);
293 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
294 dce->dce_last_change_time = TICK_TO_SEC(now);
295 mutex_exit(&dce->dce_lock);
296 dce_increment_generation(dce);
300 if (dce->dce_generation != ixa->ixa_dce_generation) {
301 error = ip_verify_dce(mp, ixa);
302 if (error != 0) {
303 ip_drop_output("ipIfStatsOutDiscards - verify dce",
304 mp, NULL);
305 goto drop;
307 dce = ixa->ixa_dce;
310 * Note that some other event could already have made the
311 * new dce's generation number change.
312 * We catch that next time we try to send a packet.
316 ill = nce->nce_ill;
319 * An initial ixa_fragsize was set in ip_set_destination
320 * and we update it if any routing changes above.
321 * A change to ill_mtu with ifconfig will increase all dce_generation
322 * so that we will detect that with the generation check. Ditto for
323 * ill_mc_mtu.
327 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
328 * conn_unspec_src.
330 if ((ixaflags & IXAF_VERIFY_SOURCE) &&
331 ixa->ixa_src_generation != ipst->ips_src_generation) {
332 /* Check if the IP source is still assigned to the host. */
333 uint_t gen;
335 if (!ip_verify_src(mp, ixa, &gen)) {
336 /* Don't send a packet with a source that isn't ours */
337 error = EADDRNOTAVAIL;
338 ip_drop_output("ipIfStatsOutDiscards - invalid src",
339 mp, NULL);
340 goto drop;
342 /* The source is still valid - update the generation number */
343 ixa->ixa_src_generation = gen;
347 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
348 * can only count the use prior to fragmentation. However the MIB
349 * counters on the ill will be incremented in post fragmentation.
351 ire->ire_ob_pkt_count++;
352 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
355 * Based on ire_type and ire_flags call one of:
356 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
357 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
358 * ire_send_multicast_v* - for IRE_MULTICAST
359 * ire_send_broadcast_v4 - for IRE_BROADCAST
360 * ire_send_wire_v* - for the rest.
362 #ifdef DEBUG
363 ASSERT(ixa->ixa_curthread == curthread);
364 ixa->ixa_curthread = NULL;
365 #endif
366 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
368 drop:
369 if (ixaflags & IXAF_IS_IPV4) {
370 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
371 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
372 } else {
373 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
374 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
376 freemsg(mp);
377 #ifdef DEBUG
378 ASSERT(ixa->ixa_curthread == curthread);
379 ixa->ixa_curthread = NULL;
380 #endif
381 return (error);
385 * Handle both IPv4 and IPv6. Sets the generation number
386 * to allow the caller to know when to call us again.
387 * Returns true if the source address in the packet is a valid source.
388 * We handle callers which try to send with a zero address (since we only
389 * get here if UNSPEC_SRC is not set).
391 boolean_t
392 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
394 ip_stack_t *ipst = ixa->ixa_ipst;
397 * Need to grab the generation number before we check to
398 * avoid a race with a change to the set of local addresses.
399 * No lock needed since the thread which updates the set of local
400 * addresses use ipif/ill locks and exit those (hence a store memory
401 * barrier) before doing the atomic increase of ips_src_generation.
403 if (generationp != NULL)
404 *generationp = ipst->ips_src_generation;
406 if (ixa->ixa_flags & IXAF_IS_IPV4) {
407 ipha_t *ipha = (ipha_t *)mp->b_rptr;
409 if (ipha->ipha_src == INADDR_ANY)
410 return (B_FALSE);
412 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
413 ipst, B_FALSE) != IPVL_BAD);
414 } else {
415 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
416 uint_t scopeid;
418 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
419 return (B_FALSE);
421 if (ixa->ixa_flags & IXAF_SCOPEID_SET)
422 scopeid = ixa->ixa_scopeid;
423 else
424 scopeid = 0;
426 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
427 ipst, B_FALSE, scopeid) != IPVL_BAD);
432 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
435 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
437 uint_t gen;
438 ire_t *ire;
439 nce_t *nce;
440 int error;
443 * Redo ip_select_route.
444 * Need to grab generation number as part of the lookup to
445 * avoid race.
447 error = 0;
448 ire = ip_select_route_pkt(mp, ixa, &gen, &error);
449 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
450 if (error != 0) {
451 ire_refrele(ire);
452 return (error);
455 if (ixa->ixa_ire != NULL)
456 ire_refrele_notr(ixa->ixa_ire);
457 #ifdef DEBUG
458 ire_refhold_notr(ire);
459 ire_refrele(ire);
460 #endif
461 ixa->ixa_ire = ire;
462 ixa->ixa_ire_generation = gen;
463 ixa->ixa_postfragfn = ire->ire_postfragfn;
466 * Don't look for an nce for reject or blackhole.
467 * They have ire_generation set to IRE_GENERATION_VERIFY which
468 * makes conn_ip_output avoid references to ixa_nce.
470 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
471 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
472 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
473 return (0);
476 /* The NCE could now be different */
477 nce = ire_to_nce_pkt(ire, mp);
478 if (nce == NULL) {
480 * Allocation failure. Make sure we redo ire/nce selection
481 * next time we send.
483 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
484 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
485 return (ENOBUFS);
487 if (nce == ixa->ixa_nce) {
488 /* No change */
489 nce_refrele(nce);
490 return (0);
494 * Since the path MTU might change as a result of this
495 * route change, we twiddle ixa_dce_generation to
496 * make conn_ip_output go through the ip_verify_dce code.
498 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
500 if (ixa->ixa_nce != NULL)
501 nce_refrele(ixa->ixa_nce);
502 ixa->ixa_nce = nce;
503 return (0);
507 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
509 static int
510 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
512 ire_t *ire = ixa->ixa_ire;
513 nce_t *nce;
514 int error = 0;
515 ipha_t *ipha = NULL;
516 ip6_t *ip6h = NULL;
518 if (ire->ire_ipversion == IPV4_VERSION)
519 ipha = (ipha_t *)mp->b_rptr;
520 else
521 ip6h = (ip6_t *)mp->b_rptr;
523 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
524 if (nce == NULL) {
525 /* Try to find a better ire */
526 return (ip_verify_ire(mp, ixa));
530 * The hardware offloading capabilities, for example LSO, of the
531 * interface might have changed, so do sanity verification here.
533 if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
534 if (!ip_verify_lso(nce->nce_ill, ixa)) {
535 ASSERT(ixa->ixa_notify != NULL);
536 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
537 IXAN_LSO, 0);
538 error = ENOTSUP;
543 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
544 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
545 * any more, return error so that conn_ip_output() can take care of
546 * the ZEROCOPY message properly. It's safe to continue send the
547 * message when ZEROCOPY newly become available.
549 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
550 if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
551 ASSERT(ixa->ixa_notify != NULL);
552 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
553 IXAN_ZCOPY, 0);
554 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
555 error = ENOTSUP;
560 * Since the path MTU might change as a result of this
561 * change, we twiddle ixa_dce_generation to
562 * make conn_ip_output go through the ip_verify_dce code.
564 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
566 nce_refrele(ixa->ixa_nce);
567 ixa->ixa_nce = nce;
568 return (error);
572 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
574 static int
575 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
577 dce_t *dce;
578 uint_t gen;
579 uint_t pmtu;
581 dce = dce_lookup_pkt(mp, ixa, &gen);
582 ASSERT(dce != NULL);
584 dce_refrele_notr(ixa->ixa_dce);
585 #ifdef DEBUG
586 dce_refhold_notr(dce);
587 dce_refrele(dce);
588 #endif
589 ixa->ixa_dce = dce;
590 ixa->ixa_dce_generation = gen;
592 /* Extract the (path) mtu from the dce, ncec_ill etc */
593 pmtu = ip_get_pmtu(ixa);
596 * Tell ULP about PMTU changes - increase or decrease - by returning
597 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
598 * both ixa_pmtu and ixa_fragsize appropriately.
600 * If ULP doesn't set that flag then we need to update ixa_fragsize
601 * since routing could have changed the ill after after ixa_fragsize
602 * was set previously in the conn_ip_output path or in
603 * ip_set_destination.
605 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
607 * In the case of a path MTU increase we send the packet after the
608 * notify to the ULP.
610 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
611 if (ixa->ixa_pmtu != pmtu) {
612 uint_t oldmtu = ixa->ixa_pmtu;
614 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
615 uint32_t, ixa->ixa_pmtu);
616 ASSERT(ixa->ixa_notify != NULL);
617 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
618 IXAN_PMTU, pmtu);
619 if (pmtu < oldmtu)
620 return (EMSGSIZE);
622 } else {
623 ixa->ixa_fragsize = pmtu;
625 return (0);
629 * Verify LSO usability. Keep the return value simple to indicate whether
630 * the LSO capability has changed. Handle both IPv4 and IPv6.
632 static boolean_t
633 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
635 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab;
636 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab;
638 if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
640 * Not unsable any more.
642 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
643 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
644 ((ixa->ixa_flags & IXAF_IS_IPV4) ?
645 !ILL_LSO_TCP_IPV4_USABLE(ill) :
646 !ILL_LSO_TCP_IPV6_USABLE(ill))) {
647 ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
649 return (B_FALSE);
653 * Capability has changed, refresh the copy in ixa.
655 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
656 *lsoc = *new_lsoc;
658 return (B_FALSE);
660 } else { /* Was not usable */
661 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
662 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
663 ((ixa->ixa_flags & IXAF_IS_IPV4) ?
664 ILL_LSO_TCP_IPV4_USABLE(ill) :
665 ILL_LSO_TCP_IPV6_USABLE(ill))) {
666 *lsoc = *new_lsoc;
667 ixa->ixa_flags |= IXAF_LSO_CAPAB;
669 return (B_FALSE);
673 return (B_TRUE);
677 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
678 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
680 static boolean_t
681 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
683 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
685 * Not unsable any more.
687 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
688 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
689 !ILL_ZCOPY_USABLE(ill)) {
690 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
692 return (B_FALSE);
694 } else { /* Was not usable */
695 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
696 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
697 ILL_ZCOPY_USABLE(ill)) {
698 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
700 return (B_FALSE);
704 return (B_TRUE);
709 * When there is no conn_t context, this will send a packet.
710 * The caller must *not* have called conn_connect() or ip_attr_connect()
711 * before calling ip_output_simple().
712 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
713 * Honors IXAF_SET_SOURCE.
715 * We acquire the ire and after calling ire_sendfn we release
716 * the hold on the ire. Ditto for the nce and dce.
718 * This assumes that the caller has set the following in ip_xmit_attr_t:
719 * ixa_zoneid, and ixa_ipst must always be set.
720 * If ixa_ifindex is non-zero it means send out that ill. (If it is
721 * an upper IPMP ill we load balance across the group; if a lower we send
722 * on that lower ill without load balancing.)
723 * IXAF_IS_IPV4 must be set correctly.
724 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
725 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
726 * If neither of those two are set we do an IPsec policy lookup.
728 * We handle setting things like
729 * ixa_pktlen
730 * ixa_ip_hdr_length
731 * ixa->ixa_protocol
733 * The caller may set ixa_xmit_hint, which is used for ECMP selection and
734 * transmit ring selecting in GLD.
736 * The caller must do an ixa_cleanup() to release any IPsec references
737 * after we return.
740 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
742 int err;
744 ASSERT(ixa->ixa_ipst != NULL);
746 if (ixa->ixa_flags & IXAF_IS_IPV4)
747 return (ip_output_simple_v4(mp, ixa));
748 else
749 return (ip_output_simple_v6(mp, ixa));
753 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
755 ipha_t *ipha;
756 ipaddr_t firsthop; /* In IP header */
757 ipaddr_t dst; /* End of source route, or ipha_dst if none */
758 ire_t *ire;
759 ipaddr_t setsrc; /* RTF_SETSRC */
760 int error;
761 ill_t *ill = NULL;
762 dce_t *dce = NULL;
763 nce_t *nce;
764 iaflags_t ixaflags = ixa->ixa_flags;
765 ip_stack_t *ipst = ixa->ixa_ipst;
766 boolean_t repeat = B_FALSE;
767 int64_t now;
769 ipha = (ipha_t *)mp->b_rptr;
770 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
772 /* Caller already set flags */
773 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
775 ASSERT(ixa->ixa_nce == NULL);
777 ixa->ixa_pktlen = ntohs(ipha->ipha_length);
778 ASSERT(ixa->ixa_pktlen == msgdsize(mp));
779 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
780 ixa->ixa_protocol = ipha->ipha_protocol;
783 * Assumes that source routed packets have already been massaged by
784 * the ULP (ip_massage_options) and as a result ipha_dst is the next
785 * hop in the source route. The final destination is used for IPsec
786 * policy and DCE lookup.
788 firsthop = ipha->ipha_dst;
789 dst = ip_get_dst(ipha);
791 repeat_ire:
792 error = 0;
793 setsrc = INADDR_ANY;
794 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
795 &setsrc, &error);
796 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
797 if (error != 0) {
798 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
799 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
800 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
801 freemsg(mp);
802 goto done;
805 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
806 /* ire_ill might be NULL hence need to skip some code */
807 if (ixaflags & IXAF_SET_SOURCE)
808 ipha->ipha_src = htonl(INADDR_LOOPBACK);
809 ixa->ixa_fragsize = IP_MAXPACKET;
810 ill = NULL;
811 nce = NULL;
812 ire->ire_ob_pkt_count++;
813 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
814 /* No dce yet; use default one */
815 error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
816 &ipst->ips_dce_default->dce_ident);
817 goto done;
820 /* Note that ipha_dst is only used for IRE_MULTICAST */
821 nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
822 if (nce == NULL) {
823 /* Allocation failure? */
824 ip_drop_output("ire_to_nce", mp, ill);
825 freemsg(mp);
826 error = ENOBUFS;
827 goto done;
829 if (nce->nce_is_condemned) {
830 nce_t *nce1;
832 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
833 nce_refrele(nce);
834 if (nce1 == NULL) {
835 if (!repeat) {
836 /* Try finding a better IRE */
837 repeat = B_TRUE;
838 ire_refrele(ire);
839 goto repeat_ire;
841 /* Tried twice - drop packet */
842 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
843 ip_drop_output("No nce", mp, ill);
844 freemsg(mp);
845 error = ENOBUFS;
846 goto done;
848 nce = nce1;
851 ixa->ixa_postfragfn = ire->ire_postfragfn;
853 ASSERT(ixa->ixa_nce == NULL);
854 ixa->ixa_nce = nce;
857 * Check for a dce_t with a path mtu.
859 dce = dce_lookup_v4(dst, ipst, NULL);
860 ASSERT(dce != NULL);
862 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
863 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
864 } else if (dce->dce_flags & DCEF_PMTU) {
866 * To avoid a periodic timer to increase the path MTU we
867 * look at dce_last_change_time each time we send a packet.
869 now = ddi_get_lbolt64();
870 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
871 ipst->ips_ip_pathmtu_interval) {
873 * Older than 20 minutes. Drop the path MTU information.
875 mutex_enter(&dce->dce_lock);
876 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
877 dce->dce_last_change_time = TICK_TO_SEC(now);
878 mutex_exit(&dce->dce_lock);
879 dce_increment_generation(dce);
880 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
881 } else {
882 uint_t fragsize;
884 fragsize = ip_get_base_mtu(nce->nce_ill, ire);
885 if (fragsize > dce->dce_pmtu)
886 fragsize = dce->dce_pmtu;
887 ixa->ixa_fragsize = fragsize;
889 } else {
890 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
894 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
895 * interface for source address selection.
897 ill = ire_nexthop_ill(ire);
899 if (ixaflags & IXAF_SET_SOURCE) {
900 ipaddr_t src;
903 * We use the final destination to get
904 * correct selection for source routed packets
907 /* If unreachable we have no ill but need some source */
908 if (ill == NULL) {
909 src = htonl(INADDR_LOOPBACK);
910 error = 0;
911 } else {
912 error = ip_select_source_v4(ill, setsrc, dst,
913 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
914 &src, NULL, NULL);
916 if (error != 0) {
917 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
918 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
919 ip_drop_output("ipIfStatsOutDiscards - no source",
920 mp, ill);
921 freemsg(mp);
922 goto done;
924 ipha->ipha_src = src;
925 } else if (ixaflags & IXAF_VERIFY_SOURCE) {
926 /* Check if the IP source is assigned to the host. */
927 if (!ip_verify_src(mp, ixa, NULL)) {
928 /* Don't send a packet with a source that isn't ours */
929 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
930 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
931 ip_drop_output("ipIfStatsOutDiscards - invalid source",
932 mp, ill);
933 freemsg(mp);
934 error = EADDRNOTAVAIL;
935 goto done;
941 * Check against global IPsec policy to set the AH/ESP attributes.
942 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
944 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
945 ASSERT(ixa->ixa_ipsec_policy == NULL);
946 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
947 if (mp == NULL) {
948 /* MIB and ip_drop_packet already done */
949 return (EHOSTUNREACH); /* IPsec policy failure */
953 if (ill != NULL) {
954 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
955 } else {
956 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
960 * We update the statistics on the most specific IRE i.e., the first
961 * one we found.
962 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
963 * can only count the use prior to fragmentation. However the MIB
964 * counters on the ill will be incremented in post fragmentation.
966 ire->ire_ob_pkt_count++;
969 * Based on ire_type and ire_flags call one of:
970 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
971 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
972 * ire_send_multicast_v4 - for IRE_MULTICAST
973 * ire_send_broadcast_v4 - for IRE_BROADCAST
974 * ire_send_wire_v4 - for the rest.
976 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
977 done:
978 ire_refrele(ire);
979 if (dce != NULL)
980 dce_refrele(dce);
981 if (ill != NULL)
982 ill_refrele(ill);
983 if (ixa->ixa_nce != NULL)
984 nce_refrele(ixa->ixa_nce);
985 ixa->ixa_nce = NULL;
986 return (error);
990 * ire_sendfn() functions.
991 * These functions use the following xmit_attr:
992 * - ixa_fragsize - read to determine whether or not to fragment
993 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
994 * - ixa_ipsec_* are used inside IPsec
995 * - IXAF_SET_SOURCE - replace IP source in broadcast case.
996 * - IXAF_LOOPBACK_COPY - for multicast and broadcast
1001 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1003 * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1005 /* ARGSUSED4 */
1007 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1008 ip_xmit_attr_t *ixa, uint32_t *identp)
1010 ipha_t *ipha = (ipha_t *)iph_arg;
1011 ip_stack_t *ipst = ixa->ixa_ipst;
1012 ill_t *ill = ire->ire_ill;
1013 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
1014 uint_t pktlen = ixa->ixa_pktlen;
1017 * No fragmentation, no nce, no application of IPsec,
1018 * and no ipha_ident assignment.
1020 * Note different order between IP provider and FW_HOOKS than in
1021 * send_wire case.
1025 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the
1026 * send probe, but not the receive probe.
1028 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1029 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1030 int, 1);
1032 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1033 int error;
1035 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1036 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1037 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1038 ipst->ips_ipv4firewall_loopback_out,
1039 NULL, ill, ipha, mp, mp, 0, ipst, error);
1040 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1041 if (mp == NULL)
1042 return (error);
1045 * Even if the destination was changed by the filter we use the
1046 * forwarding decision that was made based on the address
1047 * in ip_output/ip_set_destination.
1049 /* Length could be different */
1050 ipha = (ipha_t *)mp->b_rptr;
1051 pktlen = ntohs(ipha->ipha_length);
1055 * If a callback is enabled then we need to know the
1056 * source and destination zoneids for the packet. We already
1057 * have those handy.
1059 if (ipst->ips_ip4_observe.he_interested) {
1060 zoneid_t szone, dzone;
1061 zoneid_t stackzoneid;
1063 stackzoneid = netstackid_to_zoneid(
1064 ipst->ips_netstack->netstack_stackid);
1066 if (stackzoneid == GLOBAL_ZONEID) {
1067 /* Shared-IP zone */
1068 dzone = ire->ire_zoneid;
1069 szone = ixa->ixa_zoneid;
1070 } else {
1071 szone = dzone = stackzoneid;
1073 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1076 /* Handle lo0 stats */
1077 ipst->ips_loopback_packets++;
1079 /* Map ixa to ira including IPsec policies */
1080 ipsec_out_to_in(ixa, ill, &iras);
1081 iras.ira_pktlen = pktlen;
1083 if (!IS_SIMPLE_IPH(ipha)) {
1084 ip_output_local_options(ipha, ipst);
1085 iras.ira_flags |= IRAF_IPV4_OPTIONS;
1088 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1089 int error;
1091 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1092 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1093 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1094 ipst->ips_ipv4firewall_loopback_in,
1095 ill, NULL, ipha, mp, mp, 0, ipst, error);
1097 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1098 if (mp == NULL) {
1099 ira_cleanup(&iras, B_FALSE);
1100 return (error);
1103 * Even if the destination was changed by the filter we use the
1104 * forwarding decision that was made based on the address
1105 * in ip_output/ip_set_destination.
1107 /* Length could be different */
1108 ipha = (ipha_t *)mp->b_rptr;
1109 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1112 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1113 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1114 int, 1);
1116 ire->ire_ib_pkt_count++;
1117 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1118 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1120 /* Destined to ire_zoneid - use that for fanout */
1121 iras.ira_zoneid = ire->ire_zoneid;
1123 ip_fanout_v4(mp, ipha, &iras);
1125 /* We moved any IPsec refs from ixa to iras */
1126 ira_cleanup(&iras, B_FALSE);
1127 return (0);
1131 * ire_sendfn for IRE_BROADCAST
1132 * If the broadcast address is present on multiple ills and ixa_ifindex
1133 * isn't set, then we generate
1134 * a separate datagram (potentially with different source address) for
1135 * those ills. In any case, only one copy is looped back to ip_input_v4.
1138 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1139 ip_xmit_attr_t *ixa, uint32_t *identp)
1141 ipha_t *ipha = (ipha_t *)iph_arg;
1142 ip_stack_t *ipst = ixa->ixa_ipst;
1143 irb_t *irb = ire->ire_bucket;
1144 ire_t *ire1;
1145 mblk_t *mp1;
1146 ipha_t *ipha1;
1147 iaflags_t ixaflags = ixa->ixa_flags;
1148 nce_t *nce1, *nce_orig;
1151 * Unless someone already set a ttl, force the ttl to a smallish
1152 * value.
1154 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1156 * To avoid broadcast storms, we usually set the TTL to 1 for
1157 * broadcasts. This can
1158 * be overridden stack-wide through the ip_broadcast_ttl
1159 * ndd tunable, or on a per-connection basis through the
1160 * IP_BROADCAST_TTL socket option.
1162 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1163 * will force ttl to one after we've set this.
1165 if (ixaflags & IXAF_BROADCAST_TTL_SET)
1166 ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1167 else
1168 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1171 * Make sure we get a loopback copy (after IPsec and frag)
1172 * Skip hardware checksum so that loopback copy is checksumed.
1174 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1176 /* Do we need to potentially generate multiple copies? */
1177 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1178 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1181 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1182 * Note that everything in the bucket has the same destination address.
1184 irb_refhold(irb);
1185 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1186 /* We do the main IRE after the end of the loop */
1187 if (ire1 == ire)
1188 continue;
1191 * Only IREs for the same IP address should be in the same
1192 * bucket.
1194 ASSERT(ire1->ire_addr == ire->ire_addr);
1195 if (!(ire1->ire_type & IRE_BROADCAST))
1196 continue;
1198 if (IRE_IS_CONDEMNED(ire1))
1199 continue;
1201 if (ixa->ixa_zoneid != ALL_ZONES &&
1202 ire->ire_zoneid != ire1->ire_zoneid)
1203 continue;
1205 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1208 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1209 * ensure that this goes out on the cast_ill.
1211 if (IS_UNDER_IPMP(ire1->ire_ill))
1212 continue;
1214 mp1 = copymsg(mp);
1215 if (mp1 == NULL) {
1216 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1217 ipIfStatsOutDiscards);
1218 ip_drop_output("ipIfStatsOutDiscards",
1219 mp, ire1->ire_ill);
1220 continue;
1223 ipha1 = (ipha_t *)mp1->b_rptr;
1224 if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1226 * Need to pick a different source address for each
1227 * interface. If we have a global IPsec policy and
1228 * no per-socket policy then we punt to
1229 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1231 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1232 ip_output_simple_broadcast(ixa, mp1);
1233 continue;
1235 /* Pick a new source address for each interface */
1236 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1237 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1238 &ipha1->ipha_src, NULL, NULL) != 0) {
1239 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1240 ipIfStatsOutDiscards);
1241 ip_drop_output("ipIfStatsOutDiscards - select "
1242 "broadcast source", mp1, ire1->ire_ill);
1243 freemsg(mp1);
1244 continue;
1247 * Check against global IPsec policy to set the AH/ESP
1248 * attributes. IPsec will set IXAF_IPSEC_* and
1249 * ixa_ipsec_* as appropriate.
1251 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1252 ASSERT(ixa->ixa_ipsec_policy == NULL);
1253 mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1254 NULL, ixa);
1255 if (mp1 == NULL) {
1257 * MIB and ip_drop_packet already
1258 * done
1260 continue;
1264 /* Make sure we have an NCE on this ill */
1265 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1266 ire1->ire_type);
1267 if (nce1 == NULL) {
1268 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1269 ipIfStatsOutDiscards);
1270 ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1271 mp1, ire1->ire_ill);
1272 freemsg(mp1);
1273 continue;
1275 nce_orig = ixa->ixa_nce;
1276 ixa->ixa_nce = nce1;
1278 ire_refhold(ire1);
1280 * Ignore any errors here. We just collect the errno for
1281 * the main ire below
1283 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1284 ire_refrele(ire1);
1286 ixa->ixa_nce = nce_orig;
1287 nce_refrele(nce1);
1289 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1291 irb_refrele(irb);
1292 /* Finally, the main one */
1295 * For IPMP we only send broadcasts on the ipmp_ill.
1297 if (IS_UNDER_IPMP(ire->ire_ill)) {
1298 freemsg(mp);
1299 return (0);
1302 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1306 * Send a packet using a different source address and different
1307 * IPsec policy.
1309 static void
1310 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1312 ip_xmit_attr_t ixas;
1314 bzero(&ixas, sizeof (ixas));
1315 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1316 ixas.ixa_zoneid = ixa->ixa_zoneid;
1317 ixas.ixa_ifindex = 0;
1318 ixas.ixa_ipst = ixa->ixa_ipst;
1319 ixas.ixa_cred = ixa->ixa_cred;
1320 ixas.ixa_cpid = ixa->ixa_cpid;
1321 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1323 (void) ip_output_simple(mp, &ixas);
1324 ixa_cleanup(&ixas);
1328 * ire_sendfn for IRE_MULTICAST
1331 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1332 ip_xmit_attr_t *ixa, uint32_t *identp)
1334 ipha_t *ipha = (ipha_t *)iph_arg;
1335 ip_stack_t *ipst = ixa->ixa_ipst;
1336 ill_t *ill = ire->ire_ill;
1337 iaflags_t ixaflags = ixa->ixa_flags;
1340 * Check if anything in ip_input_v4 wants a copy of the transmitted
1341 * packet (after IPsec and fragmentation)
1343 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1344 * RSVP and the rsvp daemon is an example of a
1345 * protocol and user level process that
1346 * handles it's own routing. Hence, it uses the
1347 * SO_DONTROUTE option to accomplish this.
1348 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1349 * check whether there are any receivers for the group on the ill
1350 * (ignoring the zoneid).
1351 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1352 * any members in other shared-IP zones.
1353 * If such members exist, then we indicate that the sending zone
1354 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1355 * behavior.
1357 * When we loopback we skip hardware checksum to make sure loopback
1358 * copy is checksumed.
1360 * Note that ire_ill is the upper in the case of IPMP.
1362 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1363 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1364 !(ixaflags & IXAF_DONTROUTE)) {
1365 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1366 } else if (ixaflags & IXAF_MULTICAST_LOOP) {
1368 * If this zone or any other zone has members then loopback
1369 * a copy.
1371 if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1372 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1373 } else if (ipst->ips_netstack->netstack_numzones > 1) {
1375 * This zone should not have a copy. But there are some other
1376 * zones which might have members.
1378 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1379 ixa->ixa_zoneid)) {
1380 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1381 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1382 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1387 * Unless icmp_output_hdrincl already set a ttl, force the ttl to
1388 * the IP_MULTICAST_TTL value
1390 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1391 ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1394 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1398 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1401 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1402 ip_xmit_attr_t *ixa, uint32_t *identp)
1404 ip_stack_t *ipst = ixa->ixa_ipst;
1405 ipha_t *ipha = (ipha_t *)iph_arg;
1406 ill_t *ill;
1407 ip_recv_attr_t iras;
1408 boolean_t dummy;
1410 /* We assign an IP ident for nice errors */
1411 ipha->ipha_ident = atomic_inc_32_nv(identp);
1413 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1415 if (ire->ire_type & IRE_NOROUTE) {
1416 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1417 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1418 RTA_DST, ipst);
1421 if (ire->ire_flags & RTF_BLACKHOLE) {
1422 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1423 freemsg(mp);
1424 /* No error even for local senders - silent blackhole */
1425 return (0);
1427 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1430 * We need an ill_t for the ip_recv_attr_t even though this packet
1431 * was never received and icmp_unreachable doesn't currently use
1432 * ira_ill.
1434 ill = ill_lookup_on_name("lo0", B_FALSE,
1435 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1436 if (ill == NULL) {
1437 freemsg(mp);
1438 return (EHOSTUNREACH);
1441 bzero(&iras, sizeof (iras));
1442 /* Map ixa to ira including IPsec policies */
1443 ipsec_out_to_in(ixa, ill, &iras);
1445 if (ip_source_routed(ipha, ipst)) {
1446 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1447 } else {
1448 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1450 /* We moved any IPsec refs from ixa to iras */
1451 ira_cleanup(&iras, B_FALSE);
1452 ill_refrele(ill);
1453 return (EHOSTUNREACH);
1457 * Calculate a checksum ignoring any hardware capabilities
1459 * Returns B_FALSE if the packet was too short for the checksum. Caller
1460 * should free and do stats.
1462 static boolean_t
1463 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1465 ip_stack_t *ipst = ixa->ixa_ipst;
1466 uint_t pktlen = ixa->ixa_pktlen;
1467 uint16_t *cksump;
1468 uint32_t cksum;
1469 uint8_t protocol = ixa->ixa_protocol;
1470 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
1471 ipaddr_t dst = ipha->ipha_dst;
1472 ipaddr_t src = ipha->ipha_src;
1474 /* Just in case it contained garbage */
1475 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1478 * Calculate ULP checksum
1480 if (protocol == IPPROTO_TCP) {
1481 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1482 cksum = IP_TCP_CSUM_COMP;
1483 } else if (protocol == IPPROTO_UDP) {
1484 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1485 cksum = IP_UDP_CSUM_COMP;
1486 } else if (protocol == IPPROTO_SCTP) {
1487 sctp_hdr_t *sctph;
1489 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1490 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1492 * Zero out the checksum field to ensure proper
1493 * checksum calculation.
1495 sctph->sh_chksum = 0;
1496 #ifdef DEBUG
1497 if (!skip_sctp_cksum)
1498 #endif
1499 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1500 goto ip_hdr_cksum;
1501 } else {
1502 goto ip_hdr_cksum;
1505 /* ULP puts the checksum field is in the first mblk */
1506 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1509 * We accumulate the pseudo header checksum in cksum.
1510 * This is pretty hairy code, so watch close. One
1511 * thing to keep in mind is that UDP and TCP have
1512 * stored their respective datagram lengths in their
1513 * checksum fields. This lines things up real nice.
1515 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1517 cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1519 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1520 * Change to 0xffff
1522 if (protocol == IPPROTO_UDP && cksum == 0)
1523 *cksump = ~cksum;
1524 else
1525 *cksump = cksum;
1527 IP_STAT(ipst, ip_out_sw_cksum);
1528 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1530 ip_hdr_cksum:
1531 /* Calculate IPv4 header checksum */
1532 ipha->ipha_hdr_checksum = 0;
1533 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1534 return (B_TRUE);
1538 * Calculate the ULP checksum - try to use hardware.
1540 * If the hardware supports IP header checksum offload; then clear the
1541 * contents of IP header checksum field as expected by NIC.
1542 * Do this only if we offloaded either full or partial sum.
1544 * Returns B_FALSE if the packet was too short for the checksum. Caller
1545 * should free and do stats.
1547 static boolean_t
1548 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1549 ip_xmit_attr_t *ixa, ill_t *ill)
1551 uint_t pktlen = ixa->ixa_pktlen;
1552 uint16_t *cksump;
1553 uint16_t hck_flags;
1554 uint32_t cksum;
1555 uint8_t protocol = ixa->ixa_protocol;
1556 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
1558 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1559 !dohwcksum) {
1560 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1564 * Calculate ULP checksum. Note that we don't use cksump and cksum
1565 * if the ill has FULL support.
1567 if (protocol == IPPROTO_TCP) {
1568 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1569 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */
1570 } else if (protocol == IPPROTO_UDP) {
1571 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1572 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */
1573 } else if (protocol == IPPROTO_SCTP) {
1574 sctp_hdr_t *sctph;
1576 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1577 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1579 * Zero out the checksum field to ensure proper
1580 * checksum calculation.
1582 sctph->sh_chksum = 0;
1583 #ifdef DEBUG
1584 if (!skip_sctp_cksum)
1585 #endif
1586 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1587 goto ip_hdr_cksum;
1588 } else {
1589 ip_hdr_cksum:
1590 /* Calculate IPv4 header checksum */
1591 ipha->ipha_hdr_checksum = 0;
1592 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1593 return (B_TRUE);
1596 /* ULP puts the checksum field is in the first mblk */
1597 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1600 * Underlying interface supports hardware checksum offload for
1601 * the payload; leave the payload checksum for the hardware to
1602 * calculate. N.B: We only need to set up checksum info on the
1603 * first mblk.
1605 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1607 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1608 if (hck_flags & HCKSUM_INET_FULL_V4) {
1610 * Hardware calculates pseudo-header, header and the
1611 * payload checksums, so clear the checksum field in
1612 * the protocol header.
1614 *cksump = 0;
1615 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1617 ipha->ipha_hdr_checksum = 0;
1618 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1619 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1620 } else {
1621 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1623 return (B_TRUE);
1625 if ((hck_flags) & HCKSUM_INET_PARTIAL) {
1626 ipaddr_t dst = ipha->ipha_dst;
1627 ipaddr_t src = ipha->ipha_src;
1629 * Partial checksum offload has been enabled. Fill
1630 * the checksum field in the protocol header with the
1631 * pseudo-header checksum value.
1633 * We accumulate the pseudo header checksum in cksum.
1634 * This is pretty hairy code, so watch close. One
1635 * thing to keep in mind is that UDP and TCP have
1636 * stored their respective datagram lengths in their
1637 * checksum fields. This lines things up real nice.
1639 cksum += (dst >> 16) + (dst & 0xFFFF) +
1640 (src >> 16) + (src & 0xFFFF);
1641 cksum += *(cksump);
1642 cksum = (cksum & 0xFFFF) + (cksum >> 16);
1643 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1646 * Offsets are relative to beginning of IP header.
1648 DB_CKSUMSTART(mp) = ip_hdr_length;
1649 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1650 DB_CKSUMEND(mp) = pktlen;
1651 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1653 ipha->ipha_hdr_checksum = 0;
1654 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1655 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1656 } else {
1657 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1659 return (B_TRUE);
1661 /* Hardware capabilities include neither full nor partial IPv4 */
1662 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1666 * ire_sendfn for offlink and onlink destinations.
1667 * Also called from the multicast and broadcast send functions.
1669 * Assumes that the caller has a hold on the ire.
1671 * This function doesn't care if the IRE just became condemned since that
1672 * can happen at any time.
1674 /* ARGSUSED */
1676 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1677 ip_xmit_attr_t *ixa, uint32_t *identp)
1679 ip_stack_t *ipst = ixa->ixa_ipst;
1680 ipha_t *ipha = (ipha_t *)iph_arg;
1681 iaflags_t ixaflags = ixa->ixa_flags;
1682 ill_t *ill;
1684 ASSERT(ixa->ixa_nce != NULL);
1685 ill = ixa->ixa_nce->nce_ill;
1687 if (ixaflags & IXAF_DONTROUTE)
1688 ipha->ipha_ttl = 1;
1691 * Assign an ident value for this packet. There could be other
1692 * threads targeting the same destination, so we have to arrange
1693 * for a atomic increment. Note that we use a 32-bit atomic add
1694 * because it has better performance than its 16-bit sibling.
1696 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1697 * be the number of TCP segments that the driver/hardware will
1698 * extraly construct.
1700 ipha->ipha_ident = atomic_add_32_nv(identp,
1701 ixa->ixa_extra_ident + 1);
1702 #ifndef _BIG_ENDIAN
1703 ipha->ipha_ident = htons(ipha->ipha_ident);
1704 #endif
1707 * This might set b_band, thus the IPsec and fragmentation
1708 * code in IP ensures that b_band is updated in the first mblk.
1710 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1711 /* ip_process translates an IS_UNDER_IPMP */
1712 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1713 if (mp == NULL) {
1714 /* ip_drop_packet and MIB done */
1715 return (0); /* Might just be delayed */
1720 * Verify any IPv4 options.
1722 * The presense of IP options also forces the network stack to
1723 * calculate the checksum in software. This is because:
1725 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1726 * the size of "start offset" width to 6-bit. This effectively
1727 * sets the largest value of the offset to 64-bytes, starting
1728 * from the MAC header. When the cumulative MAC and IP headers
1729 * exceed such limit, the offset will wrap around. This causes
1730 * the checksum to be calculated at the wrong place.
1732 * IPv4 source routing: none of the full-checksum capable NICs
1733 * is capable of correctly handling the IPv4 source-routing
1734 * option for purposes of calculating the pseudo-header; the
1735 * actual destination is different from the destination in the
1736 * header which is that of the next-hop. (This case may not be
1737 * true for NICs which can parse IPv6 extension headers, but
1738 * we choose to simplify the implementation by not offloading
1739 * checksum when they are present.)
1741 if (!IS_SIMPLE_IPH(ipha)) {
1742 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1743 /* An IS_UNDER_IPMP ill is ok here */
1744 if (ip_output_options(mp, ipha, ixa, ill)) {
1745 /* Packet has been consumed and ICMP error sent */
1746 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1747 return (EINVAL);
1751 if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1752 (ixaflags & IXAF_IPSEC_SECURE)) {
1753 uint32_t pktlen;
1755 pktlen = ixa->ixa_pktlen;
1756 if (ixaflags & IXAF_IPSEC_SECURE)
1757 pktlen += ipsec_out_extra_length(ixa);
1759 if (pktlen > IP_MAXPACKET)
1760 return (EMSGSIZE);
1762 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1764 * Compute ULP checksum and IP header checksum
1765 * using software
1767 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1768 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1769 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1770 freemsg(mp);
1771 return (EINVAL);
1773 } else {
1774 /* Calculate IPv4 header checksum */
1775 ipha->ipha_hdr_checksum = 0;
1776 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1780 * If this packet would generate a icmp_frag_needed
1781 * message, we need to handle it before we do the IPsec
1782 * processing. Otherwise, we need to strip the IPsec
1783 * headers before we send up the message to the ULPs
1784 * which becomes messy and difficult.
1786 * We check using IXAF_DONTFRAG. The DF bit in the header
1787 * is not inspected - it will be copied to any generated
1788 * fragments.
1790 if ((pktlen > ixa->ixa_fragsize) &&
1791 (ixaflags & IXAF_DONTFRAG)) {
1792 /* Generate ICMP and return error */
1793 ip_recv_attr_t iras;
1795 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
1796 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1797 uint_t, ixa->ixa_pmtu);
1799 bzero(&iras, sizeof (iras));
1800 /* Map ixa to ira including IPsec policies */
1801 ipsec_out_to_in(ixa, ill, &iras);
1803 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
1804 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
1805 /* We moved any IPsec refs from ixa to iras */
1806 ira_cleanup(&iras, B_FALSE);
1807 return (EMSGSIZE);
1809 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
1810 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1811 uint_t, ixa->ixa_pmtu);
1813 if (ixaflags & IXAF_IPSEC_SECURE) {
1815 * Pass in sufficient information so that
1816 * IPsec can determine whether to fragment, and
1817 * which function to call after fragmentation.
1819 return (ipsec_out_process(mp, ixa));
1821 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
1822 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
1823 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
1824 ixa->ixa_postfragfn, &ixa->ixa_cookie));
1826 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1827 /* Compute ULP checksum and IP header checksum */
1828 /* An IS_UNDER_IPMP ill is ok here */
1829 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
1830 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1831 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1832 freemsg(mp);
1833 return (EINVAL);
1835 } else {
1836 /* Calculate IPv4 header checksum */
1837 ipha->ipha_hdr_checksum = 0;
1838 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1840 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
1841 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
1842 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
1846 * Send mp into ip_input
1847 * Common for IPv4 and IPv6
1849 void
1850 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
1851 uint_t pkt_len, zoneid_t nolzid)
1853 rtc_t rtc;
1854 ill_t *ill = nce->nce_ill;
1855 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
1856 ncec_t *ncec;
1858 ncec = nce->nce_common;
1859 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
1860 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
1861 if (ncec->ncec_flags & NCE_F_BCAST)
1862 iras.ira_flags |= IRAF_L2DST_BROADCAST;
1863 else if (ncec->ncec_flags & NCE_F_MCAST)
1864 iras.ira_flags |= IRAF_L2DST_MULTICAST;
1866 iras.ira_free_flags = 0;
1867 iras.ira_cred = NULL;
1868 iras.ira_cpid = NOPID;
1869 iras.ira_zoneid = ALL_ZONES;
1870 iras.ira_pktlen = pkt_len;
1871 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
1872 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1874 if (ixaflags & IXAF_IS_IPV4)
1875 iras.ira_flags |= IRAF_IS_IPV4;
1877 iras.ira_ill = iras.ira_rill = ill;
1878 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1879 iras.ira_rifindex = iras.ira_ruifindex;
1880 iras.ira_mhip = NULL;
1882 iras.ira_flags |= ixaflags & IAF_MASK;
1883 iras.ira_no_loop_zoneid = nolzid;
1885 /* Broadcast and multicast doesn't care about the squeue */
1886 iras.ira_sqp = NULL;
1888 rtc.rtc_ire = NULL;
1889 if (ixaflags & IXAF_IS_IPV4) {
1890 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1892 rtc.rtc_ipaddr = INADDR_ANY;
1894 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
1895 if (rtc.rtc_ire != NULL) {
1896 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
1897 ire_refrele(rtc.rtc_ire);
1899 } else {
1900 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1902 rtc.rtc_ip6addr = ipv6_all_zeros;
1904 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
1905 if (rtc.rtc_ire != NULL) {
1906 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
1907 ire_refrele(rtc.rtc_ire);
1910 /* Any references to clean up? No hold on ira */
1911 if (iras.ira_flags & (IRAF_IPSEC_SECURE))
1912 ira_cleanup(&iras, B_FALSE);
1916 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
1917 * looks at the IXAF_LOOPBACK_COPY flag.
1918 * Common for IPv4 and IPv6.
1920 * If the loopback copy fails (due to no memory) but we send the packet out
1921 * on the wire we return no failure. Only in the case we supress the wire
1922 * sending do we take the loopback failure into account.
1924 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
1925 * Those operations are performed on this packet in ip_xmit() and it would
1926 * be odd to do it twice for the same packet.
1929 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
1930 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
1931 uintptr_t *ixacookie)
1933 ill_t *ill = nce->nce_ill;
1934 int error = 0;
1937 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
1938 * had looped it back
1940 if (ixaflags & IXAF_LOOPBACK_COPY) {
1941 mblk_t *mp1;
1943 mp1 = copymsg(mp);
1944 if (mp1 == NULL) {
1945 /* Failed to deliver the loopback copy. */
1946 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1947 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1948 error = ENOBUFS;
1949 } else {
1950 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
1951 nolzid);
1956 * If TTL = 0 then only do the loopback to this host i.e. we are
1957 * done. We are also done if this was the
1958 * loopback interface since it is sufficient
1959 * to loopback one copy of a multicast packet.
1961 if (ixaflags & IXAF_IS_IPV4) {
1962 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1964 if (ipha->ipha_ttl == 0) {
1965 ip_drop_output("multicast ipha_ttl not sent to wire",
1966 mp, ill);
1967 freemsg(mp);
1968 return (error);
1970 } else {
1971 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1973 if (ip6h->ip6_hops == 0) {
1974 ip_drop_output("multicast ipha_ttl not sent to wire",
1975 mp, ill);
1976 freemsg(mp);
1977 return (error);
1980 if (nce->nce_ill->ill_wq == NULL) {
1981 /* Loopback interface */
1982 ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
1983 freemsg(mp);
1984 return (error);
1987 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
1988 ixacookie));
1992 * Verify local connectivity. This check is called by ULP fusion code.
1993 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
1994 * the interface is brought down and back up. So we simply fail the local
1995 * process. The caller, TCP Fusion, should unfuse the connection.
1997 boolean_t
1998 ip_output_verify_local(ip_xmit_attr_t *ixa)
2000 ire_t *ire = ixa->ixa_ire;
2002 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2003 return (B_FALSE);
2005 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2009 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2011 * The caller must call ip_output_verify_local() first. This function handles
2012 * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2014 mblk_t *
2015 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2016 boolean_t hooks_in, conn_t *peer_connp)
2018 ill_t *ill = ixa->ixa_ire->ire_ill;
2019 ipha_t *ipha = NULL;
2020 ip6_t *ip6h = NULL;
2021 ip_stack_t *ipst = ixa->ixa_ipst;
2022 iaflags_t ixaflags = ixa->ixa_flags;
2023 ip_recv_attr_t iras;
2024 int error;
2026 ASSERT(mp != NULL);
2028 if (ixaflags & IXAF_IS_IPV4) {
2029 ipha = (ipha_t *)mp->b_rptr;
2032 * If a callback is enabled then we need to know the
2033 * source and destination zoneids for the packet. We already
2034 * have those handy.
2036 if (ipst->ips_ip4_observe.he_interested) {
2037 zoneid_t szone, dzone;
2038 zoneid_t stackzoneid;
2040 stackzoneid = netstackid_to_zoneid(
2041 ipst->ips_netstack->netstack_stackid);
2043 if (stackzoneid == GLOBAL_ZONEID) {
2044 /* Shared-IP zone */
2045 dzone = ixa->ixa_ire->ire_zoneid;
2046 szone = ixa->ixa_zoneid;
2047 } else {
2048 szone = dzone = stackzoneid;
2050 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2051 ipst);
2053 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2054 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2055 NULL, int, 1);
2057 /* FW_HOOKS: LOOPBACK_OUT */
2058 if (hooks_out) {
2059 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2060 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2061 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2062 ipst->ips_ipv4firewall_loopback_out,
2063 NULL, ill, ipha, mp, mp, 0, ipst, error);
2064 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2066 if (mp == NULL)
2067 return (NULL);
2069 /* FW_HOOKS: LOOPBACK_IN */
2070 if (hooks_in) {
2071 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2072 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2073 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2074 ipst->ips_ipv4firewall_loopback_in,
2075 ill, NULL, ipha, mp, mp, 0, ipst, error);
2076 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2078 if (mp == NULL)
2079 return (NULL);
2081 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2082 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2083 NULL, int, 1);
2085 /* Inbound IPsec polocies */
2086 if (peer_connp != NULL) {
2087 /* Map ixa to ira including IPsec policies. */
2088 ipsec_out_to_in(ixa, ill, &iras);
2089 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2090 NULL, &iras);
2092 } else {
2093 ip6h = (ip6_t *)mp->b_rptr;
2096 * If a callback is enabled then we need to know the
2097 * source and destination zoneids for the packet. We already
2098 * have those handy.
2100 if (ipst->ips_ip6_observe.he_interested) {
2101 zoneid_t szone, dzone;
2102 zoneid_t stackzoneid;
2104 stackzoneid = netstackid_to_zoneid(
2105 ipst->ips_netstack->netstack_stackid);
2107 if (stackzoneid == GLOBAL_ZONEID) {
2108 /* Shared-IP zone */
2109 dzone = ixa->ixa_ire->ire_zoneid;
2110 szone = ixa->ixa_zoneid;
2111 } else {
2112 szone = dzone = stackzoneid;
2114 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2115 ipst);
2117 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2118 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2119 ip6h, int, 1);
2121 /* FW_HOOKS: LOOPBACK_OUT */
2122 if (hooks_out) {
2123 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2124 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2125 FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2126 ipst->ips_ipv6firewall_loopback_out,
2127 NULL, ill, ip6h, mp, mp, 0, ipst, error);
2128 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2130 if (mp == NULL)
2131 return (NULL);
2133 /* FW_HOOKS: LOOPBACK_IN */
2134 if (hooks_in) {
2135 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2136 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2137 FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2138 ipst->ips_ipv6firewall_loopback_in,
2139 ill, NULL, ip6h, mp, mp, 0, ipst, error);
2140 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2142 if (mp == NULL)
2143 return (NULL);
2145 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2146 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2147 ip6h, int, 1);
2149 /* Inbound IPsec polocies */
2150 if (peer_connp != NULL) {
2151 /* Map ixa to ira including IPsec policies. */
2152 ipsec_out_to_in(ixa, ill, &iras);
2153 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2154 ip6h, &iras);
2158 if (mp == NULL) {
2159 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2160 ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2163 return (mp);