Merge commit '80d5689f5d4588adc071138e25e9d0d5252d9b55'
[unleashed.git] / kernel / net / ip / ip6_output.c
blobeeeaaf77ae02d141ad36f43eb84cd8d703f1f005
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
27 /* Copyright (c) 1990 Mentat Inc. */
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsubr.h>
32 #include <sys/dlpi.h>
33 #include <sys/strsun.h>
34 #include <sys/zone.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/cmn_err.h>
38 #include <sys/debug.h>
39 #include <sys/atomic.h>
41 #include <sys/systm.h>
42 #include <sys/param.h>
43 #include <sys/kmem.h>
44 #include <sys/sdt.h>
45 #include <sys/socket.h>
46 #include <sys/mac.h>
47 #include <net/if.h>
48 #include <net/if_arp.h>
49 #include <net/route.h>
50 #include <sys/sockio.h>
51 #include <netinet/in.h>
52 #include <net/if_dl.h>
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/arp.h>
59 #include <inet/snmpcom.h>
60 #include <inet/kstatcom.h>
62 #include <netinet/igmp_var.h>
63 #include <netinet/ip6.h>
64 #include <netinet/icmp6.h>
65 #include <netinet/sctp.h>
67 #include <inet/ip.h>
68 #include <inet/ip_impl.h>
69 #include <inet/ip6.h>
70 #include <inet/ip6_asp.h>
71 #include <inet/tcp.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip_if.h>
74 #include <inet/ip_ire.h>
75 #include <inet/ip_ftable.h>
76 #include <inet/ip_rts.h>
77 #include <inet/optcom.h>
78 #include <inet/ip_ndp.h>
79 #include <inet/ip_listutils.h>
80 #include <netinet/igmp.h>
81 #include <netinet/ip_mroute.h>
82 #include <inet/ipp_common.h>
84 #include <net/pfkeyv2.h>
85 #include <inet/sadb.h>
86 #include <inet/ipsec_impl.h>
87 #include <inet/ipdrop.h>
88 #include <inet/ip_netinfo.h>
90 #include <sys/pattr.h>
91 #include <inet/ipclassifier.h>
92 #include <inet/sctp_ip.h>
93 #include <inet/sctp/sctp_impl.h>
94 #include <inet/udp_impl.h>
95 #include <sys/sunddi.h>
97 #ifdef DEBUG
98 extern boolean_t skip_sctp_cksum;
99 #endif
102 ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa)
104 ip6_t *ip6h;
105 in6_addr_t firsthop; /* In IP header */
106 in6_addr_t dst; /* End of source route, or ip6_dst if none */
107 ire_t *ire;
108 in6_addr_t setsrc;
109 int error;
110 ill_t *ill = NULL;
111 dce_t *dce = NULL;
112 nce_t *nce;
113 iaflags_t ixaflags = ixa->ixa_flags;
114 ip_stack_t *ipst = ixa->ixa_ipst;
115 uint8_t *nexthdrp;
116 boolean_t repeat = B_FALSE;
117 uint_t ifindex;
118 int64_t now;
120 ip6h = (ip6_t *)mp->b_rptr;
121 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
123 ASSERT(ixa->ixa_nce == NULL);
125 ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
126 ASSERT(ixa->ixa_pktlen == msgdsize(mp));
127 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length,
128 &nexthdrp)) {
129 /* Malformed packet */
130 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
131 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
132 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
133 freemsg(mp);
134 return (EINVAL);
136 ixa->ixa_protocol = *nexthdrp;
139 * Assumes that source routed packets have already been massaged by
140 * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next
141 * hop in the source route. The final destination is used for IPsec
142 * policy and DCE lookup.
144 firsthop = ip6h->ip6_dst;
145 dst = ip_get_dst_v6(ip6h, mp, NULL);
147 repeat_ire:
148 error = 0;
149 setsrc = ipv6_all_zeros;
150 ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc,
151 &error);
152 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
153 if (error != 0) {
154 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
155 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
156 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
157 freemsg(mp);
158 goto done;
161 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
162 /* ire_ill might be NULL hence need to skip some code */
163 if (ixaflags & IXAF_SET_SOURCE)
164 ip6h->ip6_src = ipv6_loopback;
165 ixa->ixa_fragsize = IP_MAXPACKET;
166 ire->ire_ob_pkt_count++;
167 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
168 /* No dce yet; use default one */
169 error = (ire->ire_sendfn)(ire, mp, ip6h, ixa,
170 &ipst->ips_dce_default->dce_ident);
171 goto done;
174 /* Note that ip6_dst is only used for IRE_MULTICAST */
175 nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
176 if (nce == NULL) {
177 /* Allocation failure? */
178 ip_drop_output("ire_to_nce", mp, ill);
179 freemsg(mp);
180 error = ENOBUFS;
181 goto done;
183 if (nce->nce_is_condemned) {
184 nce_t *nce1;
186 nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE);
187 nce_refrele(nce);
188 if (nce1 == NULL) {
189 if (!repeat) {
190 /* Try finding a better IRE */
191 repeat = B_TRUE;
192 ire_refrele(ire);
193 goto repeat_ire;
195 /* Tried twice - drop packet */
196 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
197 ip_drop_output("No nce", mp, ill);
198 freemsg(mp);
199 error = ENOBUFS;
200 goto done;
202 nce = nce1;
205 ixa->ixa_postfragfn = ire->ire_postfragfn;
207 ASSERT(ixa->ixa_nce == NULL);
208 ixa->ixa_nce = nce;
211 * Check for a dce_t with a path mtu.
213 ifindex = 0;
214 if (IN6_IS_ADDR_LINKSCOPE(&dst))
215 ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex;
217 dce = dce_lookup_v6(&dst, ifindex, ipst, NULL);
218 ASSERT(dce != NULL);
220 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
221 ixa->ixa_fragsize = IPV6_MIN_MTU;
222 } else if (dce->dce_flags & DCEF_PMTU) {
224 * To avoid a periodic timer to increase the path MTU we
225 * look at dce_last_change_time each time we send a packet.
227 now = ddi_get_lbolt64();
228 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
229 ipst->ips_ip_pathmtu_interval) {
231 * Older than 20 minutes. Drop the path MTU information.
233 mutex_enter(&dce->dce_lock);
234 dce->dce_flags &= ~DCEF_PMTU;
235 dce->dce_last_change_time = TICK_TO_SEC(now);
236 mutex_exit(&dce->dce_lock);
237 dce_increment_generation(dce);
238 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
239 } else {
240 uint_t fragsize;
242 fragsize = ip_get_base_mtu(nce->nce_ill, ire);
243 if (fragsize > dce->dce_pmtu)
244 fragsize = dce->dce_pmtu;
245 ixa->ixa_fragsize = fragsize;
247 } else {
248 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
252 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
253 * interface for source address selection.
255 ill = ire_nexthop_ill(ire);
257 if (ixaflags & IXAF_SET_SOURCE) {
258 in6_addr_t src;
261 * We use the final destination to get
262 * correct selection for source routed packets
265 /* If unreachable we have no ill but need some source */
266 if (ill == NULL) {
267 src = ipv6_loopback;
268 error = 0;
269 } else {
270 error = ip_select_source_v6(ill, &setsrc, &dst,
271 ixa->ixa_zoneid, ipst, B_FALSE,
272 ixa->ixa_src_preferences, &src, NULL, NULL);
274 if (error != 0) {
275 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
276 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
277 ip_drop_output("ipIfStatsOutDiscards - no source",
278 mp, ill);
279 freemsg(mp);
280 goto done;
282 ip6h->ip6_src = src;
283 } else if (ixaflags & IXAF_VERIFY_SOURCE) {
284 /* Check if the IP source is assigned to the host. */
285 if (!ip_verify_src(mp, ixa, NULL)) {
286 /* Don't send a packet with a source that isn't ours */
287 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
288 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
289 ip_drop_output("ipIfStatsOutDiscards - invalid source",
290 mp, ill);
291 freemsg(mp);
292 error = EADDRNOTAVAIL;
293 goto done;
298 * Check against global IPsec policy to set the AH/ESP attributes.
299 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
301 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
302 ASSERT(ixa->ixa_ipsec_policy == NULL);
303 mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa);
304 if (mp == NULL) {
305 /* MIB and ip_drop_packet already done */
306 return (EHOSTUNREACH); /* IPsec policy failure */
310 if (ill != NULL) {
311 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
312 } else {
313 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
317 * We update the statistics on the most specific IRE i.e., the first
318 * one we found.
319 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
320 * can only count the use prior to fragmentation. However the MIB
321 * counters on the ill will be incremented in post fragmentation.
323 ire->ire_ob_pkt_count++;
326 * Based on ire_type and ire_flags call one of:
327 * ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK
328 * ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
329 * ire_send_multicast_v6 - for IRE_MULTICAST
330 * ire_send_wire_v6 - for the rest.
332 error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident);
333 done:
334 ire_refrele(ire);
335 if (dce != NULL)
336 dce_refrele(dce);
337 if (ill != NULL)
338 ill_refrele(ill);
339 if (ixa->ixa_nce != NULL)
340 nce_refrele(ixa->ixa_nce);
341 ixa->ixa_nce = NULL;
342 return (error);
346 * ire_sendfn() functions.
347 * These functions use the following xmit_attr:
348 * - ixa_fragsize - read to determine whether or not to fragment
349 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
350 * - ixa_ipsec_* are used inside IPsec
351 * - IXAF_LOOPBACK_COPY - for multicast
356 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
358 * The checks for restrict_interzone_loopback are done in ire_route_recursive.
360 /* ARGSUSED4 */
362 ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
363 ip_xmit_attr_t *ixa, uint32_t *identp)
365 ip6_t *ip6h = (ip6_t *)iph_arg;
366 ip_stack_t *ipst = ixa->ixa_ipst;
367 ill_t *ill = ire->ire_ill;
368 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
369 uint_t pktlen = ixa->ixa_pktlen;
372 * No fragmentation, no nce, and no application of IPsec.
375 * Note different order between IP provider and FW_HOOKS than in
376 * send_wire case.
380 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the
381 * send probe, but not the receive probe.
383 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
384 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
385 int, 1);
387 DTRACE_PROBE4(ip6__loopback__out__start,
388 ill_t *, NULL, ill_t *, ill,
389 ip6_t *, ip6h, mblk_t *, mp);
391 if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) {
392 int error;
394 FW_HOOKS(ipst->ips_ip6_loopback_out_event,
395 ipst->ips_ipv6firewall_loopback_out,
396 NULL, ill, ip6h, mp, mp, 0, ipst, error);
398 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
399 if (mp == NULL)
400 return (error);
403 * Even if the destination was changed by the filter we use the
404 * forwarding decision that was made based on the address
405 * in ip_output/ip_set_destination.
407 /* Length could be different */
408 ip6h = (ip6_t *)mp->b_rptr;
409 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
413 * If a callback is enabled then we need to know the
414 * source and destination zoneids for the packet. We already
415 * have those handy.
417 if (ipst->ips_ip6_observe.he_interested) {
418 zoneid_t szone, dzone;
419 zoneid_t stackzoneid;
421 stackzoneid = netstackid_to_zoneid(
422 ipst->ips_netstack->netstack_stackid);
424 if (stackzoneid == GLOBAL_ZONEID) {
425 /* Shared-IP zone */
426 dzone = ire->ire_zoneid;
427 szone = ixa->ixa_zoneid;
428 } else {
429 szone = dzone = stackzoneid;
431 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
434 /* Handle lo0 stats */
435 ipst->ips_loopback_packets++;
438 * Update output mib stats. Note that we can't move into the icmp
439 * sender (icmp_output etc) since they don't know the ill and the
440 * stats are per ill.
442 if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
443 icmp6_t *icmp6;
445 icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
446 icmp_update_out_mib_v6(ill, icmp6);
449 DTRACE_PROBE4(ip6__loopback__in__start,
450 ill_t *, ill, ill_t *, NULL,
451 ip6_t *, ip6h, mblk_t *, mp);
453 if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) {
454 int error;
456 FW_HOOKS(ipst->ips_ip6_loopback_in_event,
457 ipst->ips_ipv6firewall_loopback_in,
458 ill, NULL, ip6h, mp, mp, 0, ipst, error);
460 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
461 if (mp == NULL)
462 return (error);
465 * Even if the destination was changed by the filter we use the
466 * forwarding decision that was made based on the address
467 * in ip_output/ip_set_destination.
469 /* Length could be different */
470 ip6h = (ip6_t *)mp->b_rptr;
471 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
474 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
475 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
476 int, 1);
478 /* Map ixa to ira including IPsec policies */
479 ipsec_out_to_in(ixa, ill, &iras);
480 iras.ira_pktlen = pktlen;
482 ire->ire_ib_pkt_count++;
483 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
484 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
486 /* Destined to ire_zoneid - use that for fanout */
487 iras.ira_zoneid = ire->ire_zoneid;
489 ip_fanout_v6(mp, ip6h, &iras);
491 /* We moved any IPsec refs from ixa to iras */
492 ira_cleanup(&iras, B_FALSE);
493 return (0);
497 * ire_sendfn for IRE_MULTICAST
499 * Note that we do path MTU discovery by default for IPv6 multicast. But
500 * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY
501 * only connected sockets get this by default.
504 ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
505 ip_xmit_attr_t *ixa, uint32_t *identp)
507 ip6_t *ip6h = (ip6_t *)iph_arg;
508 ip_stack_t *ipst = ixa->ixa_ipst;
509 ill_t *ill = ire->ire_ill;
510 iaflags_t ixaflags = ixa->ixa_flags;
513 * Check if anything in ip_input_v6 wants a copy of the transmitted
514 * packet (after IPsec and fragmentation)
516 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
517 * RSVP and the rsvp daemon is an example of a
518 * protocol and user level process that
519 * handles it's own routing. Hence, it uses the
520 * SO_DONTROUTE option to accomplish this.
521 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
522 * check whether there are any receivers for the group on the ill
523 * (ignoring the zoneid).
524 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
525 * any members in other shared-IP zones.
526 * If such members exist, then we indicate that the sending zone
527 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
528 * behavior.
530 * When we loopback we skip hardware checksum to make sure loopback
531 * copy is checksumed.
533 * Note that ire_ill is the upper in the case of IPMP.
535 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
536 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
537 !(ixaflags & IXAF_DONTROUTE)) {
538 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
539 } else if (ixaflags & IXAF_MULTICAST_LOOP) {
541 * If this zone or any other zone has members then loopback
542 * a copy.
544 if (ill_hasmembers_v6(ill, &ip6h->ip6_dst))
545 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
546 } else if (ipst->ips_netstack->netstack_numzones > 1) {
548 * This zone should not have a copy. But there are some other
549 * zones which might have members.
551 if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
552 ixa->ixa_zoneid)) {
553 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
554 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
555 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
560 * Unless IPV6_HOPLIMIT already set a ttl, force the ttl to the
561 * IP_MULTICAST_TTL value
563 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
564 ip6h->ip6_hops = ixa->ixa_multicast_ttl;
567 return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
571 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
573 /* ARGSUSED4 */
575 ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
576 ip_xmit_attr_t *ixa, uint32_t *identp)
578 ip6_t *ip6h = (ip6_t *)iph_arg;
579 ip_stack_t *ipst = ixa->ixa_ipst;
580 ill_t *ill;
581 ip_recv_attr_t iras;
582 boolean_t dummy;
584 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
586 if (ire->ire_type & IRE_NOROUTE) {
587 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
588 ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0,
589 RTA_DST, ipst);
592 if (ire->ire_flags & RTF_BLACKHOLE) {
593 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
594 freemsg(mp);
595 /* No error even for local senders - silent blackhole */
596 return (0);
598 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
601 * We need an ill_t for the ip_recv_attr_t even though this packet
602 * was never received and icmp_unreachable doesn't currently use
603 * ira_ill.
605 ill = ill_lookup_on_name("lo0", B_FALSE,
606 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
607 if (ill == NULL) {
608 freemsg(mp);
609 return (EHOSTUNREACH);
612 bzero(&iras, sizeof (iras));
613 /* Map ixa to ira including IPsec policies */
614 ipsec_out_to_in(ixa, ill, &iras);
616 icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras);
617 /* We moved any IPsec refs from ixa to iras */
618 ira_cleanup(&iras, B_FALSE);
620 ill_refrele(ill);
621 return (EHOSTUNREACH);
625 * Calculate a checksum ignoring any hardware capabilities
627 * Returns B_FALSE if the packet was too short for the checksum. Caller
628 * should free and do stats.
630 static boolean_t
631 ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa)
633 ip_stack_t *ipst = ixa->ixa_ipst;
634 uint_t pktlen = ixa->ixa_pktlen;
635 uint16_t *cksump;
636 uint32_t cksum;
637 uint8_t protocol = ixa->ixa_protocol;
638 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
640 #define iphs ((uint16_t *)ip6h)
642 /* Just in case it contained garbage */
643 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
646 * Calculate ULP checksum
648 if (protocol == IPPROTO_TCP) {
649 cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
650 cksum = IP_TCP_CSUM_COMP;
651 } else if (protocol == IPPROTO_UDP) {
652 cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
653 cksum = IP_UDP_CSUM_COMP;
654 } else if (protocol == IPPROTO_SCTP) {
655 sctp_hdr_t *sctph;
657 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
658 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
660 * Zero out the checksum field to ensure proper
661 * checksum calculation.
663 sctph->sh_chksum = 0;
664 #ifdef DEBUG
665 if (!skip_sctp_cksum)
666 #endif
667 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
668 return (B_TRUE);
669 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
671 * icmp has placed length and routing
672 * header adjustment in the checksum field.
674 cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
675 ixa->ixa_raw_cksum_offset);
676 cksum = htons(protocol);
677 } else if (protocol == IPPROTO_ICMPV6) {
678 cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
679 cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */
680 } else {
681 return (B_TRUE);
684 /* ULP puts the checksum field is in the first mblk */
685 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
688 * We accumulate the pseudo header checksum in cksum.
689 * This is pretty hairy code, so watch close. One
690 * thing to keep in mind is that UDP and TCP have
691 * stored their respective datagram lengths in their
692 * checksum fields. This lines things up real nice.
694 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
695 iphs[8] + iphs[9] + iphs[10] + iphs[11] +
696 iphs[12] + iphs[13] + iphs[14] + iphs[15] +
697 iphs[16] + iphs[17] + iphs[18] + iphs[19];
698 cksum = IP_CSUM(mp, ip_hdr_length, cksum);
701 * For UDP/IPv6 a zero UDP checksum is not allowed.
702 * Change to 0xffff
704 if (protocol == IPPROTO_UDP && cksum == 0)
705 *cksump = ~cksum;
706 else
707 *cksump = cksum;
709 IP6_STAT(ipst, ip6_out_sw_cksum);
710 IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen);
712 /* No IP header checksum for IPv6 */
714 return (B_TRUE);
715 #undef iphs
718 /* There are drivers that can't do partial checksum for ICMPv6 */
719 int nxge_cksum_workaround = 1;
722 * Calculate the ULP checksum - try to use hardware.
723 * In the case of multicast the IXAF_NO_HW_CKSUM is set in which case we use
724 * software.
726 * Returns B_FALSE if the packet was too short for the checksum. Caller
727 * should free and do stats.
729 static boolean_t
730 ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
731 ip_xmit_attr_t *ixa, ill_t *ill)
733 uint_t pktlen = ixa->ixa_pktlen;
734 uint16_t *cksump;
735 uint16_t hck_flags;
736 uint32_t cksum;
737 uint8_t protocol = ixa->ixa_protocol;
738 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
740 #define iphs ((uint16_t *)ip6h)
742 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
743 !dohwcksum) {
744 return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
748 * Calculate ULP checksum. Note that we don't use cksump and cksum
749 * if the ill has FULL support.
751 if (protocol == IPPROTO_TCP) {
752 cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
753 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */
754 } else if (protocol == IPPROTO_UDP) {
755 cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
756 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */
757 } else if (protocol == IPPROTO_SCTP) {
758 sctp_hdr_t *sctph;
760 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
761 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
763 * Zero out the checksum field to ensure proper
764 * checksum calculation.
766 sctph->sh_chksum = 0;
767 #ifdef DEBUG
768 if (!skip_sctp_cksum)
769 #endif
770 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
771 goto ip_hdr_cksum;
772 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
774 * icmp has placed length and routing
775 * header adjustment in the checksum field.
777 cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
778 ixa->ixa_raw_cksum_offset);
779 cksum = htons(protocol);
780 } else if (protocol == IPPROTO_ICMPV6) {
781 cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
782 cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */
783 } else {
784 ip_hdr_cksum:
785 /* No IP header checksum for IPv6 */
786 return (B_TRUE);
789 /* ULP puts the checksum field is in the first mblk */
790 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
793 * Underlying interface supports hardware checksum offload for
794 * the payload; leave the payload checksum for the hardware to
795 * calculate. N.B: We only need to set up checksum info on the
796 * first mblk.
798 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
800 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
801 if (hck_flags & HCKSUM_INET_FULL_V6) {
803 * Hardware calculates pseudo-header, header and the
804 * payload checksums, so clear the checksum field in
805 * the protocol header.
807 *cksump = 0;
808 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
809 return (B_TRUE);
811 if (((hck_flags) & HCKSUM_INET_PARTIAL) &&
812 (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) {
814 * Partial checksum offload has been enabled. Fill
815 * the checksum field in the protocol header with the
816 * pseudo-header checksum value.
818 * We accumulate the pseudo header checksum in cksum.
819 * This is pretty hairy code, so watch close. One
820 * thing to keep in mind is that UDP and TCP have
821 * stored their respective datagram lengths in their
822 * checksum fields. This lines things up real nice.
824 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
825 iphs[8] + iphs[9] + iphs[10] + iphs[11] +
826 iphs[12] + iphs[13] + iphs[14] + iphs[15] +
827 iphs[16] + iphs[17] + iphs[18] + iphs[19];
828 cksum += *(cksump);
829 cksum = (cksum & 0xFFFF) + (cksum >> 16);
830 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
833 * Offsets are relative to beginning of IP header.
835 DB_CKSUMSTART(mp) = ip_hdr_length;
836 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h;
837 DB_CKSUMEND(mp) = pktlen;
838 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
839 return (B_TRUE);
841 /* Hardware capabilities include neither full nor partial IPv6 */
842 return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
843 #undef iphs
847 * ire_sendfn for offlink and onlink destinations.
848 * Also called from the multicast send function.
850 * Assumes that the caller has a hold on the ire.
852 * This function doesn't care if the IRE just became condemned since that
853 * can happen at any time.
855 /* ARGSUSED */
857 ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
858 ip_xmit_attr_t *ixa, uint32_t *identp)
860 ip_stack_t *ipst = ixa->ixa_ipst;
861 ip6_t *ip6h = (ip6_t *)iph_arg;
862 iaflags_t ixaflags = ixa->ixa_flags;
863 ill_t *ill;
864 uint32_t pktlen = ixa->ixa_pktlen;
866 ASSERT(ixa->ixa_nce != NULL);
867 ill = ixa->ixa_nce->nce_ill;
870 * Update output mib stats. Note that we can't move into the icmp
871 * sender (icmp_output etc) since they don't know the ill and the
872 * stats are per ill.
874 * With IPMP we record the stats on the upper ill.
876 if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
877 icmp6_t *icmp6;
879 icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
880 icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill,
881 icmp6);
884 if (ixaflags & IXAF_DONTROUTE)
885 ip6h->ip6_hops = 1;
888 * This might set b_band, thus the IPsec and fragmentation
889 * code in IP ensures that b_band is updated in the first mblk.
891 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
892 /* ip_process translates an IS_UNDER_IPMP */
893 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
894 if (mp == NULL) {
895 /* ip_drop_packet and MIB done */
896 return (0); /* Might just be delayed */
900 if (pktlen > ixa->ixa_fragsize ||
901 (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) {
902 uint32_t ident;
904 if (ixaflags & IXAF_IPSEC_SECURE)
905 pktlen += ipsec_out_extra_length(ixa);
907 if (pktlen > IP_MAXPACKET)
908 return (EMSGSIZE);
910 if (ixaflags & IXAF_SET_ULP_CKSUM) {
912 * Compute ULP checksum using software
914 if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) {
915 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
916 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
917 freemsg(mp);
918 return (EINVAL);
920 /* Avoid checksum again below if we only add fraghdr */
921 ixaflags &= ~IXAF_SET_ULP_CKSUM;
925 * If we need a fragment header, pick the ident and insert
926 * the header before IPsec to we have a place to store
927 * the ident value.
929 if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) ||
930 pktlen > ixa->ixa_fragsize) {
932 * If this packet would generate a icmp_frag_needed
933 * message, we need to handle it before we do the IPsec
934 * processing. Otherwise, we need to strip the IPsec
935 * headers before we send up the message to the ULPs
936 * which becomes messy and difficult.
938 if ((pktlen > ixa->ixa_fragsize) &&
939 (ixaflags & IXAF_DONTFRAG)) {
940 /* Generate ICMP and return error */
941 ip_recv_attr_t iras;
943 DTRACE_PROBE4(ip6__fragsize__fail,
944 uint_t, pktlen, uint_t, ixa->ixa_fragsize,
945 uint_t, ixa->ixa_pktlen,
946 uint_t, ixa->ixa_pmtu);
948 bzero(&iras, sizeof (iras));
949 /* Map ixa to ira including IPsec policies */
950 ipsec_out_to_in(ixa, ill, &iras);
952 ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill);
953 icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE,
954 &iras);
955 /* We moved any IPsec refs from ixa to iras */
956 ira_cleanup(&iras, B_FALSE);
957 return (EMSGSIZE);
959 DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen,
960 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
961 uint_t, ixa->ixa_pmtu);
963 * Assign an ident value for this packet. There could
964 * be other threads targeting the same destination, so
965 * we have to arrange for a atomic increment.
966 * Normally ixa_extra_ident is 0, but in the case of
967 * LSO it will be the number of TCP segments that the
968 * driver/hardware will extraly construct.
970 ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident +
972 ixa->ixa_ident = ident; /* In case we do IPsec */
974 if (ixaflags & IXAF_IPSEC_SECURE) {
976 * Pass in sufficient information so that
977 * IPsec can determine whether to fragment, and
978 * which function to call after fragmentation.
980 return (ipsec_out_process(mp, ixa));
983 mp = ip_fraghdr_add_v6(mp, ident, ixa);
984 if (mp == NULL) {
985 /* MIB and ip_drop_output already done */
986 return (ENOMEM);
988 ASSERT(pktlen == ixa->ixa_pktlen);
989 pktlen += sizeof (ip6_frag_t);
991 if (pktlen > ixa->ixa_fragsize) {
992 return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags,
993 pktlen, ixa->ixa_fragsize,
994 ixa->ixa_xmit_hint, ixa->ixa_zoneid,
995 ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
996 &ixa->ixa_cookie));
999 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1000 /* Compute ULP checksum and IP header checksum */
1001 /* An IS_UNDER_IPMP ill is ok here */
1002 if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) {
1003 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1004 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1005 freemsg(mp);
1006 return (EINVAL);
1009 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
1010 pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
1011 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));